Now that there are (between this and the other major question about effective reading of fixed-width files) a fair amount of options on the offer for reading in such files, I think some benchmarking is appropriate.
I’ll use the following on-the-large-side (400 MB) file for comparison. It’s just a bunch of random characters with randomly defined fields and widths:
set.seed(21394)
wwidth = 400L
rrows = 1000000
#creating the contents at random
contents = write.table(
replicate(
rrows,
paste0(sample(letters, wwidth, replace = TRUE), collapse = "")
),
file = "testfwf.txt",
quote = FALSE, row.names = FALSE, col.names = FALSE
)
#defining the fields & writing a dictionary
n_fields = 40L
endpoints = unique(
c(1L, sort(sample(wwidth, n_fields - 1L)), wwidth + 1L)
)
cols = list(
beg = endpoints[-(n_fields + 1L)],
end = endpoints[-1L] - 1L
)
dict = data.frame(
column = paste0("V", seq_len(length(endpoints)) - 1L)),
start = endpoints[-length(endpoints)] - 1,
length = diff(endpoints)
)
write.csv(dict, file = "testdic.csv", quote = FALSE, row.names = FALSE)
I’ll compare five methods mentioned between these two threads (I’ll add some others if the authors would like): the base version (read.fwf
), piping the result of in2csv
to fread
(@AnandaMahto’s suggestion), Hadley’s new readr
(read_fwf
), that using LaF
/ffbase
(@jwijffls’ suggestion), and an improved (streamlined) version of that suggested by the question author (@MarkDanese) combining fread
with stri_sub
from stringi
.
Here is the benchmarking code:
library(data.table)
library(stringi)
library(readr)
library(LaF)
library(ffbase)
library(microbenchmark)
microbenchmark(
times = 5L,
utils = read.fwf("testfwf.txt", diff(endpoints), header = FALSE),
in2csv = fread(cmd = sprintf(
"in2csv -f fixed -s %s %s",
"testdic.csv", "testfwf.txt"
)),
readr = read_fwf("testfwf.txt", fwf_widths(diff(endpoints))),
LaF = {
my.data.laf = laf_open_fwf(
'testfwf.txt',
column_widths = diff(endpoints),
column_types = rep("character", length(endpoints) - 1L)
)
my.data = laf_to_ffdf(my.data.laf, nrows = rrows)
as.data.frame(my.data)
},
fread = {
DT = fread("testfwf.txt", header = FALSE, sep = "\n")
DT[ , lapply(seq_len(length(cols$beg)), function(ii) {
stri_sub(V1, cols$beg[ii], cols$end[ii])
})]
}
)
And the output:
# Unit: seconds
# expr min lq mean median uq max neval cld
# utils 423.76786 465.39212 499.00109 501.87568 543.12382 560.84598 5 c
# in2csv 67.74065 68.56549 69.60069 70.11774 70.18746 71.39210 5 a
# readr 10.57945 11.32205 15.70224 14.89057 19.54617 22.17298 5 a
# LaF 207.56267 236.39389 239.45985 237.96155 238.28316 277.09798 5 b
# fread 14.42617 15.44693 26.09877 15.76016 20.45481 64.40581 5 a
So it seems readr
and fread
+ stri_sub
are pretty competitive as the fastest; built-in read.fwf
is the clear loser.
Note that the real advantage of readr
here is that you can pre-specify column types; with fread
you’ll have to type convert afterwards.
EDIT: Adding some alternatives
At @AnandaMahto’s suggestion I am including some more options, including one that appears to be a new winner! To save time I excluded the slowest options above in the new comparison. Here’s the new code:
library(iotools)
microbenchmark(
times = 5L,
readr = read_fwf("testfwf.txt", fwf_widths(diff(endpoints))),
fread = {
DT = fread("testfwf.txt", header = FALSE, sep = "\n")
DT[ , lapply(seq_len(length(cols$beg)), function(ii) {
stri_sub(V1, cols$beg[ii], cols$end[ii])
})]
},
iotools = input.file(
"testfwf.txt", formatter = dstrfw,
col_types = rep("character", length(endpoints) - 1L),
widths = diff(endpoints)
),
awk = fread(header = FALSE, cmd = sprintf(
"awk -v FIELDWIDTHS='%s' -v OFS=', ' '{$1=$1 \"\"; print}' < testfwf.txt",
paste(diff(endpoints), collapse = " ")
))
)
And the new output:
# Unit: seconds
# expr min lq mean median uq max neval cld
# readr 7.892527 8.016857 10.293371 9.527409 9.807145 16.222916 5 a
# fread 9.652377 9.696135 9.796438 9.712686 9.807830 10.113160 5 a
# iotools 5.900362 7.591847 7.438049 7.799729 7.845727 8.052579 5 a
# awk 14.440489 14.457329 14.637879 14.472836 14.666587 15.152156 5 b
So it appears iotools
is both very fast and very consistent.