Faster way to read fixed-width files

Now that there are (between this and the other major question about effective reading of fixed-width files) a fair amount of options on the offer for reading in such files, I think some benchmarking is appropriate.

I’ll use the following on-the-large-side (400 MB) file for comparison. It’s just a bunch of random characters with randomly defined fields and widths:

set.seed(21394)
wwidth = 400L
rrows = 1000000
    
#creating the contents at random
contents = write.table(
  replicate(
    rrows,
    paste0(sample(letters, wwidth, replace = TRUE), collapse = "")
  ),
  file = "testfwf.txt",
  quote = FALSE, row.names = FALSE, col.names = FALSE
)
    
#defining the fields & writing a dictionary
n_fields = 40L
endpoints = unique(
  c(1L, sort(sample(wwidth, n_fields - 1L)), wwidth + 1L)
)
cols = list(
  beg = endpoints[-(n_fields + 1L)], 
  end = endpoints[-1L] - 1L
)
    
dict = data.frame(
  column = paste0("V", seq_len(length(endpoints)) - 1L)),
  start = endpoints[-length(endpoints)] - 1,
  length = diff(endpoints)
)
    
write.csv(dict, file = "testdic.csv", quote = FALSE, row.names = FALSE)

I’ll compare five methods mentioned between these two threads (I’ll add some others if the authors would like): the base version (read.fwf), piping the result of in2csv to fread (@AnandaMahto’s suggestion), Hadley’s new readr (read_fwf), that using LaF/ffbase (@jwijffls’ suggestion), and an improved (streamlined) version of that suggested by the question author (@MarkDanese) combining fread with stri_sub from stringi.

Here is the benchmarking code:

library(data.table)
library(stringi)
library(readr)
library(LaF)
library(ffbase)
library(microbenchmark)
    
microbenchmark(
  times = 5L,
  utils = read.fwf("testfwf.txt", diff(endpoints), header = FALSE),
  in2csv = fread(cmd = sprintf(
    "in2csv -f fixed -s %s %s",
    "testdic.csv", "testfwf.txt"
  )),
  readr = read_fwf("testfwf.txt", fwf_widths(diff(endpoints))),
  LaF = {
    my.data.laf = laf_open_fwf(
      'testfwf.txt', 
      column_widths = diff(endpoints),
      column_types = rep("character", length(endpoints) - 1L)
    )
    my.data = laf_to_ffdf(my.data.laf, nrows = rrows)
    as.data.frame(my.data)
  },
  fread = {
    DT = fread("testfwf.txt", header = FALSE, sep = "\n")
    DT[ , lapply(seq_len(length(cols$beg)), function(ii) {
      stri_sub(V1, cols$beg[ii], cols$end[ii])
    })]
  }
)

And the output:

# Unit: seconds
#    expr       min        lq      mean    median        uq       max neval cld
#   utils 423.76786 465.39212 499.00109 501.87568 543.12382 560.84598     5   c
#  in2csv  67.74065  68.56549  69.60069  70.11774  70.18746  71.39210     5 a  
#   readr  10.57945  11.32205  15.70224  14.89057  19.54617  22.17298     5 a  
#     LaF 207.56267 236.39389 239.45985 237.96155 238.28316 277.09798     5  b 
#   fread  14.42617  15.44693  26.09877  15.76016  20.45481  64.40581     5 a  

So it seems readr and fread + stri_sub are pretty competitive as the fastest; built-in read.fwf is the clear loser.

Note that the real advantage of readr here is that you can pre-specify column types; with fread you’ll have to type convert afterwards.

EDIT: Adding some alternatives

At @AnandaMahto’s suggestion I am including some more options, including one that appears to be a new winner! To save time I excluded the slowest options above in the new comparison. Here’s the new code:

library(iotools)
    
microbenchmark(
  times = 5L,
  readr = read_fwf("testfwf.txt", fwf_widths(diff(endpoints))),
  fread = {
    DT = fread("testfwf.txt", header = FALSE, sep = "\n")
    DT[ , lapply(seq_len(length(cols$beg)), function(ii) {
      stri_sub(V1, cols$beg[ii], cols$end[ii])
    })]
  },
  iotools = input.file(
    "testfwf.txt", formatter = dstrfw, 
    col_types = rep("character", length(endpoints) - 1L), 
    widths = diff(endpoints)
  ),
  awk = fread(header = FALSE, cmd = sprintf(
    "awk -v FIELDWIDTHS='%s' -v OFS=', ' '{$1=$1 \"\"; print}' < testfwf.txt",
    paste(diff(endpoints), collapse = " ")
  ))
)

And the new output:

# Unit: seconds
#     expr       min        lq      mean    median        uq       max neval cld
#    readr  7.892527  8.016857 10.293371  9.527409  9.807145 16.222916     5  a 
#    fread  9.652377  9.696135  9.796438  9.712686  9.807830 10.113160     5  a 
#  iotools  5.900362  7.591847  7.438049  7.799729  7.845727  8.052579     5  a 
#      awk 14.440489 14.457329 14.637879 14.472836 14.666587 15.152156     5   b

So it appears iotools is both very fast and very consistent.

Leave a Comment