read line by line in the most efficient way *platform specific*

Doing read(2) is not very good because of the fragmentation of the lines (e.g. if you read 1000 chars, the last line may start at offset 990 and need 50 chars beyond the buffer).

Better to use fgets [recommended] or mmap [YMMV]. Here are examples of each. Caveat: compiles, but not tested, and doesn’t do much error checking

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>

typedef struct {
    unsigned int lidx;
    char *lncontent;
} FileL;

// read in file lines using fgets
FileL *
getfileline_fgets(char *file)
{
    FILE *xf;
    char *cp;
    int len;
    int linecnt;
    int linemax;
    FileL *linelist;
    FileL *line;
    char lbuf[50000];

    xf = fopen(file,"r");

    linecnt = 0;
    linemax = 0;
    linelist = NULL;

    while (1) {
        cp = fgets(lbuf,sizeof(lbuf),xf);
        if (cp == NULL)
            break;

        len = strlen(lbuf);

        // strip newline from string [or not]
#if 1
        if (len > 0) {
            cp = &lbuf[len - 1];
            if (*cp == '\n') {
                *cp = 0;
                --len;
            }
        }
#endif

        if ((linecnt + 1) > linemax) {
            linemax += 100;
            linelist = realloc(linelist,linemax * sizeof(FileL));
        }

        line = &linelist[linecnt];
        line->lidx = linecnt++;

        cp = malloc(len + 1);
        memcpy(cp,lbuf,len + 1);
        line->lncontent = cp;
    }

    fclose(xf);

    // trim to maximum used
    linelist = realloc(linelist,linecnt * sizeof(FileL));

    return linelist;
}

// read in file lines by mmap to entire file
FileL *
getfilelines_mmap(char *file)
{
    int fd;
    char *lhs;
    char *rhs;
    char *cp;
    int len;
    int linecnt;
    int linemax;
    FileL *linelist;
    FileL *line;
    struct stat st;
    char *fbuf;
    char cbuf[50000];

    fd = open(file,O_RDONLY);
    fstat(fd,&st);

    fbuf = mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,fd,0);

    linecnt = 0;
    linemax = 0;
    linelist = NULL;

    lhs = fbuf;
    rhs = fbuf;

    for (lhs = fbuf;  lhs < &fbuf[st.st_size];  lhs = rhs + 1) {
        rhs = strchr(lhs,'\n');

        // NOTE: does _not_ handle case of malformed text file that has _no_
        // newline on last line
        if (rhs == NULL)
            break;

        len = rhs - lhs;

        // strip newline from string [or not]
#if 1
        if (len > 0)
            --len;
#endif

        if ((linecnt + 1) > linemax) {
            linemax += 100;
            linelist = realloc(linelist,linemax * sizeof(FileL));
        }

        line = &linelist[linecnt];
        line->lidx = linecnt++;

        cp = malloc(len + 1);
        memcpy(cp,lhs,len);
        cp[len] = 0;
        line->lncontent = cp;
    }

    munmap(fbuf,st.st_size);
    close(fd);

    // trim to maximum used
    linelist = realloc(linelist,linecnt * sizeof(FileL));

    return linelist;
}

UPDATE

You wanted benchmarks. Well, you’re gonna get ’em. Generated 167GB of random text data, spanning 140 files. The L: is # of lines, W: is max width, and shows filesize in MB. Times are in nanoseconds. Note that factors vary, but looks like mmap wins.

23:39:35.528333425 NEWDAY 11/09/15
23:39:35.528333425 ph: starting 23107 ...
23:39:35.528868198 ph: ARGV fastreadgo ...

F001: L:324255 W:2097 324.086MB
    368297556 fgets
    189180143 mmap

F002: L:329608 W:2822 443.649MB
    475989122 fgets
    248517335 mmap

F003: L:401476 W:6186 1185.270MB
   1206999411 fgets
    657703847 mmap

F004: L:729379 W:9350 3253.185MB
   3199692871 fgets
   1776602082 mmap

F005: L:85857 W:5185 212.599MB
    223489564 fgets
    122404608 mmap

F006: L:62871 W:5418 162.384MB
    167640768 fgets
     93127042 mmap

F007: L:298836 W:1083 154.481MB
    196584474 fgets
    100582134 mmap

F008: L:221513 W:2732 288.694MB
    322105867 fgets
    164965547 mmap

F009: L:420815 W:8906 1789.672MB
   1801309998 fgets
    961136893 mmap

F010: L:126712 W:8251 498.905MB
    499274233 fgets
    275901635 mmap

F011: L:443166 W:8822 1865.753MB
   1839816883 fgets
   1001882651 mmap

F012: L:385632 W:2162 398.467MB
    467223648 fgets
    248126909 mmap

F013: L:629448 W:4413 1324.616MB
   1432284339 fgets
    777593198 mmap

F014: L:510357 W:7313 1779.348MB
   1919309671 fgets
   1079111734 mmap

F015: L:188434 W:1254 112.922MB
    152367682 fgets
     78959769 mmap

F016: L:82139 W:4355 170.586MB
    193117015 fgets
    105417805 mmap

F017: L:389499 W:9063 1681.805MB
   1730894028 fgets
    913789253 mmap

F018: L:992849 W:3265 1547.875MB
   1685006767 fgets
    875256226 mmap

F019: L:931502 W:9647 4285.883MB
  11181005402 fgets
   2361255543 mmap

F020: L:266047 W:7454 946.298MB
    955772708 fgets
    537059554 mmap

F021: L:572709 W:67 18.835MB
     86539501 fgets
     43437303 mmap

F022: L:68373 W:3042 98.684MB
    110325296 fgets
     57538963 mmap

F023: L:651839 W:2006 624.153MB
    706094723 fgets
    369122560 mmap

F024: L:414658 W:6482 1284.202MB
   1294352248 fgets
    700279769 mmap

F025: L:984554 W:3441 1616.269MB
   1742233370 fgets
    903755131 mmap

F026: L:527629 W:3214 808.812MB
    872660092 fgets
    465403685 mmap

F027: L:572103 W:6219 1696.582MB
   1758562312 fgets
    933024466 mmap

F028: L:793354 W:5967 2255.653MB
   2341754885 fgets
   1251633414 mmap

F029: L:690669 W:389 128.888MB
    230036016 fgets
    119381427 mmap

F030: L:902519 W:8182 3523.415MB
   6665490426 fgets
   1930049511 mmap

F031: L:179482 W:2361 201.850MB
    225333697 fgets
    120424715 mmap

F032: L:342396 W:4135 675.885MB
    706219203 fgets
    379974402 mmap

F033: L:762237 W:4000 1455.780MB
   1535236381 fgets
    805977762 mmap

F034: L:421947 W:8289 1669.038MB
   1686877811 fgets
    900813641 mmap

F035: L:367349 W:5829 1022.373MB
   1051584165 fgets
    566680706 mmap

F036: L:433973 W:5064 1049.724MB
   1097920811 fgets
    584855289 mmap

F037: L:615918 W:9152 2686.372MB
   2743719787 fgets
   1468536802 mmap

F038: L:365187 W:1564 272.829MB
    326368364 fgets
    171071840 mmap

F039: L:61305 W:477 14.002MB
     22945438 fgets
     11949833 mmap

F040: L:396788 W:8576 1622.049MB
   1633217001 fgets
    884460205 mmap

F041: L:245326 W:5068 592.450MB
    610530077 fgets
    328366102 mmap

F042: L:986409 W:9174 4313.608MB
  17048484450 fgets
   2413375121 mmap

F043: L:367968 W:9703 1703.785MB
   1677764299 fgets
    922735827 mmap

F044: L:630679 W:9763 2942.911MB
   4742195305 fgets
   1585438052 mmap

F045: L:397072 W:7717 1459.554MB
   1533634531 fgets
    860518182 mmap

F046: L:918129 W:9127 3996.179MB
  10259712214 fgets
   2171550789 mmap

F047: L:770706 W:2720 999.584MB
   1097599308 fgets
    604894013 mmap

F048: L:472462 W:5011 1127.896MB
   1164186449 fgets
    621979909 mmap

F049: L:301834 W:4456 642.703MB
    664420452 fgets
    354255131 mmap

F050: L:213878 W:2913 297.159MB
    321396955 fgets
    168664579 mmap

F051: L:549950 W:1681 441.842MB
    510553173 fgets
    260455948 mmap

F052: L:63502 W:8785 267.074MB
    265697002 fgets
    142457939 mmap

F053: L:880396 W:6821 2864.595MB
   3769485430 fgets
   1591318886 mmap

F054: L:180543 W:9055 779.566MB
    773462618 fgets
    428627500 mmap

F055: L:964409 W:8454 3884.437MB
   9085108760 fgets
   2149540695 mmap

F056: L:675120 W:8912 2872.781MB
   2885159527 fgets
   1559580604 mmap

F057: L:345151 W:4157 684.052MB
    724456228 fgets
    387170980 mmap

F058: L:69114 W:4585 150.535MB
    157447952 fgets
     84782951 mmap

F059: L:304627 W:9441 1370.777MB
   1376517664 fgets
    739170571 mmap

F060: L:799770 W:3145 1200.762MB
   1304001986 fgets
    679163462 mmap

F061: L:808699 W:6544 2523.949MB
   2590924710 fgets
   1385627164 mmap

F062: L:270082 W:313 40.592MB
     78777863 fgets
     40733146 mmap

F063: L:308883 W:333 49.262MB
     93696361 fgets
     48580067 mmap

F064: L:237002 W:2618 296.446MB
    347315129 fgets
    178078149 mmap

F065: L:279040 W:1341 178.685MB
    217230537 fgets
    113291912 mmap

F066: L:809386 W:2808 1085.734MB
   1177480987 fgets
    615248653 mmap

F067: L:279448 W:8560 1140.280MB
   1151044788 fgets
    614662533 mmap

F068: L:80012 W:7441 283.334MB
    286915203 fgets
    158077955 mmap

F069: L:366808 W:7197 1260.521MB
   1292679736 fgets
    696686301 mmap

F070: L:272693 W:9275 1206.527MB
   1220763889 fgets
    658383413 mmap

F071: L:792609 W:1419 537.088MB
    645760162 fgets
    334886975 mmap

F072: L:742523 W:8640 3059.604MB
   5711688133 fgets
   1665879727 mmap

F073: L:583753 W:2992 833.759MB
    910037328 fgets
    483847376 mmap

F074: L:252560 W:7178 864.593MB
    868625985 fgets
    471770777 mmap

F075: L:154327 W:7026 515.619MB
    516135586 fgets
    277690063 mmap

F076: L:121839 W:7131 414.684MB
    424518600 fgets
    230518357 mmap

F077: L:760327 W:1421 515.475MB
    622630358 fgets
    314592959 mmap

F078: L:907033 W:3485 1508.042MB
   1622356297 fgets
    845695719 mmap

F079: L:884787 W:7491 3162.774MB
   4932864122 fgets
   1749065509 mmap

F080: L:432556 W:6039 1245.779MB
   1281231973 fgets
    693532807 mmap

F081: L:639804 W:6419 1957.747MB
   2107303517 fgets
   1130299002 mmap

F082: L:388669 W:283 52.804MB
    111686630 fgets
     57177517 mmap

F083: L:300542 W:1943 278.825MB
    336538347 fgets
    177494803 mmap

F084: L:941 W:7,3878 1.770MB
      2047540 fgets
      1347230 mmap

F085: L:85747 W:1841 75.417MB
     92274672 fgets
     49362653 mmap

F086: L:935559 W:5950 2656.411MB
   2734326131 fgets
   1487147776 mmap

F087: L:936993 W:1197 535.727MB
    672872562 fgets
    348765250 mmap

F088: L:409671 W:5235 1023.358MB
   1099320520 fgets
    606047909 mmap

F089: L:362220 W:5434 938.805MB
    991448256 fgets
    529093412 mmap

F090: L:628156 W:3682 1103.909MB
   1185317812 fgets
    637902310 mmap

F091: L:655456 W:6051 1892.574MB
   1978859918 fgets
   1066368241 mmap

F092: L:356309 W:5946 1012.893MB
   1046818030 fgets
    562463577 mmap

F093: L:878726 W:2946 1236.162MB
   1368885560 fgets
    701514499 mmap

F094: L:583863 W:747 208.701MB
    293177923 fgets
    148045230 mmap

F095: L:51374 W:3752 91.670MB
     98830853 fgets
     52715699 mmap

F096: L:757271 W:4698 1698.664MB
   1790811621 fgets
    946452098 mmap

F097: L:665420 W:1814 575.369MB
    664290848 fgets
    347346293 mmap

F098: L:152806 W:4480 326.336MB
    338683910 fgets
    185037896 mmap

F099: L:39027 W:2368 44.104MB
     49144948 fgets
     26701307 mmap

F100: L:896926 W:8209 3513.328MB
   7460727008 fgets
   1900543480 mmap

F101: L:796628 W:5663 2149.888MB
   2207899454 fgets
   1187397751 mmap

F102: L:876500 W:1986 831.161MB
    934850175 fgets
    486626065 mmap

F103: L:188682 W:773 69.722MB
     97285527 fgets
     48765985 mmap

F104: L:648920 W:9590 2969.446MB
   5021968784 fgets
   1622268239 mmap

F105: L:827850 W:2123 837.892MB
    946063144 fgets
    498163978 mmap

F106: L:879828 W:2867 1205.021MB
   1304295176 fgets
    682155187 mmap

F107: L:970674 W:3830 1771.667MB
   1883664162 fgets
    989569477 mmap

F108: L:4461 W:5634 11.840MB
     12680659 fgets
      7159011 mmap

F109: L:477207 W:1067 243.224MB
    315370392 fgets
    162708299 mmap

F110: L:140308 W:5817 389.132MB
    397510757 fgets
    216204659 mmap

F111: L:253358 W:4425 534.937MB
    559943651 fgets
    297109524 mmap

F112: L:903292 W:7989 3441.851MB
   7327033977 fgets
   1906200470 mmap

F113: L:555989 W:620 164.835MB
    245638038 fgets
    126559933 mmap

F114: L:596425 W:2330 664.143MB
    739017073 fgets
    391237002 mmap

F115: L:298147 W:9741 1387.530MB
   1363229979 fgets
    744420477 mmap

F116: L:180269 W:4522 389.175MB
    402702977 fgets
    213875684 mmap

F117: L:238597 W:9021 1029.314MB
   1033070395 fgets
    550442036 mmap

F118: L:183723 W:8705 764.555MB
    765959712 fgets
    413667801 mmap

F119: L:174802 W:549 45.896MB
     70635625 fgets
     35721310 mmap

F120: L:883013 W:4666 1963.677MB
   2062197751 fgets
   1092583730 mmap

F121: L:858995 W:9218 3776.896MB
   9278222413 fgets
   2309240152 mmap

F122: L:368895 W:5862 1030.174MB
   1076473726 fgets
    582127460 mmap

F123: L:208043 W:5672 563.889MB
    579427255 fgets
    310321934 mmap

F124: L:768482 W:4953 1816.657MB
   1888233155 fgets
    997797932 mmap

F125: L:905425 W:2812 1214.882MB
   1394928053 fgets
    724059403 mmap

F126: L:54137 W:4690 121.066MB
    125124760 fgets
     67811537 mmap

F127: L:448100 W:9643 2061.624MB
   2066282543 fgets
   1126488038 mmap

F128: L:748979 W:2111 754.038MB
    854095589 fgets
    447406977 mmap

F129: L:611388 W:6954 2026.306MB
   2074219917 fgets
   1118353849 mmap

F130: L:782834 W:9946 3715.067MB
   7338500374 fgets
   2029571615 mmap

F131: L:52630 W:7858 197.495MB
    200711062 fgets
    110759659 mmap

F132: L:930983 W:7363 3270.546MB
   3376813502 fgets
   1776365395 mmap

F133: L:73216 W:2127 74.344MB
     85854537 fgets
     46756335 mmap

F134: L:583306 W:2495 694.192MB
    766430638 fgets
    408095226 mmap

F135: L:877424 W:2964 1241.342MB
   1339005805 fgets
    702659289 mmap

F136: L:414854 W:5104 1010.006MB
   1057372341 fgets
    556583887 mmap

F137: L:333176 W:4912 781.109MB
    820007572 fgets
    435433956 mmap

F138: L:564006 W:6933 1863.574MB
   1905024687 fgets
   1030574213 mmap

F139: L:829571 W:9152 3622.399MB
   7338698902 fgets
   2002428493 mmap

F140: L:560210 W:7443 1990.047MB
   2012670010 fgets
   1098720143 mmap

00:00:58.770988225 NEWDAY 11/10/15
00:00:58.770988225 ph: complete (ELAPSED: 00:21:23.190149545)

Here’s the perl script I used to generate the files:

#!/usr/bin/perl
# grpcntgen -- generate test data for fastread algorithms
#
# arguments:
#   "-W" - maximum line width
#   "-L" - maximum number of lines
#   "-T" - number of test files to generate
#   "-O" - output file (e.g. foo%.txt)
#
# NOTE: with no arguments or missing arguments will prompt

#pragma pgmlns
# tstgen -- test generation help routines

# gengetstr -- get a string/number
sub gengetstr
{
    my($numflg,$opt,$prompt,$lim) = @_;
    my($arg);
    my($askflg);
    my($val);

    select(STDOUT);
    $| = 1;

    {
        # search command line for -whatever
        foreach $arg (@argv) {
            if ($arg =~ /^$opt(.*)$/) {
                $val = $1;
                if ($numflg && ($val eq "")) {
                    $val = $lim;
                    $val //= 1;
                }
                last;
            }
        }
        last if (defined($val));

        $askflg = 1;

        while (1) {
            printf("Enter ")
                if ($numflg != 1);

            printf("%s",$prompt);

            if ($numflg == 1) {
                printf(" (0/1)? ");
            }
            else {
                printf(": ");
            }

            $val = <STDIN>;
            chomp($val);

            if ($numflg == 0) {
                last if ($val ne "");
                next;
            }

            # an empty response for a number with a maximum means use it
            if (($numflg == 2) && ($val eq "") && defined($lim)) {
                $val = $lim;
                last;
            }

            next unless ($val =~ /^\d+$/);

            $val += 0;

            last if ($numflg == 1);

            next if ($val <= 0);
            last unless (defined($lim));
            last if ($val <= $lim);
        }
    }

    unless ($askflg) {
        printf("%s: %s\n",$prompt,$val);
    }

    $val;
}

# genrun -- generate all tests
sub genrun
{
    local(@argv) = @_;
    local($ofile,$tstmax,$tstproc);
    local($tstcur);
    local($splitflg);
    local($genvbq);
    my($sym);
    my($numfmt);
    my($xfile);

    $genvbq = genvbq(\@argv);

    $ofile = shift(@argv);
    $tstmax = shift(@argv);
    $tstproc = shift(@argv);

    # split each test into separate file
    if ($ofile =~ /%/) {
        $splitflg = 1;
        $numfmt = sprintf("%d",$tstmax);
        $numfmt = length($numfmt);
        $numfmt = sprintf("_%%%d.%dd",$numfmt,$numfmt);
        $ofile =~ s/%/$numfmt/;
        ###die("genrun: DEBUG_CAE numfmt="$numfmt" ofile="$ofile"\n");
    }

    {
        last if ($splitflg);
        genopen($ofile);
    }

    for ($tstcur = 1;  $tstcur <= $tstmax;  ++$tstcur) {
        {
            last unless ($splitflg);
            $xfile = sprintf($ofile,$tstcur);
            genopen($xfile);
        }

        &$tstproc();

        {
            last unless ($splitflg);
            genclose();
        }
    }

    {
        last if ($splitflg);
        genclose();
    }
}

# genvbq -- get options
sub genvbq
{
    my($argv) = @_;
    my($sym);
    my($env);

    $env = {};

    while (1) {
        $sym = $argv->[0];
        last unless ($sym =~ s/^-//);
        shift(@$argv);

        if ($sym =~ /^([^=]+)=(.+)$/) {
            ($sym,$val) = ($1,$2);
        }
        else {
            $val = 1;
        }

        $env->{$sym} = $val;
    }

    $env;
}

# genopen -- open output
sub genopen
{
    my($ofile) = @_;

    $gen_ofile = $ofile;

    {
        last if ($genvbq->{"n"});
        last if (open($xfdst,">$ofile"));
        die("genopen: unable to open '$ofile' -- $!\n");
    }
}

# genclose -- close output
sub genclose
{

    close($xfdst);
}

# geninit -- initialize for single test
sub geninit
{

    undef($genout_lhs);
    undef($genout_pre);
}

# genout -- output data
sub genout
{
    my($rhs) = @_;

    {
        if (defined($rhs)) {
            last if ((length($genout_pre) + length($genout_lhs) +
                length($rhs)) < 78);
        }

        last if ($genout_lhs eq "");

        print($xfdst $genout_pre,$genout_lhs,"\n");
        undef($genout_lhs);
    }

    $genout_lhs .= $rhs
        if (defined($rhs));
}

# genrand -- get random number
sub genrand
{
    my($lim) = @_;
    my($val);

    $val = int(rand($lim));
    $val += 1;

    $val;
}

# genfmtof -- get number format
sub genfmtof
{
    my($num) = @_;
    my($fmt);

    $fmt = sprintf("%d",$num);
    $fmt = length($fmt);
    $fmt = sprintf(" %%%dd",$fmt);

    $fmt;
}

1;

master(@ARGV);
exit(0);

# master -- master control
sub master
{
    local(@argv) = @_;

    $Wmax = gengetstr(2,"-W","maximum line width");
    $Lmax = gengetstr(2,"-L","maximum number of lines / file");
    $tstmax = gengetstr(2,"-T","number of tests");

    while (1) {
        $ofile = gengetstr(0,"-O","output file name");
        last if ($ofile =~ /%/);
        printf("fastreadgen: filename must have %% in it (e.g. foo%%.txt)\n");
    }

    genrun($ofile,$tstmax,\&dotest);
}

# dotest -- generate a test
sub dotest
{
    my($lno,$chroff);
    my($lhs);

    $Wlim = genrand($Wmax);
    $Llim = genrand($Lmax);

    printf("dotest: T=%d/%d W=%d L=%d\n",$tstcur,$tstmax,$Wlim,$Llim);

    for ($lno = 1;  $lno <= $Llim;  ++$lno) {
        $Wcur = genrand($Wlim);
        $Wcur -= 1;

        undef($lhs);
        for (;  $Wcur > 0;  --$Wcur) {
            $chroff = genrand(0x7E - 0x20);
            $chroff += 0x20;
            $chroff -= 1;
            $chroff = chr($chroff);
            last;
            $lhs .= $chroff;
        }
        $lhs = $chroff x $Wcur;

        print($xfdst $lhs,"\n");
    }
}

Leave a Comment