Python – Is a dictionary slow to find frequency of each character?

Question

Performance comparison

Note: time in the table doesn’t include the time it takes to load files.

| approach       | american-english, |      big.txt, | time w.r.t. defaultdict |
|                |     time, seconds | time, seconds |                         |
|----------------+-------------------+---------------+-------------------------|
| Counter        |             0.451 |         3.367 |                     3.6 |
| setdefault     |             0.348 |         2.320 |                     2.5 |
| list           |             0.277 |         1.822 |                       2 |
| try/except     |             0.158 |         1.068 |                     1.2 |
| defaultdict    |             0.141 |         0.925 |                       1 |
| numpy          |             0.012 |         0.076 |                   0.082 |
| S.Mark's ext.  |             0.003 |         0.019 |                   0.021 |
| ext. in Cython |             0.001 |         0.008 |                  0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g

The files used: '/usr/share/dict/american-english' and 'big.txt'.

The script that compares ‘Counter’, ‘setdefault’, ‘list’, ‘try/except’, ‘defaultdict’, ‘numpy’, ‘cython’ -based, and @S.Mark’s solutions is at http://gist.github.com/347000

The fastest solution is Python extension written in Cython:

import cython

@cython.locals(
    chars=unicode,
    i=cython.Py_ssize_t,
    L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
    for i in range(0x10000): # unicode code points > 0xffff are not supported
        L[i] = 0

    for c in chars:
        L[c] += 1

    return {unichr(i): L[i] for i in range(0x10000) if L[i]}

Previous comparison:

* python (dict) : 0.5  seconds
* python (list) : 0.5  (ascii) (0.2 if read whole file in memory)
* perl          : 0.5
* python (numpy): 0.07 
* c++           : 0.05
* c             : 0.008 (ascii)

Input data:

$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études

$ du -h /usr/share/dict/american-english
912K    /usr/share/dict/american-english

python (Counter): 0.5 seconds

#!/usr/bin/env python3.1
import collections, fileinput, textwrap

chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))

Run it:

$ time -p python3.1 count_char.py /usr/share/dict/american-english

Counter({'e': 87823, 's': 86620, 'i': 66548, 'a': 62778, 'n': 56696, 'r':
56286, 't': 51588, 'o': 48425, 'l': 39914, 'c': 30020, 'd': 28068, 'u': 25810,
"'": 24511, 'g': 22262, 'p': 20917, 'm': 20747, 'h': 18453, 'b': 14137, 'y':
12367, 'f': 10049, 'k': 7800, 'v': 7573, 'w': 6924, 'z': 3088, 'x': 2082, 'M':
1686, 'C': 1549, 'S': 1515, 'q': 1447, 'B': 1387, 'j': 1376, 'A': 1345, 'P':
974, 'L': 912, 'H': 860, 'T': 858, 'G': 811, 'D': 809, 'R': 749, 'K': 656, 'E':
618, 'J': 539, 'N': 531, 'W': 507, 'F': 502, 'O': 354, 'I': 344, 'V': 330, 'Z':
150, 'Y': 140, 'é': 128, 'U': 117, 'Q': 63, 'X': 42, 'è': 29, 'ö': 12, 'ü': 12,
'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û': 3, 'í':
2, 'ô': 2, 'Å': 1})
real 0.44
user 0.43
sys 0.01

perl: 0.5 seconds

time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english

Output:

{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,'g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4,'' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12,'\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150,'u' => 25810,'k' => 7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860,'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' => 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,'R' => 749,'o' => 48425}
real 0.51
user 0.49
sys 0.02

python (numpy): 0.07 seconds

Based on Ants Aasma’s answer (modified to support unicode):

#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy

filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'

# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]

# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)

# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts  if c[0] not in ' \t\n')

Output:

("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01

c++: 0.05 seconds

// $ g++ *.cc -lboost_program_options 
// $ ./a.out /usr/share/dict/american-english    
#include <iostream>
#include <fstream>
#include <cstdlib> // exit

#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>

int main(int argc, char* argv[]) {
  using namespace std;

  // open input file
  if (argc != 2) {
    cerr << "Usage: " << argv[0] << " <filename>\n";
    exit(2);
  }
  wifstream f(argv[argc-1]); 

  // assume the file has utf-8 encoding
  locale utf8_locale(locale(""), 
      new boost::program_options::detail::utf8_codecvt_facet);
  f.imbue(utf8_locale); 

  // count characters frequencies
  typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;  
  hashtable_t counts;
  for (wchar_t ch; f >> ch; )
    counts[ch]++;
  
  // print result
  wofstream of("output.utf8");
  of.imbue(utf8_locale);
  BOOST_FOREACH(hashtable_t::value_type i, counts) 
    of << "(" << i.first << " " << i.second << ") ";
  of << endl;
}

Result:

$ cat output.utf8

(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)

c (ascii): 0.0079 seconds

// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>

enum { N = 256 };
size_t counts[N];

int main(void) {
  // count characters
  int ch = -1;
  while((ch = getchar()) != EOF)
    ++counts[ch];
  
  // print result
  size_t i = 0;
  for (; i < N; ++i) 
    if (counts[i])
      printf("('%c' %zu) ", (int)i, counts[i]);
  return 0;
}