Python UTF-16 CSV reader

At the moment, the csv module does not support UTF-16.

In Python 3.x, csv expects a text-mode file and you can simply use the encoding parameter of open to force another encoding:

# Python 3.x only
import csv
with open('utf16.csv', 'r', encoding='utf16') as csvf:
    for line in csv.reader(csvf):
        print(line) # do something with the line

In Python 2.x, you can recode the input:

# Python 2.x only
import codecs
import csv

class Recoder(object):
    def __init__(self, stream, decoder, encoder, eol="\r\n"):
        self._stream = stream
        self._decoder = decoder if isinstance(decoder, codecs.IncrementalDecoder) else codecs.getincrementaldecoder(decoder)()
        self._encoder = encoder if isinstance(encoder, codecs.IncrementalEncoder) else codecs.getincrementalencoder(encoder)()
        self._buf=""
        self._eol = eol
        self._reachedEof = False

    def read(self, size=None):
        r = self._stream.read(size)
        raw = self._decoder.decode(r, size is None)
        return self._encoder.encode(raw)

    def __iter__(self):
        return self

    def __next__(self):
        if self._reachedEof:
            raise StopIteration()
        while True:
            line,eol,rest = self._buf.partition(self._eol)
            if eol == self._eol:
                self._buf = rest
                return self._encoder.encode(line + eol)
            raw = self._stream.read(1024)
            if raw == '':
                self._decoder.decode(b'', True)
                self._reachedEof = True
                return self._encoder.encode(self._buf)
            self._buf += self._decoder.decode(raw)
    next = __next__

    def close(self):
        return self._stream.close()

with open('test.csv','rb') as f:
    sr = Recoder(f, 'utf-16', 'utf-8')

    for row in csv.reader(sr):
        print (row)

open and codecs.open require the file to start with a BOM. If it doesn’t (or you’re on Python 2.x), you can still convert it in memory, like this:

try:
    from io import BytesIO
except ImportError: # Python < 2.6
    from StringIO import StringIO as BytesIO
import csv
with open('utf16.csv', 'rb') as binf:
    c = binf.read().decode('utf-16').encode('utf-8')
for line in csv.reader(BytesIO(c)):
    print(line) # do something with the line

Leave a Comment