How to read utf16 text file to string in golang?

The latest version of golang.org/x/text/encoding/unicode makes it easier to do this because it includes unicode.BOMOverride, which will intelligently interpret the BOM.

Here is ReadFileUTF16(), which is like os.ReadFile() but decodes UTF-16.

package main

import (
    "bytes"
    "fmt"
    "io/ioutil"
    "log"
    "strings"

    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

// Similar to ioutil.ReadFile() but decodes UTF-16.  Useful when
// reading data from MS-Windows systems that generate UTF-16BE files,
// but will do the right thing if other BOMs are found.
func ReadFileUTF16(filename string) ([]byte, error) {

    // Read the file into a []byte:
    raw, err := ioutil.ReadFile(filename)
    if err != nil {
        return nil, err
    }

    // Make an tranformer that converts MS-Win default to UTF8:
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
    // Make a transformer that is like win16be, but abides by BOM:
    utf16bom := unicode.BOMOverride(win16be.NewDecoder())

    // Make a Reader that uses utf16bom:
    unicodeReader := transform.NewReader(bytes.NewReader(raw), utf16bom)

    // decode and print:
    decoded, err := ioutil.ReadAll(unicodeReader)
    return decoded, err
}

func main() {
    data, err := ReadFileUTF16("inputfile.txt")
    if err != nil {
        log.Fatal(err)
    }
    final := strings.Replace(string(data), "\r\n", "\n", -1)
    fmt.Println(final)

}

Here is NewScannerUTF16 which is like os.Open() but returns a scanner.

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"

    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

type utfScanner interface {
    Read(p []byte) (n int, err error)
}

// Creates a scanner similar to os.Open() but decodes the file as UTF-16.
// Useful when reading data from MS-Windows systems that generate UTF-16BE
// files, but will do the right thing if other BOMs are found.
func NewScannerUTF16(filename string) (utfScanner, error) {

    // Read the file into a []byte:
    file, err := os.Open(filename)
    if err != nil {
        return nil, err
    }

    // Make an tranformer that converts MS-Win default to UTF8:
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
    // Make a transformer that is like win16be, but abides by BOM:
    utf16bom := unicode.BOMOverride(win16be.NewDecoder())

    // Make a Reader that uses utf16bom:
    unicodeReader := transform.NewReader(file, utf16bom)
    return unicodeReader, nil
}

func main() {

    s, err := NewScannerUTF16("inputfile.txt")
    if err != nil {
        log.Fatal(err)
    }

    scanner := bufio.NewScanner(s)
    for scanner.Scan() {
        fmt.Println(scanner.Text()) // Println will add back the final '\n'
    }
    if err := scanner.Err(); err != nil {
        fmt.Fprintln(os.Stderr, "reading inputfile:", err)
    }

}

FYI: I have put these functions into an open source module and have made further improvements. See https://github.com/TomOnTime/utfutil/

Leave a Comment