Storing numpy sparse matrix in HDF5 (PyTables)

The answer by DaveP is almost right… but can cause problems for very sparse matrices: if the last column(s) or row(s) are empty, they are dropped. So to be sure that everything works, the “shape” attribute must be stored too.

This is the code I regularly use:

import tables as tb
from numpy import array
from scipy import sparse

def store_sparse_mat(m, name, store="store.h5"):
    msg = "This code only works for csr matrices"
    assert(m.__class__ == sparse.csr.csr_matrix), msg
    with tb.openFile(store,'a') as f:
        for par in ('data', 'indices', 'indptr', 'shape'):
            full_name="%s_%s" % (name, par)
            try:
                n = getattr(f.root, full_name)
                n._f_remove()
            except AttributeError:
                pass

            arr = array(getattr(m, par))
            atom = tb.Atom.from_dtype(arr.dtype)
            ds = f.createCArray(f.root, full_name, atom, arr.shape)
            ds[:] = arr

def load_sparse_mat(name, store="store.h5"):
    with tb.openFile(store) as f:
        pars = []
        for par in ('data', 'indices', 'indptr', 'shape'):
            pars.append(getattr(f.root, '%s_%s' % (name, par)).read())
    m = sparse.csr_matrix(tuple(pars[:3]), shape=pars[3])
    return m

It is trivial to adapt it to csc matrices.

Leave a Comment