How to use numpy to get the cumulative count by unique values in linear time?

May 28, 2023 by Tarik Billa

setup

short_list = np.array(list('aaabaaacaaadaaac'))

functions

dfill takes an array and returns the positions where the array changes and repeats that index position until the next change.

# dfill
# 
# Example with short_list
#
#    0  0  0  3  4  4  4  7  8  8  8 11 12 12 12 15
# [  a  a  a  b  a  a  a  c  a  a  a  d  a  a  a  c]
#
# Example with short_list after sorting
#
#    0  0  0  0  0  0  0  0  0  0  0  0 12 13 13 15
# [  a  a  a  a  a  a  a  a  a  a  a  a  b  c  c  d]

argunsort returns the permutation necessary to undo a sort given the argsort array. The existence of this method became know to me via this post.. With this, I can get the argsort array and sort my array with it. Then I can undo the sort without the overhead of sorting again.

cumcount will take an array sort it, find the dfill array. An np.arange less dfill will give me cumulative count. Then I un-sort

# cumcount
# 
# Example with short_list
#
# short_list:
# [ a  a  a  b  a  a  a  c  a  a  a  d  a  a  a  c]
# 
# short_list.argsort():
# [ 0  1  2  4  5  6  8  9 10 12 13 14  3  7 15 11]
#
# Example with short_list after sorting
#
# short_list[short_list.argsort()]:
# [ a  a  a  a  a  a  a  a  a  a  a  a  b  c  c  d]
# 
# dfill(short_list[short_list.argsort()]):
# [ 0  0  0  0  0  0  0  0  0  0  0  0 12 13 13 15]
# 
# np.range(short_list.size):
# [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
#
# np.range(short_list.size) - 
#     dfill(short_list[short_list.argsort()]):
# [ 0  1  2  3  4  5  6  7  8  9 10 11  0  0  1  0]
# 
# unsorted:
# [ 0  1  2  0  3  4  5  0  6  7  8  0  9 10 11  1]

foo function recommended by @hpaulj using defaultdict
div function recommended by @Divakar (old, I’m sure he’d update it)

code

def dfill(a):
    n = a.size
    b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
    return np.arange(n)[b[:-1]].repeat(np.diff(b))

def argunsort(s):
    n = s.size
    u = np.empty(n, dtype=np.int64)
    u[s] = np.arange(n)
    return u

def cumcount(a):
    n = a.size
    s = a.argsort(kind='mergesort')
    i = argunsort(s)
    b = a[s]
    return (np.arange(n) - dfill(b))[i]

def foo(l):
    n = len(l)
    r = np.empty(n, dtype=np.int64)
    counter = defaultdict(int)
    for i in range(n):
        counter[l[i]] += 1
        r[i] = counter[l[i]]
    return r - 1

def div(l):
    a = np.unique(l, return_counts=1)[1]
    idx = a.cumsum()
    id_arr = np.ones(idx[-1],dtype=int)
    id_arr[0] = 0
    id_arr[idx[:-1]] = -a[:-1]+1
    rng = id_arr.cumsum()
    return rng[argunsort(np.argsort(l))]

demonstration

cumcount(short_list)

array([ 0,  1,  2,  0,  3,  4,  5,  0,  6,  7,  8,  0,  9, 10, 11,  1])

time testing

code

functions = pd.Index(['cumcount', 'foo', 'foo2', 'div'], name="function")
lengths = pd.RangeIndex(100, 1100, 100, 'array length')
results = pd.DataFrame(index=lengths, columns=functions)

from string import ascii_letters

for i in lengths:
    a = np.random.choice(list(ascii_letters), i)
    for j in functions:
        results.set_value(
            i, j,
            timeit(
                '{}(a)'.format(j),
                'from __main__ import a, {}'.format(j),
                number=1000
            )
        )

results.plot()

Leave a Comment Cancel reply