setup
short_list = np.array(list('aaabaaacaaadaaac'))
functions
-
dfill
takes an array and returns the positions where the array changes and repeats that index position until the next change.
# dfill
#
# Example with short_list
#
# 0 0 0 3 4 4 4 7 8 8 8 11 12 12 12 15
# [ a a a b a a a c a a a d a a a c]
#
# Example with short_list after sorting
#
# 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15
# [ a a a a a a a a a a a a b c c d]
argunsort
returns the permutation necessary to undo a sort given the argsort
array. The existence of this method became know to me via this post.. With this, I can get the argsort
array and sort my array with it. Then I can undo the sort without the overhead of sorting again.
-
cumcount
will take an array sort it, find the dfill
array. An np.arange
less dfill
will give me cumulative count. Then I un-sort
# cumcount
#
# Example with short_list
#
# short_list:
# [ a a a b a a a c a a a d a a a c]
#
# short_list.argsort():
# [ 0 1 2 4 5 6 8 9 10 12 13 14 3 7 15 11]
#
# Example with short_list after sorting
#
# short_list[short_list.argsort()]:
# [ a a a a a a a a a a a a b c c d]
#
# dfill(short_list[short_list.argsort()]):
# [ 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15]
#
# np.range(short_list.size):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
#
# np.range(short_list.size) -
# dfill(short_list[short_list.argsort()]):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 0 0 1 0]
#
# unsorted:
# [ 0 1 2 0 3 4 5 0 6 7 8 0 9 10 11 1]
foo
function recommended by @hpaulj using defaultdict
div
function recommended by @Divakar (old, I’m sure he’d update it)
code
def dfill(a):
n = a.size
b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
return np.arange(n)[b[:-1]].repeat(np.diff(b))
def argunsort(s):
n = s.size
u = np.empty(n, dtype=np.int64)
u[s] = np.arange(n)
return u
def cumcount(a):
n = a.size
s = a.argsort(kind='mergesort')
i = argunsort(s)
b = a[s]
return (np.arange(n) - dfill(b))[i]
def foo(l):
n = len(l)
r = np.empty(n, dtype=np.int64)
counter = defaultdict(int)
for i in range(n):
counter[l[i]] += 1
r[i] = counter[l[i]]
return r - 1
def div(l):
a = np.unique(l, return_counts=1)[1]
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
rng = id_arr.cumsum()
return rng[argunsort(np.argsort(l))]
demonstration
cumcount(short_list)
array([ 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1])
time testing
code
functions = pd.Index(['cumcount', 'foo', 'foo2', 'div'], name="function")
lengths = pd.RangeIndex(100, 1100, 100, 'array length')
results = pd.DataFrame(index=lengths, columns=functions)
from string import ascii_letters
for i in lengths:
a = np.random.choice(list(ascii_letters), i)
for j in functions:
results.set_value(
i, j,
timeit(
'{}(a)'.format(j),
'from __main__ import a, {}'.format(j),
number=1000
)
)
results.plot()