Pandas – Find and index rows that match row sequence pattern

I think you have 2 ways – simplier and slowier solution or faster complicated.

  • use Rolling.apply and test pattern
  • replace 0s to NaNs by mask
  • use bfill with limit (same as fillna with method='bfill') for repeat 1
  • then fillna NaNs to 0
  • last cast to bool by astype

pat = np.asarray([1, 2, 2, 0])
N = len(pat)


df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
                          .apply(lambda x: (x==pat).all())
                          .mask(lambda x: x == 0) 
                          .bfill(limit=N-1)
                          .fillna(0)
                          .astype(bool)
             )

If is important performance, use strides, solution from link was modify:

  • use rolling window approach
  • compare with pattaern and return Trues for match by all
  • get indices of first occurencies by np.mgrid and indexing
  • create all indices with list comprehension
  • compare by numpy.in1d and create new column

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
    return c

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]

d = [i  for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)

Another solution, thanks @divakar:

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)

m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))

Timings:

np.random.seed(456) 

import pandas as pd
from numpy.random import choice, randn
from scipy.ndimage.morphology import binary_dilation
import string

# df constructor
n_rows = 100000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
                   'group_var': choice(list(string.ascii_uppercase), n_rows),
                   'row_pat': choice([0, 1, 2, 3], n_rows),
                   'values': randn(n_rows)})

# sorting 
df.sort_values(by=['group_var', 'date_time'], inplace=True)

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
    return c


arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)

m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]

d = [i  for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)

print (df.iloc[460:480])

                date_time group_var  row_pat    values    rm0    rm1    rm2
12045 2019-06-25 21:00:00         A        3 -0.081152  False  False  False
12094 2019-06-27 22:00:00         A        1 -0.818167  False  False  False
12125 2019-06-29 05:00:00         A        0 -0.051088  False  False  False
12143 2019-06-29 23:00:00         A        0 -0.937589  False  False  False
12145 2019-06-30 01:00:00         A        3  0.298460  False  False  False
12158 2019-06-30 14:00:00         A        1  0.647161  False  False  False
12164 2019-06-30 20:00:00         A        3 -0.735538  False  False  False
12210 2019-07-02 18:00:00         A        1 -0.881740  False  False  False
12341 2019-07-08 05:00:00         A        3  0.525652  False  False  False
12343 2019-07-08 07:00:00         A        1  0.311598  False  False  False
12358 2019-07-08 22:00:00         A        1 -0.710150   True   True   True
12360 2019-07-09 00:00:00         A        2 -0.752216   True   True   True
12400 2019-07-10 16:00:00         A        2 -0.205122   True   True   True
12404 2019-07-10 20:00:00         A        0  1.342591   True   True   True
12413 2019-07-11 05:00:00         A        1  1.707748  False  False  False
12506 2019-07-15 02:00:00         A        2  0.319227  False  False  False
12527 2019-07-15 23:00:00         A        3  2.130917  False  False  False
12600 2019-07-19 00:00:00         A        1 -1.314070  False  False  False
12604 2019-07-19 04:00:00         A        0  0.869059  False  False  False
12613 2019-07-19 13:00:00         A        2  1.342101  False  False  False

In [225]: %%timeit
     ...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
     ...:                           .apply(lambda x: (x==pat).all())
     ...:                           .mask(lambda x: x == 0) 
     ...:                           .bfill(limit=N-1)
     ...:                           .fillna(0)
     ...:                           .astype(bool)
     ...:              )
     ...: 
1 loop, best of 3: 356 ms per loop

In [226]: %%timeit
     ...: arr = df['row_pat'].values
     ...: b = np.all(rolling_window(arr, N) == pat, axis=1)
     ...: c = np.mgrid[0:len(b)][b]
     ...: d = [i  for x in c for i in range(x, x+N)]
     ...: df['rm2'] = np.in1d(np.arange(len(arr)), d)
     ...: 
100 loops, best of 3: 7.63 ms per loop

In [227]: %%timeit
     ...: arr = df['row_pat'].values
     ...: b = np.all(rolling_window(arr, N) == pat, axis=1)
     ...: 
     ...: m = (rolling_window(arr, len(pat)) == pat).all(1)
     ...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
     ...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
     ...: 
100 loops, best of 3: 7.25 ms per loop

Leave a Comment