Pandas regex filtering


In [36]:
import pandas as pd
import re

In [37]:
cols = ['PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [38]:
# Dummy dataframe for testing regex
df = pd.DataFrame({col: [1,2,3] for col in cols})

In [39]:
regex_string = "^PAY_[0-9]+$"

In [40]:
df.filter(regex=regex_string)


Out[40]:
PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
0 1 1 1 1 1 1
1 2 2 2 2 2 2
2 3 3 3 3 3 3

In [43]:
regex = re.compile(regex_string)
# This gives you back an iterable
columns_of_interest = filter(regex.match, df.columns)
type(columns_of_interest)


Out[43]:
filter

In [42]:
columns_of_interest_list = list(columns_of_interest)
columns_of_interest_list


Out[42]:
['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']