Sometimes columns in dataframe have high dimentionality. eg: some categorical column with 20 most frequent values covering 80% of the cases. Rest being long tail. In such case we can convert long tail part into others based on some cut off of count.

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'groups': ['group 1','group 2','group 1','group 2','group 3','group 4','group 5','group 1','group 2','group 5'], 
                   'vals': [1,2,3,4,5,6,7,8,9,10]})

In [3]:
df


Out[3]:
groups vals
0 group 1 1
1 group 2 2
2 group 1 3
3 group 2 4
4 group 3 5
5 group 4 6
6 group 5 7
7 group 1 8
8 group 2 9
9 group 5 10

In [4]:
df['groups'].value_counts()


Out[4]:
group 1    3
group 2    3
group 5    2
group 4    1
group 3    1
Name: groups, dtype: int64

In [5]:
high_dim_columns = ['groups']

for column in high_dim_columns:
    a = pd.DataFrame(df[column].value_counts() <= 2)
    unique_values = a.index[a[column]].values
    df.loc[df[column].isin(unique_values), column] = 'other'

In [6]:
df


Out[6]:
groups vals
0 group 1 1
1 group 2 2
2 group 1 3
3 group 2 4
4 other 5
5 other 6
6 other 7
7 group 1 8
8 group 2 9
9 other 10

In [ ]: