Split a text column into multiple column based on some delimiter. Then convert the values into one hot encoded columns. Basically converting a categorical variable into one hot encoded values.

In [1]:
import pandas as pd

In [2]:
data = [{'test': 'vikash|Arpit', 'val': 6},
        {'test': 'vikash_1|arpit|Vinayp', 'val': 3},
        {'test': 'arpit|vinayp', 'val': 2}]

In [3]:
df = pd.DataFrame.from_dict(data, orient='columns')

In [4]:
df


Out[4]:
test val
0 vikash|Arpit 6
1 vikash_1|arpit|Vinayp 3
2 arpit|vinayp 2

In [5]:
chosen_columns = set()
for idx, row in df.iterrows():
    for val in str(row['test']).lower().split('|'):
        chosen_columns.add(val.strip())

In [6]:
chosen_columns_list = list(chosen_columns)

In [7]:
chosen_columns_list.sort(key=len, reverse=True)

In [8]:
chosen_columns_list


Out[8]:
['vikash_1', 'vinayp', 'vikash', 'arpit']

In [9]:
def get_one_hot_encoded_column(col_value):
    col_value = col_value.lower()
    new_col_value = ''
    for val in chosen_columns_list:
        if val in col_value.split('|'):
            col_value = col_value.replace(val, '')
            new_col_value += '1,'
        else:
            new_col_value += '0,'
    return new_col_value[:-1]

In [10]:
df['test_new'] = df['test'].map(get_one_hot_encoded_column)

In [11]:
df


Out[11]:
test val test_new
0 vikash|Arpit 6 0,0,1,1
1 vikash_1|arpit|Vinayp 3 1,1,0,1
2 arpit|vinayp 2 0,1,0,1

In [12]:
df2 = df['test_new'].apply(lambda x: pd.Series([i for i in x.lower().split(',')]))

In [15]:
df2


Out[15]:
vikash_1 vinayp vikash arpit
0 0 0 1 1
1 1 1 0 1
2 0 1 0 1

In [16]:
df2.columns = chosen_columns_list

In [17]:
df2


Out[17]:
vikash_1 vinayp vikash arpit
0 0 0 1 1
1 1 1 0 1
2 0 1 0 1

In [18]:
df2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
vikash_1    3 non-null object
vinayp      3 non-null object
vikash      3 non-null object
arpit       3 non-null object
dtypes: object(4)
memory usage: 176.0+ bytes

In [19]:
df2 = df2.apply(pd.to_numeric)

In [20]:
df2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
vikash_1    3 non-null int64
vinayp      3 non-null int64
vikash      3 non-null int64
arpit       3 non-null int64
dtypes: int64(4)
memory usage: 176.0 bytes

In [28]:
df_new = pd.concat([df, df2], axis=1)

In [29]:
df_new.drop(['test', 'test_new'], inplace=True, axis=1)

In [30]:
df_new


Out[30]:
val vikash_1 vinayp vikash arpit
0 6 0 0 1 1
1 3 1 1 0 1
2 2 0 1 0 1

In [ ]: