In [1]:
import pandas as pd
In [2]:
data = [{'test': 'vikash|Arpit', 'val': 6},
{'test': 'vikash_1|arpit|Vinayp', 'val': 3},
{'test': 'arpit|vinayp', 'val': 2}]
In [3]:
df = pd.DataFrame.from_dict(data, orient='columns')
In [4]:
df
Out[4]:
In [5]:
chosen_columns = set()
for idx, row in df.iterrows():
for val in str(row['test']).lower().split('|'):
chosen_columns.add(val.strip())
In [6]:
chosen_columns_list = list(chosen_columns)
In [7]:
chosen_columns_list.sort(key=len, reverse=True)
In [8]:
chosen_columns_list
Out[8]:
In [9]:
def get_one_hot_encoded_column(col_value):
col_value = col_value.lower()
new_col_value = ''
for val in chosen_columns_list:
if val in col_value.split('|'):
col_value = col_value.replace(val, '')
new_col_value += '1,'
else:
new_col_value += '0,'
return new_col_value[:-1]
In [10]:
df['test_new'] = df['test'].map(get_one_hot_encoded_column)
In [11]:
df
Out[11]:
In [12]:
df2 = df['test_new'].apply(lambda x: pd.Series([i for i in x.lower().split(',')]))
In [15]:
df2
Out[15]:
In [16]:
df2.columns = chosen_columns_list
In [17]:
df2
Out[17]:
In [18]:
df2.info()
In [19]:
df2 = df2.apply(pd.to_numeric)
In [20]:
df2.info()
In [28]:
df_new = pd.concat([df, df2], axis=1)
In [29]:
df_new.drop(['test', 'test_new'], inplace=True, axis=1)
In [30]:
df_new
Out[30]:
In [ ]: