notebook.community

Edit and run

Split a text column into multiple column based on some delimiter. Then convert the values into one hot encoded columns. Basically converting a categorical variable into one hot encoded values.



In [1]:

    
import pandas as pd



In [2]:

    
data = [{'test': 'vikash|Arpit', 'val': 6},
        {'test': 'vikash_1|arpit|Vinayp', 'val': 3},
        {'test': 'arpit|vinayp', 'val': 2}]



In [3]:

    
df = pd.DataFrame.from_dict(data, orient='columns')



In [4]:

    
df









    Out[4]:






  
    
      
      test
      val
    
  
  
    
      0
      vikash|Arpit
      6
    
    
      1
      vikash_1|arpit|Vinayp
      3
    
    
      2
      arpit|vinayp
      2



In [5]:

    
chosen_columns = set()
for idx, row in df.iterrows():
    for val in str(row['test']).lower().split('|'):
        chosen_columns.add(val.strip())



In [6]:

    
chosen_columns_list = list(chosen_columns)



In [7]:

    
chosen_columns_list.sort(key=len, reverse=True)



In [8]:

    
chosen_columns_list









    Out[8]:





['vikash_1', 'vinayp', 'vikash', 'arpit']



In [9]:

    
def get_one_hot_encoded_column(col_value):
    col_value = col_value.lower()
    new_col_value = ''
    for val in chosen_columns_list:
        if val in col_value.split('|'):
            col_value = col_value.replace(val, '')
            new_col_value += '1,'
        else:
            new_col_value += '0,'
    return new_col_value[:-1]



In [10]:

    
df['test_new'] = df['test'].map(get_one_hot_encoded_column)



In [11]:

    
df









    Out[11]:






  
    
      
      test
      val
      test_new
    
  
  
    
      0
      vikash|Arpit
      6
      0,0,1,1
    
    
      1
      vikash_1|arpit|Vinayp
      3
      1,1,0,1
    
    
      2
      arpit|vinayp
      2
      0,1,0,1



In [12]:

    
df2 = df['test_new'].apply(lambda x: pd.Series([i for i in x.lower().split(',')]))



In [15]:

    
df2



In [16]:

    
df2.columns = chosen_columns_list



In [17]:

    
df2



In [18]:

    
df2.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
vikash_1    3 non-null object
vinayp      3 non-null object
vikash      3 non-null object
arpit       3 non-null object
dtypes: object(4)
memory usage: 176.0+ bytes



In [19]:

    
df2 = df2.apply(pd.to_numeric)



In [20]:

    
df2.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
vikash_1    3 non-null int64
vinayp      3 non-null int64
vikash      3 non-null int64
arpit       3 non-null int64
dtypes: int64(4)
memory usage: 176.0 bytes



In [28]:

    
df_new = pd.concat([df, df2], axis=1)



In [29]:

    
df_new.drop(['test', 'test_new'], inplace=True, axis=1)



In [30]:

    
df_new



In [ ]:

	test	val	test_new
0	vikash\|Arpit	6	0,0,1,1
1	vikash_1\|arpit\|Vinayp	3	1,1,0,1
2	arpit\|vinayp	2	0,1,0,1