Create path features

Paths are created based on date features only (0.2% error)



In [1]:

    
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data



In [2]:

    
print const.TRAIN_FILES
print const.TEST_FILES









    



['train_numeric', 'train_categorical_to_num', 'train_date']
['test_numeric', 'test_categorical_to_num', 'test_date']



In [3]:

    
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.set_index('name_dat', inplace=True)
lut.head(3)









    Out[3]:






  
    
      
      line
      station
      feature_nr
      feat_nr_dat
      name_cat
      name_num
      col_dat
      col_num
      col_cat
      station_V2
      line_V2
    
    
      name_dat
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      L0_S0_D1
      0
      0
      0
      1.0
      NaN
      L0_S0_F0
      0.0
      0.0
      NaN
      0.0
      1.0
    
    
      L0_S0_D3
      0
      0
      2
      3.0
      NaN
      L0_S0_F2
      1.0
      1.0
      NaN
      0.0
      1.0
    
    
      L0_S0_D5
      0
      0
      4
      5.0
      NaN
      L0_S0_F4
      2.0
      2.0
      NaN
      0.0
      1.0



In [4]:

    
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])









    



Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115b3bb70>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115b3bb70>.pkl



In [5]:

    
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']



In [6]:

    
st_columns = lut.groupby('station_V2')['col_dat'].first().values
st_names = lut.groupby('station_V2')['col_dat'].first().index.values



In [7]:

    
date_data = pd.DataFrame(date_data[:, st_columns].todense()).replace(0, np.nan)
date_data.columns = [str(st_names[n]) for n in date_data.columns]



In [8]:

    
# Add clusters,response and id to data
# Add cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'))
cluster_info.head(3)









    Out[8]:






  
    
      
      Id
      unique_path
      cluster_n8
      cluster_n50
      cluster_n100
      cluster_n500
    
  
  
    
      0
      4
      13409
      3
      25
      3
      127
    
    
      1
      6
      7029
      1
      45
      80
      20
    
    
      2
      7
      12763
      3
      40
      11
      348



In [9]:

    
date_data = date_data.merge(ids.reset_index(), left_index=True, right_index=True, how='left')



In [10]:

    
date_data = date_data.merge(cluster_info, left_on='Id', right_on='Id', how='left')



In [11]:

    
date_data = date_data.merge(y, left_on='Id', right_index=True, how='left')



In [12]:

    
print date_data.shape
date_data.head(3)









    



(2367495, 136)






    Out[12]:






  
    
      
      0.0
      1.0
      2.0
      3.0
      4.0
      5.0
      6.0
      7.0
      8.0
      9.0
      ...
      50.0
      51.0
      index
      Id
      unique_path
      cluster_n8
      cluster_n50
      cluster_n100
      cluster_n500
      Response
    
  
  
    
      0
      82.239998
      82.239998
      82.239998
      NaN
      82.260002
      NaN
      NaN
      82.260002
      82.269997
      NaN
      ...
      NaN
      NaN
      0
      4
      13409
      3
      25
      3
      127
      0.0
    
    
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      1
      6
      7029
      1
      45
      80
      20
      0.0
    
    
      2
      1618.699951
      1618.699951
      1618.699951
      NaN
      NaN
      1618.719971
      1618.719971
      NaN
      1618.729980
      NaN
      ...
      NaN
      NaN
      2
      7
      12763
      3
      40
      11
      348
      0.0
    
  

3 rows × 136 columns

Calculate features



In [35]:

    
d_cols = date_data.columns[:128]



In [13]:

    
n_samples = date_data.shape[0]



In [14]:

    
lines = lut['line_V2'].unique()



In [15]:

    
path_feat = pd.DataFrame(ids.Id.values)



In [16]:

    
for line in lines:
    stations = [str(float(x)) for x in lut[lut['line_V2']==line]['station_V2'].unique()]
    
    df = (~date_data.loc[:,date_data.columns.isin(stations)].isnull()).sum(1)
    df = df.replace(0, np.nan)
    
    df -= df.value_counts().index[0]

    path_feat = pd.concat([path_feat, df], axis=1)



In [17]:

    
path_feat.columns = ['Id'] + ['V2_' + str(x) for x in lines if x!='Id']



In [43]:

    
# First station
path_feat['first_station'] = date_data[d_cols].apply(lambda x: x.first_valid_index(), axis=1)
path_feat['last_station'] = date_data[d_cols].apply(lambda x: x.last_valid_index(), axis=1)



In [18]:

    
path_feat.head()









    Out[18]:






  
    
      
      Id
      V2_1.0
      V2_2.0
      V2_3.1
      V2_3.2
      V2_3.3
      V2_4.1
      V2_4.0
      V2_4.2
      V2_4.3
      V2_4.4
      V2_5.0
      V2_6.0
      V2_7.0
    
  
  
    
      0
      4
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
    
    
      1
      6
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN
    
    
      2
      7
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN
    
    
      3
      9
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN
    
    
      4
      11
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN



In [19]:

    
# Which line in the end
path_feat['stage_2'] = path_feat.loc[:,['V2_5.0','V2_6.0']].abs().idxmin(1)



In [20]:

    
# Which line in the beginning
path_feat['stage_1'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                               'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().idxmin(1)



In [21]:

    
# How many lines in the first part?
path_feat['stage_1_cnt'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                                   'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().count(1)



In [22]:

    
# Compress stage1
path_feat['stage_1_sum'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                                   'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].sum(1)



In [23]:

    
# Compress stage2
path_feat['stage_2_sum'] = path_feat.loc[:,['V2_5.0','V2_6.0']].sum(1)



In [24]:

    
# How many stations in total path
path_feat['stationV2_cnt'] = date_data.loc[:,'0.0':'51.0'].count(1)



In [25]:

    
# Path nr & clusters
path_feat['unique_path'] = date_data['unique_path']
path_feat['cluster_n8'] = date_data['cluster_n8']
path_feat['cluster_n50'] = date_data['cluster_n50']
path_feat['cluster_n100'] = date_data['cluster_n100']
path_feat['cluster_n500'] = date_data['cluster_n500']



In [26]:

    
# How many stations in total path (deviation from cluster median)
path_feat['stage_1_sum_devn8'] = path_feat['stage_1_sum']
for cl in path_feat['cluster_n8'].unique():
        path_feat.loc[path_feat['cluster_n8']==cl, 'stage_1_sum_devn8'] -= \
                path_feat.loc[path_feat['cluster_n8']==cl,'stage_1_sum'].median()



In [27]:

    
# How many stations in total path (deviation from cluster median)
path_feat['stationV2_cnt_devn8'] = path_feat['stationV2_cnt']
for cl in path_feat['cluster_n8'].unique():
        path_feat.loc[path_feat['cluster_n8']==cl, 'stationV2_cnt_devn8'] -= \
                path_feat.loc[path_feat['cluster_n8']==cl,'stationV2_cnt'].median()



In [28]:

    
# Frequency of cluster (n=500)
n500_cnt = ((path_feat['cluster_n500'].value_counts()/n_samples).round(4)*10000).astype(int) \
                                         .reset_index(name='n500_cnt') \
                                         .rename(columns={'index': 'cluster_n500'})

path_feat = path_feat.merge(n500_cnt, on='cluster_n500', how='left')



In [29]:

    
# Frequency of unique path
upath_cnt = ((path_feat['unique_path'].value_counts()/n_samples).round(4)*10000).astype(int) \
                                         .reset_index(name='upath_cnt') \
                                         .rename(columns={'index': 'unique_path'})

path_feat = path_feat.merge(upath_cnt, on='unique_path', how='left')



In [51]:

    
# Combination of S32 / 33
path_feat['path_32'] = ((~date_data['32.0'].isnull()) & (date_data['33.0'].isnull()))



In [30]:

    
path_feat.head()









    Out[30]:






  
    
      
      Id
      V2_1.0
      V2_2.0
      V2_3.1
      V2_3.2
      V2_3.3
      V2_4.1
      V2_4.0
      V2_4.2
      V2_4.3
      ...
      stationV2_cnt
      unique_path
      cluster_n8
      cluster_n50
      cluster_n100
      cluster_n500
      stage_1_sum_devn8
      stationV2_cnt_devn8
      n500_cnt
      upath_cnt
    
  
  
    
      0
      4
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      14
      13409
      3
      25
      3
      127
      0.0
      1.0
      108
      4
    
    
      1
      6
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      7029
      1
      45
      80
      20
      0.0
      0.0
      38
      31
    
    
      2
      7
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      12763
      3
      40
      11
      348
      0.0
      0.0
      110
      86
    
    
      3
      9
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      13658
      3
      43
      34
      204
      0.0
      0.0
      112
      88
    
    
      4
      11
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      9865
      3
      38
      26
      180
      0.0
      0.0
      110
      86
    
  

5 rows × 29 columns



In [31]:

    
path_feat.head()









    Out[31]:






  
    
      
      Id
      V2_1.0
      V2_2.0
      V2_3.1
      V2_3.2
      V2_3.3
      V2_4.1
      V2_4.0
      V2_4.2
      V2_4.3
      ...
      stationV2_cnt
      unique_path
      cluster_n8
      cluster_n50
      cluster_n100
      cluster_n500
      stage_1_sum_devn8
      stationV2_cnt_devn8
      n500_cnt
      upath_cnt
    
  
  
    
      0
      4
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      14
      13409
      3
      25
      3
      127
      0.0
      1.0
      108
      4
    
    
      1
      6
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      7029
      1
      45
      80
      20
      0.0
      0.0
      38
      31
    
    
      2
      7
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      12763
      3
      40
      11
      348
      0.0
      0.0
      110
      86
    
    
      3
      9
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      13658
      3
      43
      34
      204
      0.0
      0.0
      112
      88
    
    
      4
      11
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      13
      9865
      3
      38
      26
      180
      0.0
      0.0
      110
      86
    
  

5 rows × 29 columns

Store feature set as csv



In [52]:

    
path_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_path.csv'), index=False)



In [ ]:

	line	station	feature_nr	feat_nr_dat	name_cat	name_num	col_dat	col_num	col_cat	station_V2	line_V2
name_dat
L0_S0_D1	0	0	0	1.0	NaN	L0_S0_F0	0.0	0.0	NaN	0.0	1.0
L0_S0_D3	0	0	2	3.0	NaN	L0_S0_F2	1.0	1.0	NaN	0.0	1.0
L0_S0_D5	0	0	4	5.0	NaN	L0_S0_F4	2.0	2.0	NaN	0.0	1.0

	0.0	1.0	2.0	3.0	4.0	5.0	6.0	7.0	8.0	9.0	...	50.0	51.0	index	Id	unique_path	cluster_n8	cluster_n50	cluster_n100	cluster_n500
0	82.239998	82.239998	82.239998	NaN	82.260002	NaN	NaN	82.260002	82.269997	NaN	...	NaN	NaN	0	4	13409	3	25	3	127
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	6	7029	1	45	80	20
2	1618.699951	1618.699951	1618.699951	NaN	NaN	1618.719971	1618.719971	NaN	1618.729980	NaN	...	NaN	NaN	2	7	12763	3	40	11	348

	Id	V2_1.0	V2_2.0	V2_3.1	V2_3.2	V2_3.3	V2_4.1	V2_4.0	V2_4.2	V2_4.3	V2_4.4	V2_5.0	V2_6.0	V2_7.0
0	4	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN
1	6	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
2	7	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
3	9	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
4	11	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN

	Id	V2_1.0	V2_2.0	V2_3.1	V2_3.2	V2_3.3	V2_4.1	V2_4.0	V2_4.2	V2_4.3	V2_4.4	V2_5.0	V2_6.0	V2_7.0
0	4	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN
1	6	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
2	7	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
3	9	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
4	11	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN

	Id	V2_1.0	V2_2.0	V2_3.1	V2_3.2	V2_3.3	V2_4.1	V2_4.0	V2_4.2	V2_4.3	V2_4.4	V2_5.0	V2_6.0	V2_7.0
0	4	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN
1	6	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
2	7	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
3	9	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN
4	11	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN