Create path features

Paths are created based on date features only (0.2% error)


In [1]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data


In [2]:
print const.TRAIN_FILES
print const.TEST_FILES


['train_numeric', 'train_categorical_to_num', 'train_date']
['test_numeric', 'test_categorical_to_num', 'test_date']

In [3]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.set_index('name_dat', inplace=True)
lut.head(3)


Out[3]:
line station feature_nr feat_nr_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
name_dat
L0_S0_D1 0 0 0 1.0 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
L0_S0_D3 0 0 2 3.0 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
L0_S0_D5 0 0 4 5.0 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [4]:
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])


Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115b3bb70>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115b3bb70>.pkl

In [5]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']

In [6]:
st_columns = lut.groupby('station_V2')['col_dat'].first().values
st_names = lut.groupby('station_V2')['col_dat'].first().index.values

In [7]:
date_data = pd.DataFrame(date_data[:, st_columns].todense()).replace(0, np.nan)
date_data.columns = [str(st_names[n]) for n in date_data.columns]

In [8]:
# Add clusters,response and id to data
# Add cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'))
cluster_info.head(3)


Out[8]:
Id unique_path cluster_n8 cluster_n50 cluster_n100 cluster_n500
0 4 13409 3 25 3 127
1 6 7029 1 45 80 20
2 7 12763 3 40 11 348

In [9]:
date_data = date_data.merge(ids.reset_index(), left_index=True, right_index=True, how='left')

In [10]:
date_data = date_data.merge(cluster_info, left_on='Id', right_on='Id', how='left')

In [11]:
date_data = date_data.merge(y, left_on='Id', right_index=True, how='left')

In [12]:
print date_data.shape
date_data.head(3)


(2367495, 136)
Out[12]:
0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 ... 50.0 51.0 index Id unique_path cluster_n8 cluster_n50 cluster_n100 cluster_n500 Response
0 82.239998 82.239998 82.239998 NaN 82.260002 NaN NaN 82.260002 82.269997 NaN ... NaN NaN 0 4 13409 3 25 3 127 0.0
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 1 6 7029 1 45 80 20 0.0
2 1618.699951 1618.699951 1618.699951 NaN NaN 1618.719971 1618.719971 NaN 1618.729980 NaN ... NaN NaN 2 7 12763 3 40 11 348 0.0

3 rows × 136 columns

Calculate features


In [35]:
d_cols = date_data.columns[:128]

In [13]:
n_samples = date_data.shape[0]

In [14]:
lines = lut['line_V2'].unique()

In [15]:
path_feat = pd.DataFrame(ids.Id.values)

In [16]:
for line in lines:
    stations = [str(float(x)) for x in lut[lut['line_V2']==line]['station_V2'].unique()]
    
    df = (~date_data.loc[:,date_data.columns.isin(stations)].isnull()).sum(1)
    df = df.replace(0, np.nan)
    
    df -= df.value_counts().index[0]

    path_feat = pd.concat([path_feat, df], axis=1)

In [17]:
path_feat.columns = ['Id'] + ['V2_' + str(x) for x in lines if x!='Id']

In [43]:
# First station
path_feat['first_station'] = date_data[d_cols].apply(lambda x: x.first_valid_index(), axis=1)
path_feat['last_station'] = date_data[d_cols].apply(lambda x: x.last_valid_index(), axis=1)

In [18]:
path_feat.head()


Out[18]:
Id V2_1.0 V2_2.0 V2_3.1 V2_3.2 V2_3.3 V2_4.1 V2_4.0 V2_4.2 V2_4.3 V2_4.4 V2_5.0 V2_6.0 V2_7.0
0 4 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN
1 6 NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN
2 7 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN
3 9 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN
4 11 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN

In [19]:
# Which line in the end
path_feat['stage_2'] = path_feat.loc[:,['V2_5.0','V2_6.0']].abs().idxmin(1)

In [20]:
# Which line in the beginning
path_feat['stage_1'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                               'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().idxmin(1)

In [21]:
# How many lines in the first part?
path_feat['stage_1_cnt'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                                   'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().count(1)

In [22]:
# Compress stage1
path_feat['stage_1_sum'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
                                                   'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].sum(1)

In [23]:
# Compress stage2
path_feat['stage_2_sum'] = path_feat.loc[:,['V2_5.0','V2_6.0']].sum(1)

In [24]:
# How many stations in total path
path_feat['stationV2_cnt'] = date_data.loc[:,'0.0':'51.0'].count(1)

In [25]:
# Path nr & clusters
path_feat['unique_path'] = date_data['unique_path']
path_feat['cluster_n8'] = date_data['cluster_n8']
path_feat['cluster_n50'] = date_data['cluster_n50']
path_feat['cluster_n100'] = date_data['cluster_n100']
path_feat['cluster_n500'] = date_data['cluster_n500']

In [26]:
# How many stations in total path (deviation from cluster median)
path_feat['stage_1_sum_devn8'] = path_feat['stage_1_sum']
for cl in path_feat['cluster_n8'].unique():
        path_feat.loc[path_feat['cluster_n8']==cl, 'stage_1_sum_devn8'] -= \
                path_feat.loc[path_feat['cluster_n8']==cl,'stage_1_sum'].median()

In [27]:
# How many stations in total path (deviation from cluster median)
path_feat['stationV2_cnt_devn8'] = path_feat['stationV2_cnt']
for cl in path_feat['cluster_n8'].unique():
        path_feat.loc[path_feat['cluster_n8']==cl, 'stationV2_cnt_devn8'] -= \
                path_feat.loc[path_feat['cluster_n8']==cl,'stationV2_cnt'].median()

In [28]:
# Frequency of cluster (n=500)
n500_cnt = ((path_feat['cluster_n500'].value_counts()/n_samples).round(4)*10000).astype(int) \
                                         .reset_index(name='n500_cnt') \
                                         .rename(columns={'index': 'cluster_n500'})

path_feat = path_feat.merge(n500_cnt, on='cluster_n500', how='left')

In [29]:
# Frequency of unique path
upath_cnt = ((path_feat['unique_path'].value_counts()/n_samples).round(4)*10000).astype(int) \
                                         .reset_index(name='upath_cnt') \
                                         .rename(columns={'index': 'unique_path'})

path_feat = path_feat.merge(upath_cnt, on='unique_path', how='left')

In [51]:
# Combination of S32 / 33
path_feat['path_32'] = ((~date_data['32.0'].isnull()) & (date_data['33.0'].isnull()))

In [30]:
path_feat.head()


Out[30]:
Id V2_1.0 V2_2.0 V2_3.1 V2_3.2 V2_3.3 V2_4.1 V2_4.0 V2_4.2 V2_4.3 ... stationV2_cnt unique_path cluster_n8 cluster_n50 cluster_n100 cluster_n500 stage_1_sum_devn8 stationV2_cnt_devn8 n500_cnt upath_cnt
0 4 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 14 13409 3 25 3 127 0.0 1.0 108 4
1 6 NaN 0.0 NaN NaN NaN NaN NaN NaN NaN ... 13 7029 1 45 80 20 0.0 0.0 38 31
2 7 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 12763 3 40 11 348 0.0 0.0 110 86
3 9 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 13658 3 43 34 204 0.0 0.0 112 88
4 11 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 9865 3 38 26 180 0.0 0.0 110 86

5 rows × 29 columns


In [31]:
path_feat.head()


Out[31]:
Id V2_1.0 V2_2.0 V2_3.1 V2_3.2 V2_3.3 V2_4.1 V2_4.0 V2_4.2 V2_4.3 ... stationV2_cnt unique_path cluster_n8 cluster_n50 cluster_n100 cluster_n500 stage_1_sum_devn8 stationV2_cnt_devn8 n500_cnt upath_cnt
0 4 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 14 13409 3 25 3 127 0.0 1.0 108 4
1 6 NaN 0.0 NaN NaN NaN NaN NaN NaN NaN ... 13 7029 1 45 80 20 0.0 0.0 38 31
2 7 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 12763 3 40 11 348 0.0 0.0 110 86
3 9 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 13658 3 43 34 204 0.0 0.0 112 88
4 11 0.0 NaN NaN NaN NaN NaN NaN NaN NaN ... 13 9865 3 38 26 180 0.0 0.0 110 86

5 rows × 29 columns

Store feature set as csv


In [52]:
path_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_path.csv'), index=False)

In [ ]: