In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [2]:
print const.TRAIN_FILES
print const.TEST_FILES
In [3]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.set_index('name_dat', inplace=True)
lut.head(3)
Out[3]:
In [4]:
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])
In [5]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']
In [6]:
st_columns = lut.groupby('station_V2')['col_dat'].first().values
st_names = lut.groupby('station_V2')['col_dat'].first().index.values
In [7]:
date_data = pd.DataFrame(date_data[:, st_columns].todense()).replace(0, np.nan)
date_data.columns = [str(st_names[n]) for n in date_data.columns]
In [8]:
# Add clusters,response and id to data
# Add cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'))
cluster_info.head(3)
Out[8]:
In [9]:
date_data = date_data.merge(ids.reset_index(), left_index=True, right_index=True, how='left')
In [10]:
date_data = date_data.merge(cluster_info, left_on='Id', right_on='Id', how='left')
In [11]:
date_data = date_data.merge(y, left_on='Id', right_index=True, how='left')
In [12]:
print date_data.shape
date_data.head(3)
Out[12]:
In [35]:
d_cols = date_data.columns[:128]
In [13]:
n_samples = date_data.shape[0]
In [14]:
lines = lut['line_V2'].unique()
In [15]:
path_feat = pd.DataFrame(ids.Id.values)
In [16]:
for line in lines:
stations = [str(float(x)) for x in lut[lut['line_V2']==line]['station_V2'].unique()]
df = (~date_data.loc[:,date_data.columns.isin(stations)].isnull()).sum(1)
df = df.replace(0, np.nan)
df -= df.value_counts().index[0]
path_feat = pd.concat([path_feat, df], axis=1)
In [17]:
path_feat.columns = ['Id'] + ['V2_' + str(x) for x in lines if x!='Id']
In [43]:
# First station
path_feat['first_station'] = date_data[d_cols].apply(lambda x: x.first_valid_index(), axis=1)
path_feat['last_station'] = date_data[d_cols].apply(lambda x: x.last_valid_index(), axis=1)
In [18]:
path_feat.head()
Out[18]:
In [19]:
# Which line in the end
path_feat['stage_2'] = path_feat.loc[:,['V2_5.0','V2_6.0']].abs().idxmin(1)
In [20]:
# Which line in the beginning
path_feat['stage_1'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().idxmin(1)
In [21]:
# How many lines in the first part?
path_feat['stage_1_cnt'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].abs().count(1)
In [22]:
# Compress stage1
path_feat['stage_1_sum'] = path_feat.loc[:,['V2_1.0','V2_2.0','V2_3.1','V2_3.2','V2_3.3',
'V2_4.1','V2_4.2','V2_4.3','V2_4.4']].sum(1)
In [23]:
# Compress stage2
path_feat['stage_2_sum'] = path_feat.loc[:,['V2_5.0','V2_6.0']].sum(1)
In [24]:
# How many stations in total path
path_feat['stationV2_cnt'] = date_data.loc[:,'0.0':'51.0'].count(1)
In [25]:
# Path nr & clusters
path_feat['unique_path'] = date_data['unique_path']
path_feat['cluster_n8'] = date_data['cluster_n8']
path_feat['cluster_n50'] = date_data['cluster_n50']
path_feat['cluster_n100'] = date_data['cluster_n100']
path_feat['cluster_n500'] = date_data['cluster_n500']
In [26]:
# How many stations in total path (deviation from cluster median)
path_feat['stage_1_sum_devn8'] = path_feat['stage_1_sum']
for cl in path_feat['cluster_n8'].unique():
path_feat.loc[path_feat['cluster_n8']==cl, 'stage_1_sum_devn8'] -= \
path_feat.loc[path_feat['cluster_n8']==cl,'stage_1_sum'].median()
In [27]:
# How many stations in total path (deviation from cluster median)
path_feat['stationV2_cnt_devn8'] = path_feat['stationV2_cnt']
for cl in path_feat['cluster_n8'].unique():
path_feat.loc[path_feat['cluster_n8']==cl, 'stationV2_cnt_devn8'] -= \
path_feat.loc[path_feat['cluster_n8']==cl,'stationV2_cnt'].median()
In [28]:
# Frequency of cluster (n=500)
n500_cnt = ((path_feat['cluster_n500'].value_counts()/n_samples).round(4)*10000).astype(int) \
.reset_index(name='n500_cnt') \
.rename(columns={'index': 'cluster_n500'})
path_feat = path_feat.merge(n500_cnt, on='cluster_n500', how='left')
In [29]:
# Frequency of unique path
upath_cnt = ((path_feat['unique_path'].value_counts()/n_samples).round(4)*10000).astype(int) \
.reset_index(name='upath_cnt') \
.rename(columns={'index': 'unique_path'})
path_feat = path_feat.merge(upath_cnt, on='unique_path', how='left')
In [51]:
# Combination of S32 / 33
path_feat['path_32'] = ((~date_data['32.0'].isnull()) & (date_data['33.0'].isnull()))
In [30]:
path_feat.head()
Out[30]:
In [31]:
path_feat.head()
Out[31]:
In [52]:
path_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_path.csv'), index=False)
In [ ]: