In [2]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [3]:
print const.TRAIN_FILES
print const.TEST_FILES
In [4]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.set_index('name_dat', inplace=True)
lut.head(3)
Out[4]:
In [5]:
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])
In [6]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']
In [7]:
st_columns = lut.groupby('station_V2')['col_dat'].first().values
st_names = lut.groupby('station_V2')['col_dat'].first().index.values
In [8]:
date_data = pd.DataFrame(date_data[:, st_columns].todense()).replace(0, np.nan)
date_data.columns = [str(st_names[n]) for n in date_data.columns]
In [9]:
# Add clusters,response and id to data
# Add cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'))
cluster_info.head(3)
Out[9]:
In [10]:
date_data = date_data.merge(ids.reset_index(), left_index=True, right_index=True, how='left')
In [11]:
date_data = date_data.merge(cluster_info, left_on='Id', right_on='Id', how='left')
In [12]:
date_data = date_data.merge(y, left_on='Id', right_index=True, how='left')
In [13]:
# Get rid of decimals in timestamp
date_data = (100*date_data).round(0)
In [14]:
n_samples = date_data.shape[0]
In [15]:
print date_data.shape
date_data.head(3)
Out[15]:
In [16]:
d_cols = date_data.columns[:128]
In [17]:
d_feat = pd.DataFrame(ids.Id.values, columns=['Id'])
In [18]:
# Time at station
d_feat = pd.concat([d_feat, date_data[d_cols]], axis=1)
In [19]:
d_feat.columns = ['Id'] + ['t_' + c for c in d_feat.columns if c!='Id']
In [20]:
d_feat.head()
Out[20]:
In [21]:
# Delay at station
d_date = date_data.loc[:,d_cols].fillna(method='ffill', axis=1).diff(axis=1)
# Replace 0.0 by nan if station was not passed
d_date[date_data[d_cols].isnull()] = np.nan
# Change columns
d_date.columns = ['d_' + c for c in d_date.columns]
d_feat = pd.concat([d_feat, d_date], axis=1)
In [22]:
d_date.shape
Out[22]:
In [23]:
# Delay relative to cluster median (rel)
df_delta_devn8_rel = d_date.copy()
for cl in date_data['cluster_n50'].unique():
df_delta_devn8_rel.loc[date_data['cluster_n50']==cl, :] /= \
df_delta_devn8_rel.loc[date_data['cluster_n50']==cl, :].median()
In [24]:
df_delta_devn8_rel[df_delta_devn8_rel==np.inf] = 0
df_delta_devn8_rel.columns = ['d_rel_' + c for c in df_delta_devn8_rel.columns]
In [25]:
d_feat['ID'] = ids.Id.values
In [26]:
df_delta_devn8_rel['ID'] = ids.Id.values
In [27]:
d_feat.shape
Out[27]:
In [28]:
df_delta_devn8_rel.shape
Out[28]:
In [29]:
# Put ID column first
cols = [df_delta_devn8_rel.columns[-1]] + list(df_delta_devn8_rel.columns[:-1])
df_delta_devn8_rel = df_delta_devn8_rel[cols]
In [30]:
cols = [d_feat.columns[-1]] + list(d_feat.columns[:-1])
d_feat = d_feat[cols]
In [31]:
d_feat.drop('Id', axis=1, inplace=True)
In [32]:
d_date.head()
Out[32]:
In [73]:
d_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_date_station.csv'), index=False)
In [33]:
df_delta_devn8_rel.head()
Out[33]:
In [74]:
df_delta_devn8_rel.to_csv(os.path.join(const.DATA_PATH, 'feat_set_date_station_rel_n50.csv'), index=False)
In [ ]:
del d_feat, d_date
In [23]:
lines = lut['line_V2'].unique()
In [41]:
d_feat = pd.DataFrame(ids.Id.values, columns=['Id'])
In [42]:
# Use different line definition for 3 and 4
blocks = {'1':('0.0','11.0'),
'2':('12.0','23.0'),
'3.1':('24.1', '24.111'),
'3.2':('24.2', '24.211'),
'3.3':('24.3', '24.311'),
'3':('24.1', '28.0'),
'4.1':('25.1', '25.11'),
'4.2':('25.202', '25.21'),
'4.3':('25.212', '25.22'),
'4.4':('25.222', '25.23'),
'4':('25.1', '28.0'),
'5':('26.0', '28.0'),
'6':('29.0', '38.0'),
'7':('39.0', '51.0')}
df_min=pd.concat([date_data.loc[:, v[0]:v[1]].min(1) for k,v in blocks.iteritems()], axis=1)
df_max=pd.concat([date_data.loc[:, v[0]:v[1]].max(1) for k,v in blocks.iteritems()], axis=1)
df_delta = df_max - df_min
df_delta = df_delta.replace(0, np.nan)
# Change column names
df_min.columns = ['tmin_' + k for k,v in blocks.iteritems()]
df_max.columns = ['tmax_' + k for k,v in blocks.iteritems()]
df_delta.columns = ['tdelta_' + k for k,v in blocks.iteritems()]
In [43]:
# Replace difference of line 24
df_delta.loc[~df_delta['tdelta_4'].isnull(),'tdelta_3'] = np.nan
# Ajdust total for sublines
df_delta.loc[:,'tdelta_3'] -= df_delta.loc[:,['tdelta_3.1','tdelta_3.2','tdelta_3.3']].fillna(0).sum(1)
df_delta.loc[:,'tdelta_4'] -= df_delta.loc[:,['tdelta_4.1','tdelta_4.2','tdelta_4.3','tdelta_4.4']].fillna(0).sum(1)
In [44]:
df_delta.describe()
Out[44]:
In [45]:
# Delay relative to cluster median (abs)
df_delta_devn8_abs = df_delta.copy()
for cl in date_data['cluster_n50'].unique():
df_delta_devn8_abs.loc[date_data['cluster_n50']==cl, :] -= \
df_delta_devn8_abs.loc[date_data['cluster_n50']==cl, :].median()
df_delta_devn8_abs.columns = ['tdeltadevabs_' + k for k,v in blocks.iteritems()]
# Delay relative to cluster median (rel)
df_delta_devn8_rel = df_delta.copy()
for cl in date_data['cluster_n50'].unique():
df_delta_devn8_rel.loc[date_data['cluster_n50']==cl, :] /= \
df_delta_devn8_rel.loc[date_data['cluster_n50']==cl, :].median()
df_delta_devn8_rel.columns = ['tdeltadevrel_' + k for k,v in blocks.iteritems()]
In [46]:
df_delta_devn8_rel[df_delta_devn8_rel==np.inf] = 0
In [47]:
df_delta_devn8_rel.describe()
Out[47]:
In [48]:
d_feat = pd.concat([d_feat, df_min, df_max, df_delta, df_delta_devn8_abs, df_delta_devn8_rel], axis=1)
In [49]:
d_feat.head()
Out[49]:
In [50]:
d_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_date_lineV2.csv'), index=False)
In [33]:
def compress(df):
df = df.copy()
tag = df.columns[0].split('_')[0] + '_'
# Merge second block (line 6 & 7) (100% coverage)
df.loc[df[tag + '7'].isnull(),tag + '7'] = df.loc[df[tag + '7'].isnull(),tag + '6']
#print df['tdelta_7'].isnull().sum().astype(float)/n_samples
df.drop(tag + '6', axis=1, inplace=True)
# Merge 3.1-3.3
df.loc[df[tag + '3.1'].isnull(),tag + '3.1'] = df.loc[df[tag + '3.1'].isnull(),tag + '3.2']
df.loc[df[tag + '3.1'].isnull(),tag + '3.1'] = df.loc[df[tag + '3.1'].isnull(),tag + '3.3']
#print (~df['tdelta_3.1'].isnull()).sum().astype(float)/n_samples
df.drop([tag + '3.2',tag + '3.3'], axis=1, inplace=True)
# Merge 4.1-4.4
df.loc[df[tag + '4.1'].isnull(),tag + '4.1'] = df.loc[df[tag + '4.1'].isnull(),tag + '4.2']
df.loc[df[tag + '4.1'].isnull(),tag + '4.1'] = df.loc[df[tag + '4.1'].isnull(),tag + '4.3']
df.loc[df[tag + '4.1'].isnull(),tag + '4.1'] = df.loc[df[tag + '4.1'].isnull(),tag + '4.4']
#print (~df['tdelta_4.1'].isnull()).sum().astype(float)/n_samples
df.drop([tag + '4.2',tag + '4.3',tag + '4.4'], axis=1, inplace=True)
# Merge 1, 2, 3.1-3.3, 4.1-4.4
df.loc[df[tag + '1'].isnull(),tag + '1'] = df.loc[df[tag + '1'].isnull(),tag + '2']
df.loc[df[tag + '1'].isnull(),tag + '1'] = df.loc[df[tag + '1'].isnull(),tag + '3.1']
df.loc[df[tag + '1'].isnull(),tag + '1'] = df.loc[df[tag + '1'].isnull(),tag + '4.1']
#print (~df['tdelta_1'].isnull()).sum().astype(float)/n_samples
df.drop([tag + '2', tag + '3.1', tag + '4.1'], axis=1, inplace=True)
# Merge 3 and 4 (and drop 5)
df.loc[df[tag + '3'].isnull(),tag + '3'] = df.loc[df[tag + '3'].isnull(),tag + '4']
#print (~df['tdelta_3'].isnull()).sum().astype(float)/n_samples
df.drop([tag + '4', tag + '5'], axis=1, inplace=True)
df.columns = [tag + 'block1',tag + 'block1a',tag + 'block2']
print df.isnull().sum(0).astype(float)/n_samples
return df
In [34]:
d_feat = pd.DataFrame(ids.Id.values, columns=['Id'])
d_feat = pd.concat([d_feat,
compress(df_min),
compress(df_max),
compress(df_delta),
compress(df_delta_devn8_abs),
compress(df_delta_devn8_rel)],
axis=1)
In [35]:
d_feat['tmax'] = date_data[d_cols].max(1)
d_feat['tmin'] = date_data[d_cols].min(1)
d_feat['tdelta'] = d_feat['tmax'] - d_feat['tmin']
# Delay relative to cluster median (rel)
d_feat['tdelta_devrel'] = d_feat['tdelta']
for cl in date_data['cluster_n50'].unique():
d_feat.loc[date_data['cluster_n50']==cl, 'tdelta_devrel'] /= \
d_feat.loc[date_data['cluster_n50']==cl, 'tdelta'].median()
# Delay relative to cluster median (abs)
d_feat['tdelta_devabs'] = d_feat['tdelta']
for cl in date_data['cluster_n50'].unique():
d_feat.loc[date_data['cluster_n50']==cl, 'tdelta_devabs'] -= \
d_feat.loc[date_data['cluster_n50']==cl, 'tdelta'].median()
In [36]:
for col in d_feat.columns[1:]:
if col.find('rel')>0:
d_feat[col] = (d_feat[col]*100).round(0)
else:
d_feat[col] = (d_feat[col]/100).round(0)
In [37]:
d_feat.head()
Out[37]:
In [38]:
d_feat.to_csv(os.path.join(const.DATA_PATH, 'feat_set_date_all_compressed.csv'), index=False)
In [ ]: