In [182]:
import os
import re
import pickle
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
%matplotlib inline
# Custom modules
import const
import func
In [183]:
const.TRAIN_FILES
Out[183]:
In [184]:
[num_info, cat_info, date_info] = [func.get_station_info(f) for f in const.TRAIN_FILES]
# Date features refer to numeric/categorical features
date_info['ref_feat_nr'] = date_info['feature_nr'] - 1
date_info.columns = ['line','station','feat_nr_dat', 'name_dat','feature_nr']
In [185]:
date_info.head()
Out[185]:
In [186]:
date_info.set_index(['line','station','feature_nr'], inplace=True)
num_info.set_index(['line','station','feature_nr'], inplace=True)
cat_info.set_index(['line','station','feature_nr'], inplace=True)
In [187]:
date_info.head()
Out[187]:
In [188]:
num_info.columns = ['name_num']
cat_info.columns = ['name_cat']
In [189]:
num_info.head()
Out[189]:
In [190]:
date_info = date_info.merge(cat_info,
how='outer',
left_index=True,
right_index=True). \
merge(num_info,
how='outer',
left_index=True,
right_index=True). \
reset_index()
date_info.sample(10)
Out[190]:
In [192]:
date_info.head(20)
Out[192]:
In [198]:
df_dat = date_info.loc[~date_info.name_dat.isnull(),['name_dat']].reset_index(drop=True).reset_index(drop=False)
df_dat.columns = ['col_dat', 'name_dat']
df_num = date_info.loc[~date_info.name_num.isnull(),['name_num']].reset_index(drop=True).reset_index(drop=False)
df_num.columns = ['col_num', 'name_num']
df_cat = date_info.loc[~date_info.name_cat.isnull(),['name_cat']].reset_index(drop=True).reset_index(drop=False)
df_cat.columns = ['col_cat', 'name_cat']
date_info = date_info.merge(df_dat, how='left', on='name_dat') \
.merge(df_num, how='left', on='name_num') \
.merge(df_cat, how='left', on='name_cat')
In [199]:
date_info.head(5)
Out[199]:
In [200]:
# Feature number L3_S37_D3942 is wrongly labeled
# However, if you look at the timestamp values it belongs the something else...
# So the category label has the wrong label: L3_S36_F3941...
In [201]:
date_info.loc[(date_info.feature_nr==3941) & (date_info.station==37),['name_cat']] = ['L3_S36_F3941']
date_info.drop( date_info[(date_info.feature_nr==3941) & (date_info.station==36)].index, axis=0, inplace=True)
In [202]:
# If all are correctly labeled there shouldn't not be any duplicates
date_info.feature_nr.value_counts().head()
Out[202]:
In [203]:
date_info.to_csv(os.path.join(const.DATA_PATH,'date_feat_lut.csv'), index=False)
In [ ]: