Create a lookup table between date column index and other indices

(Excluding ID / Reponse columns)


In [182]:
import os
import re
import pickle
import time

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix

%matplotlib inline

# Custom modules
import const
import func

Load data


In [183]:
const.TRAIN_FILES


Out[183]:
['train_numeric', 'train_categorical_to_num', 'train_date']

In [184]:
[num_info, cat_info, date_info] = [func.get_station_info(f) for f in const.TRAIN_FILES]

# Date features refer to numeric/categorical features
date_info['ref_feat_nr'] = date_info['feature_nr'] - 1
date_info.columns = ['line','station','feat_nr_dat', 'name_dat','feature_nr']

In [185]:
date_info.head()


Out[185]:
line station feat_nr_dat name_dat feature_nr
0 0 0 1 L0_S0_D1 0
1 0 0 3 L0_S0_D3 2
2 0 0 5 L0_S0_D5 4
3 0 0 7 L0_S0_D7 6
4 0 0 9 L0_S0_D9 8

In [186]:
date_info.set_index(['line','station','feature_nr'], inplace=True)
num_info.set_index(['line','station','feature_nr'], inplace=True)
cat_info.set_index(['line','station','feature_nr'], inplace=True)

Date info LUT


In [187]:
date_info.head()


Out[187]:
feat_nr_dat name_dat
line station feature_nr
0 0 0 1 L0_S0_D1
2 3 L0_S0_D3
4 5 L0_S0_D5
6 7 L0_S0_D7
8 9 L0_S0_D9

In [188]:
num_info.columns = ['name_num']
cat_info.columns = ['name_cat']

In [189]:
num_info.head()


Out[189]:
name_num
line station feature_nr
0 0 0 L0_S0_F0
2 L0_S0_F2
4 L0_S0_F4
6 L0_S0_F6
8 L0_S0_F8

In [190]:
date_info = date_info.merge(cat_info,
                            how='outer',
                            left_index=True,
                            right_index=True). \
                      merge(num_info, 
                            how='outer',
                            left_index=True,
                            right_index=True). \
                      reset_index()
date_info.sample(10)


Out[190]:
line station feature_nr feat_nr_dat name_dat name_cat name_num
1408 1 25 1881 NaN NaN NaN L1_S25_F1881
1629 1 25 2167 NaN NaN NaN L1_S25_F2167
2797 3 30 3777 NaN NaN L3_S30_F3777 NaN
2871 3 34 3878 3879.0 L3_S34_D3879 NaN L3_S34_F3878
1118 1 24 1486 NaN NaN NaN L1_S24_F1486
2170 1 25 2918 NaN NaN L1_S25_F2918 NaN
494 0 23 671 NaN NaN NaN L0_S23_F671
810 1 24 1084 1085.0 L1_S24_D1085 L1_S24_F1084 NaN
2927 3 38 3959 NaN NaN L3_S38_F3959 NaN
3070 3 47 4164 4165.0 L3_S47_D4165 L3_S47_F4164 NaN

In [192]:
date_info.head(20)


Out[192]:
line station feature_nr feat_nr_dat name_dat name_cat name_num
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4
3 0 0 6 7.0 L0_S0_D7 NaN L0_S0_F6
4 0 0 8 9.0 L0_S0_D9 NaN L0_S0_F8
5 0 0 10 11.0 L0_S0_D11 NaN L0_S0_F10
6 0 0 12 13.0 L0_S0_D13 NaN L0_S0_F12
7 0 0 14 15.0 L0_S0_D15 NaN L0_S0_F14
8 0 0 16 17.0 L0_S0_D17 NaN L0_S0_F16
9 0 0 18 19.0 L0_S0_D19 NaN L0_S0_F18
10 0 0 20 21.0 L0_S0_D21 NaN L0_S0_F20
11 0 0 22 23.0 L0_S0_D23 NaN L0_S0_F22
12 0 1 24 NaN NaN NaN L0_S1_F24
13 0 1 25 26.0 L0_S1_D26 L0_S1_F25 NaN
14 0 1 27 NaN NaN L0_S1_F27 NaN
15 0 1 28 NaN NaN NaN L0_S1_F28
16 0 1 29 30.0 L0_S1_D30 L0_S1_F29 NaN
17 0 1 31 NaN NaN L0_S1_F31 NaN
18 0 2 32 NaN NaN NaN L0_S2_F32
19 0 2 33 34.0 L0_S2_D34 L0_S2_F33 NaN

Insert column numbers


In [198]:
df_dat = date_info.loc[~date_info.name_dat.isnull(),['name_dat']].reset_index(drop=True).reset_index(drop=False)
df_dat.columns = ['col_dat', 'name_dat']

df_num = date_info.loc[~date_info.name_num.isnull(),['name_num']].reset_index(drop=True).reset_index(drop=False)
df_num.columns = ['col_num', 'name_num']

df_cat = date_info.loc[~date_info.name_cat.isnull(),['name_cat']].reset_index(drop=True).reset_index(drop=False)
df_cat.columns = ['col_cat', 'name_cat']

date_info = date_info.merge(df_dat, how='left', on='name_dat') \
                     .merge(df_num, how='left', on='name_num') \
                     .merge(df_cat, how='left', on='name_cat')

In [199]:
date_info.head(5)


Out[199]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN
3 0 0 6 7.0 L0_S0_D7 NaN L0_S0_F6 3.0 3.0 NaN
4 0 0 8 9.0 L0_S0_D9 NaN L0_S0_F8 4.0 4.0 NaN
5 0 0 10 11.0 L0_S0_D11 NaN L0_S0_F10 5.0 5.0 NaN
6 0 0 12 13.0 L0_S0_D13 NaN L0_S0_F12 6.0 6.0 NaN
7 0 0 14 15.0 L0_S0_D15 NaN L0_S0_F14 7.0 7.0 NaN
8 0 0 16 17.0 L0_S0_D17 NaN L0_S0_F16 8.0 8.0 NaN
9 0 0 18 19.0 L0_S0_D19 NaN L0_S0_F18 9.0 9.0 NaN
10 0 0 20 21.0 L0_S0_D21 NaN L0_S0_F20 10.0 10.0 NaN
11 0 0 22 23.0 L0_S0_D23 NaN L0_S0_F22 11.0 11.0 NaN
12 0 1 24 NaN NaN NaN L0_S1_F24 NaN 12.0 NaN
13 0 1 25 26.0 L0_S1_D26 L0_S1_F25 NaN 12.0 NaN 0.0
14 0 1 27 NaN NaN L0_S1_F27 NaN NaN NaN 1.0
15 0 1 28 NaN NaN NaN L0_S1_F28 NaN 13.0 NaN
16 0 1 29 30.0 L0_S1_D30 L0_S1_F29 NaN 13.0 NaN 2.0
17 0 1 31 NaN NaN L0_S1_F31 NaN NaN NaN 3.0
18 0 2 32 NaN NaN NaN L0_S2_F32 NaN 14.0 NaN
19 0 2 33 34.0 L0_S2_D34 L0_S2_F33 NaN 14.0 NaN 4.0
20 0 2 35 NaN NaN L0_S2_F35 NaN NaN NaN 5.0
21 0 2 36 NaN NaN NaN L0_S2_F36 NaN 15.0 NaN
22 0 2 37 38.0 L0_S2_D38 L0_S2_F37 NaN 15.0 NaN 6.0
23 0 2 39 NaN NaN L0_S2_F39 NaN NaN NaN 7.0
24 0 2 40 NaN NaN NaN L0_S2_F40 NaN 16.0 NaN
25 0 2 41 42.0 L0_S2_D42 L0_S2_F41 NaN 16.0 NaN 8.0
26 0 2 43 NaN NaN L0_S2_F43 NaN NaN NaN 9.0
27 0 2 44 NaN NaN NaN L0_S2_F44 NaN 17.0 NaN
28 0 2 45 46.0 L0_S2_D46 L0_S2_F45 NaN 17.0 NaN 10.0
29 0 2 47 NaN NaN L0_S2_F47 NaN NaN NaN 11.0
30 0 2 48 NaN NaN NaN L0_S2_F48 NaN 18.0 NaN
31 0 2 49 50.0 L0_S2_D50 L0_S2_F49 NaN 18.0 NaN 12.0
32 0 2 51 NaN NaN L0_S2_F51 NaN NaN NaN 13.0
33 0 2 52 NaN NaN NaN L0_S2_F52 NaN 19.0 NaN
34 0 2 53 54.0 L0_S2_D54 L0_S2_F53 NaN 19.0 NaN 14.0
35 0 2 55 NaN NaN L0_S2_F55 NaN NaN NaN 15.0
36 0 2 56 NaN NaN NaN L0_S2_F56 NaN 20.0 NaN
37 0 2 57 58.0 L0_S2_D58 L0_S2_F57 NaN 20.0 NaN 16.0
38 0 2 59 NaN NaN L0_S2_F59 NaN NaN NaN 17.0
39 0 2 60 NaN NaN NaN L0_S2_F60 NaN 21.0 NaN

Update wrong numeric features


In [200]:
# Feature number L3_S37_D3942 is wrongly labeled
# However, if you look at the timestamp values it belongs the something else...
# So the category label has the wrong label: L3_S36_F3941...

In [201]:
date_info.loc[(date_info.feature_nr==3941) & (date_info.station==37),['name_cat']] = ['L3_S36_F3941']
date_info.drop( date_info[(date_info.feature_nr==3941) & (date_info.station==36)].index, axis=0, inplace=True)

In [202]:
# If all are correctly labeled there shouldn't not be any duplicates
date_info.feature_nr.value_counts().head()


Out[202]:
2047    1
1350    1
3395    1
1346    1
1344    1
Name: feature_nr, dtype: int64

Save look-up table


In [203]:
date_info.to_csv(os.path.join(const.DATA_PATH,'date_feat_lut.csv'), index=False)

In [ ]: