In [1]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

In [2]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)


Out[2]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [7]:
use_sets = 0 # 0: both, 1: train, 2:test

dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']

dat = func.load_data_file(const.TEST_FILES[2])

if use_sets==0:
    dat_data = vstack([dat_train, dat['data']['features']], format='csr')
    ids = pd.concat([id_train, dat['data']['ids']], axis=0)
elif use_sets==1: # Only train
    dat_data = dat_train
    ids = id_train
elif use_sets==2: # Only test
    dat_data = dat['data']['features']
    ids = dat['data']['ids']
    
del dat, dat_train, id_train


Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115a58c90>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115a58c90>.pkl

Calculate features based on jayjay definition


In [8]:
# Prepare dataframe with maximum timestamp per sample
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)

for l in range(4):
    
    # Get column numbers of sparse matrix belonging to this line
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    # Get maximum timestamp for tis line
    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    # Because of sparse matrix NA were encoded as zero
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    
    # 
    df['L{}max'.format(l)].round(2)

# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

# Add columns with next and previous timestamp per line
for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)

In [9]:
# List to keep track of all the columns we generated. Useful for some operations later.
feat_cols = []

for l in range(4):
    
    # Samples are the same if they are both NA or have the same timestamp
    df['sameL{}_next'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
    
    # Samples are the same if they are both NA or have the same timestamp
    df['sameL{}_prev'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]

In [11]:
df.sort_values('index', inplace=True)

In [25]:
df.head()


Out[25]:
index L1.0_V2_MAX L2.0_V2_MAX L3.1_V2_MAX L3.2_V2_MAX L3.3_V2_MAX L4.1_V2_MAX L4.0_V2_MAX L4.2_V2_MAX L4.3_V2_MAX ... sameL4.3_V2_next sameL4.3_V2_prev sameL4.4_V2_next sameL4.4_V2_prev sameL5.0_V2_next sameL5.0_V2_prev sameL6.0_V2_next sameL6.0_V2_prev sameL7.0_V2_next sameL7.0_V2_prev
Id
4 0 82.269997 NaN NaN NaN NaN NaN NaN NaN NaN ... 1 1 1 1 0 0 0 0 1 1
6 1 NaN 1313.150024 NaN NaN NaN NaN NaN NaN NaN ... 1 1 1 1 1 0 0 0 1 1
7 2 1618.729980 NaN NaN NaN NaN NaN NaN NaN NaN ... 1 1 1 1 0 1 0 0 1 1
9 3 1149.219971 NaN NaN NaN NaN NaN NaN NaN NaN ... 1 1 1 1 1 0 0 0 1 1
11 4 602.669983 NaN NaN NaN NaN NaN NaN NaN NaN ... 1 1 1 1 1 1 0 0 1 1

5 rows × 70 columns


In [13]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_{}.csv'.format(use_sets)), index_label='ID')

Calculate features based on base line definition


In [32]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    df['L{}max'.format(l)].round(2)

# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)

In [33]:
feat_cols = []

for l in range(4):
    df['sameL{}_next'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_prev'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]

In [34]:
df.sort_values('index', inplace=True)

In [35]:
feat_cols


Out[35]:
['sameL0_prev',
 'sameL0_next',
 'sameL1_prev',
 'sameL1_next',
 'sameL2_prev',
 'sameL2_next',
 'sameL3_prev',
 'sameL3_next']

In [17]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new_{}.csv'.format(use_sets)), index_label='ID')

Calculate features based on new line definition


In [18]:
line_V2s = lut['line_V2'].unique()
print line_V2s


[ 1.   2.   3.1  3.2  3.3  4.1  4.   4.2  4.3  4.4  5.   6.   7. ]

In [19]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L{}_V2_MAX'.format(x) for x in line_V2s], index=ids.Id)
for l in line_V2s:
    col_date = [int(i) for i in lut[lut['line_V2']==l].col_dat.values if not np.isnan(i)]

    df['L{}_V2_MAX'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}_V2_MAX'.format(l)].replace(0, np.nan, inplace=True)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)

In [20]:
feat_cols = []

for l in line_V2s:
    df['sameL{}_V2_next'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_V2_prev'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_V2_prev'.format(l), 'sameL{}_V2_next'.format(l)]

In [21]:
df.sort_values('index', inplace=True)

In [22]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_V2_same_L_new_{}.csv'.format(use_sets)), index_label='ID')