notebook.community



In [1]:

    
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func



In [2]:

    
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)









    Out[2]:






  
    
      
      line
      station
      feature_nr
      feat_nr_dat
      name_dat
      name_cat
      name_num
      col_dat
      col_num
      col_cat
      station_V2
      line_V2
    
  
  
    
      0
      0
      0
      0
      1.0
      L0_S0_D1
      NaN
      L0_S0_F0
      0.0
      0.0
      NaN
      0.0
      1.0
    
    
      1
      0
      0
      2
      3.0
      L0_S0_D3
      NaN
      L0_S0_F2
      1.0
      1.0
      NaN
      0.0
      1.0
    
    
      2
      0
      0
      4
      5.0
      L0_S0_D5
      NaN
      L0_S0_F4
      2.0
      2.0
      NaN
      0.0
      1.0



In [7]:

    
use_sets = 0 # 0: both, 1: train, 2:test

dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']

dat = func.load_data_file(const.TEST_FILES[2])

if use_sets==0:
    dat_data = vstack([dat_train, dat['data']['features']], format='csr')
    ids = pd.concat([id_train, dat['data']['ids']], axis=0)
elif use_sets==1: # Only train
    dat_data = dat_train
    ids = id_train
elif use_sets==2: # Only test
    dat_data = dat['data']['features']
    ids = dat['data']['ids']
    
del dat, dat_train, id_train









    



Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115a58c90>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115a58c90>.pkl

Calculate features based on jayjay definition



In [8]:

    
# Prepare dataframe with maximum timestamp per sample
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)

for l in range(4):
    
    # Get column numbers of sparse matrix belonging to this line
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    # Get maximum timestamp for tis line
    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    # Because of sparse matrix NA were encoded as zero
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    
    # 
    df['L{}max'.format(l)].round(2)

# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

# Add columns with next and previous timestamp per line
for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)



In [9]:

    
# List to keep track of all the columns we generated. Useful for some operations later.
feat_cols = []

for l in range(4):
    
    # Samples are the same if they are both NA or have the same timestamp
    df['sameL{}_next'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
    
    # Samples are the same if they are both NA or have the same timestamp
    df['sameL{}_prev'.format(l)] = 1 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]



In [11]:

    
df.sort_values('index', inplace=True)



In [25]:

    
df.head()









    Out[25]:






  
    
      
      index
      L1.0_V2_MAX
      L2.0_V2_MAX
      L3.1_V2_MAX
      L3.2_V2_MAX
      L3.3_V2_MAX
      L4.1_V2_MAX
      L4.0_V2_MAX
      L4.2_V2_MAX
      L4.3_V2_MAX
      ...
      sameL4.3_V2_next
      sameL4.3_V2_prev
      sameL4.4_V2_next
      sameL4.4_V2_prev
      sameL5.0_V2_next
      sameL5.0_V2_prev
      sameL6.0_V2_next
      sameL6.0_V2_prev
      sameL7.0_V2_next
      sameL7.0_V2_prev
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      4
      0
      82.269997
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1
      1
      1
      1
      0
      0
      0
      0
      1
      1
    
    
      6
      1
      NaN
      1313.150024
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1
      1
      1
      1
      1
      0
      0
      0
      1
      1
    
    
      7
      2
      1618.729980
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1
      1
      1
      1
      0
      1
      0
      0
      1
      1
    
    
      9
      3
      1149.219971
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1
      1
      1
      1
      1
      0
      0
      0
      1
      1
    
    
      11
      4
      602.669983
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1
      1
      1
      1
      1
      1
      0
      0
      1
      1
    
  

5 rows × 70 columns



In [13]:

    
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_{}.csv'.format(use_sets)), index_label='ID')

Calculate features based on base line definition



In [32]:

    
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    df['L{}max'.format(l)].round(2)

# Get row index as column to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)



In [33]:

    
feat_cols = []

for l in range(4):
    df['sameL{}_next'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_prev'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]



In [34]:

    
df.sort_values('index', inplace=True)



In [35]:

    
feat_cols









    Out[35]:





['sameL0_prev',
 'sameL0_next',
 'sameL1_prev',
 'sameL1_next',
 'sameL2_prev',
 'sameL2_next',
 'sameL3_prev',
 'sameL3_next']



In [17]:

    
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new_{}.csv'.format(use_sets)), index_label='ID')

Calculate features based on new line definition



In [18]:

    
line_V2s = lut['line_V2'].unique()
print line_V2s









    



[ 1.   2.   3.1  3.2  3.3  4.1  4.   4.2  4.3  4.4  5.   6.   7. ]



In [19]:

    
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L{}_V2_MAX'.format(x) for x in line_V2s], index=ids.Id)
for l in line_V2s:
    col_date = [int(i) for i in lut[lut['line_V2']==l].col_dat.values if not np.isnan(i)]

    df['L{}_V2_MAX'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}_V2_MAX'.format(l)].replace(0, np.nan, inplace=True)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)



In [20]:

    
feat_cols = []

for l in line_V2s:
    df['sameL{}_V2_next'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_V2_prev'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_V2_prev'.format(l), 'sameL{}_V2_next'.format(l)]



In [21]:

    
df.sort_values('index', inplace=True)



In [22]:

    
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_V2_same_L_new_{}.csv'.format(use_sets)), index_label='ID')

	feature_nr	feat_nr_dat	name_dat	name_cat	name_num	col_dat	col_num	col_cat	line_V2
0	0	1.0	L0_S0_D1	NaN	L0_S0_F0	0.0	0.0	NaN	1.0
1	2	3.0	L0_S0_D3	NaN	L0_S0_F2	1.0	1.0	NaN	1.0
2	4	5.0	L0_S0_D5	NaN	L0_S0_F4	2.0	2.0	NaN	1.0

	index	L1.0_V2_MAX	L2.0_V2_MAX	L3.1_V2_MAX	L3.2_V2_MAX	L3.3_V2_MAX	L4.1_V2_MAX	L4.0_V2_MAX	L4.2_V2_MAX	L4.3_V2_MAX	...	sameL4.3_V2_next	sameL4.3_V2_prev	sameL4.4_V2_next	sameL4.4_V2_prev	sameL5.0_V2_next	sameL5.0_V2_prev	sameL6.0_V2_next	sameL6.0_V2_prev	sameL7.0_V2_next	sameL7.0_V2_prev
Id
4	0	82.269997	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1	1	1	1	0	0	0	0	1	1
6	1	NaN	1313.150024	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1	1	1	1	1	0	0	0	1	1
7	2	1618.729980	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1	1	1	1	0	1	0	0	1	1
9	3	1149.219971	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1	1	1	1	1	0	0	0	1	1
11	4	602.669983	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1	1	1	1	1	1	0	0	1	1