In [3]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data


In [2]:
print const.TRAIN_FILES


['train_numeric', 'train_categorical_to_num', 'train_date']

In [57]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)


Out[57]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [127]:
dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']

dat = func.load_data_file(const.TEST_FILES[2])

dat_data = vstack([dat_train, dat['data']['features']], format='csr')
ids = pd.concat([id_train, dat['data']['ids']], axis=0)


Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115eba540>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115eba540>.pkl

In [128]:
print dat_data.shape
print ids.shape
ids.head(3)


(2367495, 1156)
(2367495, 1)
Out[128]:
Id
0 4
1 6
2 7

In [20]:
dat_our = pd.read_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[2] + '.csv'), nrows=1000)
dat_our.head(3)


Out[20]:
Id L0_S0_D1 L0_S0_D3 L0_S0_D5 L0_S0_D7 L0_S0_D9 L0_S0_D11 L0_S0_D13 L0_S0_D15 L0_S0_D17 ... L3_S50_D4246 L3_S50_D4248 L3_S50_D4250 L3_S50_D4252 L3_S50_D4254 L3_S51_D4255 L3_S51_D4257 L3_S51_D4259 L3_S51_D4261 L3_S51_D4263
0 4 82.24 82.24 82.24 82.24 82.24 82.24 82.24 82.24 82.24 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 6 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 7 1618.70 1618.70 1618.70 1618.70 1618.70 1618.70 1618.70 1618.70 1618.70 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3 rows × 1157 columns


In [162]:
# Load jayjay's features
#dat_jay = pd.read_csv('data_jayjay/train.csv', nrows=1000)
dat_jay = pd.read_csv('data_jayjay/train.csv')
#cat_cols = cat_jay.filter(like='CATEGORICAL').columns
#cat_jay = cat_jay[cat_cols]
#print cat_jay.shape
#cat_jay.head(3)

In [50]:
# On all samples
print (dat_our.iloc[:,1:].kurtosis(axis=1)-dat_jay['Kurtosis']).mean()
print (dat_our.iloc[:,1:].max(axis=1)!=dat_jay['Max']).sum() # Nans are different
print (dat_our.iloc[:,1:].min(axis=1)!=dat_jay['Min']).sum() # Nans are different
print (dat_our.iloc[:,1:].mean(axis=1)-dat_jay['Mean']).mean()
print (dat_our.iloc[:,1:].apply(lambda x: x.nunique(), axis=1)!=dat_jay['Unique count']).sum()


4.46731303101e-14
1
1
5.62173829899e-11
0

In [87]:
a=pd.Series([1,np.nan,2])
b=pd.Series([1,np.nan,2])

In [88]:
compare(a,b)


Out[88]:
True

In [168]:
def compare(x,y):
    if x.equals(y):
        return True
    else:
        return 'Different: mean: {} sum: {}'.format((x-y).mean(), (x-y).sum())
    
    return False

In [113]:
# Lines 0
line=0
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 0')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L0_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].mean(axis=1), dat_jay['L0_Mean'])
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L0_Unique count'])
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['DATE_L0max']) # Nans are different


Line 0
True
True
8.2864247259e-11
-1.4875440359e-14
True
True

In [ ]:
# Lines 1
line=1
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 1')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L1_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L1_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L1kurt'])

In [111]:
# Lines 2
line=2
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 2')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L2_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Range'])


Line 2
True
True
True

In [115]:
# Lines 3
line=3
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]

print('Line 3')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L3_Max']) 
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Min']) 
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L3_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L3kurt'])


Line 3
True
True
-1.23021153797e-14
True
-1.89971991518e-13

In [116]:
# Between lines and stations
col_dateL0 = [int(i)+1 for i in lut[lut['line']==0].col_dat.values if not np.isnan(i)]
col_dateL2 = [int(i)+1 for i in lut[lut['line']==2].col_dat.values if not np.isnan(i)]
col_dateL3 = [int(i)+1 for i in lut[lut['line']==3].col_dat.values if not np.isnan(i)]
col_dateS37 = [int(i)+1 for i in lut[lut['station']==37].col_dat.values if not np.isnan(i)]
col_dateS26 = [int(i)+1 for i in lut[lut['station']==26].col_dat.values if not np.isnan(i)]
col_dateS30 = [int(i)+1 for i in lut[lut['station']==30].col_dat.values if not np.isnan(i)]
col_dateS34 = [int(i)+1 for i in lut[lut['station']==34].col_dat.values if not np.isnan(i)]

print('Line 3')
print compare(dat_our.iloc[:,col_dateL0].max(axis=1) - dat_our.iloc[:,col_dateL3].max(axis=1), dat_jay['L0max_L3max'])
print compare(dat_our.iloc[:,col_dateL2].max(axis=1) - dat_our.iloc[:,col_dateS37].max(axis=1), dat_jay['L2max_S37max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateL0].min(axis=1), dat_jay['L3max_L0min'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS26].max(axis=1), dat_jay['L3max_S26max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS30].max(axis=1), dat_jay['L3max_S30max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS34].min(axis=1), dat_jay['L3max_S34min'])


Line 3
-1.01295843022e-17
-5.11415987649e-16
2.70646024476e-15
2.45490308551e-16
-8.93419814403e-15
1.38480289532e-15

In [132]:
max(col_date)


Out[132]:
1156

In [225]:
# Same station

# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    df['L{}max'.format(l)].round(2)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

In [226]:
# Sort by ID
df.sort_values(['Id'], inplace=True)

In [227]:
for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

# Use only train id
df = df[df['Id'].isin(id_train.Id)]

In [228]:
df.head()


Out[228]:
index Id L0max L1max L2max L3max index_prev index_next Id_prev Id_next L0max_prev L0max_next L1max_prev L1max_next L2max_prev L2max_next L3max_prev L3max_next
0 0 4 82.269997 NaN NaN 87.290001 1183749.0 1183750.0 3.0 5.0 NaN 255.490005 671.950012 NaN 704.109985 255.500000 711.080017 256.279999
1 1 6 1313.150024 NaN NaN 1315.750000 1183750.0 2.0 5.0 7.0 255.490005 1618.729980 NaN NaN 255.500000 NaN 256.279999 1624.420044
2 2 7 1618.729980 NaN NaN 1624.420044 1.0 1183751.0 6.0 8.0 1313.150024 NaN NaN 743.400024 NaN 760.929993 1315.750000 770.280029
3 3 9 1149.219971 NaN NaN 1154.160034 1183751.0 1183752.0 8.0 10.0 NaN 907.369995 743.400024 NaN 760.929993 NaN 770.280029 911.289978
4 4 11 602.669983 NaN NaN 606.020020 1183752.0 1183753.0 10.0 12.0 907.369995 602.669983 NaN NaN NaN NaN 911.289978 623.280029

In [229]:
# Now compare
print('Line 0')
print compare(((df['L0max']==df['L0max_prev']) | (df['L0max'].isnull()) & (df['L0max_prev'].isnull())).astype(int), 
              dat_jay['sameL0'])
print compare(((df['L0max']==df['L0max_next']) | (df['L0max'].isnull()) & (df['L0max_next'].isnull())).astype(int), 
              dat_jay['sameL0 (#1)'])
print('Line 1')
print compare(((df['L1max']==df['L1max_prev']) | (df['L1max'].isnull()) & (df['L1max_prev'].isnull())).astype(int), 
              dat_jay['sameL1'])
print compare(((df['L1max']==df['L1max_next']) | (df['L1max'].isnull()) & (df['L1max_next'].isnull())).astype(int), 
              dat_jay['sameL1 (#1)'])
print('Line 2')
print compare(((df['L2max']==df['L2max_prev']) | (df['L2max'].isnull()) & (df['L2max_prev'].isnull())).astype(int), 
              dat_jay['sameL2'])
print compare(((df['L2max']==df['L2max_next']) | (df['L2max'].isnull()) & (df['L2max_next'].isnull())).astype(int), 
              dat_jay['sameL2 (#1)'])
print('Line 3')
print compare(((df['L3max']==df['L3max_prev']) | (df['L3max'].isnull()) & (df['L3max_prev'].isnull())).astype(int), 
              dat_jay['sameL3'])
print compare(((df['L3max']==df['L3max_next']) | (df['L3max'].isnull()) & (df['L3max_next'].isnull())).astype(int), 
              dat_jay['sameL3 (#1)'])


Line 0
True
True
Line 1
True
True
Line 2
True
True
Line 3
True
True

In [8]:
date_cols = ['Kurtosis', 'Max', 'Mean', 'Min', 'Range','Unique count',
             'L0_Max', 'L0_Mean', 'L0_Min', 'L0_Range', 'L0_Unique count', 'DATE_L0max',
             'L1_Max', 'L1_Min', 'L1_Range', 'L1_Unique count', 'DATE_L1kurt',
             'L2_Max', 'L2_Min', 'L2_Range', 
             'L3_Max', 'L3_Min', 'L3_Range', 'L3_Unique count', 'DATE_L3kurt', 'DATE_L3min',
             'L0max_L3max', 'L2max_S37max', 'L3max_L0min', 'L3max_S26max', 'L3max_S30max', 'L3max_S34min',
             'sameL0', 'sameL0 (#1)', 'sameL1', 'sameL1 (#1)', 'sameL2', 'sameL2 (#1)', 'sameL3', 'sameL3 (#1)',
'L1_L1_Missing value count',
'L3_L3_Missing value count',
'L3_L3_Unique count',]

In [9]:
jay_feat_diffs = ['S13min_S33min',
'S22max_S32min',
'S22min_S32min',
'S26min_S24min',
'S26min_S37min',
'S27min_S32min',
'S29max_S35max',
'S29min_S32min',
'S29min_S34min',
'S29min_S37min',
'S30min_S35min',
'S30min_S37min',
'S32max_S37min',
'S32min_S10min',
'S32min_S30min',
'S32min_S34min',
'S33max_S29min',
'S33min_S30min',
'S33min_S34min',
'S33min_S35min',
'S33min_S36min',
'S33min_S37min',
'S34min_S35min',
'S35min_L1min',
'S36max_S29min',
'S37min_S34min']
jay_feat_dates = [
'DATE_S0_max',
'DATE_S1_max',
'DATE_S10max',
'DATE_S10min',
'DATE_S11max',
'DATE_S13max',
'DATE_S13min',
'DATE_S18max',
'DATE_S19max',
'DATE_S2_max',
'DATE_S20max',
'DATE_S20min',
'DATE_S21max',
'DATE_S21min',
'DATE_S23max',
'DATE_S24max',
'DATE_S24min',
'DATE_S25max',
'DATE_S25min',
'DATE_S26max',
'DATE_S27max',
'DATE_S28max',
'DATE_S3_max',
'DATE_S30max',
'DATE_S32max',
'DATE_S32min',
'DATE_S33max',
'DATE_S34max',
'DATE_S35max',
'DATE_S36max',
'DATE_S37max',
'DATE_S38max',
'DATE_S4_max',
'DATE_S40max',
'DATE_S43max',
'DATE_S44max',
'DATE_S45min',
'DATE_S47max',
'DATE_S49max',
'DATE_S50max',
'DATE_S6_max',
'DATE_S7_max',
'DATE_S8_max',
'DATE_S8_min',
'DATE_S9_max']

In [6]:
cat_jay.filter(like='Max').columns


Out[6]:
Index([u'L0_Max', u'L3_Max', u'L1_Max', u'L2_Max', u'Max',
       u'CATEGORICAL_Max______1', u'CATEGORICAL_Max______3'],
      dtype='object')

In [8]:
cat_jay.columns[:100]


Out[8]:
Index([u'Id', u'Response', u'ratio', u'FOR100_Sum', u'L3_S33_F3857',
       u'FOR165_Sum', u'BAC165_Sum', u'BAC100_Sum', u'sameL0 (#1)',
       u'L3_S30_F3754', u'L3_S30_F3809', u'FOR165_log_lag_L2', u'L3_S30_F3744',
       u'BAC100_log_lag_L3', u'DATE_L3kurt', u'sameL3', u'sameL3 (#1)',
       u'FOR100_log_lag_L3', u'FOR165_log_lag', u'BAC165_log_lag',
       u'L3_S30_F3774', u'sameL1 (#1)', u'L3_S33_F3859', u'L3_S30_F3494',
       u'L3_S29_F3333', u'BAC165_log_lag_L3', u'L3_S30_F3759', u'L0_S1_F28',
       u'L3_S29_F3373', u'Range', u'Kurtosis', u'L3_S33_F3865', u'BAC30_Sum_S',
       u'DATE_S33max', u'FOR165_log_lag_L3', u'FOR60_Sum_S3', u'L3_S29_F3348',
       u'sameL2 (#1)', u'L0_Max', u'L3_S30_F3769', u'Response(-1)',
       u'L3_S30_F3749', u'BAC100_log_lag', u'L3_S29_F3351', u'L3_S36_F3920',
       u'FOR100_log_lag', u'BAC165_log_lag_L2', u'BAC60_Sum_S3',
       u'L3_S29_F3339', u'L3_S30_F3804', u'L3max_L0min', u'L3_S29_F3379',
       u'L3_S29_F3479', u'FOR100_log_lag_L2', u'BAC165_log_lag_L1',
       u'BAC165_log_lag_L0', u'FOR30_Sum_S', u'BAC100_log_lag_L2',
       u'L3_S29_F3342', u'L3_S29_F3327', u'L0_S2_F44', u'L3_S30_F3604',
       u'L0_S3_F100', u'L3_S30_F3534', u'L3_S29_F3354', u'L3_S30_F3574',
       u'L1_S24_F1844', u'L0_S0_F20_*_L0_S0_F20', u'L3_S30_F3639',
       u'FOR165_log_lag_L1', u'FOR100_log_lag_L1', u'FOR165_log_lag_L0',
       u'L3_S30_F3504', u'L3_S30_F3609', u'L0_S7_F138', u'L3_S29_F3321',
       u'L3_Min', u'L3_S29_F3336', u'Response(-1) (#1)', u'L0max_L3max',
       u'BAC100_log_lag_L1', u'L3_S30_F3709', u'L3_S30_F3829',
       u'FOR60_log_lag_S33', u'L3_S29_F3315', u'L0_S0_F18', u'L3_S30_F3544',
       u'L3_S29_F3330', u'L0_S1_F24', u'L0_S5_F116', u'L3_S29_F3324',
       u'CATEGORICAL_Last_____1', u'FOR60_log_lag_S30', u'L0_S0_F0',
       u'L0_S4_F109', u'L0_S9_F180', u'L3_S35_F3889', u'L1_S24_F1581',
       u'CATEGORICAL_out_out_L3_S32_F3854_class2', u'L2_Min'],
      dtype='object')

Reproduce JayJay's features


In [48]:
jay_means = cat_jay.mean()
jay_sums = cat_jay.sum()

In [95]:
def value_last_element_row(X):
    ''' Return last value of each row of sparse csr matrix X'''
    
    # Get element where new row starts -1
    last = X.indptr[1:] - 1
    
    output = X.data[last]
    
    # Replace row with zero non-zero elements by nan
    output[np.diff(X.indptr)==0] = np.nan
    
    return output

def max_element_row(X):
    ''' Return maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    
    output = X.max(1).todense().A1
    
    output[output==0] = np.nan
    
    return output

def alpha_num_max_element_row(X):
    ''' Return alpha num maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            data = ['{:d}'.format(int(x)) for x in set(X[n, nz].todense().A1)]
            output.append( int(float(max(data))))
        else:
            #output.append(np.nan)
            output.append(0)
    
    return output

def nunique_row(X):
    ''' Return number of unique per row'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            output.append( len(set(X[n, nz].todense().A1)))
        else:
            output.append(0)
        
    return output

In [10]:
# CATEGORICAL_Last_____1
n_last = cat_data[n,:].nonzero()[1][-1]
sum([2, 4, 514] == cat_data[n, n_last])
pd.Series(value_last_element_row(cat_data)).isin([2, 4, 514]).mean()


Out[10]:
0.0041672756087238238

In [65]:
# CATEGORICAL_Last_____2
pd.Series(value_last_element_row(cat_data)).isin([16, 48]).mean()


Out[65]:
0.0034889211968435821

In [149]:
## CATEGORICAL_Missing value count
pd.Series(cat_data.shape[1] - np.diff(cat_data.indptr)).mean()


Out[149]:
2082.8501723763607

In [102]:
# CATEGORICAL_Max______1 (takes a while)
list1 = [2, 8389632, 514]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list1).mean()


Out[102]:
0.0027488982020651372

In [103]:
# CATEGORICAL_Max______3 (takes a while)
list3 = [3, 145, 4, 143, 8, 512, 6, 32]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list3).mean()


Out[103]:
0.30736635446594585

In [148]:
# CATEGORICAL_Unique count
pd.Series(nunique_row(cat_data)).mean()


Out[148]:
1.433054529388459

In [15]:
# CATEGORICAL_out_L3_S32_F3854_class1
# CATEGORICAL_out_L3_S32_F3854_class1           0.003434
col_nr = lut[lut['name_cat']=='L3_S32_F3854'].col_cat.values[0].astype(int)

In [25]:
d = pd.Series(cat_data[:, 1986].todense().A1)
d.replace(0, np.nan, inplace=True)

In [52]:
tmp = np.zeros(d.shape)
tmp[(d==16).values] = 1
tmp[(d==48).values] = 1

In [53]:
tmp.mean()


Out[53]:
0.0034340108148109352

In [58]:
# CATEGORICAL_out_L3_S32_F3854_class2
# CATEGORICAL_out_out_L3_S32_F3854_class2       0.008123
tmp = np.zeros(d.shape)
tmp[(d==2).values] = 2
tmp[(d==4).values] = 2

In [59]:
tmp.mean()


Out[59]:
0.0081233574403990049

In [ ]: