In [2]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

Load data


In [3]:
print const.TRAIN_FILES


['train_numeric', 'train_categorical_to_num', 'train_date']

In [4]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)


Out[4]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [5]:
cat = func.load_data_file(const.TRAIN_FILES[1])
cat_data = cat['data']['features']


Returning <open file '/Volumes/My Book/kaggle_bosch/train_categorical_to_num.pkl', mode 'rb' at 0x115c4fc00>.pkl

In [7]:
# Load jayjay's features
cat_jay = pd.read_csv('data_jayjay/train.csv')
cat_cols = list(cat_jay.filter(like='CATEGORICAL').columns) + ['L1_L1_Missing value count',
                                                               'L3_L3_Missing value count',
                                                               'L3_L3_Unique count']
cat_jay = cat_jay[cat_cols]
print cat_jay.shape
cat_jay.head(3)


(1183747, 11)
Out[7]:
CATEGORICAL_Last_____1 CATEGORICAL_out_out_L3_S32_F3854_class2 CATEGORICAL_Unique count CATEGORICAL_Missing value count CATEGORICAL_Last_____2 CATEGORICAL_out_L3_S32_F3854_class1 CATEGORICAL_Max______1 CATEGORICAL_Max______3 L1_L1_Missing value count L3_L3_Missing value count L3_L3_Unique count
0 False 0 0 2140 False 0 False False 1227 431 0
1 False 0 0 2140 False 0 False False 1227 431 0
2 False 0 1 2077 False 0 False False 1227 368 1

Reproduce JayJay's features


In [8]:
jay_means = cat_jay.mean()
jay_sums = cat_jay.sum()
print jay_means


CATEGORICAL_Last_____1                        0.004167
CATEGORICAL_out_out_L3_S32_F3854_class2       0.008123
CATEGORICAL_Unique count                      1.433055
CATEGORICAL_Missing value count            2082.850172
CATEGORICAL_Last_____2                        0.003489
CATEGORICAL_out_L3_S32_F3854_class1           0.003434
CATEGORICAL_Max______1                        0.002749
CATEGORICAL_Max______3                        0.307366
L1_L1_Missing value count                  1220.368586
L3_L3_Missing value count                   388.670341
L3_L3_Unique count                            0.737975
dtype: float64

In [37]:
def missing_value_count(X):
    ''' Returns count of missing values per row of sparse matrix X'''
    
    return X.shape[1] - np.diff(X.indptr)

def value_last_element_row(X):
    ''' Return last value of each row of sparse csr matrix X'''
    
    # Get element where new row starts -1
    last = X.indptr[1:] - 1
    
    output = X.data[last]
    
    # Replace row with zero non-zero elements by nan
    output[np.diff(X.indptr)==0] = np.nan
    
    return output

def max_element_row(X):
    ''' Return maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    
    output = X.max(1).todense().A1
    
    output[output==0] = np.nan
    
    return output

def alpha_num_max_element_row(X):
    ''' Return alpha num maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            data = ['{:d}'.format(int(x)) for x in set(X[n, nz].todense().A1)]
            output.append( int(float(max(data))))
        else:
            #output.append(np.nan)
            output.append(0)
    
    return output

def nunique_row(X):
    ''' Return number of unique per row'''
    ''' Lazy, slow implementation, via data/indtptr much faster'''
    
    output= []
    
    for n in range(X.shape[0]):
        nz = X[n,:].nonzero()[1]
        
        if nz.shape[0]>0:
            output.append( len(set(X[n, nz].todense().A1)))
        else:
            output.append(0)
        
    return output

In [57]:
# 'L1_L1_Missing value count',
col_l1 = [int(i) for i in lut[lut['line']==1].col_cat.values if not np.isnan(i)]

print jay_means['L1_L1_Missing value count']
print pd.Series(missing_value_count(cat_data[:, col_l1])).mean()


1220.36858636
1220.36858636

In [58]:
# 'L3_L3_Missing value count'
col_l3 = [int(i) for i in lut[lut['line']==3].col_cat.values if not np.isnan(i)]

print jay_means['L3_L3_Missing value count']
print pd.Series(missing_value_count(cat_data[:, col_l3])).mean()


388.670340875
387.67034172

In [59]:
# 'L3_L3_Unique count'
col_l3 = [int(i) for i in lut[lut['line']==3].col_cat.values if not np.isnan(i)]

print jay_means['L3_L3_Unique count']
print pd.Series(nunique_row(cat_data[:, col_l3])).mean()


0.737975259916
0.737975259916

In [10]:
# CATEGORICAL_Last_____1
n_last = cat_data[n,:].nonzero()[1][-1]
sum([2, 4, 514] == cat_data[n, n_last])
print jay_means['CATEGORICAL_Last_____1']
pd.Series(value_last_element_row(cat_data)).isin([2, 4, 514]).mean()


Out[10]:
0.0041672756087238238

In [65]:
# CATEGORICAL_Last_____2
print jay_means['CATEGORICAL_Last_____2']
pd.Series(value_last_element_row(cat_data)).isin([16, 48]).mean()


Out[65]:
0.0034889211968435821

In [149]:
## CATEGORICAL_Missing value count
print jay_means['CATEGORICAL_Missing value count']
pd.Series(cat_data.shape[1] - np.diff(cat_data.indptr)).mean()


Out[149]:
2082.8501723763607

In [102]:
# CATEGORICAL_Max______1 (takes a while)
list1 = [2, 8389632, 514]
print jay_means['CATEGORICAL_Max______1']
pd.Series(alpha_num_max_element_row(cat_data)).isin(list1).mean()


Out[102]:
0.0027488982020651372

In [103]:
# CATEGORICAL_Max______3 (takes a while)
list3 = [3, 145, 4, 143, 8, 512, 6, 32]
print jay_means['CATEGORICAL_Max______3']
pd.Series(alpha_num_max_element_row(cat_data)).isin(list3).mean()


Out[103]:
0.30736635446594585

In [148]:
# CATEGORICAL_Unique count
print jay_means['CATEGORICAL_Unique count']
pd.Series(nunique_row(cat_data)).mean()


Out[148]:
1.433054529388459

In [58]:
# CATEGORICAL_out_L3_S32_F3854_class2
# CATEGORICAL_out_out_L3_S32_F3854_class2       0.008123
tmp = np.zeros(d.shape)
tmp[(d==2).values] = 2
tmp[(d==4).values] = 2

In [59]:
tmp.mean()


Out[59]:
0.0081233574403990049

In [ ]: