In [3]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [2]:
print const.TRAIN_FILES
In [57]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)
Out[57]:
In [127]:
dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']
dat = func.load_data_file(const.TEST_FILES[2])
dat_data = vstack([dat_train, dat['data']['features']], format='csr')
ids = pd.concat([id_train, dat['data']['ids']], axis=0)
In [128]:
print dat_data.shape
print ids.shape
ids.head(3)
Out[128]:
In [20]:
dat_our = pd.read_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[2] + '.csv'), nrows=1000)
dat_our.head(3)
Out[20]:
In [162]:
# Load jayjay's features
#dat_jay = pd.read_csv('data_jayjay/train.csv', nrows=1000)
dat_jay = pd.read_csv('data_jayjay/train.csv')
#cat_cols = cat_jay.filter(like='CATEGORICAL').columns
#cat_jay = cat_jay[cat_cols]
#print cat_jay.shape
#cat_jay.head(3)
In [50]:
# On all samples
print (dat_our.iloc[:,1:].kurtosis(axis=1)-dat_jay['Kurtosis']).mean()
print (dat_our.iloc[:,1:].max(axis=1)!=dat_jay['Max']).sum() # Nans are different
print (dat_our.iloc[:,1:].min(axis=1)!=dat_jay['Min']).sum() # Nans are different
print (dat_our.iloc[:,1:].mean(axis=1)-dat_jay['Mean']).mean()
print (dat_our.iloc[:,1:].apply(lambda x: x.nunique(), axis=1)!=dat_jay['Unique count']).sum()
In [87]:
a=pd.Series([1,np.nan,2])
b=pd.Series([1,np.nan,2])
In [88]:
compare(a,b)
Out[88]:
In [168]:
def compare(x,y):
if x.equals(y):
return True
else:
return 'Different: mean: {} sum: {}'.format((x-y).mean(), (x-y).sum())
return False
In [113]:
# Lines 0
line=0
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]
print('Line 0')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L0_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].mean(axis=1), dat_jay['L0_Mean'])
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L0_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L0_Unique count'])
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['DATE_L0max']) # Nans are different
In [ ]:
# Lines 1
line=1
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]
print('Line 1')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L1_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L1_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L1_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L1kurt'])
In [111]:
# Lines 2
line=2
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]
print('Line 2')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L2_Max']) # Nans are different
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Min']) # Nans are different
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L2_Range'])
In [115]:
# Lines 3
line=3
col_date = lut[lut['line']==line].col_dat.values
col_date = [int(i)+1 for i in col_date if not np.isnan(i)]
print('Line 3')
print compare(dat_our.iloc[:,col_date].max(axis=1), dat_jay['L3_Max'])
print compare(dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Min'])
print compare(dat_our.iloc[:,col_date].max(axis=1) - dat_our.iloc[:,col_date].min(axis=1), dat_jay['L3_Range'])
print compare(dat_our.iloc[:,col_date].apply(lambda x: x.nunique(), axis=1), dat_jay['L3_Unique count'])
print compare(dat_our.iloc[:,col_date].kurtosis(axis=1), dat_jay['DATE_L3kurt'])
In [116]:
# Between lines and stations
col_dateL0 = [int(i)+1 for i in lut[lut['line']==0].col_dat.values if not np.isnan(i)]
col_dateL2 = [int(i)+1 for i in lut[lut['line']==2].col_dat.values if not np.isnan(i)]
col_dateL3 = [int(i)+1 for i in lut[lut['line']==3].col_dat.values if not np.isnan(i)]
col_dateS37 = [int(i)+1 for i in lut[lut['station']==37].col_dat.values if not np.isnan(i)]
col_dateS26 = [int(i)+1 for i in lut[lut['station']==26].col_dat.values if not np.isnan(i)]
col_dateS30 = [int(i)+1 for i in lut[lut['station']==30].col_dat.values if not np.isnan(i)]
col_dateS34 = [int(i)+1 for i in lut[lut['station']==34].col_dat.values if not np.isnan(i)]
print('Line 3')
print compare(dat_our.iloc[:,col_dateL0].max(axis=1) - dat_our.iloc[:,col_dateL3].max(axis=1), dat_jay['L0max_L3max'])
print compare(dat_our.iloc[:,col_dateL2].max(axis=1) - dat_our.iloc[:,col_dateS37].max(axis=1), dat_jay['L2max_S37max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateL0].min(axis=1), dat_jay['L3max_L0min'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS26].max(axis=1), dat_jay['L3max_S26max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS30].max(axis=1), dat_jay['L3max_S30max'])
print compare(dat_our.iloc[:,col_dateL3].max(axis=1) - dat_our.iloc[:,col_dateS34].min(axis=1), dat_jay['L3max_S34min'])
In [132]:
max(col_date)
Out[132]:
In [225]:
# Same station
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]
df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
df['L{}max'.format(l)].round(2)
# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)
In [226]:
# Sort by ID
df.sort_values(['Id'], inplace=True)
In [227]:
for col in df.columns:
df[col + '_prev'] = df[col].shift(1)
df[col + '_next'] = df[col].shift(-1)
# Use only train id
df = df[df['Id'].isin(id_train.Id)]
In [228]:
df.head()
Out[228]:
In [229]:
# Now compare
print('Line 0')
print compare(((df['L0max']==df['L0max_prev']) | (df['L0max'].isnull()) & (df['L0max_prev'].isnull())).astype(int),
dat_jay['sameL0'])
print compare(((df['L0max']==df['L0max_next']) | (df['L0max'].isnull()) & (df['L0max_next'].isnull())).astype(int),
dat_jay['sameL0 (#1)'])
print('Line 1')
print compare(((df['L1max']==df['L1max_prev']) | (df['L1max'].isnull()) & (df['L1max_prev'].isnull())).astype(int),
dat_jay['sameL1'])
print compare(((df['L1max']==df['L1max_next']) | (df['L1max'].isnull()) & (df['L1max_next'].isnull())).astype(int),
dat_jay['sameL1 (#1)'])
print('Line 2')
print compare(((df['L2max']==df['L2max_prev']) | (df['L2max'].isnull()) & (df['L2max_prev'].isnull())).astype(int),
dat_jay['sameL2'])
print compare(((df['L2max']==df['L2max_next']) | (df['L2max'].isnull()) & (df['L2max_next'].isnull())).astype(int),
dat_jay['sameL2 (#1)'])
print('Line 3')
print compare(((df['L3max']==df['L3max_prev']) | (df['L3max'].isnull()) & (df['L3max_prev'].isnull())).astype(int),
dat_jay['sameL3'])
print compare(((df['L3max']==df['L3max_next']) | (df['L3max'].isnull()) & (df['L3max_next'].isnull())).astype(int),
dat_jay['sameL3 (#1)'])
In [8]:
date_cols = ['Kurtosis', 'Max', 'Mean', 'Min', 'Range','Unique count',
'L0_Max', 'L0_Mean', 'L0_Min', 'L0_Range', 'L0_Unique count', 'DATE_L0max',
'L1_Max', 'L1_Min', 'L1_Range', 'L1_Unique count', 'DATE_L1kurt',
'L2_Max', 'L2_Min', 'L2_Range',
'L3_Max', 'L3_Min', 'L3_Range', 'L3_Unique count', 'DATE_L3kurt', 'DATE_L3min',
'L0max_L3max', 'L2max_S37max', 'L3max_L0min', 'L3max_S26max', 'L3max_S30max', 'L3max_S34min',
'sameL0', 'sameL0 (#1)', 'sameL1', 'sameL1 (#1)', 'sameL2', 'sameL2 (#1)', 'sameL3', 'sameL3 (#1)',
'L1_L1_Missing value count',
'L3_L3_Missing value count',
'L3_L3_Unique count',]
In [9]:
jay_feat_diffs = ['S13min_S33min',
'S22max_S32min',
'S22min_S32min',
'S26min_S24min',
'S26min_S37min',
'S27min_S32min',
'S29max_S35max',
'S29min_S32min',
'S29min_S34min',
'S29min_S37min',
'S30min_S35min',
'S30min_S37min',
'S32max_S37min',
'S32min_S10min',
'S32min_S30min',
'S32min_S34min',
'S33max_S29min',
'S33min_S30min',
'S33min_S34min',
'S33min_S35min',
'S33min_S36min',
'S33min_S37min',
'S34min_S35min',
'S35min_L1min',
'S36max_S29min',
'S37min_S34min']
jay_feat_dates = [
'DATE_S0_max',
'DATE_S1_max',
'DATE_S10max',
'DATE_S10min',
'DATE_S11max',
'DATE_S13max',
'DATE_S13min',
'DATE_S18max',
'DATE_S19max',
'DATE_S2_max',
'DATE_S20max',
'DATE_S20min',
'DATE_S21max',
'DATE_S21min',
'DATE_S23max',
'DATE_S24max',
'DATE_S24min',
'DATE_S25max',
'DATE_S25min',
'DATE_S26max',
'DATE_S27max',
'DATE_S28max',
'DATE_S3_max',
'DATE_S30max',
'DATE_S32max',
'DATE_S32min',
'DATE_S33max',
'DATE_S34max',
'DATE_S35max',
'DATE_S36max',
'DATE_S37max',
'DATE_S38max',
'DATE_S4_max',
'DATE_S40max',
'DATE_S43max',
'DATE_S44max',
'DATE_S45min',
'DATE_S47max',
'DATE_S49max',
'DATE_S50max',
'DATE_S6_max',
'DATE_S7_max',
'DATE_S8_max',
'DATE_S8_min',
'DATE_S9_max']
In [6]:
cat_jay.filter(like='Max').columns
Out[6]:
In [8]:
cat_jay.columns[:100]
Out[8]:
In [48]:
jay_means = cat_jay.mean()
jay_sums = cat_jay.sum()
In [95]:
def value_last_element_row(X):
''' Return last value of each row of sparse csr matrix X'''
# Get element where new row starts -1
last = X.indptr[1:] - 1
output = X.data[last]
# Replace row with zero non-zero elements by nan
output[np.diff(X.indptr)==0] = np.nan
return output
def max_element_row(X):
''' Return maximum value of each row of sparse csr matrix X'''
''' nan values are assumed to be encoded as zero'''
output = X.max(1).todense().A1
output[output==0] = np.nan
return output
def alpha_num_max_element_row(X):
''' Return alpha num maximum value of each row of sparse csr matrix X'''
''' nan values are assumed to be encoded as zero'''
''' Lazy, slow implementation, via data/indtptr much faster'''
output= []
for n in range(X.shape[0]):
nz = X[n,:].nonzero()[1]
if nz.shape[0]>0:
data = ['{:d}'.format(int(x)) for x in set(X[n, nz].todense().A1)]
output.append( int(float(max(data))))
else:
#output.append(np.nan)
output.append(0)
return output
def nunique_row(X):
''' Return number of unique per row'''
''' Lazy, slow implementation, via data/indtptr much faster'''
output= []
for n in range(X.shape[0]):
nz = X[n,:].nonzero()[1]
if nz.shape[0]>0:
output.append( len(set(X[n, nz].todense().A1)))
else:
output.append(0)
return output
In [10]:
# CATEGORICAL_Last_____1
n_last = cat_data[n,:].nonzero()[1][-1]
sum([2, 4, 514] == cat_data[n, n_last])
pd.Series(value_last_element_row(cat_data)).isin([2, 4, 514]).mean()
Out[10]:
In [65]:
# CATEGORICAL_Last_____2
pd.Series(value_last_element_row(cat_data)).isin([16, 48]).mean()
Out[65]:
In [149]:
## CATEGORICAL_Missing value count
pd.Series(cat_data.shape[1] - np.diff(cat_data.indptr)).mean()
Out[149]:
In [102]:
# CATEGORICAL_Max______1 (takes a while)
list1 = [2, 8389632, 514]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list1).mean()
Out[102]:
In [103]:
# CATEGORICAL_Max______3 (takes a while)
list3 = [3, 145, 4, 143, 8, 512, 6, 32]
pd.Series(alpha_num_max_element_row(cat_data)).isin(list3).mean()
Out[103]:
In [148]:
# CATEGORICAL_Unique count
pd.Series(nunique_row(cat_data)).mean()
Out[148]:
In [15]:
# CATEGORICAL_out_L3_S32_F3854_class1
# CATEGORICAL_out_L3_S32_F3854_class1 0.003434
col_nr = lut[lut['name_cat']=='L3_S32_F3854'].col_cat.values[0].astype(int)
In [25]:
d = pd.Series(cat_data[:, 1986].todense().A1)
d.replace(0, np.nan, inplace=True)
In [52]:
tmp = np.zeros(d.shape)
tmp[(d==16).values] = 1
tmp[(d==48).values] = 1
In [53]:
tmp.mean()
Out[53]:
In [58]:
# CATEGORICAL_out_L3_S32_F3854_class2
# CATEGORICAL_out_out_L3_S32_F3854_class2 0.008123
tmp = np.zeros(d.shape)
tmp[(d==2).values] = 2
tmp[(d==4).values] = 2
In [59]:
tmp.mean()
Out[59]:
In [ ]: