In [2]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [3]:
print const.TRAIN_FILES
In [4]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)
Out[4]:
In [5]:
cat = func.load_data_file(const.TRAIN_FILES[1])
cat_data = cat['data']['features']
In [7]:
# Load jayjay's features
cat_jay = pd.read_csv('data_jayjay/train.csv')
cat_cols = list(cat_jay.filter(like='CATEGORICAL').columns) + ['L1_L1_Missing value count',
'L3_L3_Missing value count',
'L3_L3_Unique count']
cat_jay = cat_jay[cat_cols]
print cat_jay.shape
cat_jay.head(3)
Out[7]:
In [8]:
jay_means = cat_jay.mean()
jay_sums = cat_jay.sum()
print jay_means
In [37]:
def missing_value_count(X):
''' Returns count of missing values per row of sparse matrix X'''
return X.shape[1] - np.diff(X.indptr)
def value_last_element_row(X):
''' Return last value of each row of sparse csr matrix X'''
# Get element where new row starts -1
last = X.indptr[1:] - 1
output = X.data[last]
# Replace row with zero non-zero elements by nan
output[np.diff(X.indptr)==0] = np.nan
return output
def max_element_row(X):
''' Return maximum value of each row of sparse csr matrix X'''
''' nan values are assumed to be encoded as zero'''
output = X.max(1).todense().A1
output[output==0] = np.nan
return output
def alpha_num_max_element_row(X):
''' Return alpha num maximum value of each row of sparse csr matrix X'''
''' nan values are assumed to be encoded as zero'''
''' Lazy, slow implementation, via data/indtptr much faster'''
output= []
for n in range(X.shape[0]):
nz = X[n,:].nonzero()[1]
if nz.shape[0]>0:
data = ['{:d}'.format(int(x)) for x in set(X[n, nz].todense().A1)]
output.append( int(float(max(data))))
else:
#output.append(np.nan)
output.append(0)
return output
def nunique_row(X):
''' Return number of unique per row'''
''' Lazy, slow implementation, via data/indtptr much faster'''
output= []
for n in range(X.shape[0]):
nz = X[n,:].nonzero()[1]
if nz.shape[0]>0:
output.append( len(set(X[n, nz].todense().A1)))
else:
output.append(0)
return output
In [57]:
# 'L1_L1_Missing value count',
col_l1 = [int(i) for i in lut[lut['line']==1].col_cat.values if not np.isnan(i)]
print jay_means['L1_L1_Missing value count']
print pd.Series(missing_value_count(cat_data[:, col_l1])).mean()
In [58]:
# 'L3_L3_Missing value count'
col_l3 = [int(i) for i in lut[lut['line']==3].col_cat.values if not np.isnan(i)]
print jay_means['L3_L3_Missing value count']
print pd.Series(missing_value_count(cat_data[:, col_l3])).mean()
In [59]:
# 'L3_L3_Unique count'
col_l3 = [int(i) for i in lut[lut['line']==3].col_cat.values if not np.isnan(i)]
print jay_means['L3_L3_Unique count']
print pd.Series(nunique_row(cat_data[:, col_l3])).mean()
In [10]:
# CATEGORICAL_Last_____1
n_last = cat_data[n,:].nonzero()[1][-1]
sum([2, 4, 514] == cat_data[n, n_last])
print jay_means['CATEGORICAL_Last_____1']
pd.Series(value_last_element_row(cat_data)).isin([2, 4, 514]).mean()
Out[10]:
In [65]:
# CATEGORICAL_Last_____2
print jay_means['CATEGORICAL_Last_____2']
pd.Series(value_last_element_row(cat_data)).isin([16, 48]).mean()
Out[65]:
In [149]:
## CATEGORICAL_Missing value count
print jay_means['CATEGORICAL_Missing value count']
pd.Series(cat_data.shape[1] - np.diff(cat_data.indptr)).mean()
Out[149]:
In [102]:
# CATEGORICAL_Max______1 (takes a while)
list1 = [2, 8389632, 514]
print jay_means['CATEGORICAL_Max______1']
pd.Series(alpha_num_max_element_row(cat_data)).isin(list1).mean()
Out[102]:
In [103]:
# CATEGORICAL_Max______3 (takes a while)
list3 = [3, 145, 4, 143, 8, 512, 6, 32]
print jay_means['CATEGORICAL_Max______3']
pd.Series(alpha_num_max_element_row(cat_data)).isin(list3).mean()
Out[103]:
In [148]:
# CATEGORICAL_Unique count
print jay_means['CATEGORICAL_Unique count']
pd.Series(nunique_row(cat_data)).mean()
Out[148]:
In [58]:
# CATEGORICAL_out_L3_S32_F3854_class2
# CATEGORICAL_out_out_L3_S32_F3854_class2 0.008123
tmp = np.zeros(d.shape)
tmp[(d==2).values] = 2
tmp[(d==4).values] = 2
In [59]:
tmp.mean()
Out[59]:
In [ ]: