In [37]:
import feather
import os
import re
import pickle
import time
import datetime
import random
import numpy as np
import pandas as pd
from numba import jit
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack, vstack
from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm
%matplotlib inline
import xgboost as xgb
import subprocess
# Custom modules
import const
import func
In [2]:
# Based on: https://www.kaggle.com/c/caterpillar-tube-pricing/forums/t/15748/strategies-to-encode-categorical-variables-with-many-categories/88207
In [239]:
print const.TRAIN_FILES
In [3]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)
Out[3]:
In [87]:
# Load cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'), index_col=0)
print cluster_info.shape
cluster_info.head(3)
Out[87]:
In [12]:
# Load timestamps
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])
In [15]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']
del date_train, date_test
In [16]:
# Load response
#y = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0] + '.csv'))
print y.shape
y.head(3)
Out[16]:
In [17]:
# Load IDs of train + test
#ids = pd.concat([func.read_first_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0])),
# func.read_first_column(os.path.join(const.BASE_PATH, const.TEST_FILES[0]))],
# axis=0)
print ids.shape
ids.head(3)
Out[17]:
In [427]:
# Add response to cluster info
cluster_info['R'] = y.Response
# Add sample time to cluster info
def max_element_row(X):
''' Return maximum value of each row of sparse csr matrix X'''
''' nan values are assumed to be encoded as zero'''
output = X.max(1).todense().A1
output[output==0] = 0
return output
cluster_info['tmax'] = (max_element_row(date_data)*5).astype(int)
In [592]:
cluster_mean = cluster_info.groupby(['cluster_n500'])['R'].agg(['mean','count','sum'])
cluster_n500 = cluster_info.merge(cluster_mean, left_on='cluster_n500', right_index=True, how='left')
In [605]:
def loo_mean(mean, count, sample_val):
output = (mean * count - sample_val) / (count - 1)
# Return all sample mean
output[count<=1] = 0.0058
return output
def adjust_low_count_mean(count, mean):
# Adjust mean for samples with low count
# Use cut-off point to trim samples with low amount of samples
cutoff = 1000
train_mean = 0.0058
r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
output = (count * mean + (cutoff-count)*train_mean)/cutoff
output[count>(cutoff - 10 * count * mean)] = mean[count>(cutoff - 10 * count * mean)]
return output
def cut_off_loo_mean(r1_count, mean):
train_mean = 0.0058
output = mean.copy()
output[r1_count<15] = train_mean
return output
def taper_mean_bin_prob(mean, bin_prob):
train_mean = 0.0058
output = bin_prob * train_mean + (1 - bin_prob) * mean
return output
def random_loo_mean(mean, count, sample_val):
# Random number between 0.9 and 1.1
train_mean = 0.0058
r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
#print r
# Train samples have out of sample mean
output = r * loo_mean(mean, count, sample_val)
# Test samples have in-sample mean
output[sample_val.isnull()] = mean[sample_val.isnull()]
# Samples with mean null (categorical values not in train) set to all train sample mean
output[mean.isnull()] = train_mean
return output
def bin_prob(n, k, p):
return scipy.misc.comb(n,k)*(p**k)*((1-p)**(n-k))
from scipy import special, exp, log
lgam = special.gammaln
def binomial2(n, k, p):
return exp(lgam(n+1) - lgam(n-k+1) - lgam(k+1) + k*log(p) + (n-k)*log(1.-p))
In [ ]:
cluster_n500['bin_prob'] = cluster_n500[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)
In [606]:
cluster_n500['loo_mean'] = random_loo_mean(cluster_n500['mean'],
cluster_n500['count'],
cluster_n500['R'])
cluster_n500['loo_mean_tapered'] = adjust_low_count_mean(cluster_n500['count'],
cluster_n500['loo_mean'])
cluster_n500['loo_mean_cutoff'] = cut_off_loo_mean(cluster_n500['sum'],
cluster_n500['loo_mean'])
cluster_n500['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_n500['loo_mean'],
cluster_n500['bin_prob'])
In [595]:
cluster_n500.isnull().sum()
Out[595]:
In [611]:
cluster_n500[['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_n500_loo.csv'),
index_label='ID')
In [607]:
cluster_n500.sort_values('loo_mean', ascending=False)
Out[607]:
In [609]:
cluster_n500.sample(20)
Out[609]:
In [582]:
cluster_n500.groupby('R')['loo_mean'].mean()
Out[582]:
In [612]:
cluster_mean = cluster_info.groupby(['unique_path'])['R'].agg(['mean','count','sum'])
cluster_upath = cluster_info.merge(cluster_mean, left_on='unique_path', right_index=True, how='left')
In [614]:
cluster_upath['bin_prob'] = cluster_upath[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)
In [615]:
is_train = ~cluster_upath['R'].isnull()
cluster_upath['loo_mean'] = random_loo_mean(cluster_upath['mean'],
cluster_upath['count'],
cluster_upath['R'])
cluster_upath['loo_mean_tapered'] = adjust_low_count_mean(cluster_upath['count'],
cluster_upath['loo_mean'])
cluster_upath['loo_mean_cutoff'] = cut_off_loo_mean(cluster_upath['sum'],
cluster_upath['loo_mean'])
cluster_upath['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_upath['loo_mean'],
cluster_upath['bin_prob'])
In [616]:
cluster_upath.isnull().sum()
Out[616]:
In [620]:
cluster_upath.sort_values('loo_mean', ascending=False).head(20)
Out[620]:
In [629]:
cluster_upath.head(20)
Out[629]:
In [621]:
cluster_upath[['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_upath_loo.csv'),
index_label='ID')
In [492]:
cluster_mean = cluster_info.groupby(['tmax'])['R'].agg(['mean','count','sum'])
cluster_tmax = cluster_info.merge(cluster_mean, left_on='tmax', right_index=True, how='left')
In [493]:
#cluster_tmax['adj_mean'] = adjust_mean(cluster_tmax['count'], cluster_tmax['mean'])
In [622]:
cluster_tmax['bin_prob'] = cluster_tmax[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)
In [623]:
cluster_tmax['loo_mean'] = random_loo_mean(cluster_tmax['mean'],
cluster_tmax['count'],
cluster_tmax['R'])
cluster_tmax['loo_mean_tapered'] = adjust_low_count_mean(cluster_tmax['count'],
cluster_tmax['loo_mean'])
cluster_tmax['loo_mean_cutoff'] = cut_off_loo_mean(cluster_tmax['sum'],
cluster_tmax['loo_mean'])
cluster_tmax['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_tmax['loo_mean'],
cluster_tmax['bin_prob'])
In [624]:
cluster_tmax.sort_values('loo_mean', ascending=False)
Out[624]:
In [625]:
cluster_tmax[['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_tmax_loo.csv'),
index_label='ID')
In [626]:
cols = ['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']
for col in cols:
print(col)
print(cluster_tmax.groupby('R')[col].mean())
print('')
In [627]:
cols = ['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']
for col in cols:
print(col)
print(cluster_n500.groupby('R')[col].mean())
print('')
In [628]:
cols = ['loo_mean',
'loo_mean_tapered',
'loo_mean_cutoff',
'loo_mean_prob_bin']
for col in cols:
print(col)
print(cluster_upath.groupby('R')[col].mean())
print('')
In [ ]: