In [1]:
import feather
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
from numba import jit
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack
from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm
%matplotlib inline
import xgboost as xgb
import subprocess
# Custom modules
import const
import func
In [2]:
df_out = pd.DataFrame()
In [3]:
id_tr = func.read_first_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))
In [4]:
id_te = func.read_first_column(os.path.join(const.BASE_PATH, const.TEST_FILES[0]))
In [5]:
# Load S32 info feat_set_destination_station_S32.csv
destination_stations = pd.read_csv(os.path.join(const.DATA_PATH,
'feat_set_destination_station.csv'),
index_col='ID')
df_out['S32'] = destination_stations['32.0']
df_out['S36'] = destination_stations['36.0']
df_out.head(2)
Out[5]:
In [6]:
df = pd.read_csv('data/feat_set_gf0_gf1_OHE.csv', index_col=0)
df['gf0'] = df['gf0_-1']*-1 + df['gf0_0']*0 + df['gf0_1']*1
df['gf1'] = df['gf1_-1']*-1 + df['gf1_0']*0 + df['gf1_1']*1
df_out[['gf0','gf1']] = df[['gf0','gf1']]
df_out.head(2)
Out[6]:
In [7]:
df2 = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_leaks.csv'), index_col='Id')
df_out[['f1','f2','f3','f4']] = df2[['f1','f2','f3','f4']]
df_out.head(3)
Out[7]:
In [9]:
df2 = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new_0.csv'), index_col='ID')
df_out[df2.columns] = df2[df2.columns]
df_out.head(3)
Out[9]:
In [10]:
df_out.to_csv(os.path.join(const.BASE_PATH, 'submission_analysis_feat_set.csv'), index_label='ID')
In [11]:
#y_train = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0] + '.csv'))
In [12]:
@jit
def mcc(tp, tn, fp, fn):
sup = tp * tn - fp * fn
inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
if inf==0:
return 0
else:
return sup / np.sqrt(inf)
@jit
def eval_mcc(y_true, y_prob, show=False):
idx = np.argsort(y_prob)
y_true_sort = y_true[idx]
n = y_true.shape[0]
nump = 1.0 * np.sum(y_true) # number of positive
numn = n - nump # number of negative
tp = nump
tn = 0.0
fp = numn
fn = 0.0
best_mcc = 0.0
best_id = -1
prev_proba = -1
best_proba = -1
mccs = np.zeros(n)
for i in range(n):
# all items with idx < i are predicted negative while others are predicted positive
# only evaluate mcc when probability changes
proba = y_prob[idx[i]]
if proba != prev_proba:
prev_proba = proba
new_mcc = mcc(tp, tn, fp, fn)
if new_mcc >= best_mcc:
best_mcc = new_mcc
best_id = i
best_proba = proba
mccs[i] = new_mcc
if y_true_sort[i] == 1:
tp -= 1.0
fn += 1.0
else:
fp -= 1.0
tn += 1.0
if show:
y_pred = (y_prob >= best_proba).astype(int)
#score = matthews_corrcoef(y_true, y_pred)
score = mcc(tp, tn, fp, fn)
print(score, best_mcc)
plt.plot(mccs)
return best_proba, best_mcc, y_pred
else:
return best_mcc, best_proba, tp, tn, fp, fn
In [13]:
train = feather.read_dataframe('divers/tr_stack1.feather')
test = feather.read_dataframe('divers/te_stack1.feather')
train.index = id_tr.Id
test.index = id_te.Id
# Put Y as first column
R = train['Y']
train.drop('Y', axis=1, inplace=True)
In [14]:
df_agg = pd.DataFrame(index=train.columns)
In [15]:
# Determine threshold/mcc per model in train set
for col in train.columns:
tmp = eval_mcc(R.values, train[col].values)
df_agg.loc[col, 'mcc'] = tmp[0]
df_agg.loc[col, 'tr'] = tmp[1]
In [16]:
# Transform train and test to 0/1 predictions
for col in train.columns:
train.loc[:, col] = (train.loc[:,col]>df_agg.loc[col, 'tr']).astype(int)
test.loc[:, col] = (test.loc[:,col]>df_agg.loc[col, 'tr']).astype(int)
In [17]:
# Add number of positives to each model
df_agg['r_count'] = R.sum()
df_agg['r_count_train'] = train.sum(0)
df_agg['r_count_test'] = test.sum(0)
In [18]:
df_agg.head(3)
Out[18]:
In [19]:
writer = pd.ExcelWriter('submission_compared.xlsx', engine='xlsxwriter')
sheet = 'model_lvl1_compared'
In [20]:
from xlsxwriter.utility import xl_range
In [22]:
def test_pred_on_feature(feat, y_pred, showplot=False, bins=None, output=['mean']):
#target_name = y_train.columns[0]
y = y_pred.copy()
# Bin feat data if requested
if bins:
feat = pd.cut(feat, bins)
y[feat.name] = feat
output = y.groupby(feat.name).agg(output)
#output['Ratio 0v1'] = np.abs(output[labels[0]]['mean'] / output['0: Train']['mean'])
#output['Diff 0v1'] = np.abs(output['Train']['sum'] - output[labels[0]]['sum'])
#output['Ratio 0v2'] = np.abs(output[labels[1]]['mean'] / output['0: Train']['mean'])
#output['Diff 0v2'] = np.abs(output['Train']['sum'] - output[labels[1]]['sum'])
#output.replace(np.inf, 0, inplace=True)
return output
In [23]:
startrow = 0
df_agg.transpose().to_excel(writer, sheet_name=sheet, startrow=startrow)
startrow += df_agg.shape[1] + 2
for col in df_out.columns:
print('Inserting {}'.format(col))
bins = None
if col in ['f1','f2','f3','f4']:
top = df_out[col].value_counts().index[0]
bins=[df_out[col].min(), top-0.5, top+0.5, df_out[col].max()]
output_base = test_pred_on_feature(df_out[col], pd.DataFrame(R), bins=bins, output=['mean','sum'])
output_pred = test_pred_on_feature(df_out[col], test, bins=bins, output=['sum'])
output = (output_pred.apply(lambda x: x.astype(float) / output_base['Y']['sum'], axis=0))
output = pd.concat([output_base, output], axis=1, join='inner')
output.replace(np.inf, 0, inplace=True)
summed = pd.DataFrame(output.mean(0)).transpose()
summed.index = ['mean']
cols = list(output.columns.droplevel(1))
cols[0] = 'y'
output.columns = cols
output.to_excel(writer, sheet_name=sheet, startrow=startrow)
startrow += output.shape[0] + 3
summed.to_excel(writer, sheet_name=sheet, startrow=startrow-3, header=False)
worksheet = writer.sheets[sheet]
ran = xl_range(startrow-2, 3, startrow-2, output.shape[1])
worksheet.conditional_format(ran, {'type': '3_color_scale'})
#worksheet.conditional_format('K4:K' + str(startrow), {'type': '3_color_scale'})
#worksheet.conditional_format('O4:O' + str(startrow), {'type': '3_color_scale'})
writer.save()