In [1]:
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

# Custom modules
import const
import func

Create single set of features to compare on


In [2]:
df_out = pd.DataFrame()

In [3]:
id_tr = func.read_first_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))

In [4]:
id_te = func.read_first_column(os.path.join(const.BASE_PATH, const.TEST_FILES[0]))

In [5]:
# Load S32 info feat_set_destination_station_S32.csv
destination_stations = pd.read_csv(os.path.join(const.DATA_PATH, 
                                                'feat_set_destination_station.csv'), 
                                   index_col='ID')

            
df_out['S32'] = destination_stations['32.0']
df_out['S36'] = destination_stations['36.0']

df_out.head(2)


Out[5]:
S32 S36
ID
4 NaN NaN
6 NaN NaN

In [6]:
df = pd.read_csv('data/feat_set_gf0_gf1_OHE.csv', index_col=0)
df['gf0'] = df['gf0_-1']*-1 + df['gf0_0']*0 + df['gf0_1']*1
df['gf1'] = df['gf1_-1']*-1 + df['gf1_0']*0 + df['gf1_1']*1

df_out[['gf0','gf1']] = df[['gf0','gf1']]
df_out.head(2)


Out[6]:
S32 S36 gf0 gf1
ID
4 NaN NaN -1.0 -1.0
6 NaN NaN -1.0 -1.0

In [7]:
df2 = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_leaks.csv'), index_col='Id')

df_out[['f1','f2','f3','f4']] = df2[['f1','f2','f3','f4']]
df_out.head(3)


Out[7]:
S32 S36 gf0 gf1 f1 f2 f3 f4
ID
4 NaN NaN -1.0 -1.0 0 -2 -2314450 -224451
6 NaN NaN -1.0 -1.0 2 -1 -2284042 -67530
7 NaN NaN -1.0 -1.0 1 -2 -2313512 -26876

In [9]:
df2 = pd.read_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new_0.csv'), index_col='ID')

df_out[df2.columns] = df2[df2.columns]
df_out.head(3)


Out[9]:
S32 S36 gf0 gf1 f1 f2 f3 f4 sameL0_prev sameL0_next sameL1_prev sameL1_next sameL2_prev sameL2_next sameL3_prev sameL3_next
ID
4 NaN NaN -1.0 -1.0 0 -2 -2314450 -224451 0 0 0 1 0 0 0 0
6 NaN NaN -1.0 -1.0 2 -1 -2284042 -67530 0 0 1 1 0 1 0 0
7 NaN NaN -1.0 -1.0 1 -2 -2313512 -26876 0 0 1 0 1 0 0 0

In [10]:
df_out.to_csv(os.path.join(const.BASE_PATH, 'submission_analysis_feat_set.csv'), index_label='ID')

In [11]:
#y_train = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0] + '.csv'))

In [12]:
@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        #score = matthews_corrcoef(y_true, y_pred)
        score = mcc(tp, tn, fp, fn)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc, best_proba, tp, tn, fp, fn

In [13]:
train = feather.read_dataframe('divers/tr_stack1.feather')
test = feather.read_dataframe('divers/te_stack1.feather')

train.index = id_tr.Id
test.index = id_te.Id

# Put Y as first column
R = train['Y']
train.drop('Y', axis=1, inplace=True)

In [14]:
df_agg = pd.DataFrame(index=train.columns)

In [15]:
# Determine threshold/mcc per model in train set
for col in train.columns:
    tmp = eval_mcc(R.values, train[col].values)
    df_agg.loc[col, 'mcc'] = tmp[0]
    df_agg.loc[col, 'tr'] = tmp[1]


/Users/joostbloom/anaconda/lib/python2.7/site-packages/numba/dataflow.py:297: RuntimeWarning: Python2 style print partially supported.  Please use Python3 style print.
  "Python3 style print.", RuntimeWarning)

In [16]:
# Transform train and test to 0/1 predictions
for col in train.columns:
    train.loc[:, col] = (train.loc[:,col]>df_agg.loc[col, 'tr']).astype(int)
    test.loc[:, col] = (test.loc[:,col]>df_agg.loc[col, 'tr']).astype(int)

In [17]:
# Add number of positives to each model
df_agg['r_count'] = R.sum()
df_agg['r_count_train'] = train.sum(0)
df_agg['r_count_test'] = test.sum(0)

In [18]:
df_agg.head(3)


Out[18]:
mcc tr r_count r_count_train r_count_test
xgb_jay 0.450187 0.422528 6879.0 2816 2735
gbm_jay 0.474026 0.411417 6879.0 2939 3000
rf_jay 0.397868 0.239798 6879.0 1735 1782

Data & methods


In [19]:
writer = pd.ExcelWriter('submission_compared.xlsx', engine='xlsxwriter')
sheet = 'model_lvl1_compared'

In [20]:
from xlsxwriter.utility import xl_range

In [22]:
def test_pred_on_feature(feat, y_pred, showplot=False, bins=None, output=['mean']):
    
    #target_name = y_train.columns[0]
    
    y = y_pred.copy()
    
    # Bin feat data if requested
    if bins:
        feat = pd.cut(feat, bins)
    
    y[feat.name] = feat
    
    output = y.groupby(feat.name).agg(output)
    #output['Ratio 0v1'] = np.abs(output[labels[0]]['mean'] / output['0: Train']['mean'])
    #output['Diff 0v1'] = np.abs(output['Train']['sum'] - output[labels[0]]['sum'])
    #output['Ratio 0v2'] = np.abs(output[labels[1]]['mean'] / output['0: Train']['mean'])
    #output['Diff 0v2'] = np.abs(output['Train']['sum'] - output[labels[1]]['sum'])
    #output.replace(np.inf, 0, inplace=True)
    
    return output

In [23]:
startrow = 0

df_agg.transpose().to_excel(writer, sheet_name=sheet, startrow=startrow)
startrow += df_agg.shape[1] + 2

for col in df_out.columns:
    print('Inserting {}'.format(col))
    bins = None
    if col in ['f1','f2','f3','f4']:
        top = df_out[col].value_counts().index[0]
        bins=[df_out[col].min(), top-0.5, top+0.5, df_out[col].max()]
    
    output_base = test_pred_on_feature(df_out[col], pd.DataFrame(R), bins=bins, output=['mean','sum'])
    output_pred = test_pred_on_feature(df_out[col], test, bins=bins, output=['sum'])
    
    output = (output_pred.apply(lambda x: x.astype(float) / output_base['Y']['sum'], axis=0))
    output = pd.concat([output_base, output], axis=1, join='inner')
    output.replace(np.inf, 0, inplace=True)
    summed = pd.DataFrame(output.mean(0)).transpose()
    summed.index = ['mean']
    
    cols = list(output.columns.droplevel(1))
    cols[0] = 'y'
    output.columns = cols
    
    output.to_excel(writer, sheet_name=sheet, startrow=startrow)
    startrow += output.shape[0] + 3
    summed.to_excel(writer, sheet_name=sheet, startrow=startrow-3, header=False)
        
    worksheet = writer.sheets[sheet]
    ran = xl_range(startrow-2, 3, startrow-2, output.shape[1])
    worksheet.conditional_format(ran, {'type': '3_color_scale'})
    #worksheet.conditional_format('K4:K' + str(startrow), {'type': '3_color_scale'})
    #worksheet.conditional_format('O4:O' + str(startrow), {'type': '3_color_scale'})
    

writer.save()


Inserting S32
Inserting S36
Inserting gf0
Inserting gf1
Inserting f1
Inserting f2
Inserting f3
Inserting f4
Inserting sameL0_prev
Inserting sameL0_next
Inserting sameL1_prev
Inserting sameL1_next
Inserting sameL2_prev
Inserting sameL2_next
Inserting sameL3_prev
Inserting sameL3_next