In [1]:

    
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Custom modules
import const
import func

Load data



In [2]:

    
y = func.read_last_column(os.path.join(const.BASE_PATH,const.TRAIN_FILES[0]+'.csv'))
print y.head(3)
y = y.Response.values









    



    Response
Id          
4          0
6          0
7          0



In [3]:

    
# Load columns name
num_cols = func.get_columns_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))[:200]



In [51]:

    
train_stack = feather.read_dataframe('divers/tr_stack1.feather')
#test_stack = feather.read_dataframe('divers/te_stack1.feather')
#tr_lauren = feather.read_dataframe('../input/tr_lauren.feather')
#te_lauren = feather.read_dataframe('../input/te_lauren.feather')

#leak = pd.read_csv('../input/leak_feature.csv')
tr_feather_set1 = feather.read_dataframe('divers/train.feather')
#te_feather_set1 = pd.read_csv('divers/test_eng.csv')
tr_feather_set1.columns = [x + '_v2' for x in tr_feather_set1.columns]
train = pd.concat([train_stack,tr_feather_set1],axis = 1)



In [47]:

    
set(train_stack.columns) & set(tr_feather_set1.columns)









    Out[47]:





set()



In [48]:

    
features = list(train.columns)
features.remove("Y")
#features.remove("Id")
#features.remove("Id")
features.remove("Response")
#features.remove("tdeltadevrel_block1a")
features.remove("cluster_n500")
features.remove("unique_path")
features.remove('magic3')
features.remove('magic4')









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-48-aa35992fc773> in <module>()
      3 #features.remove("Id")
      4 #features.remove("Id")
----> 5 features.remove("Response")
      6 #features.remove("tdeltadevrel_block1a")
      7 features.remove("cluster_n500")

ValueError: list.remove(x): x not in list



In [52]:

    
X = train[features]



In [53]:

    
del train_stack,tr_feather_set1,train
import gc
gc.collect()









    Out[53]:





40



In [54]:

    
print('X_num_raw: {}'.format(X.shape))









    



X_num_raw: (1183747, 152)



In [55]:

    
print const.CV
with open(const.CV, 'rb') as f:
    cv = pickle.load(f)
n_cv = len(cv)









    



/Users/joostbloom/Documents/kaggle/bosch/data/folds_V1.pkl



In [56]:

    
n_cv









    Out[56]:





5



In [57]:

    
x_train = xgb.DMatrix(X, 
                      label=y)

Train simple model



In [67]:

    
def score_xgboost_full(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
    
    preds_val = np.zeros(y.shape)
    
    for (itrain, ival) in cv:
    
        x_tr = x_train.slice(itrain)
        x_va = x_train.slice(ival)

        watchlist = [ (x_tr, 'train'), (x_va, 'eval')]

        eval_result = {}

        bst = xgb.train(params, 
                        x_tr, 
                        num_boost_round=params['num_round'], 
                        evals=watchlist,
                        evals_result=eval_result,
                        early_stopping_rounds=params['early_stopping'],
                        verbose_eval=5)

        #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

        train_score = eval_result['train']['auc'][bst.best_iteration]
        val_score = eval_result['eval']['auc'][bst.best_iteration]

        # pick the best threshold based on oof predictions
        preds_val[ival] = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
        thresholds = np.linspace(0.01, 0.99, 50)
        mcc = np.array([matthews_corrcoef(y[ival], preds_val[ival]>thr) for thr in thresholds])
        th_val = thresholds[mcc.argmax()]
        mcc_val = mcc.max()

        print train_score
        print val_score
        print th_val
        print mcc_val
    
    return preds_val



In [58]:

    
def score_xgboost(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
        
    
    (itrain, ival) = cv[3]
    
    x_tr = x_train.slice(itrain)
    x_va = x_train.slice(ival)
    
    watchlist = [ (x_tr, 'train'), (x_va, 'eval')]
    
    eval_result = {}
        
    bst = xgb.train(params, 
                    x_tr, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=5)

    #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))
    
    train_score = eval_result['train']['auc'][bst.best_iteration]
    val_score = eval_result['eval']['auc'][bst.best_iteration]
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'best_iter': bst.best_iteration, 
            'mcc': mcc_val, 
            'threshold': th_val}



In [ ]:

    
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = 0.9
params['colsample_bytree']= 0.8
params['min_child_weight'] = 12
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 100
df = score_xgboost_full(params)









    



Predicting XGBoost score with (7):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 12, 'subsample': 0.9, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 100, 'booster': 'gbtree'} parameters
[0]	train-auc:0.882689	eval-auc:0.874368
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 100 rounds.
[5]	train-auc:0.910296	eval-auc:0.899835



In [63]:

    
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = hp.uniform('subsample', 0.7, 0.9) #,0.86
params['colsample_bytree']= hp.uniform('colsample_bytree', 0.7, 0.9) #0.92
params['min_child_weight'] = hp.choice('min_child_weight', range(50))
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 30



In [60]:

    
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_xgboost, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)









    



Predicting XGBoost score with (1):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.7453702907128406, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 2, 'subsample': 0.8392938371195724, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.881851	eval-auc:0.887866
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.905195	eval-auc:0.907201
[10]	train-auc:0.911597	eval-auc:0.915983
[15]	train-auc:0.922715	eval-auc:0.91963
[20]	train-auc:0.924145	eval-auc:0.919616
[25]	train-auc:0.927452	eval-auc:0.922379
[30]	train-auc:0.929131	eval-auc:0.923269
[35]	train-auc:0.93105	eval-auc:0.923566
[40]	train-auc:0.933071	eval-auc:0.923991
[45]	train-auc:0.936111	eval-auc:0.924359
[50]	train-auc:0.940188	eval-auc:0.924883
[55]	train-auc:0.942824	eval-auc:0.924543
[60]	train-auc:0.947407	eval-auc:0.924586
[65]	train-auc:0.952034	eval-auc:0.92418
[70]	train-auc:0.957042	eval-auc:0.925099
[75]	train-auc:0.960417	eval-auc:0.924847
[80]	train-auc:0.964366	eval-auc:0.924491
[85]	train-auc:0.967466	eval-auc:0.923561
[90]	train-auc:0.969468	eval-auc:0.923094
[95]	train-auc:0.970715	eval-auc:0.922135
[100]	train-auc:0.972658	eval-auc:0.922331
Stopping. Best iteration:
[73]	train-auc:0.959057	eval-auc:0.925428

0.959057
0.925428
0.35
0.488913050663
Predicting XGBoost score with (2):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.8980266639177703, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 17, 'subsample': 0.7212129811909465, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.893134	eval-auc:0.898917
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.904599	eval-auc:0.906852
[10]	train-auc:0.911777	eval-auc:0.915794
[15]	train-auc:0.919464	eval-auc:0.923524
[20]	train-auc:0.923815	eval-auc:0.924568
[25]	train-auc:0.926368	eval-auc:0.924528
[30]	train-auc:0.928424	eval-auc:0.924501
[35]	train-auc:0.931506	eval-auc:0.925535






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-7bd170faee54> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-58-540c28882d45> in score_xgboost(params)
     27                     evals_result=eval_result,
     28                     early_stopping_rounds=params['early_stopping'],
---> 29                     verbose_eval=5)
     30 
     31     #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    201                            evals=evals,
    202                            obj=obj, feval=feval,
--> 203                            xgb_model=xgb_model, callbacks=callbacks)
    204 
    205 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt:



In [16]:

    
par_values = {'max_depth': range(8,21)}
parameters = trials.trials[0]['misc']['vals'].keys()
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,16))
cmap = plt.cm.Dark2
par_best_score = {}
df = pd.DataFrame(columns=parameters + ['train_auc','val_auc'])
for i, val in enumerate(parameters):
    xs = np.array([t['misc']['vals'][val] for t in trials.trials if 'loss' in t['result']]).ravel()
    val_auc = [1-t['result']['loss'] for t in trials.trials if 'loss' in t['result']]
    train_auc = [t['result']['train_score'] for t in trials.trials if 'train_score' in t['result']]
    best_iter = [t['result']['best_iter'] for t in trials.trials if 'best_iter' in t['result']]
    mcc = [t['result']['mcc'] for t in trials.trials if 'mcc' in t['result']]
    tr = [t['result']['threshold'] for t in trials.trials if 'threshold' in t['result']]
    
    df[val] = xs
    df['val_auc'] = val_auc
    df['train_auc'] = train_auc
    df['best_iter'] = best_iter
    df['threshold'] = tr
    df['mcc'] = mcc
    
    
    par_best_score[val] = xs[val_auc.index(min(val_auc))]
    #print trials.trials[ys.index(max(ys))]
    #print i, val, max(ys)
    #xs, ys = zip(sorted(xs), sorted(ys))
    #ys = np.array(ys)
    axes[i/2,i%2].scatter(xs, mcc, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(parameters)))
    axes[i/2,i%2].set_title(val)
print par_best_score
df['diffs'] = df['train_auc'] - df['val_auc']









    



{'subsample': 0.85100526437557555, 'colsample_bytree': 0.88555831490955395, 'min_child_weight': 7}



In [17]:

    
ax = df.plot.scatter('threshold','mcc')
#ax.set_xlim([0.921, 0.926])



In [19]:

    
ax = df.plot.scatter('val_auc','mcc')
ax.set_xlim([0.924, 0.928])









    Out[19]:





(0.924, 0.928)



In [22]:

    
ax = df.plot.scatter('subsample','diffs')
#ax.set_xlim([0.924, 0.928])



In [23]:

    
ax = df.plot.scatter('colsample_bytree','diffs')



In [24]:

    
ax = df.plot.scatter('min_child_weight','diffs')



In [25]:

    
df.sort_values('mcc', ascending=False)









    Out[25]:






  
    
      
      subsample
      colsample_bytree
      min_child_weight
      train_auc
      val_auc
      best_iter
      threshold
      mcc
      diffs
    
  
  
    
      120
      0.895085
      0.786997
      15
      0.939548
      0.926382
      48
      0.37
      0.486593
      0.013166
    
    
      130
      0.880313
      0.783069
      17
      0.939069
      0.925431
      48
      0.35
      0.486413
      0.013638
    
    
      75
      0.881585
      0.785355
      12
      0.951036
      0.925846
      69
      0.37
      0.486052
      0.025190
    
    
      79
      0.899020
      0.724160
      14
      0.944426
      0.926235
      58
      0.31
      0.485772
      0.018191
    
    
      56
      0.812120
      0.766352
      12
      0.945547
      0.926282
      60
      0.37
      0.485350
      0.019265
    
    
      18
      0.883477
      0.755889
      25
      0.937977
      0.925750
      47
      0.47
      0.485346
      0.012227
    
    
      107
      0.878744
      0.742034
      19
      0.938234
      0.925939
      47
      0.49
      0.484729
      0.012295
    
    
      41
      0.868346
      0.814026
      15
      0.956644
      0.926550
      86
      0.35
      0.484634
      0.030094
    
    
      99
      0.899894
      0.746819
      30
      0.937995
      0.925775
      48
      0.47
      0.484619
      0.012220
    
    
      46
      0.851005
      0.885558
      7
      0.926866
      0.924712
      28
      0.35
      0.484173
      0.002154
    
    
      29
      0.712419
      0.777799
      26
      0.932349
      0.926178
      38
      0.39
      0.483842
      0.006171
    
    
      5
      0.873327
      0.869573
      37
      0.943992
      0.925883
      66
      0.37
      0.483785
      0.018109
    
    
      103
      0.833497
      0.772789
      24
      0.938730
      0.925618
      48
      0.41
      0.483778
      0.013112
    
    
      96
      0.854625
      0.768942
      41
      0.939363
      0.925901
      54
      0.45
      0.483701
      0.013462
    
    
      32
      0.896444
      0.834576
      42
      0.937163
      0.926547
      49
      0.41
      0.483590
      0.010616
    
    
      106
      0.806648
      0.779703
      39
      0.944256
      0.926175
      75
      0.43
      0.483521
      0.018081
    
    
      45
      0.833690
      0.841824
      30
      0.942620
      0.926209
      60
      0.39
      0.483494
      0.016411
    
    
      34
      0.739197
      0.871161
      28
      0.936528
      0.926053
      44
      0.47
      0.483299
      0.010475
    
    
      17
      0.849581
      0.887454
      23
      0.935343
      0.924794
      42
      0.37
      0.483204
      0.010549
    
    
      105
      0.849638
      0.712691
      8
      0.928402
      0.924822
      30
      0.33
      0.483027
      0.003580
    
    
      54
      0.761035
      0.747670
      16
      0.939215
      0.925500
      49
      0.45
      0.482930
      0.013715
    
    
      81
      0.726999
      0.809627
      36
      0.942292
      0.925382
      71
      0.45
      0.482916
      0.016910
    
    
      14
      0.888770
      0.757614
      13
      0.926696
      0.925046
      30
      0.39
      0.482783
      0.001650
    
    
      68
      0.892973
      0.776842
      40
      0.936393
      0.925846
      47
      0.49
      0.482708
      0.010547
    
    
      123
      0.864691
      0.739736
      40
      0.938774
      0.926377
      52
      0.37
      0.482651
      0.012397
    
    
      7
      0.728326
      0.764351
      29
      0.925730
      0.925200
      30
      0.43
      0.482577
      0.000530
    
    
      61
      0.829872
      0.755507
      34
      0.938620
      0.926275
      50
      0.49
      0.482532
      0.012345
    
    
      97
      0.773013
      0.730274
      32
      0.947059
      0.925930
      87
      0.51
      0.482532
      0.021129
    
    
      38
      0.820305
      0.887809
      33
      0.936668
      0.925595
      47
      0.37
      0.482532
      0.011073
    
    
      110
      0.820132
      0.878300
      47
      0.937734
      0.925804
      53
      0.41
      0.482413
      0.011930
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      113
      0.710836
      0.853990
      18
      0.932794
      0.925158
      38
      0.35
      0.479573
      0.007636
    
    
      87
      0.876259
      0.874622
      22
      0.948921
      0.926082
      71
      0.37
      0.479532
      0.022839
    
    
      64
      0.760006
      0.726369
      2
      0.930094
      0.924862
      33
      0.33
      0.479500
      0.005232
    
    
      66
      0.890211
      0.743009
      48
      0.937383
      0.926908
      50
      0.49
      0.479434
      0.010475
    
    
      6
      0.814106
      0.824706
      40
      0.926890
      0.925131
      32
      0.33
      0.479431
      0.001759
    
    
      118
      0.781745
      0.735219
      37
      0.936042
      0.925880
      47
      0.31
      0.479425
      0.010162
    
    
      73
      0.853200
      0.894529
      49
      0.942018
      0.926268
      71
      0.35
      0.479404
      0.015750
    
    
      37
      0.715954
      0.774858
      12
      0.945479
      0.925589
      60
      0.37
      0.479396
      0.019890
    
    
      52
      0.877743
      0.750292
      48
      0.943692
      0.926721
      84
      0.47
      0.479337
      0.016971
    
    
      19
      0.880412
      0.722898
      27
      0.937720
      0.926283
      47
      0.53
      0.479314
      0.011437
    
    
      72
      0.858856
      0.708198
      25
      0.937482
      0.925843
      46
      0.51
      0.479212
      0.011639
    
    
      94
      0.796606
      0.752986
      11
      0.954200
      0.925978
      76
      0.47
      0.479189
      0.028222
    
    
      27
      0.750597
      0.701843
      46
      0.936484
      0.926346
      50
      0.47
      0.479065
      0.010138
    
    
      3
      0.721034
      0.741014
      10
      0.953339
      0.926428
      77
      0.47
      0.478972
      0.026911
    
    
      2
      0.801345
      0.739144
      34
      0.939154
      0.926157
      51
      0.43
      0.478957
      0.012997
    
    
      42
      0.838683
      0.827446
      11
      0.944097
      0.925572
      57
      0.33
      0.478883
      0.018525
    
    
      127
      0.733754
      0.760963
      4
      0.954633
      0.927021
      72
      0.35
      0.478631
      0.027612
    
    
      117
      0.830155
      0.701378
      29
      0.939066
      0.925821
      51
      0.47
      0.478558
      0.013245
    
    
      112
      0.868097
      0.748808
      16
      0.938571
      0.925837
      47
      0.37
      0.478400
      0.012734
    
    
      25
      0.701303
      0.850956
      47
      0.937811
      0.925886
      59
      0.37
      0.478400
      0.011925
    
    
      60
      0.737347
      0.806847
      29
      0.935730
      0.925844
      43
      0.37
      0.478259
      0.009886
    
    
      26
      0.785795
      0.806017
      35
      0.939065
      0.925758
      53
      0.49
      0.478230
      0.013307
    
    
      9
      0.856181
      0.788519
      17
      0.952733
      0.925932
      79
      0.49
      0.478230
      0.026801
    
    
      89
      0.783331
      0.800958
      4
      0.945395
      0.925134
      57
      0.31
      0.478105
      0.020261
    
    
      12
      0.832266
      0.730399
      27
      0.948218
      0.926191
      79
      0.55
      0.478083
      0.022027
    
    
      100
      0.766700
      0.894550
      9
      0.954679
      0.926079
      78
      0.33
      0.477572
      0.028600
    
    
      59
      0.803396
      0.719244
      43
      0.943719
      0.926553
      80
      0.53
      0.477169
      0.017166
    
    
      131
      0.839156
      0.732313
      5
      0.949754
      0.925730
      61
      0.49
      0.476963
      0.024024
    
    
      69
      0.889458
      0.716714
      13
      0.938119
      0.925155
      45
      0.43
      0.476526
      0.012964
    
    
      92
      0.755688
      0.851466
      26
      0.942694
      0.925916
      61
      0.47
      0.476252
      0.016778
    
  

133 rows × 9 columns



In [66]:

    
#df.drop(['gamma'], axis=1, inplace=True)
#df.to_csv('./data/xgboost_hyperopt_1fold_100iter.csv', index=False)



In [122]:

    
df['colsample_bytree'] = df['colsample_bytree'].round(2)



In [127]:

    
df.sort_values('val_auc', ascending=False)









    Out[127]:






  
    
      
      colsample_bytree
      min_child_weight
      train_auc
      val_auc
      best_iter
      diffs
    
  
  
    
      18
      0.73
      49
      0.943012
      0.924580
      92
      0.018432
    
    
      2
      0.70
      29
      0.947958
      0.924540
      98
      0.023418
    
    
      61
      0.72
      49
      0.942023
      0.924430
      93
      0.017593
    
    
      26
      0.72
      49
      0.942023
      0.924430
      93
      0.017593
    
    
      47
      0.73
      47
      0.943834
      0.924389
      101
      0.019445
    
    
      75
      0.73
      30
      0.947750
      0.924377
      96
      0.023373
    
    
      14
      0.73
      30
      0.947750
      0.924377
      96
      0.023373
    
    
      43
      0.74
      37
      0.945657
      0.924363
      96
      0.021294
    
    
      73
      0.70
      49
      0.943502
      0.924312
      98
      0.019190
    
    
      71
      0.71
      25
      0.952072
      0.924288
      107
      0.027784
    
    
      80
      0.71
      34
      0.946239
      0.924287
      97
      0.021952
    
    
      12
      0.71
      31
      0.947255
      0.924267
      97
      0.022988
    
    
      54
      0.71
      5
      0.961884
      0.924240
      98
      0.037644
    
    
      11
      0.71
      35
      0.946406
      0.924147
      98
      0.022259
    
    
      72
      0.72
      35
      0.947205
      0.924069
      97
      0.023136
    
    
      0
      0.71
      45
      0.943809
      0.924064
      93
      0.019745
    
    
      68
      0.71
      49
      0.942241
      0.924047
      94
      0.018194
    
    
      36
      0.71
      44
      0.943859
      0.924041
      97
      0.019818
    
    
      76
      0.73
      21
      0.950278
      0.924026
      93
      0.026252
    
    
      19
      0.72
      21
      0.950868
      0.923988
      97
      0.026880
    
    
      13
      0.75
      31
      0.946689
      0.923966
      94
      0.022723
    
    
      21
      0.73
      10
      0.962495
      0.923921
      123
      0.038574
    
    
      48
      0.72
      48
      0.942325
      0.923880
      92
      0.018445
    
    
      7
      0.75
      25
      0.943072
      0.923878
      71
      0.019194
    
    
      51
      0.72
      3
      0.963697
      0.923843
      96
      0.039854
    
    
      39
      0.72
      27
      0.949131
      0.923785
      95
      0.025346
    
    
      34
      0.73
      26
      0.948836
      0.923749
      97
      0.025087
    
    
      4
      0.70
      9
      0.958787
      0.923718
      99
      0.035069
    
    
      64
      0.74
      41
      0.945394
      0.923635
      97
      0.021759
    
    
      66
      0.70
      33
      0.947523
      0.923630
      101
      0.023893
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      46
      0.73
      6
      0.959922
      0.923078
      96
      0.036844
    
    
      6
      0.72
      31
      0.948381
      0.922992
      98
      0.025389
    
    
      65
      0.73
      18
      0.952082
      0.922988
      97
      0.029094
    
    
      45
      0.71
      1
      0.968336
      0.922968
      92
      0.045368
    
    
      30
      0.74
      29
      0.932928
      0.922929
      49
      0.009999
    
    
      25
      0.74
      19
      0.947738
      0.922893
      80
      0.024845
    
    
      10
      0.72
      33
      0.947813
      0.922869
      99
      0.024944
    
    
      41
      0.73
      15
      0.952293
      0.922809
      93
      0.029484
    
    
      60
      0.72
      9
      0.957347
      0.922775
      98
      0.034572
    
    
      29
      0.71
      36
      0.946716
      0.922715
      99
      0.024001
    
    
      55
      0.70
      0
      0.956594
      0.922713
      69
      0.033881
    
    
      38
      0.74
      20
      0.951745
      0.922702
      97
      0.029043
    
    
      53
      0.72
      32
      0.947660
      0.922628
      99
      0.025032
    
    
      58
      0.73
      8
      0.956824
      0.922622
      94
      0.034202
    
    
      3
      0.75
      38
      0.945914
      0.922615
      98
      0.023299
    
    
      63
      0.71
      38
      0.944283
      0.922557
      93
      0.021726
    
    
      62
      0.72
      29
      0.933078
      0.922531
      49
      0.010547
    
    
      37
      0.71
      16
      0.954216
      0.922526
      96
      0.031690
    
    
      5
      0.74
      18
      0.934604
      0.922522
      49
      0.012082
    
    
      57
      0.73
      14
      0.947438
      0.922505
      71
      0.024933
    
    
      1
      0.73
      14
      0.947438
      0.922505
      71
      0.024933
    
    
      52
      0.71
      12
      0.953922
      0.922501
      92
      0.031421
    
    
      40
      0.72
      17
      0.951551
      0.922240
      89
      0.029311
    
    
      79
      0.72
      2
      0.965250
      0.922161
      95
      0.043089
    
    
      24
      0.73
      2
      0.943175
      0.922096
      57
      0.021079
    
    
      42
      0.73
      11
      0.941744
      0.922063
      60
      0.019681
    
    
      69
      0.71
      4
      0.942256
      0.922049
      58
      0.020207
    
    
      16
      0.71
      4
      0.942256
      0.922049
      58
      0.020207
    
    
      77
      0.71
      13
      0.953533
      0.921879
      92
      0.031654
    
    
      28
      0.72
      13
      0.954321
      0.921653
      93
      0.032668
    
  

81 rows × 6 columns



In [78]:

    
df.head()









    Out[78]:






  
    
      
      subsample
      colsample_bytree
      gamma
      min_child_weight
      train_auc
      val_auc
      diffs
    
  
  
    
      0
      0.919647
      0.734544
      28
      2
      0.922576
      0.918840
      0.003736
    
    
      1
      0.860606
      0.728616
      7
      17
      0.947655
      0.921702
      0.025953
    
    
      2
      0.900673
      0.731379
      23
      34
      0.922294
      0.919387
      0.002907
    
    
      3
      0.860517
      0.704370
      1
      10
      0.947460
      0.920868
      0.026592
    
    
      4
      0.902383
      0.709298
      28
      34
      0.919536
      0.918013
      0.001523



In [79]:

    
df['subsample'] = df['subsample'].round(2)
df['colsample_bytree'] = df['colsample_bytree'].round(2)



In [80]:

    
def plot_scores_for_pars(par):
    f, ax = plt.subplots(1,3, figsize=(16,6), sharex=True)

    df.groupby(par)['val_auc'].mean().plot(ax=ax[0])
    df.groupby(par)['train_auc'].mean().plot(ax=ax[1])
    df.groupby(par)['diffs'].mean().plot(ax=ax[2])

    ax[0].set_ylabel('Test auc')
    ax[1].set_ylabel('Train auc')
    ax[2].set_ylabel('Difference')

    ax[0].set_xlabel(par)
    ax[1].set_xlabel(par)
    ax[2].set_xlabel(par)



In [81]:

    
plot_scores_for_pars('subsample')



In [82]:

    
plot_scores_for_pars('colsample_bytree')



In [85]:

    
plot_scores_for_pars('min_child_weight')



In [84]:

    
plot_scores_for_pars('gamma')



In [64]:

    
plot_scores_for_pars('gamma')



In [42]:

    
df.groupby('sub_r')['val_auc'].mean().plot()
df.groupby('sub_r')['train_auc'].mean().plot()









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0x11ef145d0>



In [44]:

    
df.groupby('colt_r')['val_auc'].mean().plot()
df.groupby('colt_r')['train_auc'].mean().plot()









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x11effd550>



In [45]:

    
df.groupby('coll_r')['val_auc'].mean().plot()
df.groupby('coll_r')['train_auc'].mean().plot()









    Out[45]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f238d50>



In [24]:

    
df.plot('train_auc', 'val_auc',kind='scatter', ylim=[0.918, 0.922])









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x11db2c090>



In [29]:

    
df.plot('val_auc', 'diffs', kind='scatter', xlim=[0.918, 0.922])









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x11e3be6d0>



In [20]:

    
df.plot('gamma', 'diffs',kind='scatter')









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x11cdb0710>



In [44]:

    
df.plot.scatter('colsample_bytree', 'val_auc', by='max_depth')









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x139938190>



In [ ]:

	subsample	colsample_bytree	min_child_weight	train_auc	val_auc	best_iter	threshold	mcc	diffs
120	0.895085	0.786997	15	0.939548	0.926382	48	0.37	0.486593	0.013166
130	0.880313	0.783069	17	0.939069	0.925431	48	0.35	0.486413	0.013638
75	0.881585	0.785355	12	0.951036	0.925846	69	0.37	0.486052	0.025190
79	0.899020	0.724160	14	0.944426	0.926235	58	0.31	0.485772	0.018191
56	0.812120	0.766352	12	0.945547	0.926282	60	0.37	0.485350	0.019265
18	0.883477	0.755889	25	0.937977	0.925750	47	0.47	0.485346	0.012227
107	0.878744	0.742034	19	0.938234	0.925939	47	0.49	0.484729	0.012295
41	0.868346	0.814026	15	0.956644	0.926550	86	0.35	0.484634	0.030094
99	0.899894	0.746819	30	0.937995	0.925775	48	0.47	0.484619	0.012220
46	0.851005	0.885558	7	0.926866	0.924712	28	0.35	0.484173	0.002154
29	0.712419	0.777799	26	0.932349	0.926178	38	0.39	0.483842	0.006171
5	0.873327	0.869573	37	0.943992	0.925883	66	0.37	0.483785	0.018109
103	0.833497	0.772789	24	0.938730	0.925618	48	0.41	0.483778	0.013112
96	0.854625	0.768942	41	0.939363	0.925901	54	0.45	0.483701	0.013462
32	0.896444	0.834576	42	0.937163	0.926547	49	0.41	0.483590	0.010616
106	0.806648	0.779703	39	0.944256	0.926175	75	0.43	0.483521	0.018081
45	0.833690	0.841824	30	0.942620	0.926209	60	0.39	0.483494	0.016411
34	0.739197	0.871161	28	0.936528	0.926053	44	0.47	0.483299	0.010475
17	0.849581	0.887454	23	0.935343	0.924794	42	0.37	0.483204	0.010549
105	0.849638	0.712691	8	0.928402	0.924822	30	0.33	0.483027	0.003580
54	0.761035	0.747670	16	0.939215	0.925500	49	0.45	0.482930	0.013715
81	0.726999	0.809627	36	0.942292	0.925382	71	0.45	0.482916	0.016910
14	0.888770	0.757614	13	0.926696	0.925046	30	0.39	0.482783	0.001650
68	0.892973	0.776842	40	0.936393	0.925846	47	0.49	0.482708	0.010547
123	0.864691	0.739736	40	0.938774	0.926377	52	0.37	0.482651	0.012397
7	0.728326	0.764351	29	0.925730	0.925200	30	0.43	0.482577	0.000530
61	0.829872	0.755507	34	0.938620	0.926275	50	0.49	0.482532	0.012345
97	0.773013	0.730274	32	0.947059	0.925930	87	0.51	0.482532	0.021129
38	0.820305	0.887809	33	0.936668	0.925595	47	0.37	0.482532	0.011073
110	0.820132	0.878300	47	0.937734	0.925804	53	0.41	0.482413	0.011930
...	...	...	...	...	...	...	...	...	...
113	0.710836	0.853990	18	0.932794	0.925158	38	0.35	0.479573	0.007636
87	0.876259	0.874622	22	0.948921	0.926082	71	0.37	0.479532	0.022839
64	0.760006	0.726369	2	0.930094	0.924862	33	0.33	0.479500	0.005232
66	0.890211	0.743009	48	0.937383	0.926908	50	0.49	0.479434	0.010475
6	0.814106	0.824706	40	0.926890	0.925131	32	0.33	0.479431	0.001759
118	0.781745	0.735219	37	0.936042	0.925880	47	0.31	0.479425	0.010162
73	0.853200	0.894529	49	0.942018	0.926268	71	0.35	0.479404	0.015750
37	0.715954	0.774858	12	0.945479	0.925589	60	0.37	0.479396	0.019890
52	0.877743	0.750292	48	0.943692	0.926721	84	0.47	0.479337	0.016971
19	0.880412	0.722898	27	0.937720	0.926283	47	0.53	0.479314	0.011437
72	0.858856	0.708198	25	0.937482	0.925843	46	0.51	0.479212	0.011639
94	0.796606	0.752986	11	0.954200	0.925978	76	0.47	0.479189	0.028222
27	0.750597	0.701843	46	0.936484	0.926346	50	0.47	0.479065	0.010138
3	0.721034	0.741014	10	0.953339	0.926428	77	0.47	0.478972	0.026911
2	0.801345	0.739144	34	0.939154	0.926157	51	0.43	0.478957	0.012997
42	0.838683	0.827446	11	0.944097	0.925572	57	0.33	0.478883	0.018525
127	0.733754	0.760963	4	0.954633	0.927021	72	0.35	0.478631	0.027612
117	0.830155	0.701378	29	0.939066	0.925821	51	0.47	0.478558	0.013245
112	0.868097	0.748808	16	0.938571	0.925837	47	0.37	0.478400	0.012734
25	0.701303	0.850956	47	0.937811	0.925886	59	0.37	0.478400	0.011925
60	0.737347	0.806847	29	0.935730	0.925844	43	0.37	0.478259	0.009886
26	0.785795	0.806017	35	0.939065	0.925758	53	0.49	0.478230	0.013307
9	0.856181	0.788519	17	0.952733	0.925932	79	0.49	0.478230	0.026801
89	0.783331	0.800958	4	0.945395	0.925134	57	0.31	0.478105	0.020261
12	0.832266	0.730399	27	0.948218	0.926191	79	0.55	0.478083	0.022027
100	0.766700	0.894550	9	0.954679	0.926079	78	0.33	0.477572	0.028600
59	0.803396	0.719244	43	0.943719	0.926553	80	0.53	0.477169	0.017166
131	0.839156	0.732313	5	0.949754	0.925730	61	0.49	0.476963	0.024024
69	0.889458	0.716714	13	0.938119	0.925155	45	0.43	0.476526	0.012964
92	0.755688	0.851466	26	0.942694	0.925916	61	0.47	0.476252	0.016778

	colsample_bytree	min_child_weight	train_auc	val_auc	best_iter	diffs
18	0.73	49	0.943012	0.924580	92	0.018432
2	0.70	29	0.947958	0.924540	98	0.023418
61	0.72	49	0.942023	0.924430	93	0.017593
26	0.72	49	0.942023	0.924430	93	0.017593
47	0.73	47	0.943834	0.924389	101	0.019445
75	0.73	30	0.947750	0.924377	96	0.023373
14	0.73	30	0.947750	0.924377	96	0.023373
43	0.74	37	0.945657	0.924363	96	0.021294
73	0.70	49	0.943502	0.924312	98	0.019190
71	0.71	25	0.952072	0.924288	107	0.027784
80	0.71	34	0.946239	0.924287	97	0.021952
12	0.71	31	0.947255	0.924267	97	0.022988
54	0.71	5	0.961884	0.924240	98	0.037644
11	0.71	35	0.946406	0.924147	98	0.022259
72	0.72	35	0.947205	0.924069	97	0.023136
0	0.71	45	0.943809	0.924064	93	0.019745
68	0.71	49	0.942241	0.924047	94	0.018194
36	0.71	44	0.943859	0.924041	97	0.019818
76	0.73	21	0.950278	0.924026	93	0.026252
19	0.72	21	0.950868	0.923988	97	0.026880
13	0.75	31	0.946689	0.923966	94	0.022723
21	0.73	10	0.962495	0.923921	123	0.038574
48	0.72	48	0.942325	0.923880	92	0.018445
7	0.75	25	0.943072	0.923878	71	0.019194
51	0.72	3	0.963697	0.923843	96	0.039854
39	0.72	27	0.949131	0.923785	95	0.025346
34	0.73	26	0.948836	0.923749	97	0.025087
4	0.70	9	0.958787	0.923718	99	0.035069
64	0.74	41	0.945394	0.923635	97	0.021759
66	0.70	33	0.947523	0.923630	101	0.023893
...	...	...	...	...	...	...
46	0.73	6	0.959922	0.923078	96	0.036844
6	0.72	31	0.948381	0.922992	98	0.025389
65	0.73	18	0.952082	0.922988	97	0.029094
45	0.71	1	0.968336	0.922968	92	0.045368
30	0.74	29	0.932928	0.922929	49	0.009999
25	0.74	19	0.947738	0.922893	80	0.024845
10	0.72	33	0.947813	0.922869	99	0.024944
41	0.73	15	0.952293	0.922809	93	0.029484
60	0.72	9	0.957347	0.922775	98	0.034572
29	0.71	36	0.946716	0.922715	99	0.024001
55	0.70	0	0.956594	0.922713	69	0.033881
38	0.74	20	0.951745	0.922702	97	0.029043
53	0.72	32	0.947660	0.922628	99	0.025032
58	0.73	8	0.956824	0.922622	94	0.034202
3	0.75	38	0.945914	0.922615	98	0.023299
63	0.71	38	0.944283	0.922557	93	0.021726
62	0.72	29	0.933078	0.922531	49	0.010547
37	0.71	16	0.954216	0.922526	96	0.031690
5	0.74	18	0.934604	0.922522	49	0.012082
57	0.73	14	0.947438	0.922505	71	0.024933
1	0.73	14	0.947438	0.922505	71	0.024933
52	0.71	12	0.953922	0.922501	92	0.031421
40	0.72	17	0.951551	0.922240	89	0.029311
79	0.72	2	0.965250	0.922161	95	0.043089
24	0.73	2	0.943175	0.922096	57	0.021079
42	0.73	11	0.941744	0.922063	60	0.019681
69	0.71	4	0.942256	0.922049	58	0.020207
16	0.71	4	0.942256	0.922049	58	0.020207
77	0.71	13	0.953533	0.921879	92	0.031654
28	0.72	13	0.954321	0.921653	93	0.032668

	subsample	colsample_bytree	gamma	min_child_weight	train_auc	val_auc	diffs
0	0.919647	0.734544	28	2	0.922576	0.918840	0.003736
1	0.860606	0.728616	7	17	0.947655	0.921702	0.025953
2	0.900673	0.731379	23	34	0.922294	0.919387	0.002907
3	0.860517	0.704370	1	10	0.947460	0.920868	0.026592
4	0.902383	0.709298	28	34	0.919536	0.918013	0.001523