In [1]:
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Custom modules
import const
import func

Load data


In [2]:
y = func.read_last_column(os.path.join(const.BASE_PATH,const.TRAIN_FILES[0]+'.csv'))
print y.head(3)
y = y.Response.values


    Response
Id          
4          0
6          0
7          0

In [3]:
# Load columns name
num_cols = func.get_columns_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))[:200]

In [51]:
train_stack = feather.read_dataframe('divers/tr_stack1.feather')
#test_stack = feather.read_dataframe('divers/te_stack1.feather')
#tr_lauren = feather.read_dataframe('../input/tr_lauren.feather')
#te_lauren = feather.read_dataframe('../input/te_lauren.feather')

#leak = pd.read_csv('../input/leak_feature.csv')
tr_feather_set1 = feather.read_dataframe('divers/train.feather')
#te_feather_set1 = pd.read_csv('divers/test_eng.csv')
tr_feather_set1.columns = [x + '_v2' for x in tr_feather_set1.columns]
train = pd.concat([train_stack,tr_feather_set1],axis = 1)

In [47]:
set(train_stack.columns) & set(tr_feather_set1.columns)


Out[47]:
set()

In [48]:
features = list(train.columns)
features.remove("Y")
#features.remove("Id")
#features.remove("Id")
features.remove("Response")
#features.remove("tdeltadevrel_block1a")
features.remove("cluster_n500")
features.remove("unique_path")
features.remove('magic3')
features.remove('magic4')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-48-aa35992fc773> in <module>()
      3 #features.remove("Id")
      4 #features.remove("Id")
----> 5 features.remove("Response")
      6 #features.remove("tdeltadevrel_block1a")
      7 features.remove("cluster_n500")

ValueError: list.remove(x): x not in list

In [52]:
X = train[features]

In [53]:
del train_stack,tr_feather_set1,train
import gc
gc.collect()


Out[53]:
40

In [54]:
print('X_num_raw: {}'.format(X.shape))


X_num_raw: (1183747, 152)

In [55]:
print const.CV
with open(const.CV, 'rb') as f:
    cv = pickle.load(f)
n_cv = len(cv)


/Users/joostbloom/Documents/kaggle/bosch/data/folds_V1.pkl

In [56]:
n_cv


Out[56]:
5

In [57]:
x_train = xgb.DMatrix(X, 
                      label=y)

Train simple model


In [67]:
def score_xgboost_full(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
    
    preds_val = np.zeros(y.shape)
    
    for (itrain, ival) in cv:
    
        x_tr = x_train.slice(itrain)
        x_va = x_train.slice(ival)

        watchlist = [ (x_tr, 'train'), (x_va, 'eval')]

        eval_result = {}

        bst = xgb.train(params, 
                        x_tr, 
                        num_boost_round=params['num_round'], 
                        evals=watchlist,
                        evals_result=eval_result,
                        early_stopping_rounds=params['early_stopping'],
                        verbose_eval=5)

        #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

        train_score = eval_result['train']['auc'][bst.best_iteration]
        val_score = eval_result['eval']['auc'][bst.best_iteration]

        # pick the best threshold based on oof predictions
        preds_val[ival] = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
        thresholds = np.linspace(0.01, 0.99, 50)
        mcc = np.array([matthews_corrcoef(y[ival], preds_val[ival]>thr) for thr in thresholds])
        th_val = thresholds[mcc.argmax()]
        mcc_val = mcc.max()

        print train_score
        print val_score
        print th_val
        print mcc_val
    
    return preds_val

In [58]:
def score_xgboost(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
        
    
    (itrain, ival) = cv[3]
    
    x_tr = x_train.slice(itrain)
    x_va = x_train.slice(ival)
    
    watchlist = [ (x_tr, 'train'), (x_va, 'eval')]
    
    eval_result = {}
        
    bst = xgb.train(params, 
                    x_tr, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=5)

    #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))
    
    train_score = eval_result['train']['auc'][bst.best_iteration]
    val_score = eval_result['eval']['auc'][bst.best_iteration]
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'best_iter': bst.best_iteration, 
            'mcc': mcc_val, 
            'threshold': th_val}

In [ ]:
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = 0.9
params['colsample_bytree']= 0.8
params['min_child_weight'] = 12
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 100
df = score_xgboost_full(params)


Predicting XGBoost score with (7):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 12, 'subsample': 0.9, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 100, 'booster': 'gbtree'} parameters
[0]	train-auc:0.882689	eval-auc:0.874368
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 100 rounds.
[5]	train-auc:0.910296	eval-auc:0.899835

In [63]:
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = hp.uniform('subsample', 0.7, 0.9) #,0.86
params['colsample_bytree']= hp.uniform('colsample_bytree', 0.7, 0.9) #0.92
params['min_child_weight'] = hp.choice('min_child_weight', range(50))
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 30

In [60]:
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_xgboost, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)


Predicting XGBoost score with (1):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.7453702907128406, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 2, 'subsample': 0.8392938371195724, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.881851	eval-auc:0.887866
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.905195	eval-auc:0.907201
[10]	train-auc:0.911597	eval-auc:0.915983
[15]	train-auc:0.922715	eval-auc:0.91963
[20]	train-auc:0.924145	eval-auc:0.919616
[25]	train-auc:0.927452	eval-auc:0.922379
[30]	train-auc:0.929131	eval-auc:0.923269
[35]	train-auc:0.93105	eval-auc:0.923566
[40]	train-auc:0.933071	eval-auc:0.923991
[45]	train-auc:0.936111	eval-auc:0.924359
[50]	train-auc:0.940188	eval-auc:0.924883
[55]	train-auc:0.942824	eval-auc:0.924543
[60]	train-auc:0.947407	eval-auc:0.924586
[65]	train-auc:0.952034	eval-auc:0.92418
[70]	train-auc:0.957042	eval-auc:0.925099
[75]	train-auc:0.960417	eval-auc:0.924847
[80]	train-auc:0.964366	eval-auc:0.924491
[85]	train-auc:0.967466	eval-auc:0.923561
[90]	train-auc:0.969468	eval-auc:0.923094
[95]	train-auc:0.970715	eval-auc:0.922135
[100]	train-auc:0.972658	eval-auc:0.922331
Stopping. Best iteration:
[73]	train-auc:0.959057	eval-auc:0.925428

0.959057
0.925428
0.35
0.488913050663
Predicting XGBoost score with (2):
	 1183747 samples
	 152 features
	 {'num_round': 200, 'colsample_bytree': 0.8980266639177703, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 17, 'subsample': 0.7212129811909465, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.893134	eval-auc:0.898917
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.904599	eval-auc:0.906852
[10]	train-auc:0.911777	eval-auc:0.915794
[15]	train-auc:0.919464	eval-auc:0.923524
[20]	train-auc:0.923815	eval-auc:0.924568
[25]	train-auc:0.926368	eval-auc:0.924528
[30]	train-auc:0.928424	eval-auc:0.924501
[35]	train-auc:0.931506	eval-auc:0.925535
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-7bd170faee54> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-58-540c28882d45> in score_xgboost(params)
     27                     evals_result=eval_result,
     28                     early_stopping_rounds=params['early_stopping'],
---> 29                     verbose_eval=5)
     30 
     31     #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    201                            evals=evals,
    202                            obj=obj, feval=feval,
--> 203                            xgb_model=xgb_model, callbacks=callbacks)
    204 
    205 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt: 

In [16]:
par_values = {'max_depth': range(8,21)}
parameters = trials.trials[0]['misc']['vals'].keys()
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,16))
cmap = plt.cm.Dark2
par_best_score = {}
df = pd.DataFrame(columns=parameters + ['train_auc','val_auc'])
for i, val in enumerate(parameters):
    xs = np.array([t['misc']['vals'][val] for t in trials.trials if 'loss' in t['result']]).ravel()
    val_auc = [1-t['result']['loss'] for t in trials.trials if 'loss' in t['result']]
    train_auc = [t['result']['train_score'] for t in trials.trials if 'train_score' in t['result']]
    best_iter = [t['result']['best_iter'] for t in trials.trials if 'best_iter' in t['result']]
    mcc = [t['result']['mcc'] for t in trials.trials if 'mcc' in t['result']]
    tr = [t['result']['threshold'] for t in trials.trials if 'threshold' in t['result']]
    
    df[val] = xs
    df['val_auc'] = val_auc
    df['train_auc'] = train_auc
    df['best_iter'] = best_iter
    df['threshold'] = tr
    df['mcc'] = mcc
    
    
    par_best_score[val] = xs[val_auc.index(min(val_auc))]
    #print trials.trials[ys.index(max(ys))]
    #print i, val, max(ys)
    #xs, ys = zip(sorted(xs), sorted(ys))
    #ys = np.array(ys)
    axes[i/2,i%2].scatter(xs, mcc, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(parameters)))
    axes[i/2,i%2].set_title(val)
print par_best_score
df['diffs'] = df['train_auc'] - df['val_auc']


{'subsample': 0.85100526437557555, 'colsample_bytree': 0.88555831490955395, 'min_child_weight': 7}

In [17]:
ax = df.plot.scatter('threshold','mcc')
#ax.set_xlim([0.921, 0.926])



In [19]:
ax = df.plot.scatter('val_auc','mcc')
ax.set_xlim([0.924, 0.928])


Out[19]:
(0.924, 0.928)

In [22]:
ax = df.plot.scatter('subsample','diffs')
#ax.set_xlim([0.924, 0.928])



In [23]:
ax = df.plot.scatter('colsample_bytree','diffs')



In [24]:
ax = df.plot.scatter('min_child_weight','diffs')



In [25]:
df.sort_values('mcc', ascending=False)


Out[25]:
subsample colsample_bytree min_child_weight train_auc val_auc best_iter threshold mcc diffs
120 0.895085 0.786997 15 0.939548 0.926382 48 0.37 0.486593 0.013166
130 0.880313 0.783069 17 0.939069 0.925431 48 0.35 0.486413 0.013638
75 0.881585 0.785355 12 0.951036 0.925846 69 0.37 0.486052 0.025190
79 0.899020 0.724160 14 0.944426 0.926235 58 0.31 0.485772 0.018191
56 0.812120 0.766352 12 0.945547 0.926282 60 0.37 0.485350 0.019265
18 0.883477 0.755889 25 0.937977 0.925750 47 0.47 0.485346 0.012227
107 0.878744 0.742034 19 0.938234 0.925939 47 0.49 0.484729 0.012295
41 0.868346 0.814026 15 0.956644 0.926550 86 0.35 0.484634 0.030094
99 0.899894 0.746819 30 0.937995 0.925775 48 0.47 0.484619 0.012220
46 0.851005 0.885558 7 0.926866 0.924712 28 0.35 0.484173 0.002154
29 0.712419 0.777799 26 0.932349 0.926178 38 0.39 0.483842 0.006171
5 0.873327 0.869573 37 0.943992 0.925883 66 0.37 0.483785 0.018109
103 0.833497 0.772789 24 0.938730 0.925618 48 0.41 0.483778 0.013112
96 0.854625 0.768942 41 0.939363 0.925901 54 0.45 0.483701 0.013462
32 0.896444 0.834576 42 0.937163 0.926547 49 0.41 0.483590 0.010616
106 0.806648 0.779703 39 0.944256 0.926175 75 0.43 0.483521 0.018081
45 0.833690 0.841824 30 0.942620 0.926209 60 0.39 0.483494 0.016411
34 0.739197 0.871161 28 0.936528 0.926053 44 0.47 0.483299 0.010475
17 0.849581 0.887454 23 0.935343 0.924794 42 0.37 0.483204 0.010549
105 0.849638 0.712691 8 0.928402 0.924822 30 0.33 0.483027 0.003580
54 0.761035 0.747670 16 0.939215 0.925500 49 0.45 0.482930 0.013715
81 0.726999 0.809627 36 0.942292 0.925382 71 0.45 0.482916 0.016910
14 0.888770 0.757614 13 0.926696 0.925046 30 0.39 0.482783 0.001650
68 0.892973 0.776842 40 0.936393 0.925846 47 0.49 0.482708 0.010547
123 0.864691 0.739736 40 0.938774 0.926377 52 0.37 0.482651 0.012397
7 0.728326 0.764351 29 0.925730 0.925200 30 0.43 0.482577 0.000530
61 0.829872 0.755507 34 0.938620 0.926275 50 0.49 0.482532 0.012345
97 0.773013 0.730274 32 0.947059 0.925930 87 0.51 0.482532 0.021129
38 0.820305 0.887809 33 0.936668 0.925595 47 0.37 0.482532 0.011073
110 0.820132 0.878300 47 0.937734 0.925804 53 0.41 0.482413 0.011930
... ... ... ... ... ... ... ... ... ...
113 0.710836 0.853990 18 0.932794 0.925158 38 0.35 0.479573 0.007636
87 0.876259 0.874622 22 0.948921 0.926082 71 0.37 0.479532 0.022839
64 0.760006 0.726369 2 0.930094 0.924862 33 0.33 0.479500 0.005232
66 0.890211 0.743009 48 0.937383 0.926908 50 0.49 0.479434 0.010475
6 0.814106 0.824706 40 0.926890 0.925131 32 0.33 0.479431 0.001759
118 0.781745 0.735219 37 0.936042 0.925880 47 0.31 0.479425 0.010162
73 0.853200 0.894529 49 0.942018 0.926268 71 0.35 0.479404 0.015750
37 0.715954 0.774858 12 0.945479 0.925589 60 0.37 0.479396 0.019890
52 0.877743 0.750292 48 0.943692 0.926721 84 0.47 0.479337 0.016971
19 0.880412 0.722898 27 0.937720 0.926283 47 0.53 0.479314 0.011437
72 0.858856 0.708198 25 0.937482 0.925843 46 0.51 0.479212 0.011639
94 0.796606 0.752986 11 0.954200 0.925978 76 0.47 0.479189 0.028222
27 0.750597 0.701843 46 0.936484 0.926346 50 0.47 0.479065 0.010138
3 0.721034 0.741014 10 0.953339 0.926428 77 0.47 0.478972 0.026911
2 0.801345 0.739144 34 0.939154 0.926157 51 0.43 0.478957 0.012997
42 0.838683 0.827446 11 0.944097 0.925572 57 0.33 0.478883 0.018525
127 0.733754 0.760963 4 0.954633 0.927021 72 0.35 0.478631 0.027612
117 0.830155 0.701378 29 0.939066 0.925821 51 0.47 0.478558 0.013245
112 0.868097 0.748808 16 0.938571 0.925837 47 0.37 0.478400 0.012734
25 0.701303 0.850956 47 0.937811 0.925886 59 0.37 0.478400 0.011925
60 0.737347 0.806847 29 0.935730 0.925844 43 0.37 0.478259 0.009886
26 0.785795 0.806017 35 0.939065 0.925758 53 0.49 0.478230 0.013307
9 0.856181 0.788519 17 0.952733 0.925932 79 0.49 0.478230 0.026801
89 0.783331 0.800958 4 0.945395 0.925134 57 0.31 0.478105 0.020261
12 0.832266 0.730399 27 0.948218 0.926191 79 0.55 0.478083 0.022027
100 0.766700 0.894550 9 0.954679 0.926079 78 0.33 0.477572 0.028600
59 0.803396 0.719244 43 0.943719 0.926553 80 0.53 0.477169 0.017166
131 0.839156 0.732313 5 0.949754 0.925730 61 0.49 0.476963 0.024024
69 0.889458 0.716714 13 0.938119 0.925155 45 0.43 0.476526 0.012964
92 0.755688 0.851466 26 0.942694 0.925916 61 0.47 0.476252 0.016778

133 rows × 9 columns


In [66]:
#df.drop(['gamma'], axis=1, inplace=True)
#df.to_csv('./data/xgboost_hyperopt_1fold_100iter.csv', index=False)

In [122]:
df['colsample_bytree'] = df['colsample_bytree'].round(2)

In [127]:
df.sort_values('val_auc', ascending=False)


Out[127]:
colsample_bytree min_child_weight train_auc val_auc best_iter diffs
18 0.73 49 0.943012 0.924580 92 0.018432
2 0.70 29 0.947958 0.924540 98 0.023418
61 0.72 49 0.942023 0.924430 93 0.017593
26 0.72 49 0.942023 0.924430 93 0.017593
47 0.73 47 0.943834 0.924389 101 0.019445
75 0.73 30 0.947750 0.924377 96 0.023373
14 0.73 30 0.947750 0.924377 96 0.023373
43 0.74 37 0.945657 0.924363 96 0.021294
73 0.70 49 0.943502 0.924312 98 0.019190
71 0.71 25 0.952072 0.924288 107 0.027784
80 0.71 34 0.946239 0.924287 97 0.021952
12 0.71 31 0.947255 0.924267 97 0.022988
54 0.71 5 0.961884 0.924240 98 0.037644
11 0.71 35 0.946406 0.924147 98 0.022259
72 0.72 35 0.947205 0.924069 97 0.023136
0 0.71 45 0.943809 0.924064 93 0.019745
68 0.71 49 0.942241 0.924047 94 0.018194
36 0.71 44 0.943859 0.924041 97 0.019818
76 0.73 21 0.950278 0.924026 93 0.026252
19 0.72 21 0.950868 0.923988 97 0.026880
13 0.75 31 0.946689 0.923966 94 0.022723
21 0.73 10 0.962495 0.923921 123 0.038574
48 0.72 48 0.942325 0.923880 92 0.018445
7 0.75 25 0.943072 0.923878 71 0.019194
51 0.72 3 0.963697 0.923843 96 0.039854
39 0.72 27 0.949131 0.923785 95 0.025346
34 0.73 26 0.948836 0.923749 97 0.025087
4 0.70 9 0.958787 0.923718 99 0.035069
64 0.74 41 0.945394 0.923635 97 0.021759
66 0.70 33 0.947523 0.923630 101 0.023893
... ... ... ... ... ... ...
46 0.73 6 0.959922 0.923078 96 0.036844
6 0.72 31 0.948381 0.922992 98 0.025389
65 0.73 18 0.952082 0.922988 97 0.029094
45 0.71 1 0.968336 0.922968 92 0.045368
30 0.74 29 0.932928 0.922929 49 0.009999
25 0.74 19 0.947738 0.922893 80 0.024845
10 0.72 33 0.947813 0.922869 99 0.024944
41 0.73 15 0.952293 0.922809 93 0.029484
60 0.72 9 0.957347 0.922775 98 0.034572
29 0.71 36 0.946716 0.922715 99 0.024001
55 0.70 0 0.956594 0.922713 69 0.033881
38 0.74 20 0.951745 0.922702 97 0.029043
53 0.72 32 0.947660 0.922628 99 0.025032
58 0.73 8 0.956824 0.922622 94 0.034202
3 0.75 38 0.945914 0.922615 98 0.023299
63 0.71 38 0.944283 0.922557 93 0.021726
62 0.72 29 0.933078 0.922531 49 0.010547
37 0.71 16 0.954216 0.922526 96 0.031690
5 0.74 18 0.934604 0.922522 49 0.012082
57 0.73 14 0.947438 0.922505 71 0.024933
1 0.73 14 0.947438 0.922505 71 0.024933
52 0.71 12 0.953922 0.922501 92 0.031421
40 0.72 17 0.951551 0.922240 89 0.029311
79 0.72 2 0.965250 0.922161 95 0.043089
24 0.73 2 0.943175 0.922096 57 0.021079
42 0.73 11 0.941744 0.922063 60 0.019681
69 0.71 4 0.942256 0.922049 58 0.020207
16 0.71 4 0.942256 0.922049 58 0.020207
77 0.71 13 0.953533 0.921879 92 0.031654
28 0.72 13 0.954321 0.921653 93 0.032668

81 rows × 6 columns


In [78]:
df.head()


Out[78]:
subsample colsample_bytree gamma min_child_weight train_auc val_auc diffs
0 0.919647 0.734544 28 2 0.922576 0.918840 0.003736
1 0.860606 0.728616 7 17 0.947655 0.921702 0.025953
2 0.900673 0.731379 23 34 0.922294 0.919387 0.002907
3 0.860517 0.704370 1 10 0.947460 0.920868 0.026592
4 0.902383 0.709298 28 34 0.919536 0.918013 0.001523

In [79]:
df['subsample'] = df['subsample'].round(2)
df['colsample_bytree'] = df['colsample_bytree'].round(2)

In [80]:
def plot_scores_for_pars(par):
    f, ax = plt.subplots(1,3, figsize=(16,6), sharex=True)

    df.groupby(par)['val_auc'].mean().plot(ax=ax[0])
    df.groupby(par)['train_auc'].mean().plot(ax=ax[1])
    df.groupby(par)['diffs'].mean().plot(ax=ax[2])

    ax[0].set_ylabel('Test auc')
    ax[1].set_ylabel('Train auc')
    ax[2].set_ylabel('Difference')

    ax[0].set_xlabel(par)
    ax[1].set_xlabel(par)
    ax[2].set_xlabel(par)

In [81]:
plot_scores_for_pars('subsample')



In [82]:
plot_scores_for_pars('colsample_bytree')



In [85]:
plot_scores_for_pars('min_child_weight')



In [84]:
plot_scores_for_pars('gamma')



In [64]:
plot_scores_for_pars('gamma')



In [42]:
df.groupby('sub_r')['val_auc'].mean().plot()
df.groupby('sub_r')['train_auc'].mean().plot()


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ef145d0>

In [44]:
df.groupby('colt_r')['val_auc'].mean().plot()
df.groupby('colt_r')['train_auc'].mean().plot()


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x11effd550>

In [45]:
df.groupby('coll_r')['val_auc'].mean().plot()
df.groupby('coll_r')['train_auc'].mean().plot()


Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f238d50>

In [24]:
df.plot('train_auc', 'val_auc',kind='scatter', ylim=[0.918, 0.922])


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x11db2c090>

In [29]:
df.plot('val_auc', 'diffs', kind='scatter', xlim=[0.918, 0.922])


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e3be6d0>

In [20]:
df.plot('gamma', 'diffs',kind='scatter')


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11cdb0710>

In [44]:
df.plot.scatter('colsample_bytree', 'val_auc', by='max_depth')


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x139938190>

In [ ]: