In [1]:

    
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Custom modules
import const
import func

Load data



In [2]:

    
y = func.read_last_column(os.path.join(const.BASE_PATH,const.TRAIN_FILES[0]+'.csv'))
print y.head(3)
y = y.Response.values









    



    Response
Id          
4          0
6          0
7          0



In [3]:

    
# Load columns name
num_cols = func.get_columns_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))[:200]



In [4]:

    
# Load raw numeric features
X = feather.read_dataframe('./divers/train_top199.feather')



In [5]:

    
X.fillna(9999, inplace=True)



In [6]:

    
# Load some extra features
#X2 = pd.read_csv('feat_set_test.csv', index_col='ID')



In [7]:

    
#X = pd.concat([X, 
#               pd.get_dummies(X2.reset_index(drop=True)['ID_diff']),
#               pd.get_dummies(X2.reset_index(drop=True)['ID_diff_rev'], prefix='rev')], axis=1)



In [8]:

    
print('X_num_raw: {}'.format(X.shape))









    



X_num_raw: (1183747, 199)



In [9]:

    
cv = StratifiedKFold(y, 5, shuffle=True, random_state=123)



In [10]:

    
print const.CV
with open(const.CV, 'rb') as f:
    cv = pickle.load(f)
n_cv = len(cv)









    



/Users/joostbloom/Documents/kaggle/bosch/data/folds_V1.pkl



In [11]:

    
n_cv









    Out[11]:





5



In [211]:

    
x_train = xgb.DMatrix(X, 
                      label=y)

Train simple model



In [15]:

    
from sklearn.ensemble import RandomForestClassifier



In [ ]:

    
def score_xgboost(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
        
    
    (itrain, ival) = cv[3]
    
    x_tr = x_train.slice(itrain)
    x_va = x_train.slice(ival)
    
    watchlist = [ (x_tr, 'train'), (x_va, 'eval')]
    
    eval_result = {}
        
    bst = xgb.train(params, 
                    x_tr, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=5)

    #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))
    
    train_score = eval_result['train']['auc'][bst.best_iteration]
    val_score = eval_result['eval']['auc'][bst.best_iteration]
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'best_iter': bst.best_iteration, 
            'mcc': mcc_val, 
            'threshold': th_val}



In [37]:

    
def score_rf(params):
    
    global counter
    t = time.time()
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting RandomForest score with ({}):'.format(counter))
    print('\t {} samples'.format(X.shape[0]))
    print('\t {} features'.format(X.shape[1]))
    print('\t {} parameters'.format(params))
    
    n_est = params['n_estimators']
    max_feat = params['max_features']
    max_depth = params['max_depth']
    verb = params['verbose']
    rs = params['random_state']
        
    
    (itrain, ival) = cv[3]
    #itrain = itrain[:100000]
    
    bst = RandomForestClassifier(n_estimators=n_est, 
                                 verbose=verb,
                                random_state=rs,
                                max_features=max_feat,
                                max_depth=max_depth,
                                n_jobs=-1)
    print bst
    bst = bst.fit(X.iloc[itrain, :], y[itrain])
    
    train_score = roc_auc_score(y[itrain], bst.predict(X.iloc[itrain, :]))
    val_score = roc_auc_score(y[ival], bst.predict(X.iloc[ival, :]))
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict_proba(X.iloc[ival, :])[:,0]
    print preds_val
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    print time.time()-t
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'mcc': mcc_val, 
            'threshold': th_val,
            't': time.time()-t}



In [38]:

    
# Random Forest Params
params = {'n_estimators': 100}
params['random_state'] = 100
params['max_features'] = hp.choice('max_features', range(10, 199))
params['max_depth'] = hp.choice('max_depth', range(7,30))
params['verbose'] = 10
params['n_jobs'] = -1



In [39]:

    
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_rf, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)









    



Predicting RandomForest score with (1):
	 1183747 samples
	 199 features
	 {'n_estimators': 100, 'n_jobs': -1, 'verbose': 10, 'max_features': 119, 'random_state': 100, 'max_depth': 9} parameters
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features=119, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=100, verbose=10,
            warm_start=False)
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100






    



[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.5min






    



building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100






    



[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.7min






    



building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100






    



[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  6.2min






    



building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100






    



[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.4min






    



building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100






    



[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.1min






    



building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100






    



[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.6min






    



building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100






    



[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 17.3min






    



building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100






    



[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 19.8min






    



building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100






    



[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 24.7min






    



building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100






    



[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 28.4min






    



building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100






    



[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 30.9min finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished






    



[ 0.99844015  0.99684843  0.99929201 ...,  0.05667837  0.94516149
  0.78148215]
0.617804450538
0.605295060103
0.01
-0.0894200485013
1864.41564393
Predicting RandomForest score with (2):
	 1183747 samples
	 199 features
	 {'n_estimators': 100, 'n_jobs': -1, 'verbose': 10, 'max_features': 166, 'random_state': 100, 'max_depth': 24} parameters
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=24, max_features=166, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=100, verbose=10,
            warm_start=False)
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-39-3a45ec492ba1> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-37-45a349fd2368> in score_rf(params)
     29                                 n_jobs=-1)
     30     print bst
---> 31     bst = bst.fit(X.iloc[itrain, :], y[itrain])
     32 
     33     train_score = roc_auc_score(y[itrain], bst.predict(X.iloc[itrain, :]))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    288                     t, self, X, y, sample_weight, i, len(trees),
    289                     verbose=self.verbose, class_weight=self.class_weight)
--> 290                 for i, t in enumerate(trees))
    291 
    292             # Collect newly grown trees

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in retrieve(self)
    725                 job = self._jobs.pop(0)
    726             try:
--> 727                 self._output.extend(job.get())
    728             except tuple(self.exceptions) as exception:
    729                 # Stop dispatching any new job in the async callback thread

/Users/joostbloom/anaconda/lib/python2.7/multiprocessing/pool.pyc in get(self, timeout)
    559 
    560     def get(self, timeout=None):
--> 561         self.wait(timeout)
    562         if not self._ready:
    563             raise TimeoutError

/Users/joostbloom/anaconda/lib/python2.7/multiprocessing/pool.pyc in wait(self, timeout)
    554         try:
    555             if not self._ready:
--> 556                 self._cond.wait(timeout)
    557         finally:
    558             self._cond.release()

/Users/joostbloom/anaconda/lib/python2.7/threading.pyc in wait(self, timeout)
    338         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    339             if timeout is None:
--> 340                 waiter.acquire()
    341                 if __debug__:
    342                     self._note("%s.wait(): got it", self)

KeyboardInterrupt:



In [213]:

    
# XGBoost Params
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = hp.uniform('subsample', 0.7, 0.9) #,0.86
params['colsample_bytree']= hp.uniform('colsample_bytree', 0.7, 0.9) #0.92
params['min_child_weight'] = hp.choice('min_child_weight', range(50))
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 30



In [214]:

    
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_xgboost, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)









    



Predicting XGBoost score with (1):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.7453702907128406, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 2, 'subsample': 0.8392938371195724, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.867207	eval-auc:0.873051
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.898239	eval-auc:0.900304
[10]	train-auc:0.905708	eval-auc:0.909996
[15]	train-auc:0.910038	eval-auc:0.912381
[20]	train-auc:0.916738	eval-auc:0.915035
[25]	train-auc:0.919084	eval-auc:0.915347
[30]	train-auc:0.92289	eval-auc:0.917575
[35]	train-auc:0.924875	eval-auc:0.917585
[40]	train-auc:0.930593	eval-auc:0.918583
[45]	train-auc:0.933461	eval-auc:0.919082
[50]	train-auc:0.937429	eval-auc:0.919758
[55]	train-auc:0.941318	eval-auc:0.918685
[60]	train-auc:0.946065	eval-auc:0.919393
[65]	train-auc:0.9496	eval-auc:0.920822
[70]	train-auc:0.953826	eval-auc:0.920526
[75]	train-auc:0.957756	eval-auc:0.921945
[80]	train-auc:0.960299	eval-auc:0.922854
[85]	train-auc:0.962387	eval-auc:0.92339
[90]	train-auc:0.965019	eval-auc:0.923267
[95]	train-auc:0.966425	eval-auc:0.922979
[100]	train-auc:0.967711	eval-auc:0.922816
[105]	train-auc:0.969176	eval-auc:0.923066
[110]	train-auc:0.970298	eval-auc:0.923205
Stopping. Best iteration:
[82]	train-auc:0.960947	eval-auc:0.923542

0.960947
0.923542
0.43
0.467485658843
Predicting XGBoost score with (2):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.8980266639177703, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 17, 'subsample': 0.7212129811909465, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.864343	eval-auc:0.869565
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.889238	eval-auc:0.889885
[10]	train-auc:0.892675	eval-auc:0.893925
[15]	train-auc:0.911287	eval-auc:0.91405
[20]	train-auc:0.915807	eval-auc:0.915146
[25]	train-auc:0.919535	eval-auc:0.915715
[30]	train-auc:0.923535	eval-auc:0.919075
[35]	train-auc:0.928089	eval-auc:0.919679
[40]	train-auc:0.931808	eval-auc:0.920687
[45]	train-auc:0.933352	eval-auc:0.920361
[50]	train-auc:0.935335	eval-auc:0.919503
[55]	train-auc:0.937538	eval-auc:0.920457
[60]	train-auc:0.939877	eval-auc:0.921004
[65]	train-auc:0.941501	eval-auc:0.922674
[70]	train-auc:0.94331	eval-auc:0.923136
[75]	train-auc:0.944683	eval-auc:0.922491
[80]	train-auc:0.947022	eval-auc:0.922225
[85]	train-auc:0.948472	eval-auc:0.92271
[90]	train-auc:0.949981	eval-auc:0.922523
[95]	train-auc:0.951713	eval-auc:0.921723
Stopping. Best iteration:
[68]	train-auc:0.94286	eval-auc:0.923329

0.94286
0.923329
0.35
0.462257549847
Predicting XGBoost score with (3):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.739144317250818, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 34, 'subsample': 0.801345202299914, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.858663	eval-auc:0.863485
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.891567	eval-auc:0.893424
[10]	train-auc:0.90638	eval-auc:0.910631
[15]	train-auc:0.909735	eval-auc:0.912267
[20]	train-auc:0.916526	eval-auc:0.917744
[25]	train-auc:0.917784	eval-auc:0.91753
[30]	train-auc:0.921465	eval-auc:0.919432
[35]	train-auc:0.923342	eval-auc:0.919033
[40]	train-auc:0.928687	eval-auc:0.919672
[45]	train-auc:0.930837	eval-auc:0.919901
[50]	train-auc:0.933015	eval-auc:0.921099
[55]	train-auc:0.934305	eval-auc:0.921451
[60]	train-auc:0.936361	eval-auc:0.921169
[65]	train-auc:0.938001	eval-auc:0.921948
[70]	train-auc:0.939736	eval-auc:0.922118
[75]	train-auc:0.941142	eval-auc:0.921947
[80]	train-auc:0.942057	eval-auc:0.922473
[85]	train-auc:0.943369	eval-auc:0.922307
[90]	train-auc:0.944584	eval-auc:0.92295
[95]	train-auc:0.945345	eval-auc:0.922934
[100]	train-auc:0.946451	eval-auc:0.922676
[105]	train-auc:0.947304	eval-auc:0.922459
[110]	train-auc:0.948216	eval-auc:0.922615
[115]	train-auc:0.948894	eval-auc:0.922701
[120]	train-auc:0.950044	eval-auc:0.922428
Stopping. Best iteration:
[94]	train-auc:0.945158	eval-auc:0.923018

0.945158
0.923018
0.25
0.468943862623
Predicting XGBoost score with (4):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.7410143734327053, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 10, 'subsample': 0.721034425443535, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.861042	eval-auc:0.866097
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.874991	eval-auc:0.88022
[10]	train-auc:0.902645	eval-auc:0.907294
[15]	train-auc:0.91004	eval-auc:0.912556
[20]	train-auc:0.917408	eval-auc:0.915894
[25]	train-auc:0.918707	eval-auc:0.915874
[30]	train-auc:0.922567	eval-auc:0.917761
[35]	train-auc:0.924465	eval-auc:0.917982
[40]	train-auc:0.929008	eval-auc:0.920142
[45]	train-auc:0.931855	eval-auc:0.919786
[50]	train-auc:0.935537	eval-auc:0.920682
[55]	train-auc:0.938028	eval-auc:0.920825
[60]	train-auc:0.94116	eval-auc:0.921698
[65]	train-auc:0.943445	eval-auc:0.922397
[70]	train-auc:0.945271	eval-auc:0.92203
[75]	train-auc:0.94793	eval-auc:0.922402
[80]	train-auc:0.949659	eval-auc:0.922743
[85]	train-auc:0.95143	eval-auc:0.923158
[90]	train-auc:0.95282	eval-auc:0.922941
[95]	train-auc:0.954532	eval-auc:0.922561
[100]	train-auc:0.955805	eval-auc:0.922697
[105]	train-auc:0.956831	eval-auc:0.922748
[110]	train-auc:0.958109	eval-auc:0.922656
[115]	train-auc:0.959173	eval-auc:0.922332
Stopping. Best iteration:
[85]	train-auc:0.95143	eval-auc:0.923158

0.95143
0.923158
0.45
0.466437997444
Predicting XGBoost score with (5):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.8189064535289485, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 34, 'subsample': 0.8047666639909352, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.859828	eval-auc:0.864589
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.887744	eval-auc:0.890307
[10]	train-auc:0.896932	eval-auc:0.898908






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-214-7bd170faee54> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-212-540c28882d45> in score_xgboost(params)
     27                     evals_result=eval_result,
     28                     early_stopping_rounds=params['early_stopping'],
---> 29                     verbose_eval=5)
     30 
     31     #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    201                            evals=evals,
    202                            obj=obj, feval=feval,
--> 203                            xgb_model=xgb_model, callbacks=callbacks)
    204 
    205 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt:



In [64]:

    
par_values = {'max_depth': range(8,21)}
parameters = trials.trials[0]['misc']['vals'].keys()
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,16))
cmap = plt.cm.Dark2
par_best_score = {}
df = pd.DataFrame(columns=parameters + ['train_auc','val_auc'])
for i, val in enumerate(parameters):
    xs = np.array([t['misc']['vals'][val] for t in trials.trials if 'loss' in t['result']]).ravel()
    val_auc = [1-t['result']['loss'] for t in trials.trials if 'loss' in t['result']]
    train_auc = [t['result']['train_score'] for t in trials.trials if 'train_score' in t['result']]
    best_iter = [t['result']['best_iter'] for t in trials.trials if 'best_iter' in t['result']]
    mcc = [t['result']['mcc'] for t in trials.trials if 'mcc' in t['result']]
    tr = [t['result']['threshold'] for t in trials.trials if 'threshold' in t['result']]
    
    df[val] = xs
    df['val_auc'] = val_auc
    df['train_auc'] = train_auc
    #df['best_iter'] = best_iter
    df['threshold'] = tr
    df['mcc'] = mcc
    
    
    par_best_score[val] = xs[val_auc.index(min(val_auc))]
    #print trials.trials[ys.index(max(ys))]
    #print i, val, max(ys)
    #xs, ys = zip(sorted(xs), sorted(ys))
    #ys = np.array(ys)
    axes[i/2,i%2].scatter(xs, mcc, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(parameters)))
    axes[i/2,i%2].set_title(val)
print par_best_score
df['diffs'] = df['train_auc'] - df['val_auc']









    



{'max_features': 60, 'max_depth': 2}



In [67]:

    
df.sort_values('mcc', ascending=False)









    Out[67]:






  
    
      
      max_features
      max_depth
      train_auc
      val_auc
      threshold
      mcc
      diffs
    
  
  
    
      7
      153
      21
      0.766582
      0.627767
      0.01
      0.462687
      0.138815
    
    
      3
      102
      15
      0.712884
      0.625604
      0.01
      0.462653
      0.087280
    
    
      1
      156
      17
      0.728784
      0.626322
      0.01
      0.461887
      0.102463
    
    
      6
      95
      7
      0.641454
      0.614734
      0.01
      0.448279
      0.026720
    
    
      5
      83
      5
      0.629273
      0.607834
      0.01
      0.434193
      0.021439
    
    
      2
      157
      2
      0.618620
      0.605658
      0.01
      0.430550
      0.012962
    
    
      0
      109
      2
      0.617804
      0.605295
      0.01
      0.429720
      0.012509
    
    
      4
      60
      2
      0.616809
      0.602388
      0.01
      0.423024
      0.014421



In [158]:

    
ax = df.plot.scatter('threshold','mcc')
#ax.set_xlim([0.921, 0.926])



In [153]:

    
ax = df.plot.scatter('val_auc','mcc')
ax.set_xlim([0.921, 0.926])









    Out[153]:





(0.921, 0.926)



In [154]:

    
df.sort_values('mcc', ascending=False)









    Out[154]:






  
    
      
      subsample
      colsample_bytree
      min_child_weight
      train_auc
      val_auc
      best_iter
      threshold
      mcc
      diffs
    
  
  
    
      14
      0.888770
      0.757614
      13
      0.948389
      0.923325
      76
      0.35
      0.476790
      0.025064
    
    
      27
      0.896235
      0.702619
      9
      0.953168
      0.922122
      79
      0.29
      0.475241
      0.031046
    
    
      9
      0.856181
      0.788519
      17
      0.951301
      0.922990
      90
      0.35
      0.474586
      0.028311
    
    
      61
      0.722819
      0.801908
      8
      0.956034
      0.923575
      92
      0.27
      0.473831
      0.032459
    
    
      0
      0.839294
      0.745370
      2
      0.963283
      0.922013
      89
      0.35
      0.473750
      0.041270
    
    
      68
      0.793881
      0.720002
      1
      0.968214
      0.922843
      98
      0.27
      0.473465
      0.045371
    
    
      12
      0.832266
      0.730399
      27
      0.948297
      0.924090
      93
      0.33
      0.473453
      0.024207
    
    
      34
      0.794364
      0.899290
      19
      0.951723
      0.924133
      103
      0.39
      0.472870
      0.027590
    
    
      74
      0.726263
      0.772719
      25
      0.947585
      0.925005
      95
      0.47
      0.472759
      0.022580
    
    
      29
      0.729298
      0.797721
      7
      0.952093
      0.923446
      78
      0.33
      0.472499
      0.028647
    
    
      13
      0.730643
      0.742692
      5
      0.953240
      0.921878
      78
      0.27
      0.472447
      0.031362
    
    
      44
      0.838611
      0.871020
      11
      0.945875
      0.923337
      67
      0.33
      0.472267
      0.022538
    
    
      48
      0.721228
      0.713318
      16
      0.952191
      0.923898
      102
      0.49
      0.472134
      0.028293
    
    
      28
      0.767491
      0.711478
      29
      0.943764
      0.923496
      86
      0.29
      0.471925
      0.020268
    
    
      52
      0.709163
      0.736322
      28
      0.942105
      0.923434
      78
      0.33
      0.471905
      0.018671
    
    
      63
      0.780771
      0.845361
      2
      0.963321
      0.922288
      85
      0.35
      0.471752
      0.041033
    
    
      47
      0.871268
      0.890522
      6
      0.960705
      0.923033
      98
      0.41
      0.471436
      0.037672
    
    
      64
      0.745138
      0.830198
      10
      0.941657
      0.921939
      61
      0.27
      0.471366
      0.019718
    
    
      11
      0.862281
      0.739042
      20
      0.951935
      0.923533
      98
      0.41
      0.470664
      0.028402
    
    
      36
      0.866948
      0.822641
      3
      0.962460
      0.923178
      90
      0.49
      0.470628
      0.039282
    
    
      40
      0.841258
      0.811096
      31
      0.946516
      0.923932
      95
      0.35
      0.470541
      0.022584
    
    
      75
      0.700611
      0.792820
      25
      0.947977
      0.923819
      96
      0.25
      0.470476
      0.024158
    
    
      3
      0.721034
      0.741014
      10
      0.954357
      0.923990
      91
      0.35
      0.470253
      0.030367
    
    
      42
      0.789075
      0.796233
      14
      0.946784
      0.921633
      75
      0.27
      0.470022
      0.025151
    
    
      38
      0.765356
      0.861523
      15
      0.950956
      0.923238
      89
      0.33
      0.469946
      0.027718
    
    
      23
      0.700015
      0.808804
      21
      0.954367
      0.923501
      116
      0.29
      0.469904
      0.030866
    
    
      51
      0.891505
      0.749526
      41
      0.944882
      0.923361
      97
      0.37
      0.469642
      0.021521
    
    
      18
      0.883477
      0.755889
      25
      0.949422
      0.923863
      95
      0.29
      0.469456
      0.025559
    
    
      8
      0.830031
      0.727011
      12
      0.956502
      0.923187
      99
      0.23
      0.469421
      0.033315
    
    
      56
      0.720414
      0.877109
      4
      0.941983
      0.921817
      57
      0.23
      0.469417
      0.020166
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      72
      0.715002
      0.776406
      25
      0.947588
      0.925590
      96
      0.43
      0.465791
      0.021998
    
    
      77
      0.717439
      0.760744
      25
      0.946491
      0.923965
      92
      0.29
      0.465534
      0.022526
    
    
      58
      0.754234
      0.803023
      24
      0.937602
      0.923397
      59
      0.39
      0.465481
      0.014205
    
    
      43
      0.825580
      0.850991
      30
      0.945602
      0.923256
      88
      0.49
      0.464824
      0.022346
    
    
      7
      0.728326
      0.764351
      29
      0.944889
      0.924246
      88
      0.39
      0.464788
      0.020643
    
    
      39
      0.815170
      0.808814
      47
      0.940374
      0.923357
      85
      0.41
      0.464724
      0.017017
    
    
      16
      0.798192
      0.886845
      12
      0.942308
      0.922977
      63
      0.35
      0.464354
      0.019331
    
    
      55
      0.735418
      0.766978
      40
      0.942307
      0.924307
      93
      0.37
      0.464354
      0.018000
    
    
      37
      0.783444
      0.839686
      35
      0.943493
      0.923331
      86
      0.41
      0.464224
      0.020162
    
    
      1
      0.721213
      0.898027
      17
      0.939529
      0.923031
      60
      0.41
      0.464053
      0.016498
    
    
      46
      0.740479
      0.789720
      26
      0.937137
      0.922958
      59
      0.43
      0.463952
      0.014179
    
    
      30
      0.711568
      0.843878
      42
      0.944003
      0.925081
      101
      0.39
      0.463900
      0.018922
    
    
      5
      0.873327
      0.869573
      37
      0.935023
      0.922882
      56
      0.37
      0.463842
      0.012141
    
    
      41
      0.712886
      0.831764
      36
      0.941796
      0.924019
      84
      0.49
      0.463653
      0.017777
    
    
      45
      0.805731
      0.773100
      45
      0.942412
      0.923387
      94
      0.29
      0.463626
      0.019025
    
    
      70
      0.768679
      0.733253
      20
      0.938015
      0.923316
      59
      0.35
      0.463619
      0.014699
    
    
      65
      0.762682
      0.750389
      43
      0.945967
      0.924177
      115
      0.49
      0.462542
      0.021790
    
    
      15
      0.849868
      0.756917
      49
      0.943393
      0.924129
      101
      0.29
      0.462517
      0.019264
    
    
      2
      0.801345
      0.739144
      34
      0.943840
      0.923203
      88
      0.35
      0.462433
      0.020637
    
    
      69
      0.707292
      0.862217
      29
      0.943418
      0.923348
      79
      0.37
      0.461647
      0.020070
    
    
      35
      0.739287
      0.877974
      46
      0.943048
      0.923769
      97
      0.45
      0.461318
      0.019279
    
    
      33
      0.713263
      0.860015
      33
      0.936903
      0.923830
      62
      0.45
      0.460675
      0.013073
    
    
      24
      0.703677
      0.842038
      38
      0.949978
      0.923266
      129
      0.27
      0.460240
      0.026712
    
    
      67
      0.730173
      0.729144
      42
      0.938717
      0.922742
      78
      0.31
      0.459604
      0.015975
    
    
      20
      0.755408
      0.780418
      49
      0.940010
      0.922737
      91
      0.45
      0.457516
      0.017273
    
    
      53
      0.774294
      0.816412
      48
      0.943967
      0.923696
      110
      0.35
      0.456941
      0.020271
    
    
      6
      0.814106
      0.824706
      40
      0.931967
      0.922060
      48
      0.31
      0.456523
      0.009907
    
    
      49
      0.808068
      0.851656
      22
      0.930251
      0.922660
      39
      0.35
      0.455818
      0.007591
    
    
      31
      0.819847
      0.852363
      42
      0.931549
      0.922102
      47
      0.35
      0.454549
      0.009447
    
    
      60
      0.744576
      0.783634
      44
      0.935360
      0.923383
      59
      0.29
      0.454160
      0.011977
    
  

78 rows × 9 columns



In [66]:

    
#df.drop(['gamma'], axis=1, inplace=True)
#df.to_csv('./data/xgboost_hyperopt_1fold_100iter.csv', index=False)



In [122]:

    
df['colsample_bytree'] = df['colsample_bytree'].round(2)



In [127]:

    
df.sort_values('val_auc', ascending=False)









    Out[127]:






  
    
      
      colsample_bytree
      min_child_weight
      train_auc
      val_auc
      best_iter
      diffs
    
  
  
    
      18
      0.73
      49
      0.943012
      0.924580
      92
      0.018432
    
    
      2
      0.70
      29
      0.947958
      0.924540
      98
      0.023418
    
    
      61
      0.72
      49
      0.942023
      0.924430
      93
      0.017593
    
    
      26
      0.72
      49
      0.942023
      0.924430
      93
      0.017593
    
    
      47
      0.73
      47
      0.943834
      0.924389
      101
      0.019445
    
    
      75
      0.73
      30
      0.947750
      0.924377
      96
      0.023373
    
    
      14
      0.73
      30
      0.947750
      0.924377
      96
      0.023373
    
    
      43
      0.74
      37
      0.945657
      0.924363
      96
      0.021294
    
    
      73
      0.70
      49
      0.943502
      0.924312
      98
      0.019190
    
    
      71
      0.71
      25
      0.952072
      0.924288
      107
      0.027784
    
    
      80
      0.71
      34
      0.946239
      0.924287
      97
      0.021952
    
    
      12
      0.71
      31
      0.947255
      0.924267
      97
      0.022988
    
    
      54
      0.71
      5
      0.961884
      0.924240
      98
      0.037644
    
    
      11
      0.71
      35
      0.946406
      0.924147
      98
      0.022259
    
    
      72
      0.72
      35
      0.947205
      0.924069
      97
      0.023136
    
    
      0
      0.71
      45
      0.943809
      0.924064
      93
      0.019745
    
    
      68
      0.71
      49
      0.942241
      0.924047
      94
      0.018194
    
    
      36
      0.71
      44
      0.943859
      0.924041
      97
      0.019818
    
    
      76
      0.73
      21
      0.950278
      0.924026
      93
      0.026252
    
    
      19
      0.72
      21
      0.950868
      0.923988
      97
      0.026880
    
    
      13
      0.75
      31
      0.946689
      0.923966
      94
      0.022723
    
    
      21
      0.73
      10
      0.962495
      0.923921
      123
      0.038574
    
    
      48
      0.72
      48
      0.942325
      0.923880
      92
      0.018445
    
    
      7
      0.75
      25
      0.943072
      0.923878
      71
      0.019194
    
    
      51
      0.72
      3
      0.963697
      0.923843
      96
      0.039854
    
    
      39
      0.72
      27
      0.949131
      0.923785
      95
      0.025346
    
    
      34
      0.73
      26
      0.948836
      0.923749
      97
      0.025087
    
    
      4
      0.70
      9
      0.958787
      0.923718
      99
      0.035069
    
    
      64
      0.74
      41
      0.945394
      0.923635
      97
      0.021759
    
    
      66
      0.70
      33
      0.947523
      0.923630
      101
      0.023893
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      46
      0.73
      6
      0.959922
      0.923078
      96
      0.036844
    
    
      6
      0.72
      31
      0.948381
      0.922992
      98
      0.025389
    
    
      65
      0.73
      18
      0.952082
      0.922988
      97
      0.029094
    
    
      45
      0.71
      1
      0.968336
      0.922968
      92
      0.045368
    
    
      30
      0.74
      29
      0.932928
      0.922929
      49
      0.009999
    
    
      25
      0.74
      19
      0.947738
      0.922893
      80
      0.024845
    
    
      10
      0.72
      33
      0.947813
      0.922869
      99
      0.024944
    
    
      41
      0.73
      15
      0.952293
      0.922809
      93
      0.029484
    
    
      60
      0.72
      9
      0.957347
      0.922775
      98
      0.034572
    
    
      29
      0.71
      36
      0.946716
      0.922715
      99
      0.024001
    
    
      55
      0.70
      0
      0.956594
      0.922713
      69
      0.033881
    
    
      38
      0.74
      20
      0.951745
      0.922702
      97
      0.029043
    
    
      53
      0.72
      32
      0.947660
      0.922628
      99
      0.025032
    
    
      58
      0.73
      8
      0.956824
      0.922622
      94
      0.034202
    
    
      3
      0.75
      38
      0.945914
      0.922615
      98
      0.023299
    
    
      63
      0.71
      38
      0.944283
      0.922557
      93
      0.021726
    
    
      62
      0.72
      29
      0.933078
      0.922531
      49
      0.010547
    
    
      37
      0.71
      16
      0.954216
      0.922526
      96
      0.031690
    
    
      5
      0.74
      18
      0.934604
      0.922522
      49
      0.012082
    
    
      57
      0.73
      14
      0.947438
      0.922505
      71
      0.024933
    
    
      1
      0.73
      14
      0.947438
      0.922505
      71
      0.024933
    
    
      52
      0.71
      12
      0.953922
      0.922501
      92
      0.031421
    
    
      40
      0.72
      17
      0.951551
      0.922240
      89
      0.029311
    
    
      79
      0.72
      2
      0.965250
      0.922161
      95
      0.043089
    
    
      24
      0.73
      2
      0.943175
      0.922096
      57
      0.021079
    
    
      42
      0.73
      11
      0.941744
      0.922063
      60
      0.019681
    
    
      69
      0.71
      4
      0.942256
      0.922049
      58
      0.020207
    
    
      16
      0.71
      4
      0.942256
      0.922049
      58
      0.020207
    
    
      77
      0.71
      13
      0.953533
      0.921879
      92
      0.031654
    
    
      28
      0.72
      13
      0.954321
      0.921653
      93
      0.032668
    
  

81 rows × 6 columns



In [78]:

    
df.head()









    Out[78]:






  
    
      
      subsample
      colsample_bytree
      gamma
      min_child_weight
      train_auc
      val_auc
      diffs
    
  
  
    
      0
      0.919647
      0.734544
      28
      2
      0.922576
      0.918840
      0.003736
    
    
      1
      0.860606
      0.728616
      7
      17
      0.947655
      0.921702
      0.025953
    
    
      2
      0.900673
      0.731379
      23
      34
      0.922294
      0.919387
      0.002907
    
    
      3
      0.860517
      0.704370
      1
      10
      0.947460
      0.920868
      0.026592
    
    
      4
      0.902383
      0.709298
      28
      34
      0.919536
      0.918013
      0.001523



In [79]:

    
df['subsample'] = df['subsample'].round(2)
df['colsample_bytree'] = df['colsample_bytree'].round(2)



In [80]:

    
def plot_scores_for_pars(par):
    f, ax = plt.subplots(1,3, figsize=(16,6), sharex=True)

    df.groupby(par)['val_auc'].mean().plot(ax=ax[0])
    df.groupby(par)['train_auc'].mean().plot(ax=ax[1])
    df.groupby(par)['diffs'].mean().plot(ax=ax[2])

    ax[0].set_ylabel('Test auc')
    ax[1].set_ylabel('Train auc')
    ax[2].set_ylabel('Difference')

    ax[0].set_xlabel(par)
    ax[1].set_xlabel(par)
    ax[2].set_xlabel(par)



In [81]:

    
plot_scores_for_pars('subsample')



In [82]:

    
plot_scores_for_pars('colsample_bytree')



In [85]:

    
plot_scores_for_pars('min_child_weight')



In [84]:

    
plot_scores_for_pars('gamma')



In [64]:

    
plot_scores_for_pars('gamma')



In [42]:

    
df.groupby('sub_r')['val_auc'].mean().plot()
df.groupby('sub_r')['train_auc'].mean().plot()









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0x11ef145d0>



In [44]:

    
df.groupby('colt_r')['val_auc'].mean().plot()
df.groupby('colt_r')['train_auc'].mean().plot()









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x11effd550>



In [45]:

    
df.groupby('coll_r')['val_auc'].mean().plot()
df.groupby('coll_r')['train_auc'].mean().plot()









    Out[45]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f238d50>



In [24]:

    
df.plot('train_auc', 'val_auc',kind='scatter', ylim=[0.918, 0.922])









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x11db2c090>



In [29]:

    
df.plot('val_auc', 'diffs', kind='scatter', xlim=[0.918, 0.922])









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x11e3be6d0>



In [20]:

    
df.plot('gamma', 'diffs',kind='scatter')









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x11cdb0710>



In [44]:

    
df.plot.scatter('colsample_bytree', 'val_auc', by='max_depth')









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x139938190>



In [ ]:

	max_features	max_depth	train_auc	val_auc	threshold	mcc	diffs
7	153	21	0.766582	0.627767	0.01	0.462687	0.138815
3	102	15	0.712884	0.625604	0.01	0.462653	0.087280
1	156	17	0.728784	0.626322	0.01	0.461887	0.102463
6	95	7	0.641454	0.614734	0.01	0.448279	0.026720
5	83	5	0.629273	0.607834	0.01	0.434193	0.021439
2	157	2	0.618620	0.605658	0.01	0.430550	0.012962
0	109	2	0.617804	0.605295	0.01	0.429720	0.012509
4	60	2	0.616809	0.602388	0.01	0.423024	0.014421

	subsample	colsample_bytree	min_child_weight	train_auc	val_auc	best_iter	threshold	mcc	diffs
14	0.888770	0.757614	13	0.948389	0.923325	76	0.35	0.476790	0.025064
27	0.896235	0.702619	9	0.953168	0.922122	79	0.29	0.475241	0.031046
9	0.856181	0.788519	17	0.951301	0.922990	90	0.35	0.474586	0.028311
61	0.722819	0.801908	8	0.956034	0.923575	92	0.27	0.473831	0.032459
0	0.839294	0.745370	2	0.963283	0.922013	89	0.35	0.473750	0.041270
68	0.793881	0.720002	1	0.968214	0.922843	98	0.27	0.473465	0.045371
12	0.832266	0.730399	27	0.948297	0.924090	93	0.33	0.473453	0.024207
34	0.794364	0.899290	19	0.951723	0.924133	103	0.39	0.472870	0.027590
74	0.726263	0.772719	25	0.947585	0.925005	95	0.47	0.472759	0.022580
29	0.729298	0.797721	7	0.952093	0.923446	78	0.33	0.472499	0.028647
13	0.730643	0.742692	5	0.953240	0.921878	78	0.27	0.472447	0.031362
44	0.838611	0.871020	11	0.945875	0.923337	67	0.33	0.472267	0.022538
48	0.721228	0.713318	16	0.952191	0.923898	102	0.49	0.472134	0.028293
28	0.767491	0.711478	29	0.943764	0.923496	86	0.29	0.471925	0.020268
52	0.709163	0.736322	28	0.942105	0.923434	78	0.33	0.471905	0.018671
63	0.780771	0.845361	2	0.963321	0.922288	85	0.35	0.471752	0.041033
47	0.871268	0.890522	6	0.960705	0.923033	98	0.41	0.471436	0.037672
64	0.745138	0.830198	10	0.941657	0.921939	61	0.27	0.471366	0.019718
11	0.862281	0.739042	20	0.951935	0.923533	98	0.41	0.470664	0.028402
36	0.866948	0.822641	3	0.962460	0.923178	90	0.49	0.470628	0.039282
40	0.841258	0.811096	31	0.946516	0.923932	95	0.35	0.470541	0.022584
75	0.700611	0.792820	25	0.947977	0.923819	96	0.25	0.470476	0.024158
3	0.721034	0.741014	10	0.954357	0.923990	91	0.35	0.470253	0.030367
42	0.789075	0.796233	14	0.946784	0.921633	75	0.27	0.470022	0.025151
38	0.765356	0.861523	15	0.950956	0.923238	89	0.33	0.469946	0.027718
23	0.700015	0.808804	21	0.954367	0.923501	116	0.29	0.469904	0.030866
51	0.891505	0.749526	41	0.944882	0.923361	97	0.37	0.469642	0.021521
18	0.883477	0.755889	25	0.949422	0.923863	95	0.29	0.469456	0.025559
8	0.830031	0.727011	12	0.956502	0.923187	99	0.23	0.469421	0.033315
56	0.720414	0.877109	4	0.941983	0.921817	57	0.23	0.469417	0.020166
...	...	...	...	...	...	...	...	...	...
72	0.715002	0.776406	25	0.947588	0.925590	96	0.43	0.465791	0.021998
77	0.717439	0.760744	25	0.946491	0.923965	92	0.29	0.465534	0.022526
58	0.754234	0.803023	24	0.937602	0.923397	59	0.39	0.465481	0.014205
43	0.825580	0.850991	30	0.945602	0.923256	88	0.49	0.464824	0.022346
7	0.728326	0.764351	29	0.944889	0.924246	88	0.39	0.464788	0.020643
39	0.815170	0.808814	47	0.940374	0.923357	85	0.41	0.464724	0.017017
16	0.798192	0.886845	12	0.942308	0.922977	63	0.35	0.464354	0.019331
55	0.735418	0.766978	40	0.942307	0.924307	93	0.37	0.464354	0.018000
37	0.783444	0.839686	35	0.943493	0.923331	86	0.41	0.464224	0.020162
1	0.721213	0.898027	17	0.939529	0.923031	60	0.41	0.464053	0.016498
46	0.740479	0.789720	26	0.937137	0.922958	59	0.43	0.463952	0.014179
30	0.711568	0.843878	42	0.944003	0.925081	101	0.39	0.463900	0.018922
5	0.873327	0.869573	37	0.935023	0.922882	56	0.37	0.463842	0.012141
41	0.712886	0.831764	36	0.941796	0.924019	84	0.49	0.463653	0.017777
45	0.805731	0.773100	45	0.942412	0.923387	94	0.29	0.463626	0.019025
70	0.768679	0.733253	20	0.938015	0.923316	59	0.35	0.463619	0.014699
65	0.762682	0.750389	43	0.945967	0.924177	115	0.49	0.462542	0.021790
15	0.849868	0.756917	49	0.943393	0.924129	101	0.29	0.462517	0.019264
2	0.801345	0.739144	34	0.943840	0.923203	88	0.35	0.462433	0.020637
69	0.707292	0.862217	29	0.943418	0.923348	79	0.37	0.461647	0.020070
35	0.739287	0.877974	46	0.943048	0.923769	97	0.45	0.461318	0.019279
33	0.713263	0.860015	33	0.936903	0.923830	62	0.45	0.460675	0.013073
24	0.703677	0.842038	38	0.949978	0.923266	129	0.27	0.460240	0.026712
67	0.730173	0.729144	42	0.938717	0.922742	78	0.31	0.459604	0.015975
20	0.755408	0.780418	49	0.940010	0.922737	91	0.45	0.457516	0.017273
53	0.774294	0.816412	48	0.943967	0.923696	110	0.35	0.456941	0.020271
6	0.814106	0.824706	40	0.931967	0.922060	48	0.31	0.456523	0.009907
49	0.808068	0.851656	22	0.930251	0.922660	39	0.35	0.455818	0.007591
31	0.819847	0.852363	42	0.931549	0.922102	47	0.35	0.454549	0.009447
60	0.744576	0.783634	44	0.935360	0.923383	59	0.29	0.454160	0.011977

	colsample_bytree	min_child_weight	train_auc	val_auc	best_iter	diffs
18	0.73	49	0.943012	0.924580	92	0.018432
2	0.70	29	0.947958	0.924540	98	0.023418
61	0.72	49	0.942023	0.924430	93	0.017593
26	0.72	49	0.942023	0.924430	93	0.017593
47	0.73	47	0.943834	0.924389	101	0.019445
75	0.73	30	0.947750	0.924377	96	0.023373
14	0.73	30	0.947750	0.924377	96	0.023373
43	0.74	37	0.945657	0.924363	96	0.021294
73	0.70	49	0.943502	0.924312	98	0.019190
71	0.71	25	0.952072	0.924288	107	0.027784
80	0.71	34	0.946239	0.924287	97	0.021952
12	0.71	31	0.947255	0.924267	97	0.022988
54	0.71	5	0.961884	0.924240	98	0.037644
11	0.71	35	0.946406	0.924147	98	0.022259
72	0.72	35	0.947205	0.924069	97	0.023136
0	0.71	45	0.943809	0.924064	93	0.019745
68	0.71	49	0.942241	0.924047	94	0.018194
36	0.71	44	0.943859	0.924041	97	0.019818
76	0.73	21	0.950278	0.924026	93	0.026252
19	0.72	21	0.950868	0.923988	97	0.026880
13	0.75	31	0.946689	0.923966	94	0.022723
21	0.73	10	0.962495	0.923921	123	0.038574
48	0.72	48	0.942325	0.923880	92	0.018445
7	0.75	25	0.943072	0.923878	71	0.019194
51	0.72	3	0.963697	0.923843	96	0.039854
39	0.72	27	0.949131	0.923785	95	0.025346
34	0.73	26	0.948836	0.923749	97	0.025087
4	0.70	9	0.958787	0.923718	99	0.035069
64	0.74	41	0.945394	0.923635	97	0.021759
66	0.70	33	0.947523	0.923630	101	0.023893
...	...	...	...	...	...	...
46	0.73	6	0.959922	0.923078	96	0.036844
6	0.72	31	0.948381	0.922992	98	0.025389
65	0.73	18	0.952082	0.922988	97	0.029094
45	0.71	1	0.968336	0.922968	92	0.045368
30	0.74	29	0.932928	0.922929	49	0.009999
25	0.74	19	0.947738	0.922893	80	0.024845
10	0.72	33	0.947813	0.922869	99	0.024944
41	0.73	15	0.952293	0.922809	93	0.029484
60	0.72	9	0.957347	0.922775	98	0.034572
29	0.71	36	0.946716	0.922715	99	0.024001
55	0.70	0	0.956594	0.922713	69	0.033881
38	0.74	20	0.951745	0.922702	97	0.029043
53	0.72	32	0.947660	0.922628	99	0.025032
58	0.73	8	0.956824	0.922622	94	0.034202
3	0.75	38	0.945914	0.922615	98	0.023299
63	0.71	38	0.944283	0.922557	93	0.021726
62	0.72	29	0.933078	0.922531	49	0.010547
37	0.71	16	0.954216	0.922526	96	0.031690
5	0.74	18	0.934604	0.922522	49	0.012082
57	0.73	14	0.947438	0.922505	71	0.024933
1	0.73	14	0.947438	0.922505	71	0.024933
52	0.71	12	0.953922	0.922501	92	0.031421
40	0.72	17	0.951551	0.922240	89	0.029311
79	0.72	2	0.965250	0.922161	95	0.043089
24	0.73	2	0.943175	0.922096	57	0.021079
42	0.73	11	0.941744	0.922063	60	0.019681
69	0.71	4	0.942256	0.922049	58	0.020207
16	0.71	4	0.942256	0.922049	58	0.020207
77	0.71	13	0.953533	0.921879	92	0.031654
28	0.72	13	0.954321	0.921653	93	0.032668

	subsample	colsample_bytree	gamma	min_child_weight	train_auc	val_auc	diffs
0	0.919647	0.734544	28	2	0.922576	0.918840	0.003736
1	0.860606	0.728616	7	17	0.947655	0.921702	0.025953
2	0.900673	0.731379	23	34	0.922294	0.919387	0.002907
3	0.860517	0.704370	1	10	0.947460	0.920868	0.026592
4	0.902383	0.709298	28	34	0.919536	0.918013	0.001523