In [1]:
import feather
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Custom modules
import const
import func

Load data


In [2]:
y = func.read_last_column(os.path.join(const.BASE_PATH,const.TRAIN_FILES[0]+'.csv'))
print y.head(3)
y = y.Response.values


    Response
Id          
4          0
6          0
7          0

In [3]:
# Load columns name
num_cols = func.get_columns_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]))[:200]

In [4]:
# Load raw numeric features
X = feather.read_dataframe('./divers/train_top199.feather')

In [5]:
X.fillna(9999, inplace=True)

In [6]:
# Load some extra features
#X2 = pd.read_csv('feat_set_test.csv', index_col='ID')

In [7]:
#X = pd.concat([X, 
#               pd.get_dummies(X2.reset_index(drop=True)['ID_diff']),
#               pd.get_dummies(X2.reset_index(drop=True)['ID_diff_rev'], prefix='rev')], axis=1)

In [8]:
print('X_num_raw: {}'.format(X.shape))


X_num_raw: (1183747, 199)

In [9]:
cv = StratifiedKFold(y, 5, shuffle=True, random_state=123)

In [10]:
print const.CV
with open(const.CV, 'rb') as f:
    cv = pickle.load(f)
n_cv = len(cv)


/Users/joostbloom/Documents/kaggle/bosch/data/folds_V1.pkl

In [11]:
n_cv


Out[11]:
5

In [211]:
x_train = xgb.DMatrix(X, 
                      label=y)

Train simple model


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [ ]:
def score_xgboost(params):
    
    global counter
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting XGBoost score with ({}):'.format(counter))
    print('\t {} samples'.format(x_train.num_row()))
    print('\t {} features'.format(x_train.num_col()))
    print('\t {} parameters'.format(params))
        
    
    (itrain, ival) = cv[3]
    
    x_tr = x_train.slice(itrain)
    x_va = x_train.slice(ival)
    
    watchlist = [ (x_tr, 'train'), (x_va, 'eval')]
    
    eval_result = {}
        
    bst = xgb.train(params, 
                    x_tr, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=5)

    #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))
    
    train_score = eval_result['train']['auc'][bst.best_iteration]
    val_score = eval_result['eval']['auc'][bst.best_iteration]
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict(x_va, ntree_limit=bst.best_ntree_limit)
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'best_iter': bst.best_iteration, 
            'mcc': mcc_val, 
            'threshold': th_val}

In [37]:
def score_rf(params):
    
    global counter
    t = time.time()
    
    #print ('Params testing %d: %s' % (counter, params))
    counter += 1
    
    print('Predicting RandomForest score with ({}):'.format(counter))
    print('\t {} samples'.format(X.shape[0]))
    print('\t {} features'.format(X.shape[1]))
    print('\t {} parameters'.format(params))
    
    n_est = params['n_estimators']
    max_feat = params['max_features']
    max_depth = params['max_depth']
    verb = params['verbose']
    rs = params['random_state']
        
    
    (itrain, ival) = cv[3]
    #itrain = itrain[:100000]
    
    bst = RandomForestClassifier(n_estimators=n_est, 
                                 verbose=verb,
                                random_state=rs,
                                max_features=max_feat,
                                max_depth=max_depth,
                                n_jobs=-1)
    print bst
    bst = bst.fit(X.iloc[itrain, :], y[itrain])
    
    train_score = roc_auc_score(y[itrain], bst.predict(X.iloc[itrain, :]))
    val_score = roc_auc_score(y[ival], bst.predict(X.iloc[ival, :]))
    
    # pick the best threshold based on oof predictions
    preds_val = bst.predict_proba(X.iloc[ival, :])[:,0]
    print preds_val
    thresholds = np.linspace(0.01, 0.99, 50)
    mcc = np.array([matthews_corrcoef(y[ival], preds_val>thr) for thr in thresholds])
    th_val = thresholds[mcc.argmax()]
    mcc_val = mcc.max()
    
    print train_score
    print val_score
    print th_val
    print mcc_val
    print time.time()-t
    
    return {'loss': 1-val_score, 
            'status': STATUS_OK, 
            'train_score': train_score, 
            'mcc': mcc_val, 
            'threshold': th_val,
            't': time.time()-t}

In [38]:
# Random Forest Params
params = {'n_estimators': 100}
params['random_state'] = 100
params['max_features'] = hp.choice('max_features', range(10, 199))
params['max_depth'] = hp.choice('max_depth', range(7,30))
params['verbose'] = 10
params['n_jobs'] = -1

In [39]:
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_rf, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)


Predicting RandomForest score with (1):
	 1183747 samples
	 199 features
	 {'n_estimators': 100, 'n_jobs': -1, 'verbose': 10, 'max_features': 119, 'random_state': 100, 'max_depth': 9} parameters
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features=119, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=100, verbose=10,
            warm_start=False)
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.5min
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.7min
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  6.2min
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.4min
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.1min
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.6min
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 17.3min
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 19.8min
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 24.7min
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 28.4min
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 30.9min finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished
[ 0.99844015  0.99684843  0.99929201 ...,  0.05667837  0.94516149
  0.78148215]
0.617804450538
0.605295060103
0.01
-0.0894200485013
1864.41564393
Predicting RandomForest score with (2):
	 1183747 samples
	 199 features
	 {'n_estimators': 100, 'n_jobs': -1, 'verbose': 10, 'max_features': 166, 'random_state': 100, 'max_depth': 24} parameters
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=24, max_features=166, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=100, verbose=10,
            warm_start=False)
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-39-3a45ec492ba1> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-37-45a349fd2368> in score_rf(params)
     29                                 n_jobs=-1)
     30     print bst
---> 31     bst = bst.fit(X.iloc[itrain, :], y[itrain])
     32 
     33     train_score = roc_auc_score(y[itrain], bst.predict(X.iloc[itrain, :]))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    288                     t, self, X, y, sample_weight, i, len(trees),
    289                     verbose=self.verbose, class_weight=self.class_weight)
--> 290                 for i, t in enumerate(trees))
    291 
    292             # Collect newly grown trees

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time

/Users/joostbloom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in retrieve(self)
    725                 job = self._jobs.pop(0)
    726             try:
--> 727                 self._output.extend(job.get())
    728             except tuple(self.exceptions) as exception:
    729                 # Stop dispatching any new job in the async callback thread

/Users/joostbloom/anaconda/lib/python2.7/multiprocessing/pool.pyc in get(self, timeout)
    559 
    560     def get(self, timeout=None):
--> 561         self.wait(timeout)
    562         if not self._ready:
    563             raise TimeoutError

/Users/joostbloom/anaconda/lib/python2.7/multiprocessing/pool.pyc in wait(self, timeout)
    554         try:
    555             if not self._ready:
--> 556                 self._cond.wait(timeout)
    557         finally:
    558             self._cond.release()

/Users/joostbloom/anaconda/lib/python2.7/threading.pyc in wait(self, timeout)
    338         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    339             if timeout is None:
--> 340                 waiter.acquire()
    341                 if __debug__:
    342                     self._note("%s.wait(): got it", self)

KeyboardInterrupt: 

In [213]:
# XGBoost Params
params = {'max_depth': 7, 'eta':0.1, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = hp.uniform('subsample', 0.7, 0.9) #,0.86
params['colsample_bytree']= hp.uniform('colsample_bytree', 0.7, 0.9) #0.92
params['min_child_weight'] = hp.choice('min_child_weight', range(50))
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 200
params['early_stopping'] = 30

In [214]:
# Hyperopt
trials = Trials()
counter = 0
best = fmin(score_xgboost, 
                    params, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)


Predicting XGBoost score with (1):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.7453702907128406, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 2, 'subsample': 0.8392938371195724, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.867207	eval-auc:0.873051
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.898239	eval-auc:0.900304
[10]	train-auc:0.905708	eval-auc:0.909996
[15]	train-auc:0.910038	eval-auc:0.912381
[20]	train-auc:0.916738	eval-auc:0.915035
[25]	train-auc:0.919084	eval-auc:0.915347
[30]	train-auc:0.92289	eval-auc:0.917575
[35]	train-auc:0.924875	eval-auc:0.917585
[40]	train-auc:0.930593	eval-auc:0.918583
[45]	train-auc:0.933461	eval-auc:0.919082
[50]	train-auc:0.937429	eval-auc:0.919758
[55]	train-auc:0.941318	eval-auc:0.918685
[60]	train-auc:0.946065	eval-auc:0.919393
[65]	train-auc:0.9496	eval-auc:0.920822
[70]	train-auc:0.953826	eval-auc:0.920526
[75]	train-auc:0.957756	eval-auc:0.921945
[80]	train-auc:0.960299	eval-auc:0.922854
[85]	train-auc:0.962387	eval-auc:0.92339
[90]	train-auc:0.965019	eval-auc:0.923267
[95]	train-auc:0.966425	eval-auc:0.922979
[100]	train-auc:0.967711	eval-auc:0.922816
[105]	train-auc:0.969176	eval-auc:0.923066
[110]	train-auc:0.970298	eval-auc:0.923205
Stopping. Best iteration:
[82]	train-auc:0.960947	eval-auc:0.923542

0.960947
0.923542
0.43
0.467485658843
Predicting XGBoost score with (2):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.8980266639177703, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 17, 'subsample': 0.7212129811909465, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.864343	eval-auc:0.869565
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.889238	eval-auc:0.889885
[10]	train-auc:0.892675	eval-auc:0.893925
[15]	train-auc:0.911287	eval-auc:0.91405
[20]	train-auc:0.915807	eval-auc:0.915146
[25]	train-auc:0.919535	eval-auc:0.915715
[30]	train-auc:0.923535	eval-auc:0.919075
[35]	train-auc:0.928089	eval-auc:0.919679
[40]	train-auc:0.931808	eval-auc:0.920687
[45]	train-auc:0.933352	eval-auc:0.920361
[50]	train-auc:0.935335	eval-auc:0.919503
[55]	train-auc:0.937538	eval-auc:0.920457
[60]	train-auc:0.939877	eval-auc:0.921004
[65]	train-auc:0.941501	eval-auc:0.922674
[70]	train-auc:0.94331	eval-auc:0.923136
[75]	train-auc:0.944683	eval-auc:0.922491
[80]	train-auc:0.947022	eval-auc:0.922225
[85]	train-auc:0.948472	eval-auc:0.92271
[90]	train-auc:0.949981	eval-auc:0.922523
[95]	train-auc:0.951713	eval-auc:0.921723
Stopping. Best iteration:
[68]	train-auc:0.94286	eval-auc:0.923329

0.94286
0.923329
0.35
0.462257549847
Predicting XGBoost score with (3):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.739144317250818, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 34, 'subsample': 0.801345202299914, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.858663	eval-auc:0.863485
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.891567	eval-auc:0.893424
[10]	train-auc:0.90638	eval-auc:0.910631
[15]	train-auc:0.909735	eval-auc:0.912267
[20]	train-auc:0.916526	eval-auc:0.917744
[25]	train-auc:0.917784	eval-auc:0.91753
[30]	train-auc:0.921465	eval-auc:0.919432
[35]	train-auc:0.923342	eval-auc:0.919033
[40]	train-auc:0.928687	eval-auc:0.919672
[45]	train-auc:0.930837	eval-auc:0.919901
[50]	train-auc:0.933015	eval-auc:0.921099
[55]	train-auc:0.934305	eval-auc:0.921451
[60]	train-auc:0.936361	eval-auc:0.921169
[65]	train-auc:0.938001	eval-auc:0.921948
[70]	train-auc:0.939736	eval-auc:0.922118
[75]	train-auc:0.941142	eval-auc:0.921947
[80]	train-auc:0.942057	eval-auc:0.922473
[85]	train-auc:0.943369	eval-auc:0.922307
[90]	train-auc:0.944584	eval-auc:0.92295
[95]	train-auc:0.945345	eval-auc:0.922934
[100]	train-auc:0.946451	eval-auc:0.922676
[105]	train-auc:0.947304	eval-auc:0.922459
[110]	train-auc:0.948216	eval-auc:0.922615
[115]	train-auc:0.948894	eval-auc:0.922701
[120]	train-auc:0.950044	eval-auc:0.922428
Stopping. Best iteration:
[94]	train-auc:0.945158	eval-auc:0.923018

0.945158
0.923018
0.25
0.468943862623
Predicting XGBoost score with (4):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.7410143734327053, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 10, 'subsample': 0.721034425443535, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.861042	eval-auc:0.866097
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.874991	eval-auc:0.88022
[10]	train-auc:0.902645	eval-auc:0.907294
[15]	train-auc:0.91004	eval-auc:0.912556
[20]	train-auc:0.917408	eval-auc:0.915894
[25]	train-auc:0.918707	eval-auc:0.915874
[30]	train-auc:0.922567	eval-auc:0.917761
[35]	train-auc:0.924465	eval-auc:0.917982
[40]	train-auc:0.929008	eval-auc:0.920142
[45]	train-auc:0.931855	eval-auc:0.919786
[50]	train-auc:0.935537	eval-auc:0.920682
[55]	train-auc:0.938028	eval-auc:0.920825
[60]	train-auc:0.94116	eval-auc:0.921698
[65]	train-auc:0.943445	eval-auc:0.922397
[70]	train-auc:0.945271	eval-auc:0.92203
[75]	train-auc:0.94793	eval-auc:0.922402
[80]	train-auc:0.949659	eval-auc:0.922743
[85]	train-auc:0.95143	eval-auc:0.923158
[90]	train-auc:0.95282	eval-auc:0.922941
[95]	train-auc:0.954532	eval-auc:0.922561
[100]	train-auc:0.955805	eval-auc:0.922697
[105]	train-auc:0.956831	eval-auc:0.922748
[110]	train-auc:0.958109	eval-auc:0.922656
[115]	train-auc:0.959173	eval-auc:0.922332
Stopping. Best iteration:
[85]	train-auc:0.95143	eval-auc:0.923158

0.95143
0.923158
0.45
0.466437997444
Predicting XGBoost score with (5):
	 1183747 samples
	 205 features
	 {'num_round': 200, 'colsample_bytree': 0.8189064535289485, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 34, 'subsample': 0.8047666639909352, 'eta': 0.1, 'objective': 'binary:logistic', 'seed': 1712, 'max_depth': 7, 'early_stopping': 30, 'booster': 'gbtree'} parameters
[0]	train-auc:0.859828	eval-auc:0.864589
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[5]	train-auc:0.887744	eval-auc:0.890307
[10]	train-auc:0.896932	eval-auc:0.898908
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-214-7bd170faee54> in <module>()
      6                     algo=tpe.suggest,
      7                     max_evals=200,
----> 8                     trials=trials)

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rseed)
    332 
    333     rval = FMinIter(algo, domain, trials, max_evals=max_evals)
--> 334     rval.exhaust()
    335     return trials.argmin
    336 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in exhaust(self)
    292     def exhaust(self):
    293         n_done = len(self.trials)
--> 294         self.run(self.max_evals - n_done, block_until_done=self.async)
    295         self.trials.refresh()
    296         return self

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    266             else:
    267                 # -- loop over trials and do the jobs directly
--> 268                 self.serial_evaluate()
    269 
    270             if stopped:

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
    185                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    186                 try:
--> 187                     result = self.domain.evaluate(spec, ctrl)
    188                 except Exception, e:
    189                     logger.info('job exception: %s' % str(e))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/hyperopt/fmin.pyc in evaluate(self, config, ctrl, attach_attachments)
    112             pyll_rval = pyll.rec_eval(self.expr, memo=memo,
    113                     print_node_on_error=self.rec_eval_print_node_on_error)
--> 114             rval = self.fn(pyll_rval)
    115 
    116         if isinstance(rval, (float, int, np.number)):

<ipython-input-212-540c28882d45> in score_xgboost(params)
     27                     evals_result=eval_result,
     28                     early_stopping_rounds=params['early_stopping'],
---> 29                     verbose_eval=5)
     30 
     31     #print('\t score: {}'.format(roc_auc_score(y_val, y_pred_val)))

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    201                            evals=evals,
    202                            obj=obj, feval=feval,
--> 203                            xgb_model=xgb_model, callbacks=callbacks)
    204 
    205 

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

/Users/joostbloom/anaconda/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt: 

In [64]:
par_values = {'max_depth': range(8,21)}
parameters = trials.trials[0]['misc']['vals'].keys()
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,16))
cmap = plt.cm.Dark2
par_best_score = {}
df = pd.DataFrame(columns=parameters + ['train_auc','val_auc'])
for i, val in enumerate(parameters):
    xs = np.array([t['misc']['vals'][val] for t in trials.trials if 'loss' in t['result']]).ravel()
    val_auc = [1-t['result']['loss'] for t in trials.trials if 'loss' in t['result']]
    train_auc = [t['result']['train_score'] for t in trials.trials if 'train_score' in t['result']]
    best_iter = [t['result']['best_iter'] for t in trials.trials if 'best_iter' in t['result']]
    mcc = [t['result']['mcc'] for t in trials.trials if 'mcc' in t['result']]
    tr = [t['result']['threshold'] for t in trials.trials if 'threshold' in t['result']]
    
    df[val] = xs
    df['val_auc'] = val_auc
    df['train_auc'] = train_auc
    #df['best_iter'] = best_iter
    df['threshold'] = tr
    df['mcc'] = mcc
    
    
    par_best_score[val] = xs[val_auc.index(min(val_auc))]
    #print trials.trials[ys.index(max(ys))]
    #print i, val, max(ys)
    #xs, ys = zip(sorted(xs), sorted(ys))
    #ys = np.array(ys)
    axes[i/2,i%2].scatter(xs, mcc, s=20, linewidth=0.01, alpha=0.5, c=cmap(float(i)/len(parameters)))
    axes[i/2,i%2].set_title(val)
print par_best_score
df['diffs'] = df['train_auc'] - df['val_auc']


{'max_features': 60, 'max_depth': 2}

In [67]:
df.sort_values('mcc', ascending=False)


Out[67]:
max_features max_depth train_auc val_auc threshold mcc diffs
7 153 21 0.766582 0.627767 0.01 0.462687 0.138815
3 102 15 0.712884 0.625604 0.01 0.462653 0.087280
1 156 17 0.728784 0.626322 0.01 0.461887 0.102463
6 95 7 0.641454 0.614734 0.01 0.448279 0.026720
5 83 5 0.629273 0.607834 0.01 0.434193 0.021439
2 157 2 0.618620 0.605658 0.01 0.430550 0.012962
0 109 2 0.617804 0.605295 0.01 0.429720 0.012509
4 60 2 0.616809 0.602388 0.01 0.423024 0.014421

In [158]:
ax = df.plot.scatter('threshold','mcc')
#ax.set_xlim([0.921, 0.926])



In [153]:
ax = df.plot.scatter('val_auc','mcc')
ax.set_xlim([0.921, 0.926])


Out[153]:
(0.921, 0.926)

In [154]:
df.sort_values('mcc', ascending=False)


Out[154]:
subsample colsample_bytree min_child_weight train_auc val_auc best_iter threshold mcc diffs
14 0.888770 0.757614 13 0.948389 0.923325 76 0.35 0.476790 0.025064
27 0.896235 0.702619 9 0.953168 0.922122 79 0.29 0.475241 0.031046
9 0.856181 0.788519 17 0.951301 0.922990 90 0.35 0.474586 0.028311
61 0.722819 0.801908 8 0.956034 0.923575 92 0.27 0.473831 0.032459
0 0.839294 0.745370 2 0.963283 0.922013 89 0.35 0.473750 0.041270
68 0.793881 0.720002 1 0.968214 0.922843 98 0.27 0.473465 0.045371
12 0.832266 0.730399 27 0.948297 0.924090 93 0.33 0.473453 0.024207
34 0.794364 0.899290 19 0.951723 0.924133 103 0.39 0.472870 0.027590
74 0.726263 0.772719 25 0.947585 0.925005 95 0.47 0.472759 0.022580
29 0.729298 0.797721 7 0.952093 0.923446 78 0.33 0.472499 0.028647
13 0.730643 0.742692 5 0.953240 0.921878 78 0.27 0.472447 0.031362
44 0.838611 0.871020 11 0.945875 0.923337 67 0.33 0.472267 0.022538
48 0.721228 0.713318 16 0.952191 0.923898 102 0.49 0.472134 0.028293
28 0.767491 0.711478 29 0.943764 0.923496 86 0.29 0.471925 0.020268
52 0.709163 0.736322 28 0.942105 0.923434 78 0.33 0.471905 0.018671
63 0.780771 0.845361 2 0.963321 0.922288 85 0.35 0.471752 0.041033
47 0.871268 0.890522 6 0.960705 0.923033 98 0.41 0.471436 0.037672
64 0.745138 0.830198 10 0.941657 0.921939 61 0.27 0.471366 0.019718
11 0.862281 0.739042 20 0.951935 0.923533 98 0.41 0.470664 0.028402
36 0.866948 0.822641 3 0.962460 0.923178 90 0.49 0.470628 0.039282
40 0.841258 0.811096 31 0.946516 0.923932 95 0.35 0.470541 0.022584
75 0.700611 0.792820 25 0.947977 0.923819 96 0.25 0.470476 0.024158
3 0.721034 0.741014 10 0.954357 0.923990 91 0.35 0.470253 0.030367
42 0.789075 0.796233 14 0.946784 0.921633 75 0.27 0.470022 0.025151
38 0.765356 0.861523 15 0.950956 0.923238 89 0.33 0.469946 0.027718
23 0.700015 0.808804 21 0.954367 0.923501 116 0.29 0.469904 0.030866
51 0.891505 0.749526 41 0.944882 0.923361 97 0.37 0.469642 0.021521
18 0.883477 0.755889 25 0.949422 0.923863 95 0.29 0.469456 0.025559
8 0.830031 0.727011 12 0.956502 0.923187 99 0.23 0.469421 0.033315
56 0.720414 0.877109 4 0.941983 0.921817 57 0.23 0.469417 0.020166
... ... ... ... ... ... ... ... ... ...
72 0.715002 0.776406 25 0.947588 0.925590 96 0.43 0.465791 0.021998
77 0.717439 0.760744 25 0.946491 0.923965 92 0.29 0.465534 0.022526
58 0.754234 0.803023 24 0.937602 0.923397 59 0.39 0.465481 0.014205
43 0.825580 0.850991 30 0.945602 0.923256 88 0.49 0.464824 0.022346
7 0.728326 0.764351 29 0.944889 0.924246 88 0.39 0.464788 0.020643
39 0.815170 0.808814 47 0.940374 0.923357 85 0.41 0.464724 0.017017
16 0.798192 0.886845 12 0.942308 0.922977 63 0.35 0.464354 0.019331
55 0.735418 0.766978 40 0.942307 0.924307 93 0.37 0.464354 0.018000
37 0.783444 0.839686 35 0.943493 0.923331 86 0.41 0.464224 0.020162
1 0.721213 0.898027 17 0.939529 0.923031 60 0.41 0.464053 0.016498
46 0.740479 0.789720 26 0.937137 0.922958 59 0.43 0.463952 0.014179
30 0.711568 0.843878 42 0.944003 0.925081 101 0.39 0.463900 0.018922
5 0.873327 0.869573 37 0.935023 0.922882 56 0.37 0.463842 0.012141
41 0.712886 0.831764 36 0.941796 0.924019 84 0.49 0.463653 0.017777
45 0.805731 0.773100 45 0.942412 0.923387 94 0.29 0.463626 0.019025
70 0.768679 0.733253 20 0.938015 0.923316 59 0.35 0.463619 0.014699
65 0.762682 0.750389 43 0.945967 0.924177 115 0.49 0.462542 0.021790
15 0.849868 0.756917 49 0.943393 0.924129 101 0.29 0.462517 0.019264
2 0.801345 0.739144 34 0.943840 0.923203 88 0.35 0.462433 0.020637
69 0.707292 0.862217 29 0.943418 0.923348 79 0.37 0.461647 0.020070
35 0.739287 0.877974 46 0.943048 0.923769 97 0.45 0.461318 0.019279
33 0.713263 0.860015 33 0.936903 0.923830 62 0.45 0.460675 0.013073
24 0.703677 0.842038 38 0.949978 0.923266 129 0.27 0.460240 0.026712
67 0.730173 0.729144 42 0.938717 0.922742 78 0.31 0.459604 0.015975
20 0.755408 0.780418 49 0.940010 0.922737 91 0.45 0.457516 0.017273
53 0.774294 0.816412 48 0.943967 0.923696 110 0.35 0.456941 0.020271
6 0.814106 0.824706 40 0.931967 0.922060 48 0.31 0.456523 0.009907
49 0.808068 0.851656 22 0.930251 0.922660 39 0.35 0.455818 0.007591
31 0.819847 0.852363 42 0.931549 0.922102 47 0.35 0.454549 0.009447
60 0.744576 0.783634 44 0.935360 0.923383 59 0.29 0.454160 0.011977

78 rows × 9 columns


In [66]:
#df.drop(['gamma'], axis=1, inplace=True)
#df.to_csv('./data/xgboost_hyperopt_1fold_100iter.csv', index=False)

In [122]:
df['colsample_bytree'] = df['colsample_bytree'].round(2)

In [127]:
df.sort_values('val_auc', ascending=False)


Out[127]:
colsample_bytree min_child_weight train_auc val_auc best_iter diffs
18 0.73 49 0.943012 0.924580 92 0.018432
2 0.70 29 0.947958 0.924540 98 0.023418
61 0.72 49 0.942023 0.924430 93 0.017593
26 0.72 49 0.942023 0.924430 93 0.017593
47 0.73 47 0.943834 0.924389 101 0.019445
75 0.73 30 0.947750 0.924377 96 0.023373
14 0.73 30 0.947750 0.924377 96 0.023373
43 0.74 37 0.945657 0.924363 96 0.021294
73 0.70 49 0.943502 0.924312 98 0.019190
71 0.71 25 0.952072 0.924288 107 0.027784
80 0.71 34 0.946239 0.924287 97 0.021952
12 0.71 31 0.947255 0.924267 97 0.022988
54 0.71 5 0.961884 0.924240 98 0.037644
11 0.71 35 0.946406 0.924147 98 0.022259
72 0.72 35 0.947205 0.924069 97 0.023136
0 0.71 45 0.943809 0.924064 93 0.019745
68 0.71 49 0.942241 0.924047 94 0.018194
36 0.71 44 0.943859 0.924041 97 0.019818
76 0.73 21 0.950278 0.924026 93 0.026252
19 0.72 21 0.950868 0.923988 97 0.026880
13 0.75 31 0.946689 0.923966 94 0.022723
21 0.73 10 0.962495 0.923921 123 0.038574
48 0.72 48 0.942325 0.923880 92 0.018445
7 0.75 25 0.943072 0.923878 71 0.019194
51 0.72 3 0.963697 0.923843 96 0.039854
39 0.72 27 0.949131 0.923785 95 0.025346
34 0.73 26 0.948836 0.923749 97 0.025087
4 0.70 9 0.958787 0.923718 99 0.035069
64 0.74 41 0.945394 0.923635 97 0.021759
66 0.70 33 0.947523 0.923630 101 0.023893
... ... ... ... ... ... ...
46 0.73 6 0.959922 0.923078 96 0.036844
6 0.72 31 0.948381 0.922992 98 0.025389
65 0.73 18 0.952082 0.922988 97 0.029094
45 0.71 1 0.968336 0.922968 92 0.045368
30 0.74 29 0.932928 0.922929 49 0.009999
25 0.74 19 0.947738 0.922893 80 0.024845
10 0.72 33 0.947813 0.922869 99 0.024944
41 0.73 15 0.952293 0.922809 93 0.029484
60 0.72 9 0.957347 0.922775 98 0.034572
29 0.71 36 0.946716 0.922715 99 0.024001
55 0.70 0 0.956594 0.922713 69 0.033881
38 0.74 20 0.951745 0.922702 97 0.029043
53 0.72 32 0.947660 0.922628 99 0.025032
58 0.73 8 0.956824 0.922622 94 0.034202
3 0.75 38 0.945914 0.922615 98 0.023299
63 0.71 38 0.944283 0.922557 93 0.021726
62 0.72 29 0.933078 0.922531 49 0.010547
37 0.71 16 0.954216 0.922526 96 0.031690
5 0.74 18 0.934604 0.922522 49 0.012082
57 0.73 14 0.947438 0.922505 71 0.024933
1 0.73 14 0.947438 0.922505 71 0.024933
52 0.71 12 0.953922 0.922501 92 0.031421
40 0.72 17 0.951551 0.922240 89 0.029311
79 0.72 2 0.965250 0.922161 95 0.043089
24 0.73 2 0.943175 0.922096 57 0.021079
42 0.73 11 0.941744 0.922063 60 0.019681
69 0.71 4 0.942256 0.922049 58 0.020207
16 0.71 4 0.942256 0.922049 58 0.020207
77 0.71 13 0.953533 0.921879 92 0.031654
28 0.72 13 0.954321 0.921653 93 0.032668

81 rows × 6 columns


In [78]:
df.head()


Out[78]:
subsample colsample_bytree gamma min_child_weight train_auc val_auc diffs
0 0.919647 0.734544 28 2 0.922576 0.918840 0.003736
1 0.860606 0.728616 7 17 0.947655 0.921702 0.025953
2 0.900673 0.731379 23 34 0.922294 0.919387 0.002907
3 0.860517 0.704370 1 10 0.947460 0.920868 0.026592
4 0.902383 0.709298 28 34 0.919536 0.918013 0.001523

In [79]:
df['subsample'] = df['subsample'].round(2)
df['colsample_bytree'] = df['colsample_bytree'].round(2)

In [80]:
def plot_scores_for_pars(par):
    f, ax = plt.subplots(1,3, figsize=(16,6), sharex=True)

    df.groupby(par)['val_auc'].mean().plot(ax=ax[0])
    df.groupby(par)['train_auc'].mean().plot(ax=ax[1])
    df.groupby(par)['diffs'].mean().plot(ax=ax[2])

    ax[0].set_ylabel('Test auc')
    ax[1].set_ylabel('Train auc')
    ax[2].set_ylabel('Difference')

    ax[0].set_xlabel(par)
    ax[1].set_xlabel(par)
    ax[2].set_xlabel(par)

In [81]:
plot_scores_for_pars('subsample')



In [82]:
plot_scores_for_pars('colsample_bytree')



In [85]:
plot_scores_for_pars('min_child_weight')



In [84]:
plot_scores_for_pars('gamma')



In [64]:
plot_scores_for_pars('gamma')



In [42]:
df.groupby('sub_r')['val_auc'].mean().plot()
df.groupby('sub_r')['train_auc'].mean().plot()


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ef145d0>

In [44]:
df.groupby('colt_r')['val_auc'].mean().plot()
df.groupby('colt_r')['train_auc'].mean().plot()


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x11effd550>

In [45]:
df.groupby('coll_r')['val_auc'].mean().plot()
df.groupby('coll_r')['train_auc'].mean().plot()


Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f238d50>

In [24]:
df.plot('train_auc', 'val_auc',kind='scatter', ylim=[0.918, 0.922])


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x11db2c090>

In [29]:
df.plot('val_auc', 'diffs', kind='scatter', xlim=[0.918, 0.922])


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e3be6d0>

In [20]:
df.plot('gamma', 'diffs',kind='scatter')


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11cdb0710>

In [44]:
df.plot.scatter('colsample_bytree', 'val_auc', by='max_depth')


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x139938190>

In [ ]: