In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import log_loss
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error
from sklearn import preprocessing
from random import randint

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import sys
import xgboost as xgb
import os
from utils import get_allstate_train_valid_test_testids

shift = 200
train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
y_train = train["loss"]    
x_train = train.drop("loss", axis=1)

y_valid = valid["loss"]    
x_valid = valid.drop("loss", axis=1)

#xgtrain = xgb.DMatrix(x_train, label=y_train)
#xgvalid = xgb.DMatrix(x_valid, label=y_valid)


Train shape is: (188318, 132)
Test shape is: (125546, 131)
/home/arvc/t81_558_deep_learning/utils.py:139: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)
Final Train shape is: (160070, 131)
Final Valid shape is: (28248, 131)
Final Test shape is: (125546, 131)

In [ ]:
def score(params):           
    print("Training with params : ")
    print(params)
   
    #evallist  = [(x_valid, 'eval')]
    base_regressor = XGBRegressor(**params)
    base_regressor.fit(x_train, y_train)
    predictions = base_regressor.predict(x_valid)
                                  
                                  
                                  
    #xgtrain, num_round, evals=evallist, early_stopping_rounds=20, verbose_eval=False,
    #                 feval=custom_evalfn)
    
    
    #print(predictions)
    score =  mean_absolute_error(np.exp(y_valid) - shift, np.exp(predictions) - shift)
    print("\tMAE {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def custom_evalfn(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', mean_absolute_error(np.exp(preds) - shift, np.exp(labels) - shift)


def optimize(trials):
    space = {
        'max_depth' : hp.choice('max_depth', [9,10,11,12,13,14,15]),
        'learning_rate' : hp.quniform('eta', 0.01, 0.5, 0.01),
        'n_estimators' : hp.choice('n_estimators', np.arange(85, 3000, dtype=int)),
        'silent' : True,
        'objective': 'reg:linear',
        'nthread' : -1,
        'gamma' : hp.quniform('gamma', 0.0, 3, 0.2),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 3, 1),
             'subsample' : hp.quniform('subsample', 0.6, 1.0, 0.1),
             
             'reg_lambda' : hp.quniform('reg_lambda', 0.0, 3, 0.2),
             'base_score' : hp.quniform('base_score', 0.0, 3, 0.2),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.4, 0.8, 0.1),
             #'eval_metric': 'mae',
             
             
            
             'reg_alpha' : 1,
             #'verbose_eval': False,
             'seed': randint(1,429496)
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print("Best params are:")
    print(best)




#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)


Training with params : 
{'learning_rate': 0.35000000000000003, 'max_depth': 9, 'reg_alpha': 1, 'min_child_weight': 2.0, 'silent': True, 'colsample_bytree': 0.7000000000000001, 'nthread': -1, 'reg_lambda': 2.2, 'base_score': 1.2000000000000002, 'subsample': 0.9, 'objective': 'reg:linear', 'n_estimators': 2971, 'gamma': 1.2000000000000002, 'seed': 145932}

In [ ]: