In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train_without_noise.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [3]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)

In [4]:
# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

In [5]:
x_test = test.drop(["timestamp"], axis=1)

In [6]:
# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))

In [7]:
# base values
xgb_params = {
    "eta": 0.05,
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "silent": 1,
    "seed":42
}

In [7]:
xgb1 = XGBRegressor(learning_rate=0.05,
                     max_depth=5,
                     subsample=0.7,
                     colsample_bylevel=0.7,
                     objective="reg:linear",
                     seed=42)

In [8]:
xgb1.get_xgb_params()


Out[8]:
{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 0.7,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'objective': 'reg:linear',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 42,
 'silent': 1,
 'subsample': 0.7}

In [10]:
# search for optimal max_depth 3-8 is usually good size
param_test1 = {
    "max_depth": range(3,10,2),
    "min_child_weight": range(1, 6, 2)
}
gsearch1 = GridSearchCV(estimator=xgb1, param_grid=param_test1, scoring="r2", verbose=1)

In [11]:
gsearch1.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 15.5min finished
Out[11]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_child_weight': range(1, 6, 2), 'max_depth': range(3, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [12]:
print(gsearch1.cv_results_)
print(gsearch1.best_score_)
print(gsearch1.best_params_)


{'mean_score_time': array([ 0.06701175,  0.06393933,  0.06573129,  0.08408968,  0.08918341,
        0.08569034,  0.11710993,  0.11174297,  0.11425265,  0.13856006,
        0.13878004,  0.13838251]), 'split1_train_score': array([ 0.6832496 ,  0.68139135,  0.68042287,  0.7805569 ,  0.77216189,
        0.76830724,  0.85269423,  0.8432753 ,  0.83369414,  0.90689041,
        0.896589  ,  0.88596617]), 'mean_test_score': array([ 0.59756906,  0.59920533,  0.59746256,  0.60988449,  0.61355777,
        0.6161175 ,  0.6177487 ,  0.61778345,  0.62144769,  0.6043361 ,
        0.6050557 ,  0.60500439]), 'split1_test_score': array([ 0.62661742,  0.62747537,  0.6311931 ,  0.66397505,  0.65987904,
        0.66573214,  0.66774137,  0.66796597,  0.66766953,  0.65825125,
        0.66715402,  0.66651737]), 'std_test_score': array([ 0.05736395,  0.0549439 ,  0.05571959,  0.08756236,  0.0805866 ,
        0.08246068,  0.08173085,  0.08564962,  0.07723884,  0.09030452,
        0.09037289,  0.0912571 ]), 'rank_test_score': array([11, 10, 12,  6,  5,  4,  3,  2,  1,  9,  7,  8], dtype=int32), 'split2_test_score': array([ 0.64864571,  0.64775367,  0.6422739 ,  0.6793128 ,  0.68057883,
        0.68271103,  0.68300812,  0.68816905,  0.6840515 ,  0.67764342,
        0.67075058,  0.67250579]), 'mean_fit_time': array([ 13.07302992,  13.15797043,  13.01734273,  21.49819056,
        21.26326164,  21.31241139,  30.77172462,  29.885336  ,
        30.40557528,  37.43503626,  37.31391358,  37.19336613]), 'params': ({'min_child_weight': 1, 'max_depth': 3}, {'min_child_weight': 3, 'max_depth': 3}, {'min_child_weight': 5, 'max_depth': 3}, {'min_child_weight': 1, 'max_depth': 5}, {'min_child_weight': 3, 'max_depth': 5}, {'min_child_weight': 5, 'max_depth': 5}, {'min_child_weight': 1, 'max_depth': 7}, {'min_child_weight': 3, 'max_depth': 7}, {'min_child_weight': 5, 'max_depth': 7}, {'min_child_weight': 1, 'max_depth': 9}, {'min_child_weight': 3, 'max_depth': 9}, {'min_child_weight': 5, 'max_depth': 9}), 'std_fit_time': array([ 0.14305627,  0.14002662,  0.13217906,  0.46156347,  0.38823841,
        0.397003  ,  0.63965822,  0.3643096 ,  0.54928606,  0.74824168,
        0.61978809,  0.49741589]), 'param_min_child_weight': masked_array(data = [1 3 5 1 3 5 1 3 5 1 3 5],
             mask = [False False False False False False False False False False False False],
       fill_value = ?)
, 'split2_train_score': array([ 0.67473488,  0.67204205,  0.66958508,  0.77074367,  0.76463405,
        0.76050072,  0.84348797,  0.83337141,  0.82390455,  0.89840832,
        0.88687628,  0.87354472]), 'std_score_time': array([ 0.00158808,  0.00337306,  0.00124667,  0.00199825,  0.00193964,
        0.00198318,  0.00380707,  0.00253872,  0.00087463,  0.00307121,
        0.00050006,  0.00333371]), 'split0_test_score': array([ 0.51744936,  0.52239198,  0.51892532,  0.48637284,  0.50022239,
        0.49991624,  0.50250338,  0.49722264,  0.51262855,  0.47712123,
        0.47726932,  0.47599703]), 'param_max_depth': masked_array(data = [3 3 3 5 5 5 7 7 7 9 9 9],
             mask = [False False False False False False False False False False False False],
       fill_value = ?)
, 'std_train_score': array([ 0.01924706,  0.02036275,  0.0215431 ,  0.01072061,  0.01348896,
        0.01415135,  0.0062124 ,  0.00939856,  0.01250133,  0.00581404,
        0.00823754,  0.01081873]), 'split0_train_score': array([ 0.71915002,  0.719147  ,  0.71972958,  0.79674453,  0.79625983,
        0.79365235,  0.85858425,  0.85632151,  0.85392698,  0.91255665,
        0.90704944,  0.90002814]), 'mean_train_score': array([ 0.69237817,  0.69086013,  0.68991251,  0.7826817 ,  0.77768526,
        0.77415343,  0.85158882,  0.84432274,  0.83717522,  0.90595179,
        0.89683824,  0.88651301])}
0.621447694568
{'min_child_weight': 5, 'max_depth': 7}

In [16]:
param_test2 = {
    "max_depth": [6, 7, 8],
    "min_child_weight": [4, 5, 6]
}
gsearch2 = GridSearchCV(estimator=xgb1, param_grid=param_test2, scoring="r2", verbose=1)

In [17]:
gsearch2.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 12.6min finished
Out[17]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_child_weight': [4, 5, 6], 'max_depth': [6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [18]:
print(gsearch2.cv_results_)
print(gsearch2.best_score_)
print(gsearch2.best_params_)


{'mean_score_time': array([ 0.09515031,  0.09352493,  0.09306359,  0.10771243,  0.10707641,
        0.10636965,  0.12179669,  0.1200006 ,  0.11703531]), 'split1_train_score': array([ 0.804579  ,  0.80184064,  0.79810338,  0.8392366 ,  0.83369414,
        0.82997376,  0.86772108,  0.86288497,  0.85916154]), 'mean_test_score': array([ 0.61279463,  0.61706851,  0.61888388,  0.61979564,  0.62144769,
        0.62404684,  0.60346076,  0.60608769,  0.61016511]), 'split1_test_score': array([ 0.66250872,  0.66436937,  0.66476746,  0.6675491 ,  0.66766953,
        0.66509784,  0.6639709 ,  0.66494551,  0.66695904]), 'std_test_score': array([ 0.08587164,  0.07994363,  0.0795714 ,  0.08108955,  0.07723884,
        0.07185241,  0.09401714,  0.09467049,  0.09233051]), 'rank_test_score': array([6, 5, 4, 3, 2, 1, 9, 8, 7], dtype=int32), 'split2_test_score': array([ 0.68389624,  0.6823512 ,  0.68493141,  0.68621649,  0.6840515 ,
        0.68402484,  0.67574122,  0.68080497,  0.68359669]), 'mean_fit_time': array([ 23.62828565,  23.71099997,  23.47338907,  27.62242993,
        27.70262527,  27.65972741,  31.89469568,  31.73113179,  31.74262937]), 'params': ({'min_child_weight': 4, 'max_depth': 6}, {'min_child_weight': 5, 'max_depth': 6}, {'min_child_weight': 6, 'max_depth': 6}, {'min_child_weight': 4, 'max_depth': 7}, {'min_child_weight': 5, 'max_depth': 7}, {'min_child_weight': 6, 'max_depth': 7}, {'min_child_weight': 4, 'max_depth': 8}, {'min_child_weight': 5, 'max_depth': 8}, {'min_child_weight': 6, 'max_depth': 8}), 'std_fit_time': array([ 0.2874106 ,  0.46070622,  0.29822477,  0.41805901,  0.36369775,
        0.32042699,  0.41725812,  0.45623025,  0.3396758 ]), 'param_min_child_weight': masked_array(data = [4 5 6 4 5 6 4 5 6],
             mask = [False False False False False False False False False],
       fill_value = ?)
, 'split2_train_score': array([ 0.79870909,  0.7947246 ,  0.78933099,  0.82912878,  0.82390455,
        0.81789021,  0.85712781,  0.85072698,  0.84516848]), 'std_score_time': array([ 0.00072833,  0.00204345,  0.00019679,  0.00149226,  0.00528428,
        0.00393314,  0.00304605,  0.00263438,  0.00115748]), 'split0_test_score': array([ 0.49198632,  0.50449173,  0.50695962,  0.50562822,  0.51262855,
        0.52302407,  0.47067767,  0.47252033,  0.47994723]), 'param_max_depth': masked_array(data = [6 6 6 7 7 7 8 8 8],
             mask = [False False False False False False False False False],
       fill_value = ?)
, 'std_train_score': array([ 0.01198254,  0.01289474,  0.01404275,  0.0108985 ,  0.01250133,
        0.01309754,  0.00897944,  0.0100301 ,  0.01087245]), 'split0_train_score': array([ 0.82654935,  0.82493325,  0.82252133,  0.85558063,  0.85392698,
        0.84967   ,  0.87911797,  0.87529518,  0.87178875]), 'mean_train_score': array([ 0.80994581,  0.80716616,  0.80331857,  0.84131534,  0.83717522,
        0.83251133,  0.86798896,  0.86296905,  0.85870625])}
0.624046836469
{'min_child_weight': 6, 'max_depth': 7}

In [17]:
xgb2 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.7,
                     colsample_bylevel=0.7,
                     objective="reg:linear",
                     seed=42)

In [18]:
param_test3 = {
    "gamma": np.arange(0, 1.2, 0.2)
}
gsearch3 = GridSearchCV(estimator=xgb2, param_grid=param_test3, scoring="r2", verbose=1)

In [19]:
gsearch3.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.1min finished
Out[19]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': array([ 0. ,  0.2,  0.4,  0.6,  0.8,  1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [20]:
print(gsearch3.cv_results_)
print(gsearch3.best_score_)
print(gsearch3.best_params_)


{'mean_fit_time': array([ 30.24002806,  30.48722537,  29.1691583 ,  30.44474498,
        30.30343501,  29.50940537]), 'split1_train_score': array([ 0.82997376,  0.82997376,  0.82997376,  0.82997376,  0.82997376,
        0.82997376]), 'std_test_score': array([ 0.07185241,  0.07185241,  0.07185241,  0.07185241,  0.07185241,
        0.07185241]), 'mean_score_time': array([ 0.11473529,  0.11179773,  0.10508498,  0.1105334 ,  0.11938731,
        0.11219994]), 'split1_test_score': array([ 0.66509784,  0.66509784,  0.66509784,  0.66509784,  0.66509784,
        0.66509784]), 'split2_test_score': array([ 0.68402484,  0.68402484,  0.68402484,  0.68402484,  0.68402484,
        0.68402484]), 'std_score_time': array([ 0.00217996,  0.00314583,  0.00381724,  0.0059813 ,  0.01124935,
        0.00417286]), 'std_train_score': array([ 0.01309754,  0.01309754,  0.01309754,  0.01309754,  0.01309754,
        0.01309754]), 'split0_test_score': array([ 0.52302407,  0.52302407,  0.52302407,  0.52302407,  0.52302407,
        0.52302407]), 'std_fit_time': array([ 0.54438604,  0.42107532,  1.1663721 ,  0.3943172 ,  0.55198361,
        0.81877176]), 'param_gamma': masked_array(data = [0.0 0.20000000000000001 0.40000000000000002 0.60000000000000009
 0.80000000000000004 1.0],
             mask = [False False False False False False],
       fill_value = ?)
, 'split0_train_score': array([ 0.84967,  0.84967,  0.84967,  0.84967,  0.84967,  0.84967]), 'params': ({'gamma': 0.0}, {'gamma': 0.20000000000000001}, {'gamma': 0.40000000000000002}, {'gamma': 0.60000000000000009}, {'gamma': 0.80000000000000004}, {'gamma': 1.0}), 'mean_test_score': array([ 0.62404684,  0.62404684,  0.62404684,  0.62404684,  0.62404684,
        0.62404684]), 'rank_test_score': array([1, 1, 1, 1, 1, 1], dtype=int32), 'mean_train_score': array([ 0.83251133,  0.83251133,  0.83251133,  0.83251133,  0.83251133,
        0.83251133]), 'split2_train_score': array([ 0.81789021,  0.81789021,  0.81789021,  0.81789021,  0.81789021,
        0.81789021])}
0.624046836469
{'gamma': 0.0}

In [52]:
xgb3 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.7,
                     colsample_bytree=0.7,
                     objective="reg:linear",
                     seed=42)

In [53]:
param_test4 = {
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1)
}
gsearch4 = GridSearchCV(estimator=xgb3, param_grid=param_test4, scoring="r2", verbose=1)

In [54]:
gsearch4.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 37.2min finished
Out[54]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'subsample': array([ 0.5,  0.6,  0.7,  0.8,  0.9]), 'colsample_bytree': array([ 0.5,  0.6,  0.7,  0.8,  0.9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [55]:
print(gsearch4.cv_results_)
print(gsearch4.best_score_)
print(gsearch4.best_params_)


{'mean_fit_time': array([ 21.97196261,  22.55792165,  22.82459633,  21.99888261,
        22.22140702,  25.26790325,  25.72357694,  25.84685032,
        25.39442937,  24.98320524,  28.40661867,  29.47806629,
        29.57766366,  29.06855226,  28.74974394,  32.99740299,
        33.93686136,  32.69801434,  33.21947956,  33.254445  ,
        36.5933876 ,  37.35075307,  37.35155725,  37.19543401,  36.50886933]), 'split1_train_score': array([ 0.80285941,  0.81573785,  0.82427744,  0.83262439,  0.83767667,
        0.80436458,  0.81773754,  0.82874569,  0.83695997,  0.84032511,
        0.80645391,  0.81876318,  0.83107344,  0.83783258,  0.84478245,
        0.80947238,  0.820264  ,  0.83041896,  0.83815992,  0.84423326,
        0.81129656,  0.82248296,  0.83095846,  0.83519809,  0.84571339]), 'param_colsample_bytree': masked_array(data = [0.5 0.5 0.5 0.5 0.5 0.59999999999999998 0.59999999999999998
 0.59999999999999998 0.59999999999999998 0.59999999999999998
 0.69999999999999996 0.69999999999999996 0.69999999999999996
 0.69999999999999996 0.69999999999999996 0.79999999999999993
 0.79999999999999993 0.79999999999999993 0.79999999999999993
 0.79999999999999993 0.89999999999999991 0.89999999999999991
 0.89999999999999991 0.89999999999999991 0.89999999999999991],
             mask = [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False],
       fill_value = ?)
, 'std_test_score': array([ 0.11322163,  0.0972665 ,  0.10792994,  0.09383125,  0.09988799,
        0.12380951,  0.0947291 ,  0.10472924,  0.10557792,  0.13316617,
        0.10409124,  0.10978099,  0.10119156,  0.11051819,  0.10488389,
        0.09248377,  0.09940575,  0.08740566,  0.09603984,  0.09114049,
        0.12133959,  0.09126171,  0.1116557 ,  0.099718  ,  0.11386891]), 'mean_score_time': array([ 0.11654568,  0.11720761,  0.12089769,  0.11871823,  0.12098273,
        0.11073144,  0.1098357 ,  0.12178302,  0.12462473,  0.11448272,
        0.10732404,  0.11081902,  0.11279265,  0.11801799,  0.11292434,
        0.11179892,  0.11859298,  0.11156607,  0.10987306,  0.10922003,
        0.10295804,  0.11244297,  0.11141276,  0.11324557,  0.11249169]), 'split1_test_score': array([ 0.66002967,  0.66506092,  0.65995153,  0.6636246 ,  0.66140996,
        0.66533097,  0.66431454,  0.66208666,  0.65440724,  0.66416187,
        0.65810299,  0.66504705,  0.66156557,  0.66766598,  0.66552802,
        0.66680585,  0.66538018,  0.65965832,  0.66412887,  0.6671072 ,
        0.6700523 ,  0.6684202 ,  0.66533318,  0.66222828,  0.66755502]), 'split2_test_score': array([ 0.68272453,  0.68514629,  0.68235405,  0.68270795,  0.68815656,
        0.68144272,  0.68882791,  0.69152463,  0.68991783,  0.6870187 ,
        0.67710564,  0.68302979,  0.68174745,  0.68961841,  0.68770587,
        0.67743801,  0.67724035,  0.6819949 ,  0.68642719,  0.68683571,
        0.68175261,  0.67308339,  0.67913951,  0.68228262,  0.68819644]), 'std_score_time': array([ 0.00515197,  0.00652802,  0.00477143,  0.00540393,  0.00491151,
        0.00363181,  0.00350078,  0.0025169 ,  0.00899159,  0.00829351,
        0.00275274,  0.00749222,  0.0037727 ,  0.00926194,  0.00351383,
        0.00452341,  0.01088076,  0.00522775,  0.00287998,  0.00234877,
        0.00174677,  0.00262562,  0.00947355,  0.00476022,  0.00193382]), 'param_subsample': masked_array(data = [0.5 0.59999999999999998 0.69999999999999996 0.79999999999999993
 0.89999999999999991 0.5 0.59999999999999998 0.69999999999999996
 0.79999999999999993 0.89999999999999991 0.5 0.59999999999999998
 0.69999999999999996 0.79999999999999993 0.89999999999999991 0.5
 0.59999999999999998 0.69999999999999996 0.79999999999999993
 0.89999999999999991 0.5 0.59999999999999998 0.69999999999999996
 0.79999999999999993 0.89999999999999991],
             mask = [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False],
       fill_value = ?)
, 'std_train_score': array([ 0.01608218,  0.01541611,  0.01424178,  0.01278714,  0.01099711,
        0.01642833,  0.01456426,  0.01301148,  0.01245916,  0.01096433,
        0.01633237,  0.01560868,  0.01352986,  0.01169069,  0.01041023,
        0.01752066,  0.0152392 ,  0.01335559,  0.01175404,  0.01073932,
        0.01679671,  0.01523964,  0.01451446,  0.01243783,  0.01109318]), 'split0_test_score': array([ 0.43200473,  0.46950595,  0.44302368,  0.47480863,  0.4641598 ,
        0.41111996,  0.47674603,  0.4561101 ,  0.45032033,  0.3937984 ,
        0.44740895,  0.44168061,  0.45771085,  0.44497125,  0.45495649,
        0.47615183,  0.46069062,  0.48642404,  0.4724652 ,  0.48439083,
        0.41870375,  0.47720014,  0.43568249,  0.46143709,  0.4369872 ]), 'std_fit_time': array([ 0.29769326,  0.50653333,  0.43687925,  0.60637034,  0.21317109,
        0.53933105,  0.30759731,  0.82026782,  0.37673718,  0.70687718,
        0.22072994,  0.38900328,  0.33355438,  0.14707755,  0.28969808,
        0.45170022,  1.00115208,  0.62846533,  0.31933723,  0.74194885,
        0.55090275,  0.49779446,  0.78971384,  0.47130468,  1.13094746]), 'split0_train_score': array([ 0.83297573,  0.84119577,  0.84919621,  0.85497107,  0.85770309,
        0.83455996,  0.84338757,  0.85197403,  0.85788915,  0.85924956,
        0.83503352,  0.84385019,  0.85174615,  0.85699613,  0.86033477,
        0.83754721,  0.84459018,  0.85269769,  0.85672276,  0.86198947,
        0.83640648,  0.84530127,  0.85466426,  0.85604464,  0.86197926]), 'params': ({'subsample': 0.5, 'colsample_bytree': 0.5}, {'subsample': 0.59999999999999998, 'colsample_bytree': 0.5}, {'subsample': 0.69999999999999996, 'colsample_bytree': 0.5}, {'subsample': 0.79999999999999993, 'colsample_bytree': 0.5}, {'subsample': 0.89999999999999991, 'colsample_bytree': 0.5}, {'subsample': 0.5, 'colsample_bytree': 0.59999999999999998}, {'subsample': 0.59999999999999998, 'colsample_bytree': 0.59999999999999998}, {'subsample': 0.69999999999999996, 'colsample_bytree': 0.59999999999999998}, {'subsample': 0.79999999999999993, 'colsample_bytree': 0.59999999999999998}, {'subsample': 0.89999999999999991, 'colsample_bytree': 0.59999999999999998}, {'subsample': 0.5, 'colsample_bytree': 0.69999999999999996}, {'subsample': 0.59999999999999998, 'colsample_bytree': 0.69999999999999996}, {'subsample': 0.69999999999999996, 'colsample_bytree': 0.69999999999999996}, {'subsample': 0.79999999999999993, 'colsample_bytree': 0.69999999999999996}, {'subsample': 0.89999999999999991, 'colsample_bytree': 0.69999999999999996}, {'subsample': 0.5, 'colsample_bytree': 0.79999999999999993}, {'subsample': 0.59999999999999998, 'colsample_bytree': 0.79999999999999993}, {'subsample': 0.69999999999999996, 'colsample_bytree': 0.79999999999999993}, {'subsample': 0.79999999999999993, 'colsample_bytree': 0.79999999999999993}, {'subsample': 0.89999999999999991, 'colsample_bytree': 0.79999999999999993}, {'subsample': 0.5, 'colsample_bytree': 0.89999999999999991}, {'subsample': 0.59999999999999998, 'colsample_bytree': 0.89999999999999991}, {'subsample': 0.69999999999999996, 'colsample_bytree': 0.89999999999999991}, {'subsample': 0.79999999999999993, 'colsample_bytree': 0.89999999999999991}, {'subsample': 0.89999999999999991, 'colsample_bytree': 0.89999999999999991}), 'mean_test_score': array([ 0.59158316,  0.60656833,  0.59510673,  0.60704444,  0.60457255,
        0.58596125,  0.6099601 ,  0.60323741,  0.59821196,  0.58165601,
        0.59420299,  0.59658283,  0.60033847,  0.60074881,  0.60272719,
        0.60679612,  0.60110108,  0.60935657,  0.60767103,  0.61277535,
        0.59016639,  0.60623226,  0.59338209,  0.60197988,  0.59757642]), 'rank_test_score': array([22,  7, 19,  5,  9, 24,  2, 10, 16, 25, 20, 18, 15, 14, 11,  6, 13,
        3,  4,  1, 23,  8, 21, 12, 17], dtype=int32), 'mean_train_score': array([ 0.81058711,  0.82041575,  0.82968922,  0.83746204,  0.84248923,
        0.81178341,  0.82340491,  0.83406015,  0.84101263,  0.84428609,
        0.81266244,  0.82297789,  0.83393215,  0.84127167,  0.84672507,
        0.81413943,  0.82425378,  0.83464283,  0.84108809,  0.84750843,
        0.81444289,  0.82536371,  0.83516255,  0.83920771,  0.84756293]), 'split2_train_score': array([ 0.7959262 ,  0.80431364,  0.81559402,  0.82479066,  0.83208792,
        0.79642568,  0.80908963,  0.82146074,  0.82818878,  0.83328361,
        0.7964999 ,  0.80632029,  0.81897687,  0.82898629,  0.83505799,
        0.7953987 ,  0.80790716,  0.82081185,  0.82838158,  0.83630256,
        0.79562563,  0.80830689,  0.81986492,  0.82638039,  0.83499613])}
0.612775350719
{'subsample': 0.89999999999999991, 'colsample_bytree': 0.79999999999999993}

In [56]:
xgb4 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     seed=42)

In [57]:
param_test5 = {
    "learning_rate": np.arange(0.01, 0.12, 0.02)
}
gsearch5 = GridSearchCV(estimator=xgb4, param_grid=param_test5, scoring="r2", verbose=1)

In [58]:
gsearch5.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.8min finished
Out[58]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'learning_rate': array([ 0.01,  0.03,  0.05,  0.07,  0.09,  0.11])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [59]:
print(gsearch5.cv_results_)
print(gsearch5.best_score_)
print(gsearch5.best_params_)


{'mean_fit_time': array([ 34.23959025,  34.2434996 ,  31.95830027,  31.46197629,
        31.30107164,  31.03376762]), 'split1_train_score': array([ 0.30940852,  0.80281028,  0.84423326,  0.86204105,  0.87536992,
        0.88139915]), 'std_test_score': array([ 0.06727064,  0.07700247,  0.09114049,  0.10448471,  0.12765139,
        0.10479015]), 'mean_score_time': array([ 0.10457738,  0.11623812,  0.11232471,  0.10873866,  0.10951837,
        0.10567331]), 'split1_test_score': array([ 0.19489699,  0.64886758,  0.6671072 ,  0.65912031,  0.6649731 ,
        0.65853678]), 'split2_test_score': array([ 0.18976284,  0.66537266,  0.68683571,  0.69682493,  0.69373271,
        0.69153157]), 'std_score_time': array([ 0.00422106,  0.00333164,  0.00174697,  0.00452629,  0.00743915,
        0.00196409]), 'std_train_score': array([ 0.00573531,  0.01381136,  0.01073932,  0.01087661,  0.00937434,
        0.00840055]), 'split0_test_score': array([ 0.33496212,  0.49440077,  0.48439083,  0.4587464 ,  0.40971282,
        0.45458589]), 'std_fit_time': array([ 0.45616947,  0.62075503,  0.4570077 ,  0.68920061,  0.43560633,
        0.49929145]), 'split0_train_score': array([ 0.31884513,  0.82664354,  0.86198947,  0.8783966 ,  0.88542717,
        0.89276111]), 'param_learning_rate': masked_array(data = [0.01 0.029999999999999999 0.049999999999999996 0.069999999999999993
 0.089999999999999983 0.10999999999999997],
             mask = [False False False False False False],
       fill_value = ?)
, 'params': ({'learning_rate': 0.01}, {'learning_rate': 0.029999999999999999}, {'learning_rate': 0.049999999999999996}, {'learning_rate': 0.069999999999999993}, {'learning_rate': 0.089999999999999983}, {'learning_rate': 0.10999999999999997}), 'mean_test_score': array([ 0.23987572,  0.60287817,  0.61277535,  0.60489403,  0.58946927,
        0.6015483 ]), 'rank_test_score': array([6, 3, 1, 2, 5, 4], dtype=int32), 'mean_train_score': array([ 0.31112248,  0.80779571,  0.84750843,  0.86414772,  0.87443951,
        0.88212768]), 'split2_train_score': array([ 0.30511378,  0.79393331,  0.83630256,  0.85200551,  0.86252145,
        0.87222277])}
0.612775350719
{'learning_rate': 0.049999999999999996}

In [60]:
param_test6 = {
    "learning_rate": [0.04, 0.05, 0.06]
}
gsearch6 = GridSearchCV(estimator=xgb4, param_grid=param_test6, scoring="r2", verbose=1)

In [61]:
gsearch6.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.0min finished
Out[61]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.04, 0.05, 0.06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [62]:
print(gsearch6.cv_results_)
print(gsearch6.best_score_)
print(gsearch6.best_params_)


{'mean_fit_time': array([ 33.27052132,  33.16510908,  31.805921  ]), 'split1_train_score': array([ 0.82878414,  0.84423326,  0.85414104]), 'std_test_score': array([ 0.0895207 ,  0.09114049,  0.10798384]), 'mean_score_time': array([ 0.1187044 ,  0.10999537,  0.11767554]), 'split1_test_score': array([ 0.66483569,  0.6671072 ,  0.66251147]), 'split2_test_score': array([ 0.67581682,  0.68683571,  0.6920061 ]), 'std_score_time': array([ 0.00586206,  0.00478827,  0.00750173]), 'std_train_score': array([ 0.01207467,  0.01073932,  0.01089907]), 'split0_test_score': array([ 0.4806638 ,  0.48439083,  0.44962024]), 'std_fit_time': array([ 0.47589441,  1.45608282,  0.55955369]), 'split0_train_score': array([ 0.84990347,  0.86198947,  0.86991835]), 'param_learning_rate': masked_array(data = [0.04 0.05 0.06],
             mask = [False False False],
       fill_value = ?)
, 'params': ({'learning_rate': 0.04}, {'learning_rate': 0.05}, {'learning_rate': 0.06}), 'mean_test_score': array([ 0.60710306,  0.61277535,  0.60137613]), 'rank_test_score': array([2, 1, 3], dtype=int32), 'mean_train_score': array([ 0.83336634,  0.84750843,  0.85581268]), 'split2_train_score': array([ 0.82141142,  0.83630256,  0.84337866])}
0.612775350719
{'learning_rate': 0.05}

In [63]:
xgb5 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     reg_alpha=0,
                     seed=42)

In [64]:
param_test7 = {
    "reg_alpha": np.arange(0.01, 0.11, 0.02)
}
gsearch7 = GridSearchCV(estimator=xgb5, param_grid=param_test7, scoring="r2", verbose=1)

In [65]:
gsearch7.fit(x_train.values, y_train.values)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  8.5min finished
Out[65]:
GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'reg_alpha': array([ 0.01,  0.03,  0.05,  0.07,  0.09])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [66]:
print(gsearch7.cv_results_)
print(gsearch7.best_score_)
print(gsearch7.best_params_)


{'mean_fit_time': array([ 33.35795204,  33.54191502,  33.4405396 ,  33.59721828,  33.59879231]), 'split1_train_score': array([ 0.84423326,  0.84423326,  0.84423326,  0.84423326,  0.84423326]), 'std_test_score': array([ 0.09114049,  0.09114049,  0.09114049,  0.09114049,  0.09114049]), 'mean_score_time': array([ 0.10843492,  0.10982132,  0.11626943,  0.10852631,  0.10972071]), 'split1_test_score': array([ 0.6671072,  0.6671072,  0.6671072,  0.6671072,  0.6671072]), 'split2_test_score': array([ 0.68683571,  0.68683571,  0.68683571,  0.68683571,  0.68683571]), 'std_score_time': array([ 0.0017089 ,  0.00453805,  0.00562956,  0.00307045,  0.00395006]), 'std_train_score': array([ 0.01073932,  0.01073932,  0.01073932,  0.01073932,  0.01073932]), 'split0_test_score': array([ 0.48439082,  0.48439082,  0.48439082,  0.48439082,  0.48439082]), 'std_fit_time': array([ 0.76139841,  0.77667015,  0.46750333,  1.28307987,  1.01757436]), 'split0_train_score': array([ 0.86198947,  0.86198947,  0.86198947,  0.86198947,  0.86198947]), 'params': ({'reg_alpha': 0.01}, {'reg_alpha': 0.029999999999999999}, {'reg_alpha': 0.049999999999999996}, {'reg_alpha': 0.069999999999999993}, {'reg_alpha': 0.089999999999999983}), 'param_reg_alpha': masked_array(data = [0.01 0.029999999999999999 0.049999999999999996 0.069999999999999993
 0.089999999999999983],
             mask = [False False False False False],
       fill_value = ?)
, 'mean_test_score': array([ 0.61277535,  0.61277535,  0.61277535,  0.61277535,  0.61277535]), 'rank_test_score': array([1, 1, 3, 4, 5], dtype=int32), 'mean_train_score': array([ 0.84750843,  0.84750843,  0.84750843,  0.84750843,  0.84750843]), 'split2_train_score': array([ 0.83630256,  0.83630256,  0.83630256,  0.83630256,  0.83630256])}
0.612775349564
{'reg_alpha': 0.01}

In [ ]:
xgb6 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     reg_alpha=0.01,
                     seed=42)

In [ ]:


In [ ]:


In [ ]:


In [ ]:

To-Do

  • Learning rate 0.01 - 0.2
  • Subsample 0.5 - 1
  • colsample_bytree" 0.5 - 1
  • Seed

In [15]:
list(range(3,10,2))


Out[15]:
[3, 5, 7, 9]

In [ ]: