# Hyperparameter Optimization xgboost

What the options there're for tuning?

All right! Xgboost has about 20 params:

1. base_score
2. colsample_bylevel
3. colsample_bytree
4. gamma
5. learning_rate
6. max_delta_step
7. max_depth
8. min_child_weight
9. missing
10. n_estimators
12. objective
13. reg_alpha
14. reg_lambda
15. scale_pos_weight
16. seed
17. silent
18. subsample

Let's for tuning will be use 12 of them them with 5-10 possible values, so... there're 12^5 - 12^10 possible cases. If you will check one case in 10s, for 12^5 you need 30 days for 12^10 about 20K years :).

This is too long.. but there's a thid option - Bayesan optimisation.

``````

In [1]:

import pandas as pd
import xgboost as xgb
import numpy as np
import seaborn as sns

from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

%matplotlib inline

``````
``````

In [2]:

train['datetime'] = pd.to_datetime( train['datetime'] )
train['day'] = train['datetime'].map(lambda x: x.day)

``````

## Modeling

``````

In [6]:

def assing_test_samples(data, last_training_day=0.3, seed=1):
days = data.day.unique()
np.random.seed(seed)
np.random.shuffle(days)
test_days = days[: int(len(days) * 0.3)]

data['is_test'] = data.day.isin(test_days)

def select_features(data):
columns = data.columns[ (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].values
return [feat for feat in columns if feat not in ['count', 'casual', 'registered'] and 'log' not in feat ]

def get_X_y(data, target_variable):
features = select_features(data)

X = data[features].values
y = data[target_variable].values

return X,y

def train_test_split(train, target_variable):
df_train = train[train.is_test == False]
df_test  = train[train.is_test == True]

X_train, y_train = get_X_y(df_train, target_variable)
X_test, y_test = get_X_y(df_test, target_variable)

return X_train, X_test, y_train, y_test

def fit_and_predict(train, model, target_variable):
X_train, X_test, y_train, y_test = train_test_split(train, target_variable)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

return (y_test, y_pred)

def post_pred(y_pred):
y_pred[y_pred < 0] = 0
return y_pred

def rmsle(y_true, y_pred, y_pred_only_positive=True):
if y_pred_only_positive: y_pred = post_pred(y_pred)

diff = np.log(y_pred+1) - np.log(y_true+1)
mean_error = np.square(diff).mean()
return np.sqrt(mean_error)

assing_test_samples(train)

``````
``````

In [4]:

def etl_datetime(df):
df['year'] = df['datetime'].map(lambda x: x.year)
df['month'] = df['datetime'].map(lambda x: x.month)

df['hour'] = df['datetime'].map(lambda x: x.hour)
df['minute'] = df['datetime'].map(lambda x: x.minute)
df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])

etl_datetime(train)

train['{0}_log'.format('count')] = train['count'].map(lambda x: np.log2(x) )

for name in ['registered', 'casual']:
train['{0}_log'.format(name)] = train[name].map(lambda x: np.log2(x+1) )

``````

## Tuning hyperparmeters using Bayesian optimization algorithms

``````

In [42]:

def objective(space):

model = xgb.XGBRegressor(
max_depth = space['max_depth'],
n_estimators = int(space['n_estimators']),
subsample = space['subsample'],
colsample_bytree = space['colsample_bytree'],
learning_rate = space['learning_rate'],
reg_alpha = space['reg_alpha']
)

X_train, X_test, y_train, y_test = train_test_split(train, 'count')
eval_set  = [( X_train, y_train), ( X_test, y_test)]

(_, registered_pred) = fit_and_predict(train, model, 'registered_log')
(_, casual_pred) = fit_and_predict(train, model, 'casual_log')

y_test = train[train.is_test == True]['count']
y_pred = (np.exp2(registered_pred) - 1) + (np.exp2(casual_pred) -1)

score = rmsle(y_test, y_pred)
print "SCORE:", score

return{'loss':score, 'status': STATUS_OK }

space ={
'max_depth': hp.quniform("x_max_depth", 2, 20, 1),
'n_estimators': hp.quniform("n_estimators", 100, 1000, 1),
'subsample': hp.uniform ('x_subsample', 0.8, 1),
'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.1, 1),
'learning_rate': hp.uniform ('x_learning_rate', 0.01, 0.1),
'reg_alpha': hp.uniform ('x_reg_alpha', 0.1, 1)
}

trials = Trials()
best = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=15,
trials=trials)

print(best)

``````
``````

SCORE: 0.327769943579
SCORE: 0.402119793524
SCORE: 0.441702998659
SCORE: 0.344952075056
SCORE: 0.332483052772
SCORE: 0.415230694098
SCORE: 0.326159133525
SCORE: 0.366755440868
SCORE: 0.336209948966
SCORE: 0.320813982928
SCORE: 0.33925039026
SCORE: 0.363387131966
SCORE: 0.324682064912
SCORE: 0.382678760754
SCORE: 0.488176057958
{'x_learning_rate': 0.0803514512536536, 'x_reg_alpha': 0.44303008763740737, 'n_estimators': 421.0, 'x_max_depth': 17.0, 'x_subsample': 0.9561807797584932, 'x_colsample_bytree': 0.8214374064161822}

``````
``````

In [ ]:

``````