In [1]:
"""
Baysian hyperparameter optimization [https://github.com/fmfn/BayesianOptimization]
for Mean Absoulte Error objective
on default features for https://www.kaggle.com/c/allstate-claims-severity
"""

__author__ = "Vladimir Iglovikov"

import os
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
from tqdm import tqdm

In [ ]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(20)])

    return -cv_result['test-mae-mean'].values[-1]


def prepare_data():
    path = "./data/allstate"
    inputFilePath = os.path.join(path, "train.csv.zip")
    train = pd.read_csv(inputFilePath, compression="zip", header=0)
    train = train.sample(frac=0.01)
    categorical_columns = train.select_dtypes(include=['object']).columns

    for column in tqdm(categorical_columns):
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])

    y = train['loss']

    X = train.drop(['loss', 'id'], 1)
    xgtrain = xgb.DMatrix(X, label=y)

    return xgtrain


if __name__ == '__main__':
    xgtrain = prepare_data()

    num_rounds = 100000
    random_state = 2016
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'silent': 1,
        'eval_metric': 'mae',
        'verbose_eval': True,
        'seed': random_state
    }

    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 50),
                                                'colsample_bytree': (0, 1),
                                                'max_depth': (5, 50),
                                                'subsample': (0, 1),
                                                'gamma': (0, 50),
                                                'alpha': (0, 50),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)


100%|██████████| 116/116 [00:00<00:00, 627.81it/s]
Initialization
---------------------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
Stopping. Best iteration:
[26]	train-mae:1160.06+10.3619	test-mae:1303.72+52.9484

    1 | 00m01s | -1303.71733 |   15.9118 |             0.8436 |   19.5128 |     41.1343 |            29.8087 |      0.3912 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
Stopping. Best iteration:
[23]	train-mae:759.988+10.3987	test-mae:1289.41+36.4478

    2 | 00m01s | -1289.40584 |   27.2547 |             0.4422 |    7.3513 |     49.2704 |            12.5711 |      0.7972 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
Stopping. Best iteration:
[117]	train-mae:2101.13+210.337	test-mae:2102.28+194.247

    3 | 00m01s | -2102.28472 |   10.3205 |             0.7323 |   46.2793 |      8.8176 |            24.2801 |      0.0104 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
Stopping. Best iteration:
[32]	train-mae:1159.72+17.6586	test-mae:1312.58+64.3994

    4 | 00m01s | -1312.57510 |   26.4966 |             0.3325 |   46.2954 |     22.9669 |            43.1862 |      0.5021 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
Stopping. Best iteration:
[29]	train-mae:1064.6+14.2422	test-mae:1299.43+55.4582


In [ ]: