In [ ]:
"""
Baysian hyperparameter optimization [https://github.com/fmfn/BayesianOptimization]
for Mean Absoulte Error objective
on default features for https://www.kaggle.com/c/allstate-claims-severity
"""

__author__ = "Vladimir Iglovikov"

import os
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
from tqdm import tqdm
import numpy as np
from scipy.stats import skew, boxcox


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))


def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             feval=evalerror,
             callbacks=[xgb.callback.early_stop(50)])

    return -cv_result['test-mae-mean'].values[-1]


def prepare_data():
    path = "./data/allstate"
    inputFilePath = os.path.join(path, "train.csv.zip")
    train = pd.read_csv(inputFilePath, compression="zip", header=0, na_values=['NULL'])
    train = train.sample(frac=0.2)

    y = np.log(train['loss'] + shift)

    print(train.info())
    numerical_feats = train.dtypes[train.dtypes != "object"].index
    # compute skew and do Box-Cox transformation
    skewed_feats = train[numerical_feats].apply(lambda x: skew(x.dropna()))
    print('Skew in numeric features:')
    print(skewed_feats)
    # transform features with skew > 0.25 (this can be varied to find optimal value)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index
    for feats in tqdm(skewed_feats):
        train[feats] = train[feats] + 1
        train[feats], lam = boxcox(train[feats])

    categorical_columns = train.select_dtypes(include=['object']).columns

    for column in tqdm(categorical_columns):
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])

    X = train.drop(['loss', 'id'], 1)
    xgtrain = xgb.DMatrix(X, label=y)

    return xgtrain


if __name__ == '__main__':
    num_rounds = 100000
    random_state = 2016
    num_iter = 250
    init_points = 5
    shift = 200

    xgtrain = prepare_data()

    params = {
        'eta': 0.1,
        'silent': 1,
        'eval_metric': 'mae',
        'verbose_eval': True,
        'seed': random_state
    }

    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0, 0.7),
                                                'max_depth': (5, 15),
                                                'subsample': (0.3, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)


/home/andy/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
  0%|          | 0/12 [00:00<?, ?it/s]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37664 entries, 132975 to 132698
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 38.2+ MB
None
Skew in numeric features:
id        0.001300
cont1     0.510914
cont2    -0.297998
cont3    -0.005130
cont4     0.419437
cont5     0.683353
cont6     0.467183
cont7     0.816377
cont8     0.673707
cont9     1.068756
cont10    0.355044
cont11    0.288620
cont12    0.300982
cont13    0.380568
cont14    0.249562
loss      3.360506
dtype: float64
100%|██████████| 12/12 [00:03<00:00,  3.39it/s]
100%|██████████| 116/116 [00:07<00:00, 15.99it/s]
Initialization
---------------------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.

In [ ]: