In [3]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.grid_search import RandomizedSearchCV
import os
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [ ]:
path = "./data/allstate"
inputFilePath = os.path.join(path, "train.csv.zip")
train = pd.read_csv(inputFilePath, compression="zip", header=0)
categorical_columns = train.select_dtypes(include=['object']).columns

for column in tqdm(categorical_columns):
    le = LabelEncoder()
    train[column] = le.fit_transform(train[column])

y = train['loss']

X = train.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)

# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'min_child_weight': (1, 50),
            'colsample_bytree': (0, 1),
            'max_depth': (5, 50),
            'subsample': (0, 1),
            'gamma': (0, 50),
            'alpha': (0, 50),
            }

num_rounds = 100000
random_state = 2016
num_iter = 25
init_points = 5
params = {
    'eta': 0.1,
    'silent': 1,
    'eval_metric': 'mae',
    'verbose_eval': True,
    'seed': random_state
}

model = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(500)])
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10)
rsearch.fit(dataset.data, dataset.target)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)