In [3]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.grid_search import RandomizedSearchCV
import os
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
In [ ]:
path = "./data/allstate"
inputFilePath = os.path.join(path, "train.csv.zip")
train = pd.read_csv(inputFilePath, compression="zip", header=0)
categorical_columns = train.select_dtypes(include=['object']).columns
for column in tqdm(categorical_columns):
le = LabelEncoder()
train[column] = le.fit_transform(train[column])
y = train['loss']
X = train.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'min_child_weight': (1, 50),
'colsample_bytree': (0, 1),
'max_depth': (5, 50),
'subsample': (0, 1),
'gamma': (0, 50),
'alpha': (0, 50),
}
num_rounds = 100000
random_state = 2016
num_iter = 25
init_points = 5
params = {
'eta': 0.1,
'silent': 1,
'eval_metric': 'mae',
'verbose_eval': True,
'seed': random_state
}
model = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
seed=random_state,
callbacks=[xgb.callback.early_stop(500)])
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10)
rsearch.fit(dataset.data, dataset.target)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)