In [1]:
import os
import xgboost as xgb
import pandas as pd
import numpy as np
from utils import encode_numeric_zscore_list, encode_numeric_zscore_all, to_xy, encode_text_index_list, encode_numeric_log_all
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn import datasets
from sigopt_sklearn.search import SigOptSearchCV


/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [10]:
path = "./data/allstate"
inputFilePath = os.path.join(path, "train.csv.zip")
df = pd.read_csv(inputFilePath, compression="zip", header=0, na_values=['NULL'])
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
df.drop('id', axis=1, inplace=True)
#df = df.sample(frac=0.01)

#encode categoricals as dummies
encode_text_index_list(df, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116'])

#encode all numeric values to zscored values
encode_numeric_zscore_list(df, ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14'])

#discard rows where z-score > 2
df.fillna(0)
# Create x(predictors) and y (expected outcome)
X,Y = to_xy(df, "loss")


float64

In [31]:
# find your SigOpt client token here : https://sigopt.com/user/profile
client_token = "UAJKINHBEGLJVIYYMGWANLUPRORPFRLTJMESGZKNPTHKOSIW"

xgb_params = {
 'learning_rate' : [0.01, 0.5],
 'n_estimators' :  [10, 70],
 'max_depth':[3, 50],
 'min_child_weight':[1, 15],
 'gamma':[0, 1.0],
 'subsample':[0.1, 1.0],
 'colsample_bytree':[0.1, 1.0],
 'max_delta_step': [1,15],
 'colsample_bylevel': [0.1, 1.0],
 #'lamda': [1,5],
 #'alpha': [1,5],
 'scale_pos_weight': [0,5],
 #'objective': 'reg:linear',
 #'booster': ['gblinear', 'gbtree'] ,
 #'eval_metric': 'mae',
 #'tree_method': ['exact', 'approx']
}

In [ ]:
xgb = XGBRegressor()

clf = SigOptSearchCV(xgb, xgb_params, cv=5,
    client_token=client_token, n_jobs=25, n_iter=700, verbose=1)

clf.fit(X, Y)


Creating SigOpt experiment:  XGBRegressor (sklearn)
Experiment progress available at : https://sigopt.com/experiment/10601
Evaluating params :  [{'scale_pos_weight': 0, 'learning_rate': 0.45767903718668396, 'gamma': 0.6490173787833018, 'max_depth': 31, 'n_estimators': 18, 'colsample_bylevel': 0.6177293232669435, 'colsample_bytree': 0.8858130111489914, 'subsample': 0.2707613887264699, 'min_child_weight': 4, 'max_delta_step': 3}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:    6.9s finished
Evaluating params :  [{'scale_pos_weight': 2, 'learning_rate': 0.3582230447850348, 'gamma': 0.9605930722762156, 'max_depth': 36, 'n_estimators': 29, 'colsample_bylevel': 0.5116534648370143, 'colsample_bytree': 0.27101314612768995, 'subsample': 0.7553763823390154, 'min_child_weight': 7, 'max_delta_step': 10}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:    8.6s finished
Evaluating params :  [{'scale_pos_weight': 3, 'learning_rate': 0.29423487603930776, 'gamma': 0.8782106250015249, 'max_depth': 43, 'n_estimators': 45, 'colsample_bylevel': 0.9658504117574152, 'colsample_bytree': 0.12441877638838487, 'subsample': 0.7126723857099755, 'min_child_weight': 8, 'max_delta_step': 9}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:   11.6s finished
Evaluating params :  [{'scale_pos_weight': 4, 'learning_rate': 0.12551043341859588, 'gamma': 0.4534513401622303, 'max_depth': 21, 'n_estimators': 70, 'colsample_bylevel': 0.26380232316008223, 'colsample_bytree': 0.7580178180787949, 'subsample': 0.44330843230985695, 'min_child_weight': 1, 'max_delta_step': 4}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:   17.8s finished
Evaluating params :  [{'scale_pos_weight': 5, 'learning_rate': 0.4140696140357177, 'gamma': 0.26045170718465277, 'max_depth': 4, 'n_estimators': 23, 'colsample_bylevel': 0.4017021198898454, 'colsample_bytree': 0.6685330641197106, 'subsample': 0.1791194277068413, 'min_child_weight': 15, 'max_delta_step': 2}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:    7.2s finished
Evaluating params :  [{'scale_pos_weight': 1, 'learning_rate': 0.19384443029107729, 'gamma': 0.02815238988760721, 'max_depth': 22, 'n_estimators': 38, 'colsample_bylevel': 0.28400657490767117, 'colsample_bytree': 0.4747380581558507, 'subsample': 0.3463395498648917, 'min_child_weight': 13, 'max_delta_step': 11}]
[Parallel(n_jobs=25)]: Done   5 out of   5 | elapsed:    9.1s finished
Evaluating params :  [{'scale_pos_weight': 1, 'learning_rate': 0.32441412844702994, 'gamma': 0.12522567885894398, 'max_depth': 33, 'n_estimators': 50, 'colsample_bylevel': 0.16303890568522914, 'colsample_bytree': 0.36205408387101884, 'subsample': 0.9637354448274572, 'min_child_weight': 10, 'max_delta_step': 7}]

In [2]:
a = XGBRegressor()
a.get_params().keys()


Out[2]:
dict_keys(['max_delta_step', 'nthread', 'reg_alpha', 'n_estimators', 'min_child_weight', 'colsample_bytree', 'objective', 'colsample_bylevel', 'subsample', 'seed', 'learning_rate', 'silent', 'base_score', 'missing', 'gamma', 'max_depth', 'reg_lambda', 'scale_pos_weight'])

In [ ]: