In [1]:
import os
import xgboost as xgb
import pandas as pd
import numpy as np
from utils import encode_numeric_zscore_list, encode_numeric_zscore_all, to_xy, encode_text_index_list, encode_numeric_log_all
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn import datasets
from sigopt_sklearn.search import SigOptSearchCV
In [10]:
path = "./data/allstate"
inputFilePath = os.path.join(path, "train.csv.zip")
df = pd.read_csv(inputFilePath, compression="zip", header=0, na_values=['NULL'])
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
df.drop('id', axis=1, inplace=True)
#df = df.sample(frac=0.01)
#encode categoricals as dummies
encode_text_index_list(df, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116'])
#encode all numeric values to zscored values
encode_numeric_zscore_list(df, ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14'])
#discard rows where z-score > 2
df.fillna(0)
# Create x(predictors) and y (expected outcome)
X,Y = to_xy(df, "loss")
In [31]:
# find your SigOpt client token here : https://sigopt.com/user/profile
client_token = "UAJKINHBEGLJVIYYMGWANLUPRORPFRLTJMESGZKNPTHKOSIW"
xgb_params = {
'learning_rate' : [0.01, 0.5],
'n_estimators' : [10, 70],
'max_depth':[3, 50],
'min_child_weight':[1, 15],
'gamma':[0, 1.0],
'subsample':[0.1, 1.0],
'colsample_bytree':[0.1, 1.0],
'max_delta_step': [1,15],
'colsample_bylevel': [0.1, 1.0],
#'lamda': [1,5],
#'alpha': [1,5],
'scale_pos_weight': [0,5],
#'objective': 'reg:linear',
#'booster': ['gblinear', 'gbtree'] ,
#'eval_metric': 'mae',
#'tree_method': ['exact', 'approx']
}
In [ ]:
xgb = XGBRegressor()
clf = SigOptSearchCV(xgb, xgb_params, cv=5,
client_token=client_token, n_jobs=25, n_iter=700, verbose=1)
clf.fit(X, Y)
In [2]:
a = XGBRegressor()
a.get_params().keys()
Out[2]:
In [ ]: