BayesSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used.
The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings.
In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.(n_iter!)
In [16]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
In [2]:
%config InlineBackend.figure_format = 'retina'
In [3]:
ITERATIONS = 10 # 1000
TRAINING_SIZE = 100000 # 20000000
TEST_SIZE = 25000
# Load data
X = pd.read_csv(
'./data/train_sample.csv',
nrows=TRAINING_SIZE,
parse_dates=['click_time']
)
# Split into X and y
y = X['is_attributed']
X = X.drop(['click_time','is_attributed', 'attributed_time'], axis=1)
In [4]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
estimator = xgb.XGBClassifier(
n_jobs = 1,
objective = 'binary:logistic',
eval_metric = 'auc',
silent=1,
tree_method='approx'
),
search_spaces = {
'learning_rate': (0.01, 1.0, 'log-uniform'),
'min_child_weight': (0, 10),
'max_depth': (0, 50),
'max_delta_step': (0, 20),
'subsample': (0.01, 1.0, 'uniform'),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'colsample_bylevel': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'reg_alpha': (1e-9, 1.0, 'log-uniform'),
'gamma': (1e-9, 0.5, 'log-uniform'),
'min_child_weight': (0, 5),
'n_estimators': (50, 100),
'scale_pos_weight': (1e-6, 500, 'log-uniform')
},
scoring = 'roc_auc',
cv = StratifiedKFold(
n_splits=3,
shuffle=True,
random_state=42
),
n_jobs = 3,
n_iter = ITERATIONS,
verbose = 0,
refit = True,
random_state = 42
)
def status_print(optim_result):
"""Status callback durring bayesian hyperparameter search"""
# Get all the models tested so far in DataFrame format
all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)
# Get current parameters and the best parameters
best_params = pd.Series(bayes_cv_tuner.best_params_)
print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
len(all_models),
np.round(bayes_cv_tuner.best_score_, 4),
bayes_cv_tuner.best_params_
))
# Save all model results
clf_name = bayes_cv_tuner.estimator.__class__.__name__
all_models.to_csv(clf_name+"_cv_results.csv")
In [5]:
xgb_result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)
In [6]:
xgb_result.best_score_
Out[6]:
In [7]:
xgb_result.best_params_
Out[7]:
In [8]:
xgb_result.best_estimator_
Out[8]:
In [9]:
new_model = xgb_result.best_estimator_
In [10]:
xgb.plot_importance(new_model);
In [11]:
xgb_result.cv_results_
Out[11]:
In [12]:
bayes_cv_tuner = BayesSearchCV(
estimator = lgb.LGBMRegressor(
objective='binary',
metric='auc',
n_jobs=1,
verbose=0
),
search_spaces = {
'learning_rate': (0.01, 1.0, 'log-uniform'),
'num_leaves': (1, 100),
'max_depth': (0, 50),
'min_child_samples': (0, 50),
'max_bin': (100, 1000),
'subsample': (0.01, 1.0, 'uniform'),
'subsample_freq': (0, 10),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'min_child_weight': (0, 10),
'subsample_for_bin': (100000, 500000),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'reg_alpha': (1e-9, 1.0, 'log-uniform'),
'scale_pos_weight': (1e-6, 500, 'log-uniform'),
'n_estimators': (50, 100),
},
scoring = 'roc_auc',
cv = StratifiedKFold(
n_splits=3,
shuffle=True,
random_state=42
),
n_jobs = 3,
n_iter = ITERATIONS,
verbose = 0,
refit = True,
random_state = 42
)
# Fit the model
lgbm_result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)
In [13]:
lgbm_result.best_params_
Out[13]:
In [14]:
lgbm_result.estimator
Out[14]:
In [21]:
bayes_cv_tuner = BayesSearchCV(
estimator = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', subsample=0.6143), #colsample_bytree=0.6453, subsample=0.6143
search_spaces = {
'learning_rate': (0.01, 1.0, 'log-uniform'),
'num_leaves': (10, 100),
'max_depth': (0, 50),
'min_child_samples': (0, 50),
'max_bin': (100, 1000),
'subsample_freq': (0, 10),
'min_child_weight': (0, 10),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'reg_alpha': (1e-9, 1.0, 'log-uniform'),
'scale_pos_weight': (1e-6, 500, 'log-uniform'),
'n_estimators': (50, 150),
},
scoring = 'neg_mean_squared_error', #neg_mean_squared_log_error
cv = KFold(
n_splits=5,
shuffle=True,
random_state=42
),
n_jobs = 1,
n_iter = 100,
verbose = 0,
refit = True,
random_state = 42
)
def status_print(optim_result):
"""Status callback durring bayesian hyperparameter search"""
# Get all the models tested so far in DataFrame format
all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)
# Get current parameters and the best parameters
best_params = pd.Series(bayes_cv_tuner.best_params_)
print('Model #{}\nBest MSE: {}\nBest params: {}\n'.format(
len(all_models),
np.round(bayes_cv_tuner.best_score_, 4),
bayes_cv_tuner.best_params_
))
# Save all model results
clf_name = bayes_cv_tuner.estimator.__class__.__name__
all_models.to_csv(clf_name+"_cv_results.csv")
# Fit the model
result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)
In [22]:
import sklearn
In [26]:
keys = sklearn.metrics.SCORERS.keys()
In [32]:
for key in keys:
print(key)
In [ ]: