In [1]:
import time
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from xgboost.callback import _get_callback_context
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.size"] = 14
In [2]:
# reset_learning_rate(learning_rate) [https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/callback.py]
def reset_parameters(param_name, param_values):
"""Reset paramter values
Parameters
----------
param_values: list or function
List of parameter values for each boosting round
or a customized function that calculates eta in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: eta = l[boosting round]
- function f: eta = f(boosting round, num_boost_round)
Returns
-------
callback : function
The requested callback function.
"""
def get_param_value(i, n, param_values):
"""helper providing the learning rate"""
if isinstance(param_values, (list, np.ndarray)):
if len(param_values) != n:
raise ValueError("Length of list 'param_values' has to equal 'num_boost_round'.")
new_param_value = param_values[i]
else:
new_param_value = param_values(i, n)
return new_param_value
def callback(env):
"""internal function"""
context = _get_callback_context(env)
if context == 'train':
bst, i, n = env.model, env.iteration, env.end_iteration
bst.set_param(param_name, get_param_value(i, n, param_values))
elif context == 'cv':
i, n = env.iteration, env.end_iteration
for cvpack in env.cvfolds:
bst = cvpack.bst
bst.set_param(param_name, get_param_value(i, n, param_values))
callback.before_iteration = True
return callback
def get_leaf_values(tree_str):
# To find 'leaf=0.123\n'
prog=re.compile(r"(?<=leaf\=)(.+)\n")
result = [float(rval) for rval in prog.findall(tree_str)]
return np.array(result)
def get_all_leaves(bst):
dmp = bst.get_dump()
return [get_leaf_values(tree) for tree in dmp]
def experiment(X_train, y_train, X_valid, y_valid,
n_rounds, params_xgb,
param_name=None, params_values=None, lowest_score=0.0001):
if param_name is None:
callbacks = None
else:
callbacks = [reset_parameters(param_name, param_values)]
xgmat_train = xgb.DMatrix(X_train, label=y_train)
xgmat_valid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(xgmat_train, 'train'), (xgmat_valid, 'valid')]
evals_result = {}
t0 = time.time()
bst = xgb.train(params_xgb, xgmat_train, n_rounds, watchlist,
callbacks=callbacks,
early_stopping_rounds=50,
evals_result=evals_result, verbose_eval=False)
ntree = len(evals_result['valid']['logloss'])
df_train_scores = pd.DataFrame({'train_loss':evals_result['train']['logloss']},
index=pd.Index(range(1, ntree+1), name='Boosting iteration'))
df_valid_scores = pd.DataFrame({'valid_loss':evals_result['valid']['logloss']},
index=pd.Index(range(1, ntree+1), name='Boosting iteration'))
leaves_lst = get_all_leaves(bst)[:ntree]
df_leaf_cnts = pd.DataFrame({'leaf_cnts':[len(leaves) for leaves in leaves_lst]},
index=pd.Index(range(1, ntree+1), name='Boosting iteration'))
df_w_L2 = pd.DataFrame({'w_L2':[np.sqrt(np.sum(leaves**2)) for leaves in leaves_lst]},
index=pd.Index(range(1, ntree+1), name='Boosting iteration'))
sc_best = evals_result['valid']['logloss'][bst.best_iteration]
print("valid_loss:%.4f, ntree:%d, %.1fs" % \
(sc_best,
bst.best_ntree_limit,
(time.time() - t0)))
fig, axes = plt.subplots(3, sharex=True, figsize=(13,9))
#df_train_scores.plot(ax=ax[0], logy=True)
(df_valid_scores - sc_best + lowest_score).plot(ax=axes[0], logy=True)
df_leaf_cnts.plot(ax=axes[1])
df_w_L2.plot(ax=axes[2])
for ax in axes:
#print(ax.get_xlim())
ax.axvline(x=100, ymin=0, ymax=0.5, color='red')
ax.axvline(x=bst.best_ntree_limit, ymin=0.25, ymax=0.5, color='blue')
In [3]:
n_train = 10 ** 6
n_valid = n_train // 4
n_all = n_train + n_valid
n_classes = 2
n_clusters_per_class = 8
n_features = 32
n_informative = n_redundant = n_features // 4
In [4]:
X, y = make_classification(n_samples=n_all,
n_classes=n_classes,
n_features=n_features,
n_informative=n_informative,
n_redundant=n_redundant,
n_clusters_per_class=n_clusters_per_class,
shuffle=True,
random_state=987)
In [5]:
X_train = X[:n_train]
y_train = y[:n_train]
X_valid = X[n_train:]
y_valid = y[n_train:]
In [6]:
params_xgb = {'objective' : 'binary:logistic',
'eval_metric' : 'logloss',
'eta' : 0.1, #default=0.3
'max_depth' : 20, #default=6
'lambda' : 1, #default
'min_child_weight': 1, #default
'silent' : True,
}
n_rounds = 1000
In [7]:
experiment(X_train, y_train, X_valid, y_valid, n_rounds,
params_xgb)
In [8]:
param_name = 'lambda'
param_values = np.linspace(1, 10, n_rounds)
experiment(X_train, y_train, X_valid, y_valid, n_rounds,
params_xgb, param_name, param_values)
In [9]:
param_name = 'alpha'
param_values = np.linspace(0, 10, n_rounds)
experiment(X_train, y_train, X_valid, y_valid, n_rounds,
params_xgb, param_name, param_values)
In [10]:
param_name = 'max_depth'
param_values = np.linspace(20, 10, n_rounds).astype(int)
experiment(X_train, y_train, X_valid, y_valid, n_rounds,
params_xgb, param_name, param_values)
In [11]:
params_xgb.update({'booster':'dart',
'rate_drop':0.01,
})
experiment(X_train, y_train, X_valid, y_valid, n_rounds,
params_xgb)
In [ ]: