In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import fetch_covtype
from sklearn.metrics import log_loss
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.size"] = 16
In [2]:
covtype = fetch_covtype()
y = covtype.target
pd.Series(y).value_counts(sort=False)
Out[2]:
In [3]:
# Multiclass to Binary
X = covtype.data[y < 3]
y = y[y < 3] - 1
print(X.shape, y.mean())
In [4]:
idx = np.arange(y.size)
np.random.seed(1234)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]
In [5]:
N = 100000
feature_names = ['f%02d' % i for i in range(X.shape[1])]
xgmat_train = xgb.DMatrix(X[:N], label=y[:N], feature_names=feature_names)
xgmat_valid = xgb.DMatrix(X[-N:], label=y[-N:], feature_names=feature_names)
y_valid = y[-N:]
watchlist = [(xgmat_train,'train'), (xgmat_valid, 'valid')]
In [6]:
params_xgb = {'objective' : 'binary:logistic',
'eta' : 0.2,
'max_depth' : 10,
'eval_metric': 'logloss',
#'seed' : 2017,
'silent':True}
n_rounds = 100
In [7]:
evals_result = {}
t0 = time.time()
print("Training ...")
bst = xgb.train(params_xgb, xgmat_train, n_rounds, watchlist,
evals_result=evals_result, verbose_eval=False)
print("Done: %.1fs" % (time.time() - t0))
In [8]:
df_score = pd.DataFrame({'train':evals_result['train']['logloss'],
'valid':evals_result['valid']['logloss']})
df_score.plot(figsize=(13, 6))
plt.ylabel("logloss")
plt.xlabel("Boosting iteration")
Out[8]:
In [9]:
baseline = df_score.valid.iloc[-1]
In [10]:
params_xgb.update({'process_type':'update',
'updater':'refresh',
'refresh_leaf': True})
In [11]:
r_lst = [0.5, 0.8, 1, 2, 4, 8]
In [12]:
print("Refreshing ...")
idx_base = np.arange(N)
pr_dict = {}
for r in r_lst:
t0 = time.time()
pr_lst = []
for i in range(10):
idx = np.random.choice(idx_base, int(r * N), replace=True)
xgmat_sample = xgb.DMatrix(X[idx], label=y[idx], feature_names=feature_names)
bst_after = xgb.train(params_xgb, xgmat_sample, n_rounds, xgb_model=bst)
pr_lst.append(bst_after.predict(xgmat_valid))
pr_dict[r] = pr_lst
print("r:%.1f, %.1fs" % (r, (time.time() - t0)))
In [13]:
df_scores = pd.DataFrame(index=range(10))
for r in r_lst:
pr_lst = pr_dict[r]
pr_avg = np.zeros(y_valid.size)
scores = []
for i in range(10):
pr = pr_lst[i]
pr_avg += pr
scores.append({'one_%.1f' % r:log_loss(y_valid, pr),
'avg_%.1f' % r:log_loss(y_valid, pr_avg/(i+1)),
})
df_scores = df_scores.join(pd.DataFrame(scores))
In [14]:
baseline
Out[14]:
In [15]:
df_scores.iloc[:, ::2].tail(1)
Out[15]:
In [16]:
df_scores.iloc[:,1::2].plot(kind='box', figsize=(12,5))
plt.axhline(y=baseline, color='red')
Out[16]:
In [ ]: