notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import time

from sklearn.datasets import fetch_covtype
from sklearn.metrics import log_loss
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.size"] = 16



In [2]:

    
covtype = fetch_covtype()
y = covtype.target

pd.Series(y).value_counts(sort=False)









    Out[2]:





1    211840
2    283301
3     35754
4      2747
5      9493
6     17367
7     20510
dtype: int64



In [3]:

    
# Multiclass to Binary
X = covtype.data[y < 3]
y = y[y < 3] - 1
print(X.shape, y.mean())









    



(495141, 54) 0.572162272969



In [4]:

    
idx = np.arange(y.size)
np.random.seed(1234)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]



In [5]:

    
N = 100000
feature_names = ['f%02d' % i for i in range(X.shape[1])]
xgmat_train = xgb.DMatrix(X[:N], label=y[:N], feature_names=feature_names)
xgmat_valid = xgb.DMatrix(X[-N:], label=y[-N:], feature_names=feature_names)
y_valid = y[-N:]
watchlist = [(xgmat_train,'train'), (xgmat_valid, 'valid')]



In [6]:

    
params_xgb = {'objective'  : 'binary:logistic',
              'eta'        : 0.2,
              'max_depth'  : 10,
              'eval_metric': 'logloss',
              #'seed'       : 2017,
              'silent':True}
n_rounds = 100



In [7]:

    
evals_result = {}
t0 = time.time()
print("Training ...")
bst = xgb.train(params_xgb, xgmat_train, n_rounds, watchlist,
                evals_result=evals_result, verbose_eval=False)
print("Done: %.1fs" % (time.time() - t0))









    



Training ...
Done: 6.7s



In [8]:

    
df_score = pd.DataFrame({'train':evals_result['train']['logloss'],
                         'valid':evals_result['valid']['logloss']})
df_score.plot(figsize=(13, 6))
plt.ylabel("logloss")
plt.xlabel("Boosting iteration")









    Out[8]:





<matplotlib.text.Text at 0x7f1e718d7048>



In [9]:

    
baseline = df_score.valid.iloc[-1]



In [10]:

    
params_xgb.update({'process_type':'update',
                   'updater':'refresh',
                   'refresh_leaf': True})



In [11]:

    
r_lst = [0.5, 0.8, 1, 2, 4, 8]



In [12]:

    
print("Refreshing ...")
idx_base = np.arange(N)
pr_dict = {}
for r in r_lst:
    t0 = time.time()
    pr_lst = []
    for i in range(10):
        idx = np.random.choice(idx_base, int(r * N), replace=True)
        xgmat_sample = xgb.DMatrix(X[idx], label=y[idx], feature_names=feature_names)
        bst_after = xgb.train(params_xgb, xgmat_sample, n_rounds, xgb_model=bst)
        pr_lst.append(bst_after.predict(xgmat_valid))
    pr_dict[r] = pr_lst
    print("r:%.1f, %.1fs" % (r, (time.time() - t0)))









    



Refreshing ...
r:0.5, 3.8s
r:0.8, 5.6s
r:1.0, 6.8s
r:2.0, 12.5s
r:4.0, 23.5s
r:8.0, 45.8s



In [13]:

    
df_scores = pd.DataFrame(index=range(10))
for r in r_lst:
    pr_lst = pr_dict[r]
    pr_avg = np.zeros(y_valid.size)
    scores = []
    for i in range(10):
        pr = pr_lst[i]
        pr_avg += pr
        scores.append({'one_%.1f' % r:log_loss(y_valid, pr),
                       'avg_%.1f' % r:log_loss(y_valid, pr_avg/(i+1)),
                      })
    df_scores = df_scores.join(pd.DataFrame(scores))



In [14]:

    
baseline









    Out[14]:





0.17676600000000001



In [15]:

    
df_scores.iloc[:, ::2].tail(1)



In [16]:

    
df_scores.iloc[:,1::2].plot(kind='box', figsize=(12,5))
plt.axhline(y=baseline, color='red')









    Out[16]:





<matplotlib.lines.Line2D at 0x7f1e80d02f28>



In [ ]: