In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import fetch_covtype
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.size"] = 16
In [2]:
covtype = fetch_covtype()
y = covtype.target
pd.Series(y).value_counts(sort=False)
Out[2]:
In [3]:
# Multiclass to Binary
X = covtype.data[y < 3]
y = y[y < 3] - 1
print(X.shape, y.mean())
In [4]:
idx = np.arange(y.size)
np.random.seed(1234)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]
In [5]:
feature_names = ['f%02d' % i for i in range(X.shape[1])]
xgmat_train = xgb.DMatrix(X[:100000], label=y[:100000], feature_names=feature_names)
xgmat_valid = xgb.DMatrix(X[-100000:], label=y[-100000:], feature_names=feature_names)
watchlist = [(xgmat_train,'train'), (xgmat_valid, 'valid')]
In [6]:
params_xgb = {'objective' : 'binary:logistic',
'eta' : 0.2,
'max_depth' : 10,
'eval_metric': 'logloss',
'seed' : 2017,
'silent' : True,
}
n_rounds = 100
In [7]:
evals_result = {}
t0 = time.time()
print("Training ...")
bst = xgb.train(params_xgb, xgmat_train, n_rounds, watchlist,
evals_result=evals_result, verbose_eval=False)
print("Done: %.1fs" % (time.time() - t0))
In [8]:
df_score = pd.DataFrame({'train':evals_result['train']['logloss'],
'valid':evals_result['valid']['logloss']})
df_score.plot(figsize=(13, 6))
plt.ylabel("logloss")
plt.xlabel("Boosting iteration")
Out[8]:
In [9]:
params_xgb.update({'process_type': 'update',
'updater' : 'refresh',
'refresh_leaf': False,
})
In [10]:
t0 = time.time()
print("Refreshing ...")
bst_after = xgb.train(params_xgb, xgmat_valid, n_rounds, xgb_model=bst)
print("Done: %.1fs" % (time.time() - t0))
In [11]:
imp_2 = pd.DataFrame(index=xgmat_train.feature_names)
imp_2['train'] = pd.Series(bst.get_score(importance_type='gain'))
imp_2['OOB'] = pd.Series(bst_after.get_score(importance_type='gain'))
imp_2 = imp_2.fillna(0)
imp_2 /= imp_2.sum(0)*.01
imp_2 = imp_2.sort_values('train')
print(imp_2.tail())
In [12]:
imp_2.tail(8).plot(kind='barh', title="Feature importance", figsize=(13, 6))
Out[12]:
In [13]:
key = 'lambda'
val_lst = [0, 1, 2, 4, 8]
t0 = time.time()
bst_lst = []
print("Refreshing ...")
for val in val_lst:
params_xgb[key] = val
bst_lst.append(xgb.train(params_xgb, xgmat_valid, n_rounds, xgb_model=bst))
print("%s:%d done: %.1fs" % (key, val, (time.time() - t0)))
In [14]:
imp_s = pd.DataFrame(index=xgmat_train.feature_names)
imp_s['train'] = pd.Series(bst.get_score(importance_type='gain'))
for val, bst_after in zip(val_lst, bst_lst):
imp_s['%s_%d' % (key, val)] = pd.Series(bst_after.get_score(importance_type='gain'))
imp_s = imp_s.fillna(0)
imp_s /= imp_s.sum(0)*.01
imp_s = imp_s.sort_values('train')
print(imp_s.tail())
In [15]:
imp_s.tail(5).plot(kind='barh', title="Feature importance", figsize=(13, 6))
Out[15]: