Tutorial how to use xgboost


In [1]:
import xgboost as xgb

import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [55]:
FIGSIZE = (15, 8)

def ground_truth(x):
    """Ground truth -- function to approximate"""
    return x*np.sin(x) + 2 * np.sin(2 * x) + np.sin(3 * x)

def gen_data(n_samples=200):
    """generate training and testing data"""
    np.random.seed(15)
    X = np.random.uniform(0, 10, size=n_samples)[:, np.newaxis]
    y = ground_truth(X.ravel()) + np.random.normal(scale=2, size=n_samples)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = gen_data(100)

# plot ground truth
x_plot = np.linspace(0, 10, 500)

def plot_data(alpha=0.4, s=20):
    fig = plt.figure(figsize=FIGSIZE)
    gt = plt.plot(x_plot, ground_truth(x_plot), alpha=alpha, label='ground truth')

    # plot training and testing data
    plt.scatter(X_train, y_train, s=s, alpha=alpha)
    plt.scatter(X_test, y_test, s=s, alpha=alpha, color='red')
    plt.xlim((0, 10))
    plt.ylabel('y')
    plt.xlabel('x')
    
annotation_kw = {'xycoords': 'data', 'textcoords': 'data',
                 'arrowprops': {'arrowstyle': '->', 'connectionstyle': 'arc'}}
    
plot_data()



In [3]:
plot_data()

est = DecisionTreeRegressor(max_depth=1).fit(X_train, y_train)
x_pred_1 = est.predict(x_plot[:, np.newaxis])
plt.plot(x_plot, x_pred_1, label='RT max_depth=1', color='g', alpha=0.9, linewidth=3)

est = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]),
         label='RT max_depth=3', color='g', alpha=0.7, linewidth=2)

est = DecisionTreeRegressor(max_depth=10).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]),
         label='RT max_depth=10', color='g', alpha=0.5, linewidth=1)


plt.legend(loc='upper left')


Out[3]:
<matplotlib.legend.Legend at 0x10ed98b50>

In [4]:
plot_data()

est = RandomForestRegressor(n_estimators=1, max_depth=1).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=1, max_depth=1', color='g', alpha=0.9, linewidth=3)

est = RandomForestRegressor(n_estimators=1, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=1, max_depth=5', color='g', alpha=0.7, linewidth=2)

est = RandomForestRegressor(n_estimators=5, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=5, max_depth=5', color='g', alpha=0.5, linewidth=1)


plt.legend(loc='upper left')


Out[4]:
<matplotlib.legend.Legend at 0x10e887510>

In [ ]:


In [5]:
plot_data()

est = xgb.XGBRegressor(n_estimators=1, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=1, max_depth=5', color='g', alpha=0.9, linewidth=3)

est = xgb.XGBRegressor(n_estimators=10, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=10, max_depth=5', color='g', alpha=0.7, linewidth=2)

est = xgb.XGBRegressor(n_estimators=100, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=100, max_depth=5', color='g', alpha=0.5, linewidth=1)


plt.legend(loc='upper left')


Out[5]:
<matplotlib.legend.Legend at 0x10ed39310>

Let's analyze what parameters (and default value) a model has and what that means

  • base_score=0.5
  • colsample_bylevel=1
  • colsample_bytree=1
  • gamma=0
  • learning_rate=0.1
  • max_delta_step=0
  • max_depth=3
  • min_child_weight=1
  • missing=None
  • n_estimators=100
  • nthread=-1
  • objective='reg:linear'
  • reg_alpha=0
  • reg_lambda=1
  • scale_pos_weight=1
  • seed=0
  • silent=True
  • subsample=1

Play aroudn with params like: learning_rate, n_estimators, max_depth, colsample_bylevel


In [33]:
plot_data()

params = [ (1, 100)] #, (10, 100), (100, 100) ]
for (i, (n_estimators, max_depth)) in enumerate(params):
    
    est = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth).fit(X_train, y_train)
    plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators={0}, max_depth={1}'.format(n_estimators, max_depth), color='g', alpha=0.9, linewidth=len(params)-i)


plt.legend(loc='upper left')


Out[33]:
<matplotlib.legend.Legend at 0x113f05410>

In [65]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from ipywidgets import interact, IntSlider, FloatSlider

n_estimators_slider = IntSlider(min=1, max=1000, step=20, value=30)
max_depth_slider = IntSlider(min=1, max=15, step=1, value=3)
learning_rate_slider = FloatSlider(min=0.01, max=0.3, step=0.01, value=0.1)
subsample_slider = FloatSlider(min=0.1, max=1, step=0.1, value=1.0)

gamma_slider = FloatSlider(min=0.1, max=1, step=0.1, value=0)
reg_alpha_slider = FloatSlider(min=0.1, max=1, step=0.1, value=0)
reg_lambda_slider = FloatSlider(min=0.1, max=1, step=0.1, value=1.0)


@interact(n_estimators=n_estimators_slider, max_depth=max_depth_slider, learning_rate=learning_rate_slider,\
         subsample=subsample_slider, gamma=gamma_slider, reg_alpha=reg_alpha_slider, reg_lambda=reg_lambda_slider)
def plot(n_estimators, max_depth, learning_rate, subsample, gamma, reg_alpha, reg_lambda):
    est = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, \
                           subsample=subsample, gamma=gamma, reg_alpha=reg_alpha, reg_lambda=reg_lambda).fit(X_train, y_train)

    plot_data()
    plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), \
             label='XGB n_estimators={0}, max_depth={1}, learning_rate={2}, subsample={3}, gamma={4}, reg_alpha={5}, reg_lambda={6}'.format(n_estimators, max_depth, learning_rate, subsample, gamma, reg_alpha, reg_lambda),\
             color='g', alpha=0.9, linewidth=len(params)-i)
    
    plt.legend(loc='upper left')



In [ ]: