# Tutorial how to use xgboost

``````

In [1]:

import xgboost as xgb

import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

%pylab inline

``````
``````

Populating the interactive namespace from numpy and matplotlib

``````
``````

In [55]:

FIGSIZE = (15, 8)

def ground_truth(x):
"""Ground truth -- function to approximate"""
return x*np.sin(x) + 2 * np.sin(2 * x) + np.sin(3 * x)

def gen_data(n_samples=200):
"""generate training and testing data"""
np.random.seed(15)
X = np.random.uniform(0, 10, size=n_samples)[:, np.newaxis]
y = ground_truth(X.ravel()) + np.random.normal(scale=2, size=n_samples)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = gen_data(100)

# plot ground truth
x_plot = np.linspace(0, 10, 500)

def plot_data(alpha=0.4, s=20):
fig = plt.figure(figsize=FIGSIZE)
gt = plt.plot(x_plot, ground_truth(x_plot), alpha=alpha, label='ground truth')

# plot training and testing data
plt.scatter(X_train, y_train, s=s, alpha=alpha)
plt.scatter(X_test, y_test, s=s, alpha=alpha, color='red')
plt.xlim((0, 10))
plt.ylabel('y')
plt.xlabel('x')

annotation_kw = {'xycoords': 'data', 'textcoords': 'data',
'arrowprops': {'arrowstyle': '->', 'connectionstyle': 'arc'}}

plot_data()

``````
``````

``````
``````

In [3]:

plot_data()

est = DecisionTreeRegressor(max_depth=1).fit(X_train, y_train)
x_pred_1 = est.predict(x_plot[:, np.newaxis])
plt.plot(x_plot, x_pred_1, label='RT max_depth=1', color='g', alpha=0.9, linewidth=3)

est = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]),
label='RT max_depth=3', color='g', alpha=0.7, linewidth=2)

est = DecisionTreeRegressor(max_depth=10).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]),
label='RT max_depth=10', color='g', alpha=0.5, linewidth=1)

plt.legend(loc='upper left')

``````
``````

Out[3]:

<matplotlib.legend.Legend at 0x10ed98b50>

``````
``````

In [4]:

plot_data()

est = RandomForestRegressor(n_estimators=1, max_depth=1).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=1, max_depth=1', color='g', alpha=0.9, linewidth=3)

est = RandomForestRegressor(n_estimators=1, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=1, max_depth=5', color='g', alpha=0.7, linewidth=2)

est = RandomForestRegressor(n_estimators=5, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RF n_estimators=5, max_depth=5', color='g', alpha=0.5, linewidth=1)

plt.legend(loc='upper left')

``````
``````

Out[4]:

<matplotlib.legend.Legend at 0x10e887510>

``````
``````

In [ ]:

``````
``````

In [5]:

plot_data()

est = xgb.XGBRegressor(n_estimators=1, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=1, max_depth=5', color='g', alpha=0.9, linewidth=3)

est = xgb.XGBRegressor(n_estimators=10, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=10, max_depth=5', color='g', alpha=0.7, linewidth=2)

est = xgb.XGBRegressor(n_estimators=100, max_depth=5).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators=100, max_depth=5', color='g', alpha=0.5, linewidth=1)

plt.legend(loc='upper left')

``````
``````

Out[5]:

<matplotlib.legend.Legend at 0x10ed39310>

``````

Let's analyze what parameters (and default value) a model has and what that means

• base_score=0.5
• colsample_bylevel=1
• colsample_bytree=1
• gamma=0
• learning_rate=0.1
• max_delta_step=0
• max_depth=3
• min_child_weight=1
• missing=None
• n_estimators=100
• objective='reg:linear'
• reg_alpha=0
• reg_lambda=1
• scale_pos_weight=1
• seed=0
• silent=True
• subsample=1

Play aroudn with params like: learning_rate, n_estimators, max_depth, colsample_bylevel

``````

In [33]:

plot_data()

params = [ (1, 100)] #, (10, 100), (100, 100) ]
for (i, (n_estimators, max_depth)) in enumerate(params):

est = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='XGB n_estimators={0}, max_depth={1}'.format(n_estimators, max_depth), color='g', alpha=0.9, linewidth=len(params)-i)

plt.legend(loc='upper left')

``````
``````

Out[33]:

<matplotlib.legend.Legend at 0x113f05410>

``````
``````

In [65]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from ipywidgets import interact, IntSlider, FloatSlider

n_estimators_slider = IntSlider(min=1, max=1000, step=20, value=30)
max_depth_slider = IntSlider(min=1, max=15, step=1, value=3)
learning_rate_slider = FloatSlider(min=0.01, max=0.3, step=0.01, value=0.1)
subsample_slider = FloatSlider(min=0.1, max=1, step=0.1, value=1.0)

gamma_slider = FloatSlider(min=0.1, max=1, step=0.1, value=0)
reg_alpha_slider = FloatSlider(min=0.1, max=1, step=0.1, value=0)
reg_lambda_slider = FloatSlider(min=0.1, max=1, step=0.1, value=1.0)

@interact(n_estimators=n_estimators_slider, max_depth=max_depth_slider, learning_rate=learning_rate_slider,\
subsample=subsample_slider, gamma=gamma_slider, reg_alpha=reg_alpha_slider, reg_lambda=reg_lambda_slider)
def plot(n_estimators, max_depth, learning_rate, subsample, gamma, reg_alpha, reg_lambda):
est = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, \
subsample=subsample, gamma=gamma, reg_alpha=reg_alpha, reg_lambda=reg_lambda).fit(X_train, y_train)

plot_data()
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), \
label='XGB n_estimators={0}, max_depth={1}, learning_rate={2}, subsample={3}, gamma={4}, reg_alpha={5}, reg_lambda={6}'.format(n_estimators, max_depth, learning_rate, subsample, gamma, reg_alpha, reg_lambda),\
color='g', alpha=0.9, linewidth=len(params)-i)

plt.legend(loc='upper left')

``````
``````

``````
``````

In [ ]:

``````