Chapter 7 – Ensemble Learning and Random Forests

This notebook contains all the sample code and solutions to the exercises in chapter 7.

Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:


In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)

Voting classifiers


In [2]:
heads_proba = 0.51
coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32)
cumulative_heads_ratio = np.cumsum(coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)

In [3]:
plt.figure(figsize=(8,3.5))
plt.plot(cumulative_heads_ratio)
plt.plot([0, 10000], [0.51, 0.51], "k--", linewidth=2, label="51%")
plt.plot([0, 10000], [0.5, 0.5], "k-", label="50%")
plt.xlabel("Number of coin tosses")
plt.ylabel("Heads ratio")
plt.legend(loc="lower right")
plt.axis([0, 10000, 0.42, 0.58])
save_fig("law_of_large_numbers_plot")
plt.show()


Saving figure law_of_large_numbers_plot

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)


Out[5]:
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [6]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))


LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.896

In [7]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)


Out[7]:
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [8]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))


LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912

Bagging ensembles


In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))


0.904

In [11]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))


0.856

In [12]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [13]:
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
save_fig("decision_tree_without_and_with_bagging_plot")
plt.show()


Saving figure decision_tree_without_and_with_bagging_plot

Random Forests


In [14]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)

In [15]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [17]:
np.sum(y_pred == y_pred_rf) / len(y_pred)  # almost identical predictions


Out[17]:
0.97599999999999998

In [18]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)


sepal length (cm) 0.112492250999
sepal width (cm) 0.0231192882825
petal length (cm) 0.441030464364
petal width (cm) 0.423357996355

In [19]:
rnd_clf.feature_importances_


Out[19]:
array([ 0.11249225,  0.02311929,  0.44103046,  0.423358  ])

In [20]:
plt.figure(figsize=(6, 4))

for i in range(15):
    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)
    indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))
    tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])
    plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)

plt.show()


Out-of-Bag evaluation


In [21]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_


Out[21]:
0.90133333333333332

In [22]:
bag_clf.oob_decision_function_


Out[22]:
array([[ 0.31746032,  0.68253968],
       [ 0.34117647,  0.65882353],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.08379888,  0.91620112],
       [ 0.31693989,  0.68306011],
       [ 0.02923977,  0.97076023],
       [ 0.97687861,  0.02312139],
       [ 0.97765363,  0.02234637],
       [ 0.74404762,  0.25595238],
       [ 0.        ,  1.        ],
       [ 0.71195652,  0.28804348],
       [ 0.83957219,  0.16042781],
       [ 0.97777778,  0.02222222],
       [ 0.0625    ,  0.9375    ],
       [ 0.        ,  1.        ],
       [ 0.97297297,  0.02702703],
       [ 0.95238095,  0.04761905],
       [ 1.        ,  0.        ],
       [ 0.01704545,  0.98295455],
       [ 0.38947368,  0.61052632],
       [ 0.88700565,  0.11299435],
       [ 1.        ,  0.        ],
       [ 0.96685083,  0.03314917],
       [ 0.        ,  1.        ],
       [ 0.99428571,  0.00571429],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.64804469,  0.35195531],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.13402062,  0.86597938],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.36065574,  0.63934426],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.27093596,  0.72906404],
       [ 0.34146341,  0.65853659],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.00531915,  0.99468085],
       [ 0.98265896,  0.01734104],
       [ 0.91428571,  0.08571429],
       [ 0.97282609,  0.02717391],
       [ 0.97029703,  0.02970297],
       [ 0.        ,  1.        ],
       [ 0.06134969,  0.93865031],
       [ 0.98019802,  0.01980198],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.97790055,  0.02209945],
       [ 0.79473684,  0.20526316],
       [ 0.41919192,  0.58080808],
       [ 0.99473684,  0.00526316],
       [ 0.        ,  1.        ],
       [ 0.67613636,  0.32386364],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.87356322,  0.12643678],
       [ 1.        ,  0.        ],
       [ 0.56140351,  0.43859649],
       [ 0.16304348,  0.83695652],
       [ 0.67539267,  0.32460733],
       [ 0.90673575,  0.09326425],
       [ 0.        ,  1.        ],
       [ 0.16201117,  0.83798883],
       [ 0.89005236,  0.10994764],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.995     ,  0.005     ],
       [ 0.        ,  1.        ],
       [ 0.07272727,  0.92727273],
       [ 0.05418719,  0.94581281],
       [ 0.29533679,  0.70466321],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.81871345,  0.18128655],
       [ 0.01092896,  0.98907104],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.22513089,  0.77486911],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.9368932 ,  0.0631068 ],
       [ 0.76536313,  0.23463687],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.17127072,  0.82872928],
       [ 0.65306122,  0.34693878],
       [ 0.        ,  1.        ],
       [ 0.03076923,  0.96923077],
       [ 0.49444444,  0.50555556],
       [ 1.        ,  0.        ],
       [ 0.02673797,  0.97326203],
       [ 0.98870056,  0.01129944],
       [ 0.23121387,  0.76878613],
       [ 0.5       ,  0.5       ],
       [ 0.9947644 ,  0.0052356 ],
       [ 0.00555556,  0.99444444],
       [ 0.98963731,  0.01036269],
       [ 0.25641026,  0.74358974],
       [ 0.92972973,  0.07027027],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.80681818,  0.19318182],
       [ 1.        ,  0.        ],
       [ 0.0106383 ,  0.9893617 ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.98181818,  0.01818182],
       [ 1.        ,  0.        ],
       [ 0.01036269,  0.98963731],
       [ 0.97752809,  0.02247191],
       [ 0.99453552,  0.00546448],
       [ 0.01960784,  0.98039216],
       [ 0.18367347,  0.81632653],
       [ 0.98387097,  0.01612903],
       [ 0.29533679,  0.70466321],
       [ 0.98295455,  0.01704545],
       [ 0.        ,  1.        ],
       [ 0.00561798,  0.99438202],
       [ 0.75138122,  0.24861878],
       [ 0.38624339,  0.61375661],
       [ 0.42708333,  0.57291667],
       [ 0.86315789,  0.13684211],
       [ 0.92964824,  0.07035176],
       [ 0.05699482,  0.94300518],
       [ 0.82802548,  0.17197452],
       [ 0.01546392,  0.98453608],
       [ 0.        ,  1.        ],
       [ 0.02298851,  0.97701149],
       [ 0.96721311,  0.03278689],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.01041667,  0.98958333],
       [ 0.        ,  1.        ],
       [ 0.0326087 ,  0.9673913 ],
       [ 0.01020408,  0.98979592],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.93785311,  0.06214689],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.99462366,  0.00537634],
       [ 0.        ,  1.        ],
       [ 0.38860104,  0.61139896],
       [ 0.32065217,  0.67934783],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.31182796,  0.68817204],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.00588235,  0.99411765],
       [ 0.        ,  1.        ],
       [ 0.98387097,  0.01612903],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.62264151,  0.37735849],
       [ 0.92344498,  0.07655502],
       [ 0.        ,  1.        ],
       [ 0.99526066,  0.00473934],
       [ 1.        ,  0.        ],
       [ 0.98888889,  0.01111111],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.06451613,  0.93548387],
       [ 1.        ,  0.        ],
       [ 0.05154639,  0.94845361],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.03278689,  0.96721311],
       [ 1.        ,  0.        ],
       [ 0.95808383,  0.04191617],
       [ 0.79532164,  0.20467836],
       [ 0.55665025,  0.44334975],
       [ 0.        ,  1.        ],
       [ 0.18604651,  0.81395349],
       [ 1.        ,  0.        ],
       [ 0.93121693,  0.06878307],
       [ 0.97740113,  0.02259887],
       [ 1.        ,  0.        ],
       [ 0.00531915,  0.99468085],
       [ 0.        ,  1.        ],
       [ 0.44623656,  0.55376344],
       [ 0.86363636,  0.13636364],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.00558659,  0.99441341],
       [ 0.        ,  1.        ],
       [ 0.96923077,  0.03076923],
       [ 0.        ,  1.        ],
       [ 0.21649485,  0.78350515],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.98477157,  0.01522843],
       [ 0.8       ,  0.2       ],
       [ 0.99441341,  0.00558659],
       [ 0.        ,  1.        ],
       [ 0.08379888,  0.91620112],
       [ 0.98984772,  0.01015228],
       [ 0.01142857,  0.98857143],
       [ 0.        ,  1.        ],
       [ 0.02747253,  0.97252747],
       [ 1.        ,  0.        ],
       [ 0.79144385,  0.20855615],
       [ 0.        ,  1.        ],
       [ 0.90804598,  0.09195402],
       [ 0.98387097,  0.01612903],
       [ 0.20634921,  0.79365079],
       [ 0.19767442,  0.80232558],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.20338983,  0.79661017],
       [ 0.98181818,  0.01818182],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.98969072,  0.01030928],
       [ 0.        ,  1.        ],
       [ 0.48663102,  0.51336898],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.07821229,  0.92178771],
       [ 0.11176471,  0.88823529],
       [ 0.99415205,  0.00584795],
       [ 0.03015075,  0.96984925],
       [ 1.        ,  0.        ],
       [ 0.40837696,  0.59162304],
       [ 0.04891304,  0.95108696],
       [ 0.51595745,  0.48404255],
       [ 0.51898734,  0.48101266],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.59903382,  0.40096618],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.24157303,  0.75842697],
       [ 0.81052632,  0.18947368],
       [ 0.08717949,  0.91282051],
       [ 0.99453552,  0.00546448],
       [ 0.82142857,  0.17857143],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.125     ,  0.875     ],
       [ 0.04712042,  0.95287958],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.89150943,  0.10849057],
       [ 0.1978022 ,  0.8021978 ],
       [ 0.95238095,  0.04761905],
       [ 0.00515464,  0.99484536],
       [ 0.609375  ,  0.390625  ],
       [ 0.07692308,  0.92307692],
       [ 0.99484536,  0.00515464],
       [ 0.84210526,  0.15789474],
       [ 0.        ,  1.        ],
       [ 0.99484536,  0.00515464],
       [ 0.95876289,  0.04123711],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.26903553,  0.73096447],
       [ 0.98461538,  0.01538462],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.00574713,  0.99425287],
       [ 0.85142857,  0.14857143],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.76506024,  0.23493976],
       [ 0.8969697 ,  0.1030303 ],
       [ 1.        ,  0.        ],
       [ 0.73333333,  0.26666667],
       [ 0.47727273,  0.52272727],
       [ 0.        ,  1.        ],
       [ 0.92473118,  0.07526882],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.87709497,  0.12290503],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.74752475,  0.25247525],
       [ 0.09146341,  0.90853659],
       [ 0.44329897,  0.55670103],
       [ 0.22395833,  0.77604167],
       [ 0.        ,  1.        ],
       [ 0.87046632,  0.12953368],
       [ 0.78212291,  0.21787709],
       [ 0.00507614,  0.99492386],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.02884615,  0.97115385],
       [ 0.96571429,  0.03428571],
       [ 0.93478261,  0.06521739],
       [ 1.        ,  0.        ],
       [ 0.49756098,  0.50243902],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.01604278,  0.98395722],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.96987952,  0.03012048],
       [ 0.        ,  1.        ],
       [ 0.05747126,  0.94252874],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.98989899,  0.01010101],
       [ 0.01675978,  0.98324022],
       [ 1.        ,  0.        ],
       [ 0.13541667,  0.86458333],
       [ 0.        ,  1.        ],
       [ 0.00546448,  0.99453552],
       [ 0.        ,  1.        ],
       [ 0.41836735,  0.58163265],
       [ 0.11309524,  0.88690476],
       [ 0.22110553,  0.77889447],
       [ 1.        ,  0.        ],
       [ 0.97647059,  0.02352941],
       [ 0.22826087,  0.77173913],
       [ 0.98882682,  0.01117318],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 0.96428571,  0.03571429],
       [ 0.33507853,  0.66492147],
       [ 0.98235294,  0.01764706],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.99465241,  0.00534759],
       [ 0.        ,  1.        ],
       [ 0.06043956,  0.93956044],
       [ 0.97619048,  0.02380952],
       [ 1.        ,  0.        ],
       [ 0.03108808,  0.96891192],
       [ 0.57291667,  0.42708333]])

In [23]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)


Out[23]:
0.91200000000000003

Feature importance


In [24]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

In [25]:
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(mnist["data"], mnist["target"])


Out[25]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = matplotlib.cm.hot,
               interpolation="nearest")
    plt.axis("off")

In [27]:
plot_digit(rnd_clf.feature_importances_)

cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])

save_fig("mnist_feature_importance_plot")
plt.show()


Saving figure mnist_feature_importance_plot

AdaBoost


In [28]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)


Out[28]:
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=42)

In [29]:
plot_decision_boundary(ada_clf, X, y)



In [30]:
m = len(X_train)

plt.figure(figsize=(11, 4))
for subplot, learning_rate in ((121, 1), (122, 0.5)):
    sample_weights = np.ones(m)
    for i in range(5):
        plt.subplot(subplot)
        svm_clf = SVC(kernel="rbf", C=0.05, random_state=42)
        svm_clf.fit(X_train, y_train, sample_weight=sample_weights)
        y_pred = svm_clf.predict(X_train)
        sample_weights[y_pred != y_train] *= (1 + learning_rate)
        plot_decision_boundary(svm_clf, X, y, alpha=0.2)
        plt.title("learning_rate = {}".format(learning_rate), fontsize=16)

plt.subplot(121)
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5,  0.10, "3", fontsize=14)
plt.text(-0.4,  0.55, "4", fontsize=14)
plt.text(-0.3,  0.90, "5", fontsize=14)
save_fig("boosting_plot")
plt.show()


Saving figure boosting_plot

In [31]:
list(m for m in dir(ada_clf) if not m.startswith("_") and m.endswith("_"))


Out[31]:
['base_estimator_',
 'classes_',
 'estimator_errors_',
 'estimator_weights_',
 'estimators_',
 'feature_importances_',
 'n_classes_']

Gradient Boosting


In [32]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [33]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)


Out[33]:
DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [34]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)


Out[34]:
DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [35]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)


Out[35]:
DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [36]:
X_new = np.array([[0.8]])

In [37]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [38]:
y_pred


Out[38]:
array([ 0.75026781])

In [39]:
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

plt.figure(figsize=(11,11))

plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)

plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)

plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)

plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

save_fig("gradient_boosting_plot")
plt.show()


Saving figure gradient_boosting_plot

In [40]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt.fit(X, y)


Out[40]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [41]:
gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)
gbrt_slow.fit(X, y)


Out[41]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [42]:
plt.figure(figsize=(11,4))

plt.subplot(121)
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)

save_fig("gbrt_learning_rate_plot")
plt.show()


Saving figure gbrt_learning_rate_plot

Gradient Boosting with Early stopping


In [43]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)


Out[43]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=55, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [44]:
min_error = np.min(errors)

In [45]:
plt.figure(figsize=(11, 4))

plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (%d trees)" % bst_n_estimators, fontsize=14)

save_fig("early_stopping_gbrt_plot")
plt.show()


Saving figure early_stopping_gbrt_plot

In [46]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # early stopping

In [47]:
print(gbrt.n_estimators)


61

Exercise solutions

Coming soon


In [ ]: