In [1]:
from sklearn import  ensemble
from sklearn import datasets
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
plt.figure(figsize=(10,8))
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

for label, color, setting in [('No shrinkage', 'orange',
                               {'learning_rate': 1.0, 'subsample': 1.0}),
                              ('learning_rate=0.1', 'turquoise',
                               {'learning_rate': 0.1, 'subsample': 1.0}),
                              ('subsample=0.5', 'blue',
                               {'learning_rate': 1.0, 'subsample': 0.5}),
                              ('learning_rate=0.1, subsample=0.5', 'gray',
                               {'learning_rate': 0.1, 'subsample': 0.5}),
                              ('learning_rate=0.1, max_features=2', 'magenta',
                               {'learning_rate': 0.1, 'max_features': 2})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')

plt.show()



In [22]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:700], X[2000:]
y_train, y_test = y[:700], y[2000:]
plt.figure(figsize=(10,8))
original_params = {'n_estimators': 1500, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

for label, color, setting in [('subsample=1.0', 'orange',
                               {'subsample': 1.0}),
                              ('subsample=0.5', 'turquoise',
                               {'subsample': 0.5}),
                              ('subsample=0.3', 'gray',
                               {'subsample': 0.3})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    train_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    

    for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
        train_deviance[i] = clf.loss_(y_train, y_pred)
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(train_deviance.shape[0]) + 1)[::5], train_deviance[::5],
            '-', color=color, label=label)        
    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Train  VS Test Set Deviance')

plt.show()



In [26]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:700], X[2000:]
y_train, y_test = y[:700], y[2000:]
plt.figure(figsize=(10,8))
original_params = {'n_estimators': 1500, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

for label, color, setting in [('max_features=1.0', 'orange',
                               {'max_features': 1.0}),
                              ('max_features=0.5', 'turquoise',
                               {'max_features': 0.8}),
                              ('max_features=0.3', 'gray',
                               {'max_features': 0.7})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    train_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    

    for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
        train_deviance[i] = clf.loss_(y_train, y_pred)
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(train_deviance.shape[0]) + 1)[::5], train_deviance[::5],
            '-', color=color, label=label)        
    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Train  VS Test Set Deviance')

plt.show()



In [35]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:700], X[2000:]
y_train, y_test = y[:700], y[2000:]
plt.figure(figsize=(10,8))
original_params = {'n_estimators': 1500, 'max_depth': None, 'random_state': 2, 'subsample': 0.5}

for label, color, setting in [('min_samples_split: 9', 'orange',
                               {'min_samples_split': 9,}),
                              ('min_samples_split: 7', 'turquoise',
                               {'min_samples_split': 7,}),
                              ('min_samples_split: 5', 'gray',
                               {'min_samples_split': 5,})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    train_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    

    for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
        train_deviance[i] = clf.loss_(y_train, y_pred)
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(train_deviance.shape[0]) + 1)[::5], train_deviance[::5],
            '-', color=color, label=label)        
    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Train  VS Test Set Deviance')

plt.show()



In [19]:
X.shape


Out[19]:
(12000, 10)

In [12]:
clf.staged_decision_function?

In [36]:
ensemble.GradientBoostingRegressor?

In [30]:
clf.

In [ ]: