In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Learning Curves


In [ ]:
from figures import make_dataset

fig, axes = plt.subplots(1, 3, figsize=(20, 5))
for n_samples, ax in zip([10, 100, 500], axes.ravel()):
    X, y = make_dataset(n_samples=n_samples)
    ax.scatter(X, y)
    ax.set_xticks(())
    ax.set_yticks(())

In [ ]:
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import KFold
from sklearn.linear_model import Ridge

from sklearn.datasets import load_boston
boston = load_boston()
# print(boston.DESCR) to learn more about the dataset
cv = KFold(n_folds=10, n=len(boston.data), shuffle=True)
train_sizes, train_scores, validation_scores = learning_curve(Ridge(alpha=1), boston.data, boston.target,
                                                              cv=cv)
print("dataset size: %d" % boston.data.shape[0])
print("training set sizes: %s" % train_sizes)

In [ ]:
train_scores.shape

In [ ]:
def plot_learning_curve(train_sizes, train_scores, validation_scores):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.ylim(max(-3, validation_scores_mean.min() - .1), train_scores_mean.max() + .1)

In [ ]:
plot_learning_curve(train_sizes, train_scores, validation_scores)

In [ ]:
train_sizes, train_scores, validation_scores = learning_curve(Ridge(alpha=0.001), boston.data, boston.target,
                                                              cv=cv)

In [ ]:
plot_learning_curve(train_sizes, train_scores, validation_scores)

In [ ]: