In [39]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)

import numpy as np
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

In [40]:
np.random.seed(0)
n_samples = 30
true_fn = lambda X: np.cos(1.5 * np.pi * X)
X = np.sort(np.random.rand(n_samples))
y = true_fn(X) + np.random.randn(n_samples) * 0.1
continuous_X = np.linspace(0, 1, 500)
plt.plot(continuous_X, true_fn(continuous_X), 
c='aquamarine', label='Signal')
plt.scatter(X, y, label='Samples')
print('Signal and noisy samples taken')
plt.legend()
plt.show()


Signal and noisy samples taken

In [41]:
mses = []
stds = []
min_mse = np.iinfo(np.int32).max
best_model = None
pipelines = []
degrees = range(1, 16)

for d in degrees:
    poly_features = PolynomialFeatures(degree=d, include_bias=False)
    model = LinearRegression()
    pipeline = Pipeline([('polynomial_features', poly_features),
                         ('linear_regression', model)])
    pipeline.fit(X[:, np.newaxis], y)

    scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                             scoring='mean_squared_error', cv=10)

    mses.append(-scores.mean())
    stds.append(scores.std())
    pipelines.append(pipeline)
    if -scores.mean() < min_mse:
        min_mse = -scores.mean()
        best_model = pipeline

In [42]:
height = 4
width = 4
n = 0
fig, axs = plt.subplots(height, width)

for h in range(height):
    for w in range(width):
        ax = axs[h, w]
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        if h == w == 0:
            ax.set_title('Raw data', fontsize=10)
            ax.scatter(X, y, color='teal', s=7)
        else:
            p = pipelines[n]
            n += 1
            ax.set_title('{} degrees'.format(n), fontsize=10)
            ax.plot(X, p.predict(X[:, np.newaxis]), color='teal')
print('Plots varying degrees of coefficients for independent (and Raw data)')


Plots varying degrees of coefficients for independent (and Raw data)

In [43]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(degrees, mses, label='MSE', color='cornflowerblue')
ax2.plot(degrees, stds, label='Sigma', color='teal')
ax1.set_yscale('log')
ax2.set_yscale('log')
ax1.set_xlabel('Degrees')
ax1.set_ylabel('Mean sq error', color='cornflowerblue')
ax2.set_ylabel('Std deviation', color='teal')
print('Approximating noisy signal varying polynomial coefficients')


Approximating noisy signal varying polynomial coefficients

In [44]:
for d, m, s in zip(degrees, mses, stds):
    print('Degree {:>2}: mse = {:16.3f}, std = {:16.3f}'.format(d, m, s))


Degree  1: mse =            0.408, std =            0.425
Degree  2: mse =            0.058, std =            0.057
Degree  3: mse =            0.019, std =            0.013
Degree  4: mse =            0.043, std =            0.071
Degree  5: mse =            0.099, std =            0.229
Degree  6: mse =            0.114, std =            0.252
Degree  7: mse =            0.197, std =            0.446
Degree  8: mse =            0.360, std =            0.537
Degree  9: mse =           21.233, std =           62.966
Degree 10: mse =         1516.437, std =         4548.559
Degree 11: mse =        24016.937, std =        72050.221
Degree 12: mse =         2071.781, std =         6160.328
Degree 13: mse =      1323087.436, std =      3968927.109
Degree 14: mse =      7518478.412, std =     22555278.576
Degree 15: mse =    181521531.240, std =    544552181.277

In [45]:
plt.plot(continuous_X, true_fn(continuous_X), 
    c='aquamarine', label='Signal')
plt.plot(X, best_model.predict(X[:, np.newaxis]), 
    c='teal', linestyle='-.', label='Model')
plt.scatter(X, y, label='Samples')
print('Signal, samples and best model')
plt.legend()
plt.show()


Signal, samples and best model