In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Why pipelines for preprocessing


In [ ]:
from sklearn.datasets import make_regression

X, y = make_regression(random_state=42, noise=100)
print(X.shape)

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.5)

In [ ]:
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.linear_model import Ridge

fpr = SelectFpr(score_func=f_regression, alpha=.05)
fpr.fit(X_train, y_train)
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

print(X_train_fpr.shape)

In [ ]:
ridge = Ridge()
ridge.fit(X_train_fpr, y_train)
ridge.score(X_test_fpr, y_test)

How not to do grid-searches


In [ ]:
# DON'T DO THIS:
from sklearn.grid_search import GridSearchCV
param_grid = {'alpha': 10. ** np.arange(-3, 5)}
grid = GridSearchCV(ridge, param_grid, cv=5)
grid.fit(X_train_fpr, y_train)
print("test set accuracy: %.2f" % grid.score(X_test_fpr, y_test))

A more extreme example


In [ ]:
rng = np.random.RandomState(0)
y = rng.rand(X.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=.6)

In [ ]:
from sklearn.feature_selection import SelectKBest

fpr = SelectFpr(score_func=f_regression)
fpr.fit(X_train, y_train)
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

X_train_fpr.shape

In [ ]:
# DON'T DO THIS:
from sklearn.grid_search import GridSearchCV
param_grid = {'alpha': 10. ** np.arange(-3, 3)}
grid = GridSearchCV(ridge, param_grid, cv=5)
grid.fit(X_train_fpr, y_train)
print("best cross-validation score: %.2f" % grid.best_score_)
print("test set accuracy: %.2f" % grid.score(X_test_fpr, y_test))