Preprocessing and Pipelines


In [1]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold. To do that, we build a pipeline.


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [3]:
pipeline = Pipeline([("scaler", StandardScaler()), ("svm", SVC())])
# in new versions: make_pipeline(StandardScaler(), SVC())

In [4]:
pipeline.fit(X_train, y_train)


Out[4]:
Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [5]:
pipeline.predict(X_test)


Out[5]:
array([6, 4, 5, 7, 8, 5, 2, 7, 3, 2, 7, 8, 4, 5, 3, 6, 4, 9, 5, 6, 7, 4, 7,
       2, 0, 3, 1, 5, 4, 7, 7, 2, 5, 3, 9, 7, 8, 2, 8, 1, 1, 3, 9, 5, 1, 7,
       5, 1, 6, 0, 2, 4, 5, 0, 1, 4, 7, 1, 3, 3, 6, 3, 1, 8, 5, 1, 7, 2, 2,
       9, 1, 2, 0, 6, 2, 4, 4, 9, 1, 0, 2, 4, 9, 2, 3, 4, 1, 7, 4, 0, 6, 1,
       6, 8, 1, 6, 9, 4, 9, 6, 8, 8, 1, 2, 7, 7, 6, 2, 0, 3, 0, 7, 7, 8, 1,
       6, 7, 0, 9, 5, 6, 3, 0, 8, 1, 7, 9, 7, 3, 9, 3, 0, 8, 8, 5, 5, 0, 8,
       9, 2, 5, 7, 5, 2, 9, 7, 0, 6, 1, 8, 0, 4, 3, 7, 9, 5, 4, 7, 1, 3, 0,
       3, 7, 7, 1, 8, 6, 3, 4, 5, 9, 2, 6, 3, 6, 4, 6, 1, 8, 3, 3, 6, 6, 5,
       8, 3, 5, 7, 8, 1, 0, 3, 6, 5, 5, 6, 0, 7, 2, 0, 7, 7, 6, 6, 5, 8, 1,
       6, 7, 8, 8, 4, 7, 2, 2, 9, 4, 6, 4, 8, 7, 0, 2, 8, 9, 6, 0, 7, 3, 3,
       5, 9, 4, 3, 7, 2, 9, 1, 6, 8, 3, 9, 9, 1, 3, 3, 0, 0, 2, 8, 5, 7, 9,
       6, 3, 6, 2, 0, 6, 5, 7, 7, 9, 2, 3, 9, 0, 5, 4, 8, 2, 5, 4, 8, 5, 0,
       8, 6, 6, 5, 2, 0, 3, 7, 8, 4, 1, 1, 2, 3, 1, 6, 7, 8, 8, 8, 1, 8, 7,
       4, 8, 0, 6, 9, 4, 3, 8, 7, 6, 8, 0, 6, 0, 7, 2, 1, 0, 4, 5, 1, 1, 1,
       3, 5, 9, 4, 9, 4, 8, 1, 8, 0, 7, 6, 9, 3, 2, 7, 3, 9, 5, 8, 8, 3, 3,
       0, 9, 1, 4, 1, 2, 7, 4, 7, 2, 7, 3, 7, 9, 6, 4, 8, 9, 9, 8, 5, 3, 5,
       3, 1, 6, 5, 9, 8, 0, 2, 4, 8, 2, 4, 7, 6, 4, 6, 2, 8, 6, 8, 8, 9, 6,
       9, 6, 4, 9, 0, 8, 9, 4, 1, 6, 3, 5, 9, 7, 5, 6, 0, 2, 1, 5, 7, 8, 7,
       6, 9, 8, 7, 7, 1, 8, 9, 0, 4, 0, 6, 8, 6, 6, 5, 4, 3, 8, 2, 5, 4, 4,
       6, 8, 7, 4, 4, 0, 6, 8, 9, 9, 2, 7, 0])

Cross-validation with a pipeline


In [6]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipeline, X_train, y_train)


Out[6]:
array([ 0.97787611,  0.97327394,  0.97757848])

Grid Search with a pipeline


In [7]:
import numpy as np
from sklearn.grid_search import GridSearchCV

param_grid = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}

grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)

In [8]:
grid_pipeline.fit(X_train, y_train)


Out[8]:
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'svm__C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02]), 'svm__gamma': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
grid_pipeline.score(X_test, y_test)


Out[9]:
0.98222222222222222

In [ ]: