## Building a pipeline

``````

In [1]:

%pylab inline

``````
``````

Populating the interactive namespace from numpy and matplotlib

``````
``````

In [2]:

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

``````
``````

In [3]:

X_digits = digits.data
y_digits = digits.target

``````
``````

In [4]:

logistic = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

``````
``````

In [5]:

pipe.fit(X_digits, y_digits)

``````
``````

Out[5]:

Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))])

``````
``````

In [6]:

pipe.predict(X_digits[:1])

``````
``````

Out[6]:

array([0])

``````

## Finding the best model

``````

In [7]:

from sklearn.grid_search import GridSearchCV

``````
``````

In [8]:

n_components = [20, 40, 64] # number of compomentens in PCA
Cs = np.logspace(-4, 0, 3, 4) # Inverse of regularization strength
penalty = ["l1", "l2"] # Norm used by the Logistic regression penalization
class_weight = [None, "balanced"] # Weights associatied with clases

estimator = GridSearchCV(pipe,
{"pca__n_components": n_components,
"logistic__C": Cs,
"logistic__class_weight": class_weight,
"logistic__penalty": penalty
}, n_jobs=8, cv=5)
estimator.fit(X_digits, y_digits)

``````
``````

Out[8]:

GridSearchCV(cv=5, error_score='raise',
estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))]),
fit_params={}, iid=True, n_jobs=8,
param_grid={'logistic__class_weight': [None, 'balanced'], 'logistic__C': array([  1.00000e-04,   1.00000e-02,   1.00000e+00]), 'logistic__penalty': ['l1', 'l2'], 'pca__n_components': [20, 40, 64]},
pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

``````
``````

In [9]:

estimator.grid_scores_

``````
``````

Out[9]:

[mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.87702, std: 0.03613, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
mean: 0.87590, std: 0.03502, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.88481, std: 0.03741, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.90262, std: 0.03276, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
mean: 0.88481, std: 0.03862, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.90373, std: 0.03219, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.92209, std: 0.03039, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.91820, std: 0.02660, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
mean: 0.91820, std: 0.03286, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
mean: 0.91653, std: 0.03731, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.92154, std: 0.03069, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
mean: 0.91875, std: 0.02700, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
mean: 0.91708, std: 0.03293, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
mean: 0.91597, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0}]

``````
``````

In [10]:

print(estimator.best_score_)
print(estimator.best_params_)

``````
``````

0.922092376183
{'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0}

``````

## Exercise

``````

In [ ]:

``````