Building a pipeline


In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [3]:
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

In [4]:
logistic = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [5]:
pipe.fit(X_digits, y_digits)


Out[5]:
Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [6]:
pipe.predict(X_digits[:1])


Out[6]:
array([0])

Finding the best model


In [7]:
from sklearn.grid_search import GridSearchCV

In [8]:
n_components = [20, 40, 64] # number of compomentens in PCA 
Cs = np.logspace(-4, 0, 3, 4) # Inverse of regularization strength
penalty = ["l1", "l2"] # Norm used by the Logistic regression penalization
class_weight = [None, "balanced"] # Weights associatied with clases

estimator = GridSearchCV(pipe,
                         {"pca__n_components": n_components,
                          "logistic__C": Cs,
                          "logistic__class_weight": class_weight,
                          "logistic__penalty": penalty
                         }, n_jobs=8, cv=5)
estimator.fit(X_digits, y_digits)


Out[8]:
GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'logistic__class_weight': [None, 'balanced'], 'logistic__C': array([  1.00000e-04,   1.00000e-02,   1.00000e+00]), 'logistic__penalty': ['l1', 'l2'], 'pca__n_components': [20, 40, 64]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
estimator.grid_scores_


Out[9]:
[mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.87702, std: 0.03613, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.87590, std: 0.03502, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.88481, std: 0.03741, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90262, std: 0.03276, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.88481, std: 0.03862, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90373, std: 0.03219, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.92209, std: 0.03039, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02660, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.03286, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91653, std: 0.03731, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.92154, std: 0.03069, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91875, std: 0.02700, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91708, std: 0.03293, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91597, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0}]

In [10]:
print(estimator.best_score_)
print(estimator.best_params_)


0.922092376183
{'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0}

Exercise


In [ ]: