Building a pipeline



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA



In [3]:

    
digits = load_digits()
X_digits = digits.data
y_digits = digits.target



In [4]:

    
logistic = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])



In [5]:

    
pipe.fit(X_digits, y_digits)









    Out[5]:





Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])



In [6]:

    
pipe.predict(X_digits[:1])









    Out[6]:





array([0])

Finding the best model



In [7]:

    
from sklearn.grid_search import GridSearchCV



In [8]:

    
n_components = [20, 40, 64] # number of compomentens in PCA 
Cs = np.logspace(-4, 0, 3, 4) # Inverse of regularization strength
penalty = ["l1", "l2"] # Norm used by the Logistic regression penalization
class_weight = [None, "balanced"] # Weights associatied with clases

estimator = GridSearchCV(pipe,
                         {"pca__n_components": n_components,
                          "logistic__C": Cs,
                          "logistic__class_weight": class_weight,
                          "logistic__penalty": penalty
                         }, n_jobs=8, cv=5)
estimator.fit(X_digits, y_digits)









    Out[8]:





GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'logistic__class_weight': [None, 'balanced'], 'logistic__C': array([  1.00000e-04,   1.00000e-02,   1.00000e+00]), 'logistic__penalty': ['l1', 'l2'], 'pca__n_components': [20, 40, 64]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)



In [9]:

    
estimator.grid_scores_









    Out[9]:





[mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.87702, std: 0.03613, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.87590, std: 0.03502, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89538, std: 0.03581, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.88481, std: 0.03741, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90262, std: 0.03276, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.89482, std: 0.03603, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.89649, std: 0.03544, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.01},
 mean: 0.88481, std: 0.03862, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90484, std: 0.03247, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.90373, std: 0.03219, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.01},
 mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.92209, std: 0.03039, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02660, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.03286, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91653, std: 0.03731, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91653, std: 0.02974, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.92154, std: 0.03069, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91820, std: 0.02980, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 1.0},
 mean: 0.91875, std: 0.02700, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91708, std: 0.03293, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 1.0},
 mean: 0.91597, std: 0.03789, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 1.0}]



In [10]:

    
print(estimator.best_score_)
print(estimator.best_params_)









    



0.922092376183
{'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0}

Exercise

Find the best model for the diabetes dataset

http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes



In [ ]: