This Jupyter notebook contains the complimentary code for the Appendix section of the article "Model evaluation, model selection, and algorithm selection in machine learning - Part IV" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.

A "nested cross-validation for algorithm selection" example using scikit-learn



In [1]:

    
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v









    



Sebastian Raschka 2016-09-30 

CPython 3.5.2
IPython 5.1.0

sklearn 0.17.1
mlxtend 0.4.2



In [2]:

    
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.data import mnist_data
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
import random

np.random.seed(1)
random.seed(1)

# Loading and splitting the dataset
# Note that this is a small (stratified) subset
# of MNIST; it consists of 5000 samples only, that is,
# 10% of the original MNIST dataset
# http://yann.lecun.com/exdb/mnist/
X, y = mnist_data()
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    random_state=1,
                                                    stratify=y)

# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(random_state=1)

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('clf1', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('clf2', clf2)])

pipe4 = Pipeline([('std', StandardScaler()),
                  ('clf4', clf4)])


# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l2'],
                'clf1__C': np.power(10., np.arange(-4, 4))}]

param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),
                'clf2__p': [1, 2]}]

param_grid3 = [{'max_depth': list(range(1, 10)) + [None],
                'criterion': ['gini', 'entropy']}]

param_grid4 = [{'clf4__kernel': ['rbf'],
                'clf4__C': np.power(10., np.arange(-4, 4)),
                'clf4__gamma': np.power(10., np.arange(-5, 0))},
               {'clf4__kernel': ['linear'],
                'clf4__C': np.power(10., np.arange(-4, 4))}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2,
                             param_grid3, param_grid4),
                            (pipe1, pipe2, clf3, pipe4),
                            ('Softmax', 'KNN', 'DTree', 'SVM')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=2,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv



In [3]:

    
cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold:
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train[outer_train_idx], y_train[outer_train_idx])
        y_pred = gs_est.predict(X_train[outer_valid_idx])
        acc = accuracy_score(y_true=y_train[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1









    



outer fold 1/5 | tuning DTree    | inner ACC 72.38% | outer ACC 81.25%
outer fold 1/5 | tuning KNN      | inner ACC 88.19% | outer ACC 90.62%
outer fold 1/5 | tuning SVM      | inner ACC 89.88% | outer ACC 92.62%
outer fold 1/5 | tuning Softmax  | inner ACC 88.22% | outer ACC 91.88%
outer fold 2/5 | tuning DTree    | inner ACC 75.16% | outer ACC 76.25%
outer fold 2/5 | tuning KNN      | inner ACC 88.62% | outer ACC 90.62%
outer fold 2/5 | tuning SVM      | inner ACC 90.84% | outer ACC 91.25%
outer fold 2/5 | tuning Softmax  | inner ACC 89.00% | outer ACC 90.62%
outer fold 3/5 | tuning DTree    | inner ACC 74.25% | outer ACC 78.75%
outer fold 3/5 | tuning KNN      | inner ACC 87.81% | outer ACC 93.00%
outer fold 3/5 | tuning SVM      | inner ACC 89.69% | outer ACC 92.12%
outer fold 3/5 | tuning Softmax  | inner ACC 89.03% | outer ACC 90.38%
outer fold 4/5 | tuning DTree    | inner ACC 75.03% | outer ACC 73.62%
outer fold 4/5 | tuning KNN      | inner ACC 88.88% | outer ACC 90.50%
outer fold 4/5 | tuning SVM      | inner ACC 90.78% | outer ACC 90.38%
outer fold 4/5 | tuning Softmax  | inner ACC 89.25% | outer ACC 86.50%
outer fold 5/5 | tuning DTree    | inner ACC 73.31% | outer ACC 76.25%
outer fold 5/5 | tuning KNN      | inner ACC 88.41% | outer ACC 90.88%
outer fold 5/5 | tuning SVM      | inner ACC 90.28% | outer ACC 93.00%
outer fold 5/5 | tuning Softmax  | inner ACC 88.16% | outer ACC 90.62%



In [4]:

    
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print('\nSVM Best parameters', gridcvs['SVM'].best_params_)









    



DTree    | outer CV acc. 77.22% +\- 2.584
KNN      | outer CV acc. 91.13% +\- 0.945
Softmax  | outer CV acc. 90.00% +\- 1.827
SVM      | outer CV acc. 91.88% +\- 0.952

SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}



In [5]:

    
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))









    



Accuracy 90.80% (average over CV test folds)
Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}
Training Accuracy: 99.92%
Test Accuracy: 93.00%



In [6]:

    
# Fitting a model to the whole dataset
# using the "best" algorithm and hyperparameter settings
best_clf = best_algo.best_estimator_
final_model = best_clf.fit(X, y)

Nested CV for algorithm selection in scikit-learn 0.18



In [1]:

    
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v









    



Sebastian Raschka 2016-09-30 

CPython 3.5.2
IPython 5.1.0

sklearn 0.18
mlxtend 0.4.3dev0

There were a lot of neat changes introduced in scikit-learn 0.18, released on on 28 Sep, 2016, that make nested CV a lot more convenient.



In [2]:

    
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.data import mnist_data
from sklearn.metrics import accuracy_score

# Loading and splitting the dataset
# Note that this is a small (stratified) subset
# of MNIST; it consists of 5000 samples only, that is,
# 10% of the original MNIST dataset
# http://yann.lecun.com/exdb/mnist/
X, y = mnist_data()
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    random_state=1,
                                                    stratify=y)

# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(random_state=1)

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('clf1', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('clf2', clf2)])

pipe4 = Pipeline([('std', StandardScaler()),
                  ('clf4', clf4)])


# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l2'],
                'clf1__C': np.power(10., np.arange(-4, 4))}]

param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),
                'clf2__p': [1, 2]}]

param_grid3 = [{'max_depth': list(range(1, 10)) + [None],
                'criterion': ['gini', 'entropy']}]

param_grid4 = [{'clf4__kernel': ['rbf'],
                'clf4__C': np.power(10., np.arange(-4, 4)),
                'clf4__gamma': np.power(10., np.arange(-5, 0))},
               {'clf4__kernel': ['linear'],
                'clf4__C': np.power(10., np.arange(-4, 4))}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

for pgrid, est, name in zip((param_grid1, param_grid2,
                             param_grid3, param_grid4),
                            (pipe1, pipe2, clf3, pipe4),
                            ('Softmax', 'KNN', 'DTree', 'SVM')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv



In [3]:

    
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train, 
                                   y=y_train, 
                                   cv=outer_cv,
                                   n_jobs=1)
    print('%s | outer ACC %.2f%% +/- %.2f' % 
          (name, nested_score.mean() * 100, nested_score.std() * 100))









    



DTree | outer ACC 77.33% +/- 2.72
KNN | outer ACC 91.10% +/- 0.96
SVM | outer ACC 91.95% +/- 1.04
Softmax | outer ACC 90.32% +/- 1.22



In [4]:

    
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))









    



Accuracy 91.03% (average over CV test folds)
Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}
Training Accuracy: 99.92%
Test Accuracy: 93.00%