This Jupyter notebook contains the complimentary code for the Appendix section of the article "Model evaluation, model selection, and algorithm selection in machine learning - Part IV" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.
In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v
In [2]:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.data import mnist_data
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
import random
np.random.seed(1)
random.seed(1)
# Loading and splitting the dataset
# Note that this is a small (stratified) subset
# of MNIST; it consists of 5000 samples only, that is,
# 10% of the original MNIST dataset
# http://yann.lecun.com/exdb/mnist/
X, y = mnist_data()
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.8,
random_state=1,
stratify=y)
# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
solver='newton-cg',
random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(random_state=1)
# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
('clf1', clf1)])
pipe2 = Pipeline([('std', StandardScaler()),
('clf2', clf2)])
pipe4 = Pipeline([('std', StandardScaler()),
('clf4', clf4)])
# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l2'],
'clf1__C': np.power(10., np.arange(-4, 4))}]
param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),
'clf2__p': [1, 2]}]
param_grid3 = [{'max_depth': list(range(1, 10)) + [None],
'criterion': ['gini', 'entropy']}]
param_grid4 = [{'clf4__kernel': ['rbf'],
'clf4__C': np.power(10., np.arange(-4, 4)),
'clf4__gamma': np.power(10., np.arange(-5, 0))},
{'clf4__kernel': ['linear'],
'clf4__C': np.power(10., np.arange(-4, 4))}]
# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}
for pgrid, est, name in zip((param_grid1, param_grid2,
param_grid3, param_grid4),
(pipe1, pipe2, clf3, pipe4),
('Softmax', 'KNN', 'DTree', 'SVM')):
gcv = GridSearchCV(estimator=est,
param_grid=pgrid,
scoring='accuracy',
n_jobs=1,
cv=2,
verbose=0,
refit=True)
gridcvs[name] = gcv
In [3]:
cv_scores = {name: [] for name, gs_est in gridcvs.items()}
skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)
# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold:
for name, gs_est in sorted(gridcvs.items()):
print('outer fold %d/5 | tuning %-8s' % (c, name), end='')
# The inner loop for hyperparameter tuning
gs_est.fit(X_train[outer_train_idx], y_train[outer_train_idx])
y_pred = gs_est.predict(X_train[outer_valid_idx])
acc = accuracy_score(y_true=y_train[outer_valid_idx], y_pred=y_pred)
print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
(gs_est.best_score_ * 100, acc * 100))
cv_scores[name].append(acc)
c += 1
In [4]:
# Looking at the results
for name in cv_scores:
print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print('\nSVM Best parameters', gridcvs['SVM'].best_params_)
In [5]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['SVM']
best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))
print('Accuracy %.2f%% (average over CV test folds)' %
(100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
In [6]:
# Fitting a model to the whole dataset
# using the "best" algorithm and hyperparameter settings
best_clf = best_algo.best_estimator_
final_model = best_clf.fit(X, y)
In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v
There were a lot of neat changes introduced in scikit-learn 0.18, released on on 28 Sep, 2016, that make nested CV a lot more convenient.
In [2]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.data import mnist_data
from sklearn.metrics import accuracy_score
# Loading and splitting the dataset
# Note that this is a small (stratified) subset
# of MNIST; it consists of 5000 samples only, that is,
# 10% of the original MNIST dataset
# http://yann.lecun.com/exdb/mnist/
X, y = mnist_data()
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.8,
random_state=1,
stratify=y)
# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
solver='newton-cg',
random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
leaf_size=50)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(random_state=1)
# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
('clf1', clf1)])
pipe2 = Pipeline([('std', StandardScaler()),
('clf2', clf2)])
pipe4 = Pipeline([('std', StandardScaler()),
('clf4', clf4)])
# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l2'],
'clf1__C': np.power(10., np.arange(-4, 4))}]
param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),
'clf2__p': [1, 2]}]
param_grid3 = [{'max_depth': list(range(1, 10)) + [None],
'criterion': ['gini', 'entropy']}]
param_grid4 = [{'clf4__kernel': ['rbf'],
'clf4__C': np.power(10., np.arange(-4, 4)),
'clf4__gamma': np.power(10., np.arange(-5, 0))},
{'clf4__kernel': ['linear'],
'clf4__C': np.power(10., np.arange(-4, 4))}]
# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
for pgrid, est, name in zip((param_grid1, param_grid2,
param_grid3, param_grid4),
(pipe1, pipe2, clf3, pipe4),
('Softmax', 'KNN', 'DTree', 'SVM')):
gcv = GridSearchCV(estimator=est,
param_grid=pgrid,
scoring='accuracy',
n_jobs=1,
cv=inner_cv,
verbose=0,
refit=True)
gridcvs[name] = gcv
In [3]:
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for name, gs_est in sorted(gridcvs.items()):
nested_score = cross_val_score(gs_est,
X=X_train,
y=y_train,
cv=outer_cv,
n_jobs=1)
print('%s | outer ACC %.2f%% +/- %.2f' %
(name, nested_score.mean() * 100, nested_score.std() * 100))
In [4]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['SVM']
best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))
print('Accuracy %.2f%% (average over CV test folds)' %
(100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))