In [1]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
import sys, math, time
# private functions
sys.path.append('/home/george/Dropbox/MNIST/src')
from MNIST_utilities import load_all_MNIST, \
plot_confusion_matrix, \
print_imgs, \
GridSearchHeatmap, \
plot_learning_curve
%matplotlib inline
In [2]:
#%qtconsole
In [3]:
# load MNIST here
trainX, trainY, testX, testY, _ , testXoriginal = \
load_all_MNIST(portion=1.0, create_X_copy=True)
In [4]:
# ... but scaling makes the test-set predictions a lot worse (for PCA)
#scaler = StandardScaler()
#trainX = scaler.fit_transform(trainX)
#testX = scaler.transform(testX)
In [5]:
t0 = time.time()
pca = PCA()
knn = KNeighborsClassifier(weights='distance', p=2)
pipe = Pipeline(steps=[('pca', pca), ('knn', knn)])
pca.fit(trainX)
var_explained = np.cumsum(pca.explained_variance_ratio_)
#Parameters of pipelines can be set using ‘__’ separated parameter names:
# find every 5th n_components between a range of explained variances
search_grid = dict(pca__n_components = np.arange(np.argmax(var_explained>0.75),
np.argmax(var_explained>=0.90), 5),
knn__n_neighbors = np.arange(1,8+1))
# ----------------------------------------------------------------------------
# you can't randomize if the number of grid points is less than the iterations
n_iter = 100
grid_points = 1
for value in search_grid.itervalues():
grid_points *= len(value)
print("Total points in the search grid: {}".format(grid_points))
if grid_points <= n_iter:
estimator = GridSearchCV(estimator = pipe, param_grid = search_grid,
cv = StratifiedKFold(y = trainY, n_folds = 5),
n_jobs=-1, pre_dispatch=10, verbose=1)
else:
estimator = RandomizedSearchCV(estimator = pipe, param_distributions = search_grid, n_iter = n_iter,
cv = StratifiedKFold(y = trainY, n_folds = 5),
n_jobs=-1, pre_dispatch=10, verbose=1)
# ----------------------------------------------------------------------------
estimator.fit(trainX, trainY)
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [6]:
# shows where the selected number of components lies
# along the continuum of the explained variance
# --------------------------------------------------
n_components_chosen = estimator.best_estimator_.named_steps['pca'].n_components
var_expl_chosen = var_explained[n_components_chosen]
plt.figure(figsize=(10, 6))
plt.plot(var_explained, linewidth=2)
plt.xlabel('n_components chosen='+str(n_components_chosen)+ \
'\npercent variance explained='+str(round(var_expl_chosen*100,0)))
plt.ylabel('explained_variance')
plt.ylim(0,1.1)
plt.axvline(n_components_chosen,
linestyle=':',
label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()
In [7]:
# shows how well our grid search selected parameter combinations
# --------------------------------------------------------------
GridSearchHeatmap(estimator, y_key='pca__n_components', x_key='knn__n_neighbors')
In [8]:
# what proportion of parameter combinations
# had an accuracy below 98% (anything 98% or below is not a contender)
# --------------------------------------------------------------------
mean_score_list = [score.mean_validation_score for score in estimator.grid_scores_]
print("\nProportion of random scores below 98%: {0:.2f}\n". \
format(sum(np.array(mean_score_list)<0.98)/len(mean_score_list)))
# what do the top 10 parameter combinations look like?
# ----------------------------------------------------
for score in sorted(estimator.grid_scores_, key=itemgetter(1), reverse=True)[:10]:
print score
In [9]:
target_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
predicted_values = estimator.predict(testX)
y_true, y_pred = testY, predicted_values
#print(classification_report(y_true, y_pred, target_names=target_names))
cm = confusion_matrix(y_true, y_pred)
print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)
In [10]:
print_imgs(images = testXoriginal,
actual_labels = y_true,
predicted_labels = y_pred,
starting_index = np.random.randint(0, high=testY.shape[0]-36, size=1)[0],
size = 6)
In [11]:
t0 = time.time()
parm_list = ""
for i, param in enumerate(estimator.best_params_):
if i % 3 == 0: parm_list += "\n"
param_val = estimator.best_params_[param]
parm_list += param + "=" + str(param_val) + " "
plot_learning_curve(estimator = estimator.best_estimator_,
title = "KNN PCA" + parm_list,
X = trainX,
y = trainY,
ylim = (0.85, 1.01),
cv = ShuffleSplit(n = trainX.shape[0],
n_iter = 5,
test_size = 0.2,
random_state = 0),
n_jobs = -1)
plt.show()
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [ ]: