KNN PCA


In [1]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt

from operator import itemgetter
from tabulate import tabulate

from sklearn.preprocessing    import StandardScaler
from sklearn.decomposition    import PCA 
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.pipeline         import Pipeline
from sklearn.grid_search      import GridSearchCV
from sklearn.grid_search      import RandomizedSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics          import confusion_matrix, classification_report

import sys, math, time

# private functions
sys.path.append('/home/george/Dropbox/MNIST/src') 
from MNIST_utilities import load_all_MNIST,        \
                            plot_confusion_matrix, \
                            print_imgs,            \
                            GridSearchHeatmap,     \
                            plot_learning_curve

%matplotlib inline

In [2]:
#%qtconsole

Load the MNIST data


In [3]:
# load MNIST here
trainX, trainY, testX, testY, _ , testXoriginal = \
            load_all_MNIST(portion=1.0, create_X_copy=True)


trainX shape: (120000, 784)
trainY shape: (120000,)

testX shape: (10000, 784)
testY shape: (10000, 1)

Note that distance measures can be significantly affected by variable scale


In [4]:
# ... but scaling makes the test-set predictions a lot worse (for PCA)
#scaler = StandardScaler()
#trainX = scaler.fit_transform(trainX)
#testX  = scaler.transform(testX)

Perform a grid search to find the best PCA & KNN parameter combination


In [5]:
t0 = time.time()

pca   = PCA()
knn   = KNeighborsClassifier(weights='distance', p=2)

pipe  = Pipeline(steps=[('pca', pca), ('knn', knn)])

pca.fit(trainX)
var_explained = np.cumsum(pca.explained_variance_ratio_)

#Parameters of pipelines can be set using ‘__’ separated parameter names:
#                                      find every 5th n_components between a range of explained variances
search_grid = dict(pca__n_components = np.arange(np.argmax(var_explained>0.75),
                                                 np.argmax(var_explained>=0.90), 5),
                   knn__n_neighbors  = np.arange(1,8+1))

# ----------------------------------------------------------------------------
# you can't randomize if the number of grid points is less than the iterations
n_iter      = 100
grid_points = 1
for value in search_grid.itervalues():
    grid_points *= len(value)
print("Total points in the search grid: {}".format(grid_points))

if grid_points <= n_iter:
    estimator = GridSearchCV(estimator = pipe, param_grid = search_grid,
                             cv        = StratifiedKFold(y = trainY, n_folds = 5),
                             n_jobs=-1, pre_dispatch=10, verbose=1)
    
else:
    estimator = RandomizedSearchCV(estimator = pipe, param_distributions = search_grid, n_iter = n_iter,
                                   cv        = StratifiedKFold(y = trainY, n_folds = 5),
                                   n_jobs=-1, pre_dispatch=10, verbose=1)
# ----------------------------------------------------------------------------


estimator.fit(trainX, trainY)

print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))


Total points in the search grid: 80
Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed: 35.9min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 161.6min
[Parallel(n_jobs=-1)]: Done 392 out of 400 | elapsed: 330.6min remaining:  6.7min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 336.0min finished
time in minutes 336.45

In [6]:
# shows where the selected number of components lies
# along the continuum of the explained variance
# --------------------------------------------------

n_components_chosen = estimator.best_estimator_.named_steps['pca'].n_components

var_expl_chosen     = var_explained[n_components_chosen]


plt.figure(figsize=(10, 6))
plt.plot(var_explained, linewidth=2)
plt.xlabel('n_components chosen='+str(n_components_chosen)+ \
           '\npercent variance explained='+str(round(var_expl_chosen*100,0)))
plt.ylabel('explained_variance')
plt.ylim(0,1.1)

plt.axvline(n_components_chosen,
            linestyle=':', 
            label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()



In [7]:
# shows how well our grid search selected parameter combinations
# --------------------------------------------------------------

GridSearchHeatmap(estimator, y_key='pca__n_components', x_key='knn__n_neighbors')


The best parameters are {'pca__n_components': 50, 'knn__n_neighbors': 1}
with a score of 0.99, misclass of 0.0111

In [8]:
# what proportion of parameter combinations
# had an accuracy below 98% (anything 98% or below is not a contender)
# --------------------------------------------------------------------

mean_score_list = [score.mean_validation_score for score in estimator.grid_scores_]
print("\nProportion of random scores below 98%: {0:.2f}\n". \
      format(sum(np.array(mean_score_list)<0.98)/len(mean_score_list)))
    

# what do the top 10 parameter combinations look like?
# ----------------------------------------------------

for score in sorted(estimator.grid_scores_, key=itemgetter(1), reverse=True)[:10]:
    print score


Proportion of random scores below 98%: 0.00

mean: 0.98886, std: 0.00051, params: {'pca__n_components': 50, 'knn__n_neighbors': 1}
mean: 0.98886, std: 0.00051, params: {'pca__n_components': 50, 'knn__n_neighbors': 2}
mean: 0.98875, std: 0.00057, params: {'pca__n_components': 65, 'knn__n_neighbors': 1}
mean: 0.98875, std: 0.00057, params: {'pca__n_components': 65, 'knn__n_neighbors': 2}
mean: 0.98862, std: 0.00038, params: {'pca__n_components': 75, 'knn__n_neighbors': 1}
mean: 0.98862, std: 0.00038, params: {'pca__n_components': 75, 'knn__n_neighbors': 2}
mean: 0.98861, std: 0.00059, params: {'pca__n_components': 60, 'knn__n_neighbors': 1}
mean: 0.98861, std: 0.00059, params: {'pca__n_components': 60, 'knn__n_neighbors': 2}
mean: 0.98857, std: 0.00051, params: {'pca__n_components': 70, 'knn__n_neighbors': 1}
mean: 0.98857, std: 0.00051, params: {'pca__n_components': 70, 'knn__n_neighbors': 2}

Predict the test data


In [9]:
target_names     = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

predicted_values = estimator.predict(testX)
y_true, y_pred   = testY, predicted_values

#print(classification_report(y_true, y_pred, target_names=target_names))

cm = confusion_matrix(y_true, y_pred)  

print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))

plot_confusion_matrix(cm, target_names)


[[ 973    1    0    0    0    2    2    2    0    0]
 [   1 1128    2    2    0    0    1    1    0    0]
 [   4    2 1004    3    1    0    2   13    3    0]
 [   2    0    3  982    0   10    0    4    6    3]
 [   0    1    1    0  962    0    3    1    0   14]
 [   3    1    1    4    1  870    3    1    4    4]
 [   4    4    1    0    1    1  947    0    0    0]
 [   2    5    7    1    0    0    0 1004    0    9]
 [   3    0    4    4    3    4    4    3  945    4]
 [   0    0    0    4   12    3    1    8    3  978]]

Model accuracy: 0.9793, model misclass rate: 0.0207

In [10]:
print_imgs(images           = testXoriginal, 
           actual_labels    = y_true, 
           predicted_labels = y_pred,
           starting_index   = np.random.randint(0, high=testY.shape[0]-36, size=1)[0],
           size             = 6)


Learning Curves

  1. do we predict the training data well? (flat red line hugs the 1.0 line)
  2. does the prediction improve with more data? (green line increases from left to right)

In [11]:
t0 = time.time()

parm_list = ""
for i, param in enumerate(estimator.best_params_):
    if i % 3 == 0: parm_list += "\n"
    param_val = estimator.best_params_[param]
    parm_list += param + "=" + str(param_val) + " "


plot_learning_curve(estimator = estimator.best_estimator_, 
                    title     = "KNN PCA" + parm_list, 
                    X         = trainX, 
                    y         = trainY, 
                    ylim      = (0.85, 1.01), 
                    cv        = ShuffleSplit(n            = trainX.shape[0], 
                                             n_iter       = 5, 
                                             test_size    = 0.2, 
                                             random_state = 0), 
                    n_jobs      = -1)

plt.show()

print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))


time in minutes 9.42

In [ ]: