In [ ]:
print('ex01')

In [ ]:
import numpy as np
import pandas as pd

3.1 Problem description

Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set. Hint: the KNeighborsClassifier works quite well for this task; you just need to find good hyperparameter values (try a grid search on the weights and n_neighbors hyperparameters).

Load the data


In [ ]:
from scipy.io import loadmat
mnist = loadmat('./datasets/mnist-original.mat')

In [ ]:
mnist

In [ ]:
X, y = mnist['data'], mnist['label']

In [ ]:
X = X.T
X.shape

In [ ]:
y = y.T
y.shape

In [ ]:
type(y)

In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

Split test and training data


In [ ]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [ ]:
len(X_train)

In [ ]:
shuffle_index = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

3.2 Training a Random Forest Classifier for baseline

The reason to use Random Forest Classifier is it runs faster than linear model


In [ ]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)

In [ ]:
forest_clf.fit(X_train, y_train)

In [ ]:
forest_pred = forest_clf.predict(X_test)

In [ ]:
forest_pred = forest_pred.reshape(10000,1)

In [ ]:
accuracy = (forest_pred == y_test).sum() / len(y_test)
print(accuracy)

3.3 Training a KNeighborsClassifier Classifier with default settings

Seems like we have to have n_jobs = 1 so the prediction runs within reasonable time.


In [ ]:
from sklearn.neighbors import KNeighborsClassifier

In [ ]:
knn_clf = KNeighborsClassifier(n_jobs=-1)

In [ ]:
knn_clf.fit(X_train, y_train)

In [ ]:
knn_clf.predict([X_test[0]])

In [ ]:
# for i in range(1000):
#     knn_clf.predict([X_test[i]])

In [ ]:
knn_pred = knn_clf.predict(X_test)

In [ ]:
knn_pred = knn_pred.reshape(10000, 1)
accuracy = (knn_pred == y_test).sum() / len(y_test)
print(accuracy)

3.4 GridSearchCV


In [ ]:
from sklearn.model_selection import GridSearchCV

In [ ]:
param_grid = [
    {'n_jobs': [-1], 'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance']}
]

In [ ]:
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

In [ ]:
grid_search.fit(X_train, y_train)