In [ ]:
print('ex01')
In [ ]:
import numpy as np
import pandas as pd
Try to build a classifier for the MNIST dataset that achieves over 97% accuracy
on the test set. Hint: the KNeighborsClassifier
works quite well for this task;
you just need to find good hyperparameter values (try a grid search on the
weights and n_neighbors hyperparameters).
In [ ]:
from scipy.io import loadmat
mnist = loadmat('./datasets/mnist-original.mat')
In [ ]:
mnist
In [ ]:
X, y = mnist['data'], mnist['label']
In [ ]:
X = X.T
X.shape
In [ ]:
y = y.T
y.shape
In [ ]:
type(y)
In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
In [ ]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
In [ ]:
len(X_train)
In [ ]:
shuffle_index = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
The reason to use Random Forest Classifier is it runs faster than linear model
In [ ]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
In [ ]:
forest_clf.fit(X_train, y_train)
In [ ]:
forest_pred = forest_clf.predict(X_test)
In [ ]:
forest_pred = forest_pred.reshape(10000,1)
In [ ]:
accuracy = (forest_pred == y_test).sum() / len(y_test)
print(accuracy)
Seems like we have to have n_jobs = 1
so the prediction runs within reasonable time.
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
In [ ]:
knn_clf = KNeighborsClassifier(n_jobs=-1)
In [ ]:
knn_clf.fit(X_train, y_train)
In [ ]:
knn_clf.predict([X_test[0]])
In [ ]:
# for i in range(1000):
# knn_clf.predict([X_test[i]])
In [ ]:
knn_pred = knn_clf.predict(X_test)
In [ ]:
knn_pred = knn_pred.reshape(10000, 1)
accuracy = (knn_pred == y_test).sum() / len(y_test)
print(accuracy)
In [ ]:
from sklearn.model_selection import GridSearchCV
In [ ]:
param_grid = [
{'n_jobs': [-1], 'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance']}
]
In [ ]:
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
In [ ]:
grid_search.fit(X_train, y_train)