In [1]:
import numpy as np
from sklearn.datasets import fetch_mldata
import pandas as pd
%matplotlib notebook
In [2]:
mnist = fetch_mldata("MNIST original")
X_ = mnist.data / 255.0
y_ = mnist.target
In [3]:
print(X_.shape, y_.shape)
In [4]:
rndperm = np.random.permutation(X_.shape[0])
In [5]:
N_ = X_.shape[0]
X_train = X_[rndperm][:int(0.7*N_)]
y_train = y_[rndperm][:int(0.7*N_)].reshape(-1,1)
X_cv = X_[rndperm][int(0.7*N_):int(0.85*N_)]
y_cv = y_[rndperm][int(0.7*N_):int(0.85*N_)].reshape(-1,1)
X_test = X_[rndperm][int(0.85*N_):N_]
y_test = y_[rndperm][int(0.85*N_):N_].reshape(-1,1)
print("Train: ", np.shape(X_train), " ", np.shape(y_train))
print("Cross-Validation: ", np.shape(X_cv), " ", np.shape(y_cv))
print("Test: ", np.shape(X_test), " ", np.shape(y_test))
In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
# Plot the graph
plt.gray()
fig = plt.figure( figsize=(16,7) )
for i in range(0,30):
ax = fig.add_subplot(3,10,i+1)
ax.matshow(X_train[i].reshape((28,28)).astype(float))
ax.axis("off")
plt.show()
In [11]:
from scipy import stats
class kNNModel:
def __init__(self, X, y, k=5):
self.k = k
self.X = X
self.y = y
def distanceMatrix(self, T):
M, _ = np.shape(self.X)
L, _ = np.shape(T)
X2 = np.sum(self.X**2, axis=1).reshape(-1,1)
T2 = np.sum(T**2, axis=1).reshape(-1,1)
TX = T.dot(self.X.T)
dm = T2 - 2*TX + X2.T
return dm
def predict(self, T, dm=None, k=None):
if dm is None:
dm = self.distanceMatrix(T)
if k is None:
k = self.k
indices = np.argsort(dm, axis=1)[:,:k]
return stats.mode(self.y[indices], axis=1).mode.reshape(-1,1)
def accuracy(self, T, l, dm=None, k=None):
p = self.predict(T, dm, k)
return np.mean(p==l)
In [18]:
model = kNNModel(X_train, y_train, 6)
accu = model.accuracy(X_cv[:5], y_cv[:5], None, 2)
print(accu)
In [20]:
accuL = []
Ks = list(range(1,10))
dm = model.distanceMatrix(X_cv[:1000])
for k in Ks:
print("k =", k)
accu = model.accuracy(X_cv[:1000], y_cv[:1000], dm, k)
accuL.append(accu)
print(accuL)
In [21]:
%matplotlib inline
plt.plot(Ks, accuL)
plt.show()