In [1]:
import numpy as np
import matplotlib.pylab as plt
import math
from scipy.stats import mode
%matplotlib inline
In [2]:
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
mnist = fetch_mldata('MNIST original', data_home='../data')
In [3]:
mnist.data.shape
Out[3]:
In [4]:
X = np.append(np.ones((mnist.data.shape[0],1)), mnist.data, axis = 1)
In [5]:
Y = mnist.target
In [6]:
def display(x, label):
pixels = x.reshape((28, 28))
plt.title('{label}'.format(label=label))
plt.imshow(pixels, cmap='gray')
plt.show()
In [7]:
display(X[1][1:785], 'Y[0]')
In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
In [9]:
w = np.random.rand(10, X_train.shape[1])
w_orig = w
In [10]:
w.dot(X_train[30000])
Out[10]:
In [11]:
def manhattan_distance(s1, s2):
return np.sum(np.abs(s1 - s2), axis=1)
In [12]:
class NearestNeighbour:
def __init__(self, k, loss = manhattan_distance):
self.k = k
self.loss = loss
def train(self, X, Y):
self.X = X
self.Y = Y
def test(self, X):
losses = self.loss(self.X, X)
return mode(self.Y[losses.argsort()[self.k]])[0][0]
In [13]:
n = NearestNeighbour(1)
n.train(X_train, Y_train)
In [14]:
n.test(X_test[0])
Out[14]:
In [15]:
display(X_test[0][1:785], "%s"%Y_test[0])
In [25]:
def test(X_test, Y_test):
count_failed = 0
for i in range(X_test.shape[0]):
if Y_test[i] != n.test(X_test[i]):
count_failed += 1
return (count_failed, num_test)
count_failed, num_tests = test(X_test, Y_test)
print("\n Results:")
print("Total: %s " % num_tests)
print("Failed: %s " % count_failed)
print("Failed: %s " % (1.0 * count_failed / num_tests))
In [24]:
Out[24]:
In [ ]: