In [1]:
import numpy as np
import pandas as pd

In [2]:
# from http://www.cs.toronto.edu/~kriz/cifar.html
def unpickle(filename):
    import cPickle
    fo = open(filename, 'rb')
    dict = cPickle.load(fo)
    fo.close()
    return dict

In [3]:
X_train = []
y_train = []
for i in xrange(1,6):
    data_batch = unpickle('/Users/excalibur/Dropbox/1_Studies/1_STEM/ArtificialIntelligence/datasets/cifar-10-batches-py/data_batch_{0}'.format(i))
    if i == 1:
        X_train = data_batch['data']
        y_train = np.array(data_batch['labels'])
    else:
        X_train = np.vstack((X_train,data_batch['data']))
        y_train = np.hstack((y_train,np.array(data_batch['labels'])))

print X_train.shape
print y_train.shape


(50000, 3072)
(50000,)

In [4]:
test_batch = unpickle('/Users/excalibur/Dropbox/1_Studies/1_STEM/ArtificialIntelligence/datasets/cifar-10-batches-py/test_batch')
X_test = test_batch['data']
y_test = np.array(test_batch['labels'])
print X_test.shape
print y_test.shape


(10000, 3072)
(10000,)

$d_{1}(I_{1},I_{2}) = \sum\limits_{p} \lvert I_{1}^{p}-I_{2}^{p}\rvert$


In [5]:
class NearestNeighbor:
    def __init__(self):
        self.X_train = []
        self.y_train = []
        self.y_hat = []
    
    def train(self,X,y):
        self.X_train = X
        self.y_train = y
     
    ### CRAZY SLOW
    def predict_SLOW(self,X):
        self.y_hat = np.zeros(X.shape[0], dtype=self.y_train.dtype)
        for i in xrange(X.shape[0]):
            L1_row_distances = np.sum(np.abs(self.X_train - X[i,:]), axis=1)
            argmin_index = np.argmin(L1_row_distances)
            self.y_hat[i] = self.y_train[argmin_index]
            if i % 100 == 0:
                print i
        return self.y_hat
    
    def score(self,y):
        return np.mean(self.y_hat == y)

In [6]:
#nn = NearestNeighbor()
#nn.train(X_train, y_train)
#y_hat = nn.predict(X_test)
#nn.score(y_test)

In [7]:
from sklearn import neighbors, datasets

In [14]:
clf = neighbors.KNeighborsClassifier(10)

In [ ]:
clf.fit(X_train, y_train)
clf.predict(X_test)

In [13]:
clf.score(X_test,y_test)


Out[13]:
0.16320000000000001