Testing Algorithms with Digits Dataset


In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn

In [13]:
df = pandas.read_table("../data/digits/ex3data_X.csv", sep=",", header=None)
y  = pandas.read_table("../data/digits/ex3data_y.csv", sep=",", header=None)

df.head()


Out[13]:
0 1 2 3 4 5 6 7 8 9 ... 390 391 392 393 394 395 396 397 398 399
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 400 columns


In [60]:
y.tail()


Out[60]:
0
4995 9
4996 9
4997 9
4998 9
4999 9

In [8]:
N, m = df.shape

print(N, m)


(5000, 400)

Split Train and Test


In [61]:
np.random.seed(1234)

train_idx = np.random.choice(N, 0.7*N, replace=False)
test_idx  = np.setdiff1d(np.arange(N), train_idx, assume_unique=True)

Xtrain = df.iloc[train_idx]
ytrain = y.iloc[train_idx]

Xtest = df.iloc[test_idx]
ytest = y.iloc[test_idx]

Xtrain.shape

ytrain.head()


Out[61]:
0
2706 5
2436 4
1201 2
1486 2
4286 8

In [23]:
kdtree = scipy.spatial.KDTree(Xtrain, leafsize=100)

kdtree


Out[23]:
<scipy.spatial.kdtree.KDTree at 0x7fcc8c978fd0>

In [39]:
print(ytrain.iloc[3179])


0    10
Name: 383, dtype: int64

In [48]:
pred = np.empty(shape=Xtest.shape[0], dtype=int)

for i in range(Xtest.shape[0]):
    knear = kdtree.query(Xtest.iloc[i,:], k=3)[1]
    ynear = ytrain.iloc[knear,0].values
    yuniq, ycounts = np.unique(ynear, return_counts=True)
    pred[i] = yuniq[np.argmax(ycounts)]
    
np.unique(pred, return_counts=True)


Out[48]:
(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([170, 138, 157, 136, 142, 169, 149, 134, 162, 143]))

In [67]:
np.sum(pred == ytest[0].values) / float(ytest.shape[0])


Out[67]:
0.94133333333333336

In [ ]: