In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
In [13]:
df = pandas.read_table("../data/digits/ex3data_X.csv", sep=",", header=None)
y = pandas.read_table("../data/digits/ex3data_y.csv", sep=",", header=None)
df.head()
Out[13]:
In [60]:
y.tail()
Out[60]:
In [8]:
N, m = df.shape
print(N, m)
In [61]:
np.random.seed(1234)
train_idx = np.random.choice(N, 0.7*N, replace=False)
test_idx = np.setdiff1d(np.arange(N), train_idx, assume_unique=True)
Xtrain = df.iloc[train_idx]
ytrain = y.iloc[train_idx]
Xtest = df.iloc[test_idx]
ytest = y.iloc[test_idx]
Xtrain.shape
ytrain.head()
Out[61]:
In [23]:
kdtree = scipy.spatial.KDTree(Xtrain, leafsize=100)
kdtree
Out[23]:
In [39]:
print(ytrain.iloc[3179])
In [48]:
pred = np.empty(shape=Xtest.shape[0], dtype=int)
for i in range(Xtest.shape[0]):
knear = kdtree.query(Xtest.iloc[i,:], k=3)[1]
ynear = ytrain.iloc[knear,0].values
yuniq, ycounts = np.unique(ynear, return_counts=True)
pred[i] = yuniq[np.argmax(ycounts)]
np.unique(pred, return_counts=True)
Out[48]:
In [67]:
np.sum(pred == ytest[0].values) / float(ytest.shape[0])
Out[67]:
In [ ]: