K-Nearest neighbors is one of the most basic classification algorithms and should be your first choice if you have no prior knowledge about the data.
It is based on the Euclidean distance between the test sample and the specified training samples.
We will use the digits dataset from sklearn to practice using KNN to predict the true value of hand written digits.
Lecture notes for theory and math is here: http://george1328.github.io/lecture_notes/KNN.pdf
In [27]:
import pandas as pd
%pylab inline
from sklearn.datasets import load_digits
In [28]:
digits = load_digits()
In [29]:
digits.keys()
Out[29]:
In [30]:
print digits.DESCR
In [31]:
print digits.data.shape
print digits.images.shape
print digits.target.shape
In [32]:
#view some images
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(digits.target[i]))
In [33]:
X = digits.data
In [34]:
y = digits.target
In [35]:
from sklearn.cross_validation import train_test_split
In [36]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=0)
In [37]:
from sklearn.neighbors import KNeighborsClassifier
In [38]:
model = KNeighborsClassifier(3).fit(X_train,y_train)
In [39]:
model.score(X_test, y_test)
Out[39]:
In [40]:
from sklearn.cross_validation import KFold
def cross_validate(X, y, classifier, k_fold) :
# derive a set of (random) training and testing indices
k_fold_indices = KFold(len(X), n_folds=k_fold,
shuffle=True, random_state=0)
k_score_total = 0
# for each training and testing slices run the classifier, and score the results
for train_slice, test_slice in k_fold_indices :
model = classifier(X[ train_slice ],
y[ train_slice ])
k_score = model.score(X[ test_slice ],
y[ test_slice ])
k_score_total += k_score
# return the average accuracy
return k_score_total/k_fold
In [41]:
knn_values = np.array(range(2,20))
knn_results = []
for x in range(2,20):
knn_results.append(cross_validate(X, y, KNeighborsClassifier(x).fit, 10))
In [42]:
plot(knn_values, knn_results)
Out[42]: