In [2]:
import random
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
In [3]:
def knn(X, dataSet, k):
dataSetSize = dataSet.shape[0]
#distance calculation
print (tile(X, (dataSetSize,1)))
diffMat = tile(X, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#sort results
distances.sort()
#vote using k lowest distances
top_labels = distances.head(k).index.tolist()
label_counts = dict((i, top_labels.count(i)) for i in top_labels)
return max(label_counts.iteritems(), key=operator.itemgetter(1))[0]
In [4]:
labels = ["A", "B", "C", "D"] * 25
random.shuffle(labels)
#Generate random points
N = 100
data = DataFrame(np.random.randint(0, 100, size=(N, 2)), columns = ["x", "y"], index = labels)
#Define a test point
test_point = DataFrame({"x": [15], "y": [30]})
In [5]:
#Invoke the knn function
k = 5
result_label = knn(test_point, data, k)
print "***Result = %s***" % result_label
In [6]:
# Plot the points
f = plt.figure()
ax = f.add_subplot(1, 1, 1)
ax.set_title("K-Nearest Neighbor")
a = ax.scatter(data.ix["A"]["x"], data.ix["A"]["y"], c="Blue", s=75)
b = ax.scatter(data.ix["B"]["x"], data.ix["B"]["y"], c="Green", s=75)
c = ax.scatter(data.ix["C"]["x"], data.ix["C"]["y"], c="Red", s=75)
d = ax.scatter(data.ix["D"]["x"], data.ix["D"]["y"], c="Black", s=75)
point = ax.scatter(test_point["x"], test_point["y"], c="Yellow", s=100)
ax.legend((a,b,c,d, point), ("A", "B", "C", "D", "test point"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., scatterpoints=1)
Out[6]:
In [6]: