In [2]:
import random
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame

In [3]:
def knn(X, dataSet, k):
    dataSetSize = dataSet.shape[0]
    #distance calculation
    print (tile(X, (dataSetSize,1)))
    diffMat = tile(X, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5

    #sort results
    distances.sort()
    
    #vote using k lowest distances
    top_labels = distances.head(k).index.tolist()
    label_counts = dict((i, top_labels.count(i)) for i in top_labels)
    return max(label_counts.iteritems(), key=operator.itemgetter(1))[0]

In [4]:
labels = ["A", "B", "C", "D"] * 25
random.shuffle(labels)

#Generate random points
N = 100
data = DataFrame(np.random.randint(0, 100, size=(N, 2)), columns = ["x", "y"], index = labels)

#Define a test point
test_point = DataFrame({"x": [15], "y": [30]})

In [5]:
#Invoke the knn function
k = 5
result_label = knn(test_point, data, k)
print "***Result = %s***" % result_label


[[15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]
 [15 30]]
***Result = D***

In [6]:
# Plot the points
f = plt.figure()
ax = f.add_subplot(1, 1, 1)
ax.set_title("K-Nearest Neighbor")
a = ax.scatter(data.ix["A"]["x"], data.ix["A"]["y"], c="Blue", s=75)
b = ax.scatter(data.ix["B"]["x"], data.ix["B"]["y"], c="Green", s=75)
c = ax.scatter(data.ix["C"]["x"], data.ix["C"]["y"], c="Red", s=75)
d = ax.scatter(data.ix["D"]["x"], data.ix["D"]["y"], c="Black", s=75)
point = ax.scatter(test_point["x"], test_point["y"], c="Yellow", s=100)
ax.legend((a,b,c,d, point), ("A", "B", "C", "D", "test point"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., scatterpoints=1)


Out[6]:
<matplotlib.legend.Legend at 0x7fc438171910>

In [6]: