In [92]:
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from matplotlib import style
from collections import Counter
style.use('fivethirtyeight')
%matplotlib notebook
In [93]:
# Sample Feature sets
dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]}
new_features = [5,7]
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
plt.scatter(new_features[0],new_features[1],s=100)
plt.show()
In [102]:
def k_nearest_neighbors(data,predict,k=3):
print(data)
if(len(data) >= k):
warnings.warn('K is set to a value less than total voting groups')
distances = []
# The predicted point finds distance from all the data points.
for group in data:
for features in data[group]:
# euclidean_distance = sqrt((plot1[0] - plot2[0])**2 + (plot1[1] - plot2[1])**2)
# eucledian_distance = sqrt( (features[0]-predict[0])**2 + (features[1]-predict[1])**2)
# eucledian_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2))
eucledian_distance = np.linalg.norm( np.array(features) - np.array(predict))
distances.append([eucledian_distance,group])
# Taking top 5 distances and their group ID's or clusters
votes = [i[1] for i in sorted(distances)[:k]]
# Count of each group ID or clusters
vote_result = Counter(votes)
# Find the cluster which has maximum closest points to the predicted point.
vote_result = vote_result.most_common(1)[0][0]
# Finding Confidence. Highest number of nodes closer to the predicted point divided by cluster size.
confidence = Counter(votes).most_common(1)[0][1] / k
return vote_result, confidence
In [103]:
result, confidence = k_nearest_neighbors(dataset, new_features, k=3)
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
plt.scatter(new_features[0],new_features[1],s=100,color=result)
plt.show()
print(confidence)
In [ ]:
In [ ]: