In [2]:
""" This is a very slow intro to understanding and implementing
the K-means algorithm in Python.
"""
from collections import Counter
def raw_majority_vote(labels):
votes = Counter(labels)
winner, _ = votes.most_common(1)[0]
return winner
def majority_vote(labels):
"""assumes that labels are ordered from nearest to farthest"""
vote_counts = Counter(labels)
winner, winner_count = vote_counts.most_common(1)[0]
num_winners = len([count for count in vote_counts.values() if count == winner_count])
if num_winners == 1:
return winner
else:
return majority_vote(labels[:-1])
In [3]:
# Just copying all the vector stuff from chapter 4
from math import sqrt
def vector_add(v, w):
'''adds corresponding elements of vectors'''
return [v_i + w_i for v_i,w_i in zip(v, w)]
def vector_subtract(v, w):
'''subtract corresponding elements of vectors'''
return [v_i - w_i for v_i,w_i in zip(v, w)]
def vector_sum(vectors):
'''sum all corresponding elements'''
result = vectors[0]
for vector in vectors[1:]:
result = vector_add(result, vector)
return result
def vector_sum_fast(vectors):
'''sum all corresponding elements'''
return reduce(vector_add, vectors)
def scalar_multiply(c, vector):
'''c is the scalar'''
return [c * v_i for v_i in vector]
def vector_mean(vectors):
'''The ith element of the result is the mean of the ith element
of all the input vectors.'''
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))
def dot(v, w):
'''the sum of the product of the matching elements
of the input vectors'''
return sum(v_i * w_i for v_i,w_i in zip(v, w))
def sum_of_squares(v):
'''add the square of each element'''
return dot(v, v)
def magnitude(v):
'''Find the length of a vector in cartesian space'''
return sqrt(sum_of_squares(v))
def distance(v, w):
'''Find the distance between two vectors'''
return magnitude(vector_subtract(v, w))
In [5]:
def knn_classify(k, labeled_points, new_point):
"""each labeled point should be a pair: (point, label)"""
# order the labeled points from nearest to farthest
by_distance = sorted(labeled_points, key=lambda point, _: distance(point, new_point))
# find the labels for the k closest
k_nearest_labels = [label for _, label in by_distance[:k]]
# and let them vote
return majority_vote(k_nearest_labels)