In [2]:
""" This is a very slow intro to understanding and implementing
the K-means algorithm in Python.
"""
from collections import Counter


def raw_majority_vote(labels):
    votes = Counter(labels)
    winner, _ = votes.most_common(1)[0]
    return winner


def majority_vote(labels):
    """assumes that labels are ordered from nearest to farthest"""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count for count in vote_counts.values() if count == winner_count])
    
    if num_winners == 1:
        return winner
    else:
        return majority_vote(labels[:-1])

In [3]:
# Just copying all the vector stuff from chapter 4
from math import sqrt


def vector_add(v, w):
    '''adds corresponding elements of vectors'''
    return [v_i + w_i for v_i,w_i in zip(v, w)]


def vector_subtract(v, w):
    '''subtract corresponding elements of vectors'''
    return [v_i - w_i for v_i,w_i in zip(v, w)]


def vector_sum(vectors):
    '''sum all corresponding elements'''
    result = vectors[0]
    for vector in vectors[1:]:
        result = vector_add(result, vector)

    return result


def vector_sum_fast(vectors):
    '''sum all corresponding elements'''
    return reduce(vector_add, vectors)


def scalar_multiply(c, vector):
    '''c is the scalar'''
    return [c * v_i for v_i in vector]


def vector_mean(vectors):
    '''The ith element of the result is the mean of the ith element
    of all the input vectors.'''
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))


def dot(v, w):
    '''the sum of the product of the matching elements
    of the input vectors'''
    return sum(v_i * w_i for v_i,w_i in zip(v, w))


def sum_of_squares(v):
    '''add the square of each element'''
    return dot(v, v)


def magnitude(v):
    '''Find the length of a vector in cartesian space'''
    return sqrt(sum_of_squares(v))


def distance(v, w):
    '''Find the distance between two vectors'''
    return magnitude(vector_subtract(v, w))

In [5]:
def knn_classify(k, labeled_points, new_point):
    """each labeled point should be a pair: (point, label)"""
    # order the labeled points from nearest to farthest
    by_distance = sorted(labeled_points, key=lambda point, _: distance(point, new_point))
    
    # find the labels for the k closest
    k_nearest_labels = [label for _, label in by_distance[:k]]
    
    # and let them vote
    return majority_vote(k_nearest_labels)