In [2]:
from __future__ import division
from lin_alg import *
import random
In [3]:
class KMeans:
"""k-means algo"""
def __init__(self, k):
self.k = k # number of clusters
self.means = None # means of clusters
def classify(self, input):
"""return the index of the cluster to closest to input"""
return min(range(self.k),
key = lambda i: squared_distance(input, self.means[i]))
def train(self, inputs):
# choose k rand points as initials
self.means = random.sample(inputs, self.k)
assignments = None
while True:
# Find new assignments
new_assignments = map(self.classify, inputs)
# If nothing changed we're good to go
if assignments == new_assignments:
return
# otherwise keep
assignments = new_assignments
# And compute new means based on assigments
for i in range(self.k):
# get points in cluster
i_points = [p for p,a in zip(inputs, assignments) if a == i]
# check for membership
if i_points:
self.means[i] = vector_mean(i_points)
In [4]:
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
In [5]:
random.seed(0)
clusterer = KMeans(2)
clusterer.train(inputs)
In [6]:
clusterer.means
Out[6]:
In [7]:
def squared_clustering_errors(inputs, k):
"""finds total square error for k"""
clusterer = KMeans(k)
clusterer.train(inputs)
means = clusterer.means
assignments = map(clusterer.classify, inputs)
return sum(squared_distance(input, means[cluster])
for input, cluster in zip(inputs, assignments))
In [8]:
ks = range(1, len(inputs) + 1)
errors = [squared_clustering_errors(inputs, k) for k in ks]
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
In [10]:
plt.plot(ks, errors, '.')
Out[10]:
In [16]:
def is_leaf(cluster):
"""a cluster is a leaf if it has len 1"""
return len(cluster) == 1
def get_children(cluster):
"""returns children of the cluster if merged else exception"""
if is_leaf(cluster):
raise TypeError("a leaf cluster has no children")
else:
return cluster[1]
def get_values(cluster):
"""returns the value in the cluster (if leaf)
or all values in leaf clusters below"""
if is_leaf(cluster):
return cluster
else:
return [value
for child in get_children(cluster)
for value in get_values(child)]
def cluster_distance(cluster1, cluster2, distance_agg = min):
"""compute all pairwise distances btw clusters
and apply distance_agg to the list"""
return distance_agg([distance(input1, input2)
for input1 in get_values(cluster1)
for input2 in get_values(cluster2)])
def get_merge_order(cluster):
if is_leaf(cluster):
return float('inf')
else:
return cluster[0]
def bottom_up_cluster(inputs, distance_agg = min):
# we start with all leaf clusters (this is bottom up after all)
clusters = [(input,) for input in inputs]
# Don't stop until we have one cluster
while len(clusters) > 1:
# the two clusters we want to merge
# are the clusters that are closest without touching
c1, c2 = min([(cluster1, cluster2)
for i, cluster1 in enumerate(clusters)
for cluster2 in clusters[:i]],
key = lambda (x,y): cluster_distance(x, y, distance_agg))
# the above is really inefficient in distance calc
# we should instead "look up" the distance
# once we merge them we remove them from the list
clusters = [c for c in clusters if c != c1 and c != c2]
# merge them with order = # of clusters left (so that last merge is "0")
merged_cluster = (len(clusters), [c1, c2])
# append the merge
clusters.append(merged_cluster)
return clusters[0]
In [17]:
base_cluster = bottom_up_cluster(inputs)
In [18]:
base_cluster
Out[18]:
In [19]:
def generate_clusters(base_cluster, num_clusters):
clusters = [base_cluster]
# keep going till we have the desired number of clusters
while len(clusters) < num_clusters:
# choose the last-merge
next_cluster = min(clusters, key = get_merge_order)
# remove it from the list
clusters = [c for c in clusters if c != next_cluster]
# add its children to the list (this is an unmerge)
clusters.extend(get_children(next_cluster))
return clusters
In [24]:
three_clusters = [get_values(cluster)
for cluster in generate_clusters(base_cluster, 3)]
In [25]:
three_clusters
Out[25]:
In [ ]: