In [1]:
from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
가장 간단한 군빚화 방법 중 하나는 군집의 개수 k를 미리 정해 두는 k-means 데이터와 데이터가 속한 군집의 중심점과의 거리의 제곱합을 최소화시키며 $S_1,...,S_k$까지의 군집을 구한다. n개의 데이터 포인터를 할당하는 방법은 다양하며, 최적의 군집을 찾는 것은 무척 어렵다.
반복 연산을 통해 군집을 찾는 방법
In [2]:
class KMeans:
"""performs k-means clustering"""
def __init__(self, k):
self.k = k # number of clusters
self.means = None # means of clusters
def classify(self, input):
"""return the index of the cluster closest to the input"""
return min(range(self.k),
key=lambda i: squared_distance(input, self.means[i]))
def train(self, inputs):
self.means = random.sample(inputs, self.k)
assignments = None
while True:
# Find new assignments
new_assignments = list(map(self.classify, inputs))
# If no assignments have changed, we're done.
if assignments == new_assignments:
return
# Otherwise keep the new assignments,
assignments = new_assignments
for i in range(self.k):
i_points = [p for p, a in zip(inputs, assignments) if a == i]
# avoid divide-by-zero if i_points is empty
if i_points:
self.means[i] = vector_mean(i_points)
In [3]:
def squared_clustering_errors(inputs, k):
"""finds the total squared error from k-means clustering the inputs"""
clusterer = KMeans(k)
clusterer.train(inputs)
means = clusterer.means
assignments = list(map(clusterer.classify, inputs))
return sum(squared_distance(input,means[cluster])
for input, cluster in zip(inputs, assignments))
def plot_squared_clustering_errors():
ks = range(1, len(inputs) + 1)
errors = [squared_clustering_errors(inputs, k) for k in ks]
plt.plot(ks, errors)
plt.xticks(ks)
plt.xlabel("k")
plt.ylabel("total squared error")
plt.show()
In [4]:
#
# using clustering to recolor an image
#
def recolor_image(input_file, k=5):
path_to_png_file = input_file
img = mpimg.imread(path_to_png_file)
pixels = [pixel for row in img for pixel in row]
clusterer = KMeans(k)
clusterer.train(pixels) # this might take a while
def recolor(pixel):
cluster = clusterer.classify(pixel) # index of the closest cluster
return clusterer.means[cluster] # mean of the closest cluster
new_img = [[recolor(pixel) for pixel in row]
for row in img]
plt.imshow(new_img)
plt.axis('off')
plt.show()
In [5]:
#
# hierarchical clustering
#
def is_leaf(cluster):
"""a cluster is a leaf if it has length 1"""
return len(cluster) == 1
def get_children(cluster):
"""returns the two children of this cluster if it's a merged cluster;
raises an exception if this is a leaf cluster"""
if is_leaf(cluster):
raise TypeError("a leaf cluster has no children")
else:
return cluster[1]
def get_values(cluster):
"""returns the value in this cluster (if it's a leaf cluster)
or all the values in the leaf clusters below it (if it's not)"""
if is_leaf(cluster):
return cluster # is already a 1-tuple containing value
else:
return [value
for child in get_children(cluster)
for value in get_values(child)]
def cluster_distance(cluster1, cluster2, distance_agg=min):
"""finds the aggregate distance between elements of cluster1
and elements of cluster2"""
return distance_agg([distance(input1, input2)
for input1 in get_values(cluster1)
for input2 in get_values(cluster2)])
def get_merge_order(cluster):
if is_leaf(cluster):
return float('inf')
else:
return cluster[0] # merge_order is first element of 2-tuple
def bottom_up_cluster(inputs, distance_agg=min):
# start with every input a leaf cluster / 1-tuple
clusters = [(input,) for input in inputs]
# as long as we have more than one cluster left...
while len(clusters) > 1:
# find the two closest clusters
c1, c2 = min([(cluster1, cluster2)
for i, cluster1 in enumerate(clusters)
for cluster2 in clusters[:i]],
key=lambda p: cluster_distance(p[0], p[1], distance_agg))
# remove them from the list of clusters
clusters = [c for c in clusters if c != c1 and c != c2]
# merge them, using merge_order = # of clusters left
merged_cluster = (len(clusters), [c1, c2])
# and add their merge
clusters.append(merged_cluster)
# when there's only one cluster left, return it
return clusters[0]
def generate_clusters(base_cluster, num_clusters):
# start with a list with just the base cluster
clusters = [base_cluster]
# as long as we don't have enough clusters yet...
while len(clusters) < num_clusters:
# choose the last-merged of our clusters
next_cluster = min(clusters, key=get_merge_order)
# remove it from the list
clusters = [c for c in clusters if c != next_cluster]
# and add its children to the list (i.e., unmerge it)
clusters.extend(get_children(next_cluster))
# once we have enough clusters...
return clusters
In [6]:
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
random.seed(0) # so you get the same results as me
clusterer = KMeans(3)
clusterer.train(inputs)
print("3-means:")
print(clusterer.means)
print()
In [7]:
plt.scatter([row for row, col in inputs[0:]], [col for row, col in inputs[0:]])
plt.margins(0.15, 0.3)
plt.title("User Locations")
plt.xlabel("blocks east of city center")
plt.ylabel("blocks north of city center")
plt.show()
In [8]:
random.seed(0)
clusterer = KMeans(2)
clusterer.train(inputs)
print("2-means:")
print(clusterer.means)
print()
In [35]:
print("errors as a function of k")
for k in range(1, len(inputs) + 1):
print(k, squared_clustering_errors(inputs, k))
print()
In [36]:
plot_squared_clustering_errors()
In [37]:
print("bottom up hierarchical clustering")
base_cluster = bottom_up_cluster(inputs)
print(base_cluster)
print()
In [38]:
print("three clusters, min:")
for cluster in generate_clusters(base_cluster, 3):
print(get_values(cluster))
print()
In [39]:
print("three clusters, max:")
base_cluster = bottom_up_cluster(inputs, max)
for cluster in generate_clusters(base_cluster, 3):
print(get_values(cluster))
print()
In [10]:
path_to_png_file = "./image/image.png"
plt.imshow(mpimg.imread(path_to_png_file))
plt.axis('off')
plt.show()
recolor_image(path_to_png_file)
In [66]:
base_cluster = bottom_up_cluster(inputs)
three_clusters = [get_values(cluster)
for cluster in generate_clusters(base_cluster, 3)]
for i, cluster, marker, color in zip([1, 2, 3],
three_clusters,
['D','o','*'],
['r','g','b']):
xs, ys = zip(*cluster) # magic unzipping trick
plt.scatter(xs, ys, color=color, marker=marker)
# put a number at the mean of the cluster
x, y = vector_mean(cluster)
plt.plot(x, y, marker='$' + str(i) + '$', color='black')
plt.margins(0.15, 0.3)
plt.title("User Locations -- 3 Bottom-Up Clusters, Min")
plt.xlabel("blocks east of city center")
plt.ylabel("blocks north of city center")
plt.show()
In [67]:
base_cluster = bottom_up_cluster(inputs, max)
three_clusters = [get_values(cluster)
for cluster in generate_clusters(base_cluster, 3)]
for i, cluster, marker, color in zip([1, 2, 3],
three_clusters,
['D','o','*'],
['r','g','b']):
xs, ys = zip(*cluster) # magic unzipping trick
plt.scatter(xs, ys, color=color, marker=marker)
# put a number at the mean of the cluster
x, y = vector_mean(cluster)
plt.plot(x, y, marker='$' + str(i) + '$', color='black')
plt.margins(0.15, 0.3)
plt.title("User Locations -- 3 Bottom-Up Clusters, Max")
plt.xlabel("blocks east of city center")
plt.ylabel("blocks north of city center")
plt.show()