In [14]:
import random
def make_data(n_points, n_clusters=2, dim=2, sigma=1):
    x = [[] for i in range(dim)]
    for i in range(n_clusters):
        for d in range(dim):
            x[d].extend([random.gauss(i*3,sigma) for j in range(n_points)])
    return x
#[x,y] = make_data(100, 3)
scatter(*make_data(75, 3))


Out[14]:
<matplotlib.collections.PathCollection at 0x10d7eb310>

In [24]:
def centroid(x):
    return [[sum(col)/float(len(x[0]))] for col in x]
ex1 = make_data(100, 1)
#print ex1
scatter(*ex1, color="green")
print centroid(ex1)
scatter(*centroid(ex1), color="red", marker="x")


[[-0.13568663211290832], [0.070437061579248339]]
Out[24]:
<matplotlib.collections.PathCollection at 0x10dcf0d50>

In [42]:
co = ["red", "orange", "yellow", "green", "blue", "black","brown"]
def guess_clusters(x, n_clusters):
    # req co list of identifiers
    for i in range(len(x[0])):
        return [ co[random.choice(range(n_clusters))] for i in range(len(x[0]))]
ex2 = make_data(50, 2)
scatter(*ex2)
mem_ex2 = guess_clusters(ex2,2)
#print mem_ex2
scatter(*ex2, color=mem_ex2)


Out[42]:
<matplotlib.collections.PathCollection at 0x10e24c6d0>

In [32]:
def select_members(x, membership, cluster):
    return [ [i for i,label in zip(dim, membership) if label == cluster] for dim in x ]
print mem_ex2.count("red")
scatter(*select_members(ex2, mem_ex2, "red"), color="red")
scatter(*centroid(select_members(ex2, mem_ex2, "red")), color="red", marker="x")


51
Out[32]:
<matplotlib.collections.PathCollection at 0x10e08b390>

In [33]:
print mem_ex2.count("orange")
scatter(*select_members(ex2, mem_ex2, "orange"), color="orange")
scatter(*centroid(select_members(ex2, mem_ex2, "orange")), color="orange", marker="x")


49
Out[33]:
<matplotlib.collections.PathCollection at 0x10e114e10>

In [30]:
import math
def distance(p1, p2):
    # odd... vectors are lists of lists with only 1 element in each dim
    return math.sqrt(sum([(i[0]-j[0])**2 for i,j in zip(p1, p2)]))
print distance([[-1],[-1]],[[2],[3]])


5.0

In [34]:
import sys
def reassign(x, centriods):
    membership = []
    for idx in range(len(x[0])):
        min_d = sys.maxint
        cluster = ""
        for c, vc in centriods.items():
            dist = distance(vc, [[t[idx]] for t in x])
            if dist < min_d:
                min_d = dist
                cluster = c
        membership.append(cluster)
    return membership

cent_ex2 = {i:centroid(select_members(ex2, mem_ex2, i)) for i in co[:2]}
mem_ex2 = reassign(ex2, cent_ex2)
scatter(*ex2, color=mem_ex2)
scatter(*cent_ex2["red"], color="red", marker="x")
scatter(*cent_ex2["orange"], color="orange", marker="x")


Out[34]:
<matplotlib.collections.PathCollection at 0x10e1afc10>

In [46]:
# function
def get_centroids(x, membership):
    return {i:centroid(select_members(x, membership, i)) for i in set(membership)}

# redifine with total distance measure
def reassign(x, centroids):
    membership, scores = [], {}
    # step through all the vectors
    for idx in range(len(x[0])):
        min_d, cluster = sys.maxint, None
        for c, vc in centroids.items():
            dist = distance(vc, [[t[idx]] for t in x])
            if dist < min_d:
                min_d = dist
                cluster = c
        scores[cluster] = min_d + scores.get(cluster, 0)
        membership.append(cluster)
    return membership, sum(scores.values())/float(len(x[0]))

def k_means(data, k):
    # start with random distribution
    membership = guess_clusters(data, k)
    score, last_score = 0.0, sys.maxint
    while abs(last_score - score) > 1e-7:
        last_score = score
        c = get_centroids(data, membership)
        membership, score = reassign(data, c)
        #print last_score - score
    return membership, c, score
        
ex3 = make_data(100, 7, sigma=1) 
mem_ex3, cl_ex3, s_ex3 = k_means(ex3, 7)
scatter(*ex3, color = mem_ex3)
for i, pt in cl_ex3.items():
    scatter(*pt, color=i, marker="x")



In [47]:
ex4 = make_data(200, 4) 
err = []
trial_ks = range(1,8)
for k in trial_ks:
    mem_ex4, cl_ex4, s_ex4 = k_means(ex4, k)
    err.append(s_ex4)
scatter(*ex4, color = mem_ex4)


Out[47]:
<matplotlib.collections.PathCollection at 0x10e394710>

In [48]:
plot(trial_ks, err)


Out[48]:
[<matplotlib.lines.Line2D at 0x10e361a90>]

In [10]: