In [6]:
# %load kmeans(git).py
'''
Created on the 11th, Sep, 2017
@author : HAO Zhaojun
'''
from numpy import *
from random import *
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# function to perform Lloyd algorithm
def Kmeans(Dataset, k, d, maxite = 10) :
# dataset is a list whose elements are also list
n = len(Dataset)
Colors = cm.rainbow(linspace(0, 1, k))
try :
for i in Dataset:
if(len(i) != d):
raise ValueError("Data points are not of the given dimension")
exit()
# label stores the clustering of every point
Label = []
for j in range(n):
Label.append(0)
# choose k distinct samples by random as initial centroids
nomb = 1
Center=[]
num = randint(0,n-1)
print(num)
Center.append(Dataset[num])
while(nomb < k) :
num = randint(0,n-1)
print(num)
rep = 0
for i in range(0, nomb) :
if((Dataset[num] > Center[i]) - (Dataset[num] < Center[i])==0):
rep+= 1
if(rep ==0):
Center.append(Dataset[num])
nomb += 1
print("initial centers are :", Center)
change = 1
for ite in range(maxite):
if(change>0):
print(ite, "th iteration : ")
# sum is a list whose element are the sums of points in each cluster in form of array
Sum1 = []
for l in range(k):
sum1.append(array([0 for x in range(d)]))
# during every iteration, change and dist records the number of changed labels and the size of each clustering
change = 0
Dist = [0 for x in range(k)]
for j in range(n):
#print "let's look at ", j, "th point"
# kDist is a list storing the distancce between given point and K centers
KDist = [0 for y in range(k)]
# dist is the distance between given point and ith center
for i in range(k):
#print "compare with", i, "th centroid"
sub = array(Dataset[j])-array(Center[i])
#print sub
KDist[i] = sum([z**2 for z in sub])
clu = KDist.index(min(KDist))
#print clu
Dist[clu] += 1
Sum1[clu] += array(Dataset[j])
if(Label[j] != clu):
change+= 1
Label[j] = clu
# update of centers
for i in range(k):
Center[i] = Sum1[i]/float(Dist[i])
print("new centers are :\n", Center)
print("new label are : ", Label)
return(Center, Label)
except ValueError:
print("Not all data points are numeric")
## main function
a =[2,2]
b =[1,2]
c =[1,1]
d =[0,0]
f =[3,2]
Dataset = [a,b,c,d,f]
res = Kmeans(Dataset,2,2)
print(res)
In [ ]: