In [21]:
import random
import operator
import numpy as np 
from copy import deepcopy

class point:
    def __init__(self,listLine):
        self.feature = listLine
        self.label = None           
class label:
    def __init__(self,listLine):
        self.feature = listLine
        self.cluster = []

In [22]:
# 載入data
filenamme = '/Users/wy/Desktop/kmean_data.txt'
points=[]

with open (filenamme,'r') as f:
    lines = f.readlines()
    
for index, line in enumerate(lines):
    line=line.strip()   
    listLine = map(float,line.split(' '))
    # 初始化 points
    tmp = point(listLine)
    points.append(tmp)
    
# 多少維度    
dimension = len(listLine)

# 初始化 label
def initialization_label(k,points):
    labels=[]
    # random以利於隨機初始label
    new_points = deepcopy(points)
    np.random.shuffle(new_points)  
    for index, line in enumerate(new_points[:k]):
        index = label(line.feature)
#         print(line.feature)
        labels.append(index) 
    return labels

In [3]:
# show points labels and feature
def show_points(points):
    for index, item in enumerate(points):
        t=""
        for findex, fitem in enumerate(item.feature):
            t += str(findex)+':'+str(fitem)+'\t'
        print(str(index)+':'+"分類"+str(item.label)+"\tfeature-"+str(t))
        
# show labels feature        
def show_labels(labels):
    for index, item in enumerate(labels):
        t=""
        for findex, fitem in enumerate(item.feature):
            t += str(findex)+':'+str(fitem)+'\t'
        print("label"+str(index)+"\tfeature-"+str(t))

In [4]:
def kmeans(k,labels,points,num_basic_kmeans):
    # sse = sum of square error
    sse=0
    
    # 清空 label 的 cluster 便於之後重新分配點的label
    for a in range(len(labels)):
        labels[a].cluster=[]
        
    # step1: cluster assignment
    for a in range(len(points)):
        # 計算最小的距離之list
        tp=[]
        for b in range(len(labels)):
            point_features_error=0
            for c in range(dimension):
                point_features_error += (labels[b].feature[c]-points[a].feature[c])**2
            tp.append(point_features_error)            
        points[a].label = tp.index(min(tp))+num_basic_kmeans
        sse+=tp[tp.index(min(tp))]
        # labes 加入 被分配的點
        labels[ tp.index(min(tp)) ].cluster.append(points[a])     
        
    # step2: move centroid
    for a in range(len(labels)):
        if len(labels[a].cluster) !=0:
            for c in range(dimension):
                temp = 0
                for b in range(len( labels[a].cluster )):
                    temp+=labels[a].cluster[b].feature[c]
                labels[a].feature[c]=float(temp)/float(len(labels[a].cluster))  
                
    # return labels 用於 basic_kmeans
    return (sse , labels)

In [5]:
# basic_kmeans
def basic_kmeans(k,points,num_basic_kmeans,times):
    labels = optimization(k,points,times)
    psse, plabelList = kmeans(k,labels,points,num_basic_kmeans)
    sse, labelList = kmeans(k,labels,points,num_basic_kmeans)
    count=1
    while psse != sse:
        psse = sse
        sse, labelList = kmeans(k,labels,points,num_basic_kmeans)
        count += 1
    return (sse , labelList)

In [6]:
# costfunction 算出 sse = sum of square error
def costfunction(labels,points):
    # sse = sum of square error
    sse=0
    for a in range(len(points)):
        # 計算最小的距離之list
        tp=[]
        for b in range(len(labels)):
            point_features_error=0
            for c in range(dimension):
                point_features_error += (labels[b].feature[c]-points[a].feature[c])**2
            tp.append(point_features_error)            
        sse+=tp[tp.index(min(tp))]
    return sse

In [7]:
# bisecting_Kmeans
def bisecting_Kmeans(points,nbk,times):
    # 執行第一次 bisecting_Kmeans t=0
    sse,labels = basic_kmeans(2,points,0,times)
    # bisecting_Kmeans 中 points皆會分成兩類 labels[0],labels[1]
    show_points(labels[0].cluster)
    print('@@')
    show_points(labels[1].cluster)
    print('@@')
    minsseDict = {}
    clusterDict = {}
    # 假設執行num_basic_kmeans次 所需執行k-1次bisecting_Kmeans 第一次以執行,故剩下k-1-1 = k-2次
    for nbk in range(nbk-2):
        sse0 = costfunction(labels,labels[0].cluster)
        minsseDict[nbk]=sse0
        clusterDict[nbk]=labels[0].cluster
        # 一次分兩cluster 故+2避免重覆
        sse1 = costfunction(labels,labels[1].cluster)
        minsseDict[nbk+2]=sse1
        clusterDict[nbk+2]=labels[1].cluster
        # find minsseDict min value
        key = max(minsseDict.iteritems(), key=operator.itemgetter(1))[0]
        tppoints = clusterDict[key]
        del minsseDict[key]
        del clusterDict[key]
        sse,labels = basic_kmeans(2,tppoints,nbk*2+2,times)
        # 每一次去分類的points
        print('@@')
        show_points(tppoints)

In [8]:
# 以 first sse = sum of square error 來找出較好的初始點
def optimization(k,points,times):
    if times != 0:
        sseList=[]
        labelList=[]
        for a in range(times):
            labels = initialization_label(k,points)
            sse = costfunction(labels,points)
            sseList.append(sse)
            labelList.append(labels)
            best_labels = labelList[sseList.index(min(sseList))]
    else:
        best_labels = initialization_label(k,points)
    return best_labels

In [9]:
def choose_k(points,k_range,times):
    k_candidate=[]
    # 帶入k的範圍
    for k in range(1,k_range):
        best_labels = optimization(k,points,times)
        sse,labelList = basic_kmeans(k,points,0,times)
        k_candidate.append(sse)
    sse_slopeList=[0,0]
    print(k_candidate)
    for a in range(len(k_candidate)-1):
        # 算出不同k值的斜率 根據elbow 求出 best_k
        sse_slope = k_candidate[a+1]-k_candidate[a]
        sse_slopeList.append(sse_slope)
    #選出最斜率變化最大    
    best_k = sse_slopeList.index(min(sse_slopeList))
    return best_k

In [10]:
# basic_kmeans
basic_kmeans(4,points,0,10)
show_points(points)


0:分類0	feature-0:0.1	1:0.1	
1:分類0	feature-0:0.1	1:0.2	
2:分類0	feature-0:0.2	1:0.1	
3:分類0	feature-0:0.2	1:0.2	
4:分類1	feature-0:10.1	1:0.1	
5:分類1	feature-0:10.1	1:0.2	
6:分類1	feature-0:10.2	1:0.2	
7:分類1	feature-0:10.2	1:0.1	
8:分類2	feature-0:0.1	1:10.1	
9:分類2	feature-0:0.2	1:10.1	
10:分類2	feature-0:0.1	1:10.2	
11:分類2	feature-0:0.2	1:10.2	
12:分類3	feature-0:10.1	1:10.1	
13:分類3	feature-0:10.1	1:10.2	
14:分類3	feature-0:10.2	1:10.2	
15:分類3	feature-0:10.2	1:10.1	

In [16]:
# depend on elbow theorem find bestk
choose_k(points,6,10)


[800.0799999999999, 400.0799999999999, 200.07999999999993, 0.07999999999999972, 0.06999999999999972]
Out[16]:
2

In [23]:
# bisecting_Kmeans
bisecting_Kmeans(points,4,10)


0:分類0	feature-0:10.1	1:0.1	
1:分類0	feature-0:10.1	1:0.2	
2:分類0	feature-0:10.2	1:0.2	
3:分類0	feature-0:10.2	1:0.1	
4:分類0	feature-0:10.1	1:10.1	
5:分類0	feature-0:10.1	1:10.2	
6:分類0	feature-0:10.2	1:10.2	
7:分類0	feature-0:10.2	1:10.1	
@@
0:分類1	feature-0:0.1	1:0.1	
1:分類1	feature-0:0.1	1:0.2	
2:分類1	feature-0:0.2	1:0.1	
3:分類1	feature-0:0.2	1:0.2	
4:分類1	feature-0:0.1	1:10.1	
5:分類1	feature-0:0.2	1:10.1	
6:分類1	feature-0:0.1	1:10.2	
7:分類1	feature-0:0.2	1:10.2	
@@
@@
0:分類2	feature-0:10.1	1:0.1	
1:分類2	feature-0:10.1	1:0.2	
2:分類2	feature-0:10.2	1:0.2	
3:分類2	feature-0:10.2	1:0.1	
4:分類3	feature-0:10.1	1:10.1	
5:分類3	feature-0:10.1	1:10.2	
6:分類3	feature-0:10.2	1:10.2	
7:分類3	feature-0:10.2	1:10.1	
@@
0:分類4	feature-0:0.1	1:0.1	
1:分類4	feature-0:0.1	1:0.2	
2:分類4	feature-0:0.2	1:0.1	
3:分類4	feature-0:0.2	1:0.2	
4:分類5	feature-0:0.1	1:10.1	
5:分類5	feature-0:0.2	1:10.1	
6:分類5	feature-0:0.1	1:10.2	
7:分類5	feature-0:0.2	1:10.2	

In [24]:
show_points(points)


0:分類4	feature-0:0.1	1:0.1	
1:分類4	feature-0:0.1	1:0.2	
2:分類4	feature-0:0.2	1:0.1	
3:分類4	feature-0:0.2	1:0.2	
4:分類2	feature-0:10.1	1:0.1	
5:分類2	feature-0:10.1	1:0.2	
6:分類2	feature-0:10.2	1:0.2	
7:分類2	feature-0:10.2	1:0.1	
8:分類5	feature-0:0.1	1:10.1	
9:分類5	feature-0:0.2	1:10.1	
10:分類5	feature-0:0.1	1:10.2	
11:分類5	feature-0:0.2	1:10.2	
12:分類3	feature-0:10.1	1:10.1	
13:分類3	feature-0:10.1	1:10.2	
14:分類3	feature-0:10.2	1:10.2	
15:分類3	feature-0:10.2	1:10.1