In [1]:
import numpy as np
import scipy
import pandas
import treelib
import pyclust

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pandas.read_table('data/data_k5.csv', sep=',')

df.head(3)


Out[2]:
x y label
0 9.3716 -0.4233 4
1 10.1449 0.0706 4
2 -9.7006 4.7697 1

In [3]:
def plot_scatter(X, labels=None, title="Scatter Plot"):
    
    labels = np.zeros(shape=X.shape[0], dtype=int) if labels is None else labels
    colors = ['b', 'r', 'g', 'm', 'y']
    col_dict = {}
    i = 0
    for lab in np.unique(labels):
        col_dict[lab] = colors[i]
        i += 1 
    
    fig1 = plt.figure(1, figsize=(8,6))
    ax = fig1.add_subplot(1, 1, 1)

    for i in np.unique(labels):
        indx = np.where(labels == i)[0]
        plt.scatter(X[indx,0], X[indx,1], color=col_dict[i], marker='o', s=100, alpha=0.5)

    plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
    plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)

    plt.xlabel('$x_1$', size=20)
    plt.ylabel('$x_2$', size=20)
    plt.title(title, size=20)

    plt.show()
    
## test plot original data
plot_scatter(df.iloc[:,0:2].values, labels=df.iloc[:,2].values, title="Scatter Plot: Original Labels")


KMeans Clustering

K = 5


In [4]:
km = pyclust.KMeans(n_clusters=5)

km.fit(df.iloc[:,0:2].values)

print(km.centers_)

plot_scatter(df.iloc[:,0:2].values, labels=km.labels_, title="Scatter Plot: K-Means")


[[ 9.94503684 -0.92278421]
 [-6.57869259  2.69908148]
 [ 7.01883636  0.71122727]
 [-9.41249706  5.21222647]
 [-9.25017241 -2.04355172]]

Bisecting K-Means


In [5]:
bkm = pyclust.BisectKMeans(n_clusters=5)

bkm.fit(df.iloc[:,0:2].values)

print(bkm.labels_)

plot_scatter(df.iloc[:,0:2].values, labels=bkm.labels_, title="Scatter Plot: Bisecting K-Means")


[4 4 8 7 4 7 6 8 6 8 7 3 8 6 6 4 6 8 8 8 4 8 3 3 6 3 4 3 4 4 3 4 3 4 8 4 6
 4 3 6 3 6 6 4 3 8 6 4 6 4 4 8 6 4 3 3 8 4 4 4 3 3 8 8 4 7 7 6 3 4 3 7 6 4
 3 8 7 3 8 3 3 6 3 3 6 3 7 8 6 7 3 6 7 6 6 3 4 4 6 3 6 4 8 6 6 8 6 3 4 4 4
 6 8 3 4 6 6 7 4 3]

In [6]:
bkm.tree_.show(line_type='ascii')


0
|-- 1
|   |-- 3
|   +-- 4
+-- 2
    |-- 5
    |   |-- 7
    |   +-- 8
    +-- 6

Cutting the tree structure

  • Cut the tree to get a clustering with a new n_cluster
    • bkm.cut(n_clusters=4)
  • It returns a tuple:
    • first elemen being the new cluster memberships
    • second element is a dictionary for the centroid of each cluster

Example

bkm.cut(3)

(array([4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 3,
        3, 2, 3, 4, 3, 4, 4, 3, 4, 3, 4, 2, 4, 2, 4, 3, 2, 3, 2, 2, 4, 3, 2,
        2, 4, 2, 4, 4, 2, 2, 4, 3, 3, 2, 4, 4, 4, 3, 3, 2, 2, 4, 2, 2, 2, 3,
        4, 3, 2, 2, 4, 3, 2, 2, 3, 2, 3, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2,
        2, 2, 2, 3, 4, 4, 2, 3, 2, 4, 2, 2, 2, 2, 2, 3, 4, 4, 4, 2, 2, 3, 4,
        2, 2, 2, 4, 3]),
 {2: [-8.1686500000000013, 4.1619483333333331],
  3: [-9.2501724137931021, -2.0435517241379313],
  4: [8.3429774193548365, -0.30114193548387092]})

In [7]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(2)[0], title="Scatter Plot: Bisecting K-Means (2)")



In [8]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(3)[0], title="Scatter Plot: Bisecting K-Means (3)")



In [9]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(4)[0], title="Scatter Plot: Bisecting K-Means (4)")



In [ ]: