In [1]:
import numpy as np
import scipy
import pandas
import treelib
import pyclust

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

print("pyclust version: %s"%pyclust.__version__)


pyclust version: 0.0.5

In [2]:
def plot_scatter(X, labels=None, title="Scatter Plot"):
    
    labels = np.zeros(shape=X.shape[0], dtype=int) if labels is None else labels
    colors = ['b', 'r', 'g', 'm', 'y', 
              'darkkhaki', 'royalblue', 'aqua', 'lawngreen', 'sienna']
    col_dict = {}
    i = 0
    for lab in np.unique(labels):
        col_dict[lab] = colors[i]
        i += 1 
    
    fig1 = plt.figure(1, figsize=(10,6))
    ax = fig1.add_subplot(1, 1, 1)

    for i in np.unique(labels):
        indx = np.where(labels == i)[0]
        plt.scatter(X[indx,0], X[indx,1], color=col_dict[i], marker='o', s=100, alpha=0.5)

    plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
    plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)

    plt.xlabel('$x_1$', size=20)
    plt.ylabel('$x_2$', size=20)
    plt.title(title, size=20)

    plt.show()

In [3]:
nsamp = 40
da = np.random.multivariate_normal(mean=(-2,0), cov=[[0.12,0],[0,0.12]], size=nsamp)
ya = np.ones(shape=nsamp, dtype=int)

db = np.random.multivariate_normal(mean=(-3,1), cov=[[0.08,0.05],[0.05,0.08]], size=nsamp)
yb = 2*np.ones(shape=nsamp, dtype=int)

dc = np.random.multivariate_normal(mean=(-3,-1), cov=[[0.08,-0.05],[-0.05,0.08]], size=nsamp)
yc = 3*np.ones(shape=nsamp, dtype=int)

dd = np.random.multivariate_normal(mean=(-1,1), cov=[[0.08,-0.05],[-0.05,0.08]], size=nsamp)
yd = 4*np.ones(shape=nsamp, dtype=int)

de = np.random.multivariate_normal(mean=(-1,-1), cov=[[0.08,0.05],[0.05,0.08]], size=nsamp)
ye = 5*np.ones(shape=nsamp, dtype=int)

df = np.random.multivariate_normal(mean=(3,0.6), cov=[[0.05,0.0],[0.0,0.05]], size=nsamp)
yf = 6*np.ones(shape=nsamp, dtype=int)

dg = np.random.multivariate_normal(mean=(3,-0.6), cov=[[0.05,0.0],[0.0,0.05]], size=nsamp)
yg = 7*np.ones(shape=nsamp, dtype=int)


X = np.vstack((da, db, dc, dd, de, df, dg))
y = np.hstack((ya, yb, yc, yd, ye, yf, yg))
print(X.shape, X.shape)

plot_scatter(X=X, labels=None, title="Scatter Plot")


(280, 2) (280, 2)

In [4]:
km = pyclust.KMeans(n_clusters=5)

km.fit(X[:,0:2])

print(km.centers_)

plot_scatter(X[:,0:2], labels=km.labels_, title="Scatter Plot: K-Means")


[[-1.13460083 -0.85638685]
 [-1.04989029  0.8532586 ]
 [-2.8915434  -0.92484077]
 [-2.78717847  0.79721536]
 [ 2.99274955 -0.01290366]]
/home/vahid/anaconda/envs/py34/lib/python3.4/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [5]:
bkm = pyclust.BisectKMeans(n_clusters=10)

bkm.fit(X[:,0:2])

print(bkm.labels_)

plot_scatter(X[:,0:2], labels=bkm.labels_, title="Scatter Plot: Bisecting K-Means")


Bisecting Step 1    : 0 [  51.07045976  253.60971881] [[ 2.99274955 -0.01290366]
 [-1.98245776 -0.02451446]]
Bisecting Step 2    : 2 [ 100.78309968   96.17639432] [[-2.8352682   0.00371891]
 [-1.09483874 -0.05390021]]
Bisecting Step 3    : 3 [ 16.84066346  35.47012966] [[-2.92965556 -0.97936517]
 [-2.76366399  0.74950684]]
Bisecting Step 4    : 4 [ 27.50676021  20.96054304] [[-1.12151778 -0.87333229]
 [-1.06588914  0.83527076]]
Bisecting Step 5    : 1 [ 10.62349642  10.02446349] [[ 3.02895381 -0.61823316]
 [ 2.95654529  0.59242585]]
Bisecting Step 6    : 6 [ 15.12758377   4.6157115 ] [[-2.99982208  1.00803731]
 [-2.23886824  0.17499468]]
Bisecting Step 7    : 7 [ 15.49669722   1.93785081] [[-0.9549116  -1.03971505]
 [-1.72735843 -0.26830409]]
Bisecting Step 8    : 8 [  1.48905062  13.93823554] [[-1.71488998  0.31422698]
 [-0.95231399  0.92645342]]
Bisecting Step 9    : 5 [ 4.90060982  6.0317567 ] [[-3.22599331 -0.82875769]
 [-2.68270744 -1.10487141]]
[12 14 12 15 12 15 15 15 12 12 14 12 12 12 12 14 14 18 17 12 14 18 15 12 18
 14 15 14 12 12 12 15 14 12 14 14 12 14 12 12 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 18 18 18 18 18 18 18 18 17 18 17 18 17 17 17 17 17 18 17 17
 17 18 18 17 17 18 18 18 18 17 17 18 17 17 18 17 17 18 17 18 16 16 16 16 16
 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
 16 16 16 16 16 16 16 16 16 16 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10  9  9  9  9  9  9  9  9  9  9
  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
  9  9  9  9  9]

In [6]:
for nclust in range(2,11):
    print(np.unique(bkm.cut(nclust)))
    plot_scatter(X[:,0:2], labels=bkm.cut(nclust), title="Scatter Plot: Bisecting K-Means (%d)"%nclust)


[1 2]
[1 3 4]
[1 4 5 6]
[1 5 6 7 8]
[ 5  6  7  8  9 10]
[ 5  7  8  9 10 11 12]
[ 5  8  9 10 11 12 13 14]
[ 5  9 10 11 12 13 14 15 16]
[ 9 10 11 12 13 14 15 16 17 18]