In [1]:
import numpy as np
import scipy
import pandas
import treelib
import pyclust
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
print("pyclust version: %s"%pyclust.__version__)
In [2]:
def plot_scatter(X, labels=None, title="Scatter Plot"):
labels = np.zeros(shape=X.shape[0], dtype=int) if labels is None else labels
colors = ['b', 'r', 'g', 'm', 'y',
'darkkhaki', 'royalblue', 'aqua', 'lawngreen', 'sienna']
col_dict = {}
i = 0
for lab in np.unique(labels):
col_dict[lab] = colors[i]
i += 1
fig1 = plt.figure(1, figsize=(10,6))
ax = fig1.add_subplot(1, 1, 1)
for i in np.unique(labels):
indx = np.where(labels == i)[0]
plt.scatter(X[indx,0], X[indx,1], color=col_dict[i], marker='o', s=100, alpha=0.5)
plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title(title, size=20)
plt.show()
In [3]:
nsamp = 40
da = np.random.multivariate_normal(mean=(-2,0), cov=[[0.12,0],[0,0.12]], size=nsamp)
ya = np.ones(shape=nsamp, dtype=int)
db = np.random.multivariate_normal(mean=(-3,1), cov=[[0.08,0.05],[0.05,0.08]], size=nsamp)
yb = 2*np.ones(shape=nsamp, dtype=int)
dc = np.random.multivariate_normal(mean=(-3,-1), cov=[[0.08,-0.05],[-0.05,0.08]], size=nsamp)
yc = 3*np.ones(shape=nsamp, dtype=int)
dd = np.random.multivariate_normal(mean=(-1,1), cov=[[0.08,-0.05],[-0.05,0.08]], size=nsamp)
yd = 4*np.ones(shape=nsamp, dtype=int)
de = np.random.multivariate_normal(mean=(-1,-1), cov=[[0.08,0.05],[0.05,0.08]], size=nsamp)
ye = 5*np.ones(shape=nsamp, dtype=int)
df = np.random.multivariate_normal(mean=(3,0.6), cov=[[0.05,0.0],[0.0,0.05]], size=nsamp)
yf = 6*np.ones(shape=nsamp, dtype=int)
dg = np.random.multivariate_normal(mean=(3,-0.6), cov=[[0.05,0.0],[0.0,0.05]], size=nsamp)
yg = 7*np.ones(shape=nsamp, dtype=int)
X = np.vstack((da, db, dc, dd, de, df, dg))
y = np.hstack((ya, yb, yc, yd, ye, yf, yg))
print(X.shape, X.shape)
plot_scatter(X=X, labels=None, title="Scatter Plot")
In [4]:
km = pyclust.KMeans(n_clusters=5)
km.fit(X[:,0:2])
print(km.centers_)
plot_scatter(X[:,0:2], labels=km.labels_, title="Scatter Plot: K-Means")
In [5]:
bkm = pyclust.BisectKMeans(n_clusters=10)
bkm.fit(X[:,0:2])
print(bkm.labels_)
plot_scatter(X[:,0:2], labels=bkm.labels_, title="Scatter Plot: Bisecting K-Means")
In [6]:
for nclust in range(2,11):
print(np.unique(bkm.cut(nclust)))
plot_scatter(X[:,0:2], labels=bkm.cut(nclust), title="Scatter Plot: Bisecting K-Means (%d)"%nclust)