In [1]:
import numpy as np
import scipy
import pandas
import treelib
import pyclust
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pandas.read_table('data/data_k5.csv', sep=',')
df.head(3)
Out[2]:
In [3]:
def plot_scatter(X, labels=None, title="Scatter Plot"):
labels = np.zeros(shape=X.shape[0], dtype=int) if labels is None else labels
colors = ['b', 'r', 'g', 'm', 'y']
col_dict = {}
i = 0
for lab in np.unique(labels):
col_dict[lab] = colors[i]
i += 1
fig1 = plt.figure(1, figsize=(8,6))
ax = fig1.add_subplot(1, 1, 1)
for i in np.unique(labels):
indx = np.where(labels == i)[0]
plt.scatter(X[indx,0], X[indx,1], color=col_dict[i], marker='o', s=100, alpha=0.5)
plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title(title, size=20)
plt.show()
## test plot original data
plot_scatter(df.iloc[:,0:2].values, labels=df.iloc[:,2].values, title="Scatter Plot: Original Labels")
In [4]:
km = pyclust.KMeans(n_clusters=5)
km.fit(df.iloc[:,0:2].values)
print(km.centers_)
plot_scatter(df.iloc[:,0:2].values, labels=km.labels_, title="Scatter Plot: K-Means")
In [5]:
bkm = pyclust.BisectKMeans(n_clusters=5)
bkm.fit(df.iloc[:,0:2].values)
print(bkm.labels_)
plot_scatter(df.iloc[:,0:2].values, labels=bkm.labels_, title="Scatter Plot: Bisecting K-Means")
In [6]:
bkm.tree_.show(line_type='ascii')
bkm.cut(n_clusters=4)
bkm.cut(3)
(array([4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 3,
3, 2, 3, 4, 3, 4, 4, 3, 4, 3, 4, 2, 4, 2, 4, 3, 2, 3, 2, 2, 4, 3, 2,
2, 4, 2, 4, 4, 2, 2, 4, 3, 3, 2, 4, 4, 4, 3, 3, 2, 2, 4, 2, 2, 2, 3,
4, 3, 2, 2, 4, 3, 2, 2, 3, 2, 3, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2,
2, 2, 2, 3, 4, 4, 2, 3, 2, 4, 2, 2, 2, 2, 2, 3, 4, 4, 4, 2, 2, 3, 4,
2, 2, 2, 4, 3]),
{2: [-8.1686500000000013, 4.1619483333333331],
3: [-9.2501724137931021, -2.0435517241379313],
4: [8.3429774193548365, -0.30114193548387092]})
In [7]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(2)[0], title="Scatter Plot: Bisecting K-Means (2)")
In [8]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(3)[0], title="Scatter Plot: Bisecting K-Means (3)")
In [9]:
plot_scatter(df.iloc[:,0:2].values, labels=bkm.cut(4)[0], title="Scatter Plot: Bisecting K-Means (4)")
In [ ]: