In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import MyML.metrics.accuracy as acc
import seaborn as sns
import sklearn.cluster as skl_cluster
import scipy.cluster.hierarchy as hie
In [2]:
data_path = "/home/chiroptera/workspace/QCThesis/datasets/iris/iris.csv"
data = np.genfromtxt(data_path, delimiter=',', dtype=np.float32)
gt = np.empty(150, dtype=np.int32)
gt[:50] = 0
gt[50:100] = 1
gt[100:] = 2
In [3]:
sns.set_style("whitegrid")
fig_width = 8
fig_height = 6
markers=['.', '^', '*', 'd', 'v', 'o',
'<', '>', 's', 'p', '*',
'h', '+', 'D', 'd', '|', '_']
In [4]:
fig_feature_1 = 0
fig_feature_2 = 1
In [18]:
figRaw = plt.figure(figsize=(fig_width,fig_height))
sns.set_palette(sns.color_palette("deep", 6))
ax = figRaw.add_subplot(111)
ax.plot(data[:,fig_feature_1], data[:,fig_feature_2], '.')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("Raw data")
ax.legend(loc="best")
ax.grid(False, which="both")
In [125]:
figGT = plt.figure(figsize=(fig_width,fig_height))
sns.set_palette(sns.color_palette("deep", 6))
mi = iter(markers)
ax = figGT.add_subplot(111)
for c in np.unique(gt):
idx = gt == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("Desired clustering")
ax.legend(loc="best")
ax.grid(False, which="both")
I ran the KMeans algorithm with 1 iteration with random initialization until I got a low accuracy score but whose centroids would converge to a good solution. Then I executed the KMeans algorithm until converce, iteration by iteration (saving centoids and labels of each in convergence_list), with the previous centroids as initialization.
In [323]:
convergence_list = list()
In [341]:
convergence_list.append((centroids.copy(), labels.copy()))
In [321]:
# KMeans cluster
kmeans_est = skl_cluster.KMeans(n_clusters=3, init='random', n_init=1, max_iter=1)
keans_res = kmeans_est.fit(data)
# accuracy
acc_est = acc.ConsistencyIndex(data.shape[0])
acc_est.score(gt, kmeans_est.labels_)
print "Accuracy of K-Means: {}".format(acc_est.accuracy)
centroids = keans_res.cluster_centers_
labels = keans_res.labels_
In [344]:
# KMeans cluster
kmeans_est = skl_cluster.KMeans(n_clusters=3, init=centroids, n_init=1, max_iter=1)
keans_res = kmeans_est.fit(data)
centroids = keans_res.cluster_centers_
# accuracy
acc_est = acc.ConsistencyIndex(data.shape[0])
acc_est.score(gt, kmeans_est.labels_)
print "Accuracy of K-Means: {}".format(acc_est.accuracy)
centroids = keans_res.cluster_centers_
labels = keans_res.labels_
In [326]:
figKM = plt.figure(figsize=(fig_width,fig_height))
sns.set_palette(sns.color_palette("deep", 6))
mi = iter(markers)
ax = figKM.add_subplot(111)
for c in np.unique(labels):
idx = labels == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.plot(centroids[:,fig_feature_1], centroids[:,fig_feature_2], 's')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("K-Means clustering")
ax.legend(loc="best")
ax.grid(False, which="both")
In [348]:
figKM2 = plt.figure(figsize=(fig_width*2,fig_height*2))
sns.set_palette(sns.color_palette("deep", 6))
centroids = convergence_list[0][0]
labels = convergence_list[0][1]
mi = iter(markers)
ax = figKM2.add_subplot(221)
for c in np.unique(kmeans_est.labels_):
idx = kmeans_est.labels_ == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.plot(centroids[:,fig_feature_1], centroids[:,fig_feature_2], 's')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("K-Means clustering, iter=1")
ax.legend(loc="best")
ax.grid(False, which="both")
centroids = convergence_list[1][0]
labels = convergence_list[1][1]
mi = iter(markers)
ax = figKM2.add_subplot(222)
for c in np.unique(kmeans_est.labels_):
idx = kmeans_est.labels_ == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.plot(centroids[:,fig_feature_1], centroids[:,fig_feature_2], 's')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("K-Means clustering, iter=2")
ax.legend(loc="best")
ax.grid(False, which="both")
centroids = convergence_list[2][0]
labels = convergence_list[2][1]
mi = iter(markers)
ax = figKM2.add_subplot(223)
for c in np.unique(kmeans_est.labels_):
idx = kmeans_est.labels_ == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.plot(centroids[:,fig_feature_1], centroids[:,fig_feature_2], 's')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("K-Means clustering, iter=3")
ax.legend(loc="best")
ax.grid(False, which="both")
centroids = convergence_list[-1][0]
labels = convergence_list[-1][1]
mi = iter(markers)
ax = figKM2.add_subplot(224)
for c in np.unique(kmeans_est.labels_):
idx = kmeans_est.labels_ == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.plot(centroids[:,fig_feature_1], centroids[:,fig_feature_2], 's')
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("K-Means clustering, converged")
ax.legend(loc="best")
ax.grid(False, which="both")
In [5]:
import scipy
In [6]:
import MyML.EAC.eac_new as eac
In [15]:
figDendrogram = plt.figure(figsize=(fig_width,fig_height))
plt.xlabel("Clusters")
plt.ylabel("Similarity between clusters")
plt.title("Single-Link dendrogram")
plt.legend(loc="best")
plt.grid(which="both", axis='x')
# pairwise distances in condensed format
pairwise = scipy.spatial.distance.pdist(data)
# cluster
Z = hie.linkage(pairwise, method='single', metric='euclidean')
dendrogram = hie.dendrogram(Z, p=35, truncate_mode='lastp', color_threshold=0.8)
sl_labels = eac.labels_from_Z(Z, 3)
# accuracy
acc_est = acc.ConsistencyIndex(data.shape[0])
acc_est.score(gt, sl_labels)
print "Accuracy of Single-Link: {}".format(acc_est.accuracy)
In [8]:
figSL = plt.figure(figsize=(fig_width,fig_height))
sns.set_palette(sns.color_palette("deep", 6))
mi = iter(markers)
ax = figSL.add_subplot(111)
for c in np.unique(sl_labels):
idx = sl_labels == c
ax.plot(data[idx,fig_feature_1], data[idx,fig_feature_2], mi.next())
ax.set_xlabel("First feature")
ax.set_ylabel("Second feature")
ax.set_title("Single-Link clustering")
ax.legend(loc="best")
ax.grid(False, which="both")
In [16]:
from MyML.helper.plotting import save_fig
img_folder = '/home/chiroptera/workspace/thesis_writing/rsc/clustering/'
save_fig(figDendrogram, '{}sl_dendrogram'.format(img_folder))
In [349]:
from MyML.helper.plotting import save_fig
img_folder = '/home/chiroptera/workspace/thesis_writing/rsc/clustering/'
save_fig(figRaw, '{}raw_data'.format(img_folder))
save_fig(figGT, '{}desired'.format(img_folder))
save_fig(figKM, '{}kmeans'.format(img_folder))
save_fig(figKM2, '{}kmeans_progress'.format(img_folder))
save_fig(figDendrogram, '{}sl_dendrogram'.format(img_folder))
save_fig(figSL, '{}sl'.format(img_folder))