In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
                  n_features=2,
                  centers=3,
                  cluster_std=0.5,
                  shuffle=True,
                  random_state=0)

In [5]:
plt.scatter(X[:, 0],
            X[:, 1],
            c='blue',
            marker='o',
            s=50)
plt.show()



In [8]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,
            init='random',
            n_init=10,
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

In [9]:
plt.scatter(X[y_km==0, 0],
            X[y_km==0, 1],
            s=50,
            c='lightgreen',
            marker='s',
            label='cluster 1')
plt.scatter(X[y_km==1, 0],
            X[y_km==1, 1],
            s=50,
            c='orange',
            marker='o',
            label='cluster 2')
plt.scatter(X[y_km==2, 0],
            X[y_km==2, 1],
            s=50,
            c='lightblue',
            marker='v',
            label='cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
            km.cluster_centers_[:, 1],
            s=250,
            marker='*',
            c='red',
            label='centroids')
plt.legend()
plt.show()



In [10]:
print('Distortion: %.2f' % km.inertia_)


Distortion: 72.48

In [11]:
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=0)
    km.fit(X)
    distortions.append(km.inertia_)
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()



In [16]:
km = KMeans(n_clusters=3,
            n_init=10,
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X,
                                     y_km,
                                     metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
colors = ['blue', 'lightblue', 'green']
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = colors[i]
    plt.barh(range(y_ax_lower, y_ax_upper),
             c_silhouette_vals,
             height=1.0,
             edgecolor='none',
             color=color)
    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
            color='red',
            linestyle='--')
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()



In [17]:
# Organizing clusters as a hierarchical tree
import pandas as pd
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
X = np.random.random_sample([5, 3])*10
df = pd.DataFrame(X, columns=variables, index=labels)
df


Out[17]:
X Y Z
ID_0 6.964692 2.861393 2.268515
ID_1 5.513148 7.194690 4.231065
ID_2 9.807642 6.848297 4.809319
ID_3 3.921175 3.431780 7.290497
ID_4 4.385722 0.596779 3.980443

In [18]:
# Calculate distance matrix
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(
            pdist(df, metric='euclidean')),
            columns=labels, index=labels)
row_dist


Out[18]:
ID_0 ID_1 ID_2 ID_3 ID_4
ID_0 0.000000 4.973534 5.516653 5.899885 3.835396
ID_1 4.973534 0.000000 4.347073 5.104311 6.698233
ID_2 5.516653 4.347073 0.000000 7.244262 8.316594
ID_3 5.899885 5.104311 7.244262 0.000000 4.382864
ID_4 3.835396 6.698233 8.316594 4.382864 0.000000

In [19]:
# Implement Linkage Agglomeration
from scipy.cluster.hierarchy import linkage
row_clusters = linkage(df.values,
                       method='complete',
                       metric='euclidean')
pd.DataFrame(row_clusters,
             columns=['row label 1',
                      'row lable 2',
                      'distance',
                      'no. of items in clust.'],
             index=['cluster %d' %(i+1) for i in
                    range(row_clusters.shape[0])])


Out[19]:
row label 1 row lable 2 distance no. of items in clust.
cluster 1 0.0 4.0 3.835396 2.0
cluster 2 1.0 2.0 4.347073 2.0
cluster 3 3.0 5.0 5.899885 3.0
cluster 4 6.0 7.0 8.316594 5.0

In [21]:
# Visual results in a Dendrogram
from scipy.cluster.hierarchy import dendrogram
row_dendr = dendrogram(row_clusters,
                       labels=labels)
plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()
# We can see that ID_0 and ID_4, followed by
# ID_1 and ID_2 are most similar based on
# distance metric



In [25]:
"""Attaching Dendrogram to heat map"""
fig = plt.figure(figsize=(8, 8), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
row_rendr = dendrogram(row_clusters, orientation='left')
df_rowclust = df.ix[row_dendr['leaves'][::-1]]
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])
cax = axm.matshow(df_rowclust,
                  interpolation='nearest',
                  cmap='hot_r')
axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
    i.set_visible(False)
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))


Out[25]:
[<matplotlib.text.Text at 0x112c7fed0>,
 <matplotlib.text.Text at 0x112cb7d10>,
 <matplotlib.text.Text at 0x112861f50>,
 <matplotlib.text.Text at 0x1123905d0>,
 <matplotlib.text.Text at 0x112390d50>,
 <matplotlib.text.Text at 0x11284a710>]

In [29]:
# Agglomerative clustering via Scikit-learn
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2,
                          affinity='euclidean',
                          linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)


Cluster labels: [0 1 1 0 0]

In [30]:
# DBSCAN Clustering
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=200,
                  noise=0.05,
                  random_state=0)
plt.scatter(X[:, 0], X[:, 1])
plt.show()



In [34]:
# Cluster using KMeans and Agglomerative
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
km = KMeans(n_clusters=2,
            random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km==0, 0],
            X[y_km==0, 1],
            c='lightblue',
            marker='o',
            s=40,
            label='cluster 1')
ax1.scatter(X[y_km==1, 0],
            X[y_km==1, 1],
            c='red',
            marker='s',
            s=40,
            label='cluster 2')
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(n_clusters=2,
                             affinity='euclidean',
                             linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac==0, 0],
           X[y_ac==0, 1],
           c='lightblue',
           marker='o',
           s=40,
           label='cluster 1')
ax2.scatter(X[y_ac==1, 0],
            X[y_ac==1, 1],
            c='red',
            marker='s',
            s=40,
            label='cluster 2')
ax2.set_title('Agglomerative clustering')
plt.legend()
plt.show()



In [35]:
# Cluster using DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.2,
            min_samples=5,
            metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db==0, 0],
           X[y_db==0, 1],
           c='lightblue',
           marker='o',
           s=40,
           label='cluster 1')
plt.scatter(X[y_db==1, 0],
            X[y_db==1, 1],
            c='red',
            marker='s',
            s=40,
            label='cluster 2')
plt.legend()
plt.show()



In [ ]: