In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
In [5]:
plt.scatter(X[:, 0],
X[:, 1],
c='blue',
marker='o',
s=50)
plt.show()
In [8]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,
init='random',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
In [9]:
plt.scatter(X[y_km==0, 0],
X[y_km==0, 1],
s=50,
c='lightgreen',
marker='s',
label='cluster 1')
plt.scatter(X[y_km==1, 0],
X[y_km==1, 1],
s=50,
c='orange',
marker='o',
label='cluster 2')
plt.scatter(X[y_km==2, 0],
X[y_km==2, 1],
s=50,
c='lightblue',
marker='v',
label='cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
km.cluster_centers_[:, 1],
s=250,
marker='*',
c='red',
label='centroids')
plt.legend()
plt.show()
In [10]:
print('Distortion: %.2f' % km.inertia_)
In [11]:
distortions = []
for i in range(1, 11):
km = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
max_iter=300,
random_state=0)
km.fit(X)
distortions.append(km.inertia_)
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
In [16]:
km = KMeans(n_clusters=3,
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X,
y_km,
metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
colors = ['blue', 'lightblue', 'green']
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = colors[i]
plt.barh(range(y_ax_lower, y_ax_upper),
c_silhouette_vals,
height=1.0,
edgecolor='none',
color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
color='red',
linestyle='--')
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()
In [17]:
# Organizing clusters as a hierarchical tree
import pandas as pd
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
X = np.random.random_sample([5, 3])*10
df = pd.DataFrame(X, columns=variables, index=labels)
df
Out[17]:
In [18]:
# Calculate distance matrix
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(
pdist(df, metric='euclidean')),
columns=labels, index=labels)
row_dist
Out[18]:
In [19]:
# Implement Linkage Agglomeration
from scipy.cluster.hierarchy import linkage
row_clusters = linkage(df.values,
method='complete',
metric='euclidean')
pd.DataFrame(row_clusters,
columns=['row label 1',
'row lable 2',
'distance',
'no. of items in clust.'],
index=['cluster %d' %(i+1) for i in
range(row_clusters.shape[0])])
Out[19]:
In [21]:
# Visual results in a Dendrogram
from scipy.cluster.hierarchy import dendrogram
row_dendr = dendrogram(row_clusters,
labels=labels)
plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()
# We can see that ID_0 and ID_4, followed by
# ID_1 and ID_2 are most similar based on
# distance metric
In [25]:
"""Attaching Dendrogram to heat map"""
fig = plt.figure(figsize=(8, 8), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
row_rendr = dendrogram(row_clusters, orientation='left')
df_rowclust = df.ix[row_dendr['leaves'][::-1]]
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])
cax = axm.matshow(df_rowclust,
interpolation='nearest',
cmap='hot_r')
axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
i.set_visible(False)
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
Out[25]:
In [29]:
# Agglomerative clustering via Scikit-learn
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2,
affinity='euclidean',
linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)
In [30]:
# DBSCAN Clustering
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=200,
noise=0.05,
random_state=0)
plt.scatter(X[:, 0], X[:, 1])
plt.show()
In [34]:
# Cluster using KMeans and Agglomerative
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
km = KMeans(n_clusters=2,
random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km==0, 0],
X[y_km==0, 1],
c='lightblue',
marker='o',
s=40,
label='cluster 1')
ax1.scatter(X[y_km==1, 0],
X[y_km==1, 1],
c='red',
marker='s',
s=40,
label='cluster 2')
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(n_clusters=2,
affinity='euclidean',
linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac==0, 0],
X[y_ac==0, 1],
c='lightblue',
marker='o',
s=40,
label='cluster 1')
ax2.scatter(X[y_ac==1, 0],
X[y_ac==1, 1],
c='red',
marker='s',
s=40,
label='cluster 2')
ax2.set_title('Agglomerative clustering')
plt.legend()
plt.show()
In [35]:
# Cluster using DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.2,
min_samples=5,
metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db==0, 0],
X[y_db==0, 1],
c='lightblue',
marker='o',
s=40,
label='cluster 1')
plt.scatter(X[y_db==1, 0],
X[y_db==1, 1],
c='red',
marker='s',
s=40,
label='cluster 2')
plt.legend()
plt.show()
In [ ]: