Clustering: Unsupervised Grouping of Data


In [ ]:
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [ ]:
iris = load_iris()
X = iris.data
y = iris.target

In [ ]:
print(X.shape)

In [ ]:
pca = PCA(n_components=2)
X = pca.fit_transform(X)

Fit a simple KMeans cluster model in iris dataset


In [ ]:
km = KMeans()
km.fit(X)
clusters = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=clusters, alpha=0.5)
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
            c=np.arange(km.n_clusters), marker='x', s=150, linewidth=3)

Q: What went wrong?


In [ ]:
km = KMeans(n_clusters=3)
km.fit(X)
clusters = km.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=clusters, alpha=0.5)
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
            c=np.arange(km.n_clusters), marker='x', s=150, linewidth=3)

In [ ]:
print("Clustering F1 Score: %f" % f1_score(y, clusters))

Q: What went wrong?


In [ ]:
print(y)

In [ ]:
print(clusters)

In [ ]:
c_mapped = clusters.copy()
c_mapped[clusters == 1] = 0
c_mapped[clusters == 2] = 1
c_mapped[clusters == 0] = 2

In [ ]:
print("Clustering F1 Score: %f" % f1_score(y, c_mapped))

Always interpret results with caution!

Clustering as Data Compression: Vector Quantization


In [ ]:
from scipy.misc import face
racoon = face(gray=True)
fig, ax = plt.subplots(nrows=1, ncols=2)
ax[0].imshow(racoon, cmap=plt.cm.gray)
ax[0].set_xticks([])
ax[0].set_yticks([])
_ = ax[1].hist(racoon.reshape(-1, 1), bins=256,
               normed=True, color='.5', edgecolor='.5')
plt.tight_layout()

In [ ]:
X = racoon.reshape(-1, 1)
km = KMeans(n_clusters=5)
km.fit(X)
values = km.cluster_centers_.ravel()
labels = km.labels_
rac_compressed = np.choose(labels, values)
rac_compressed.shape = racoon.shape
fig, ax = plt.subplots(nrows=1, ncols=2)
ax[0].imshow(rac_compressed, cmap=plt.cm.gray)
ax[0].set_xticks([])
ax[0].set_yticks([])
_ = ax[1].hist(rac_compressed.reshape(-1, 1), bins=256,
               normed=True, color='.5', edgecolor='.5')
plt.tight_layout()

Overview of clustering methods in sklearn

Exercise: Apply KMeans clustering on MNIST digits dataset and figure out which cluster belongs to which digit

Hint: Try to visualize the average of all images that belong to one cluster


In [ ]:
digits = load_digits()
X = digits.data
y = digits.target

In [ ]:
# enter code here