In this notebook, we demonstrate the construction of an "Elbow Plot" to select cluster size.
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
First we load the IRIS Dataset:
In [2]:
iris_df = sns.load_dataset('iris')
In [3]:
iris_df.head()
Out[3]:
This dataset is already ordered by species. We scramble the rows:
In [4]:
iris_df = iris_df.sample(frac=1)
In [5]:
iris_df.head()
Out[5]:
Next we set aside some test data for each label.
In [6]:
from sklearn.model_selection import train_test_split
features = iris_df.drop('species', axis=1).copy()
labels = iris_df.species.copy()
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,
test_size=20, stratify=labels, random_state=0)
Next we write a function that, given n, computes the cluster centers assuming n clusters and returns within-cluster sum of squared errors.
In [7]:
from sklearn.cluster import KMeans
def within_cluster_sse(n, train_f, test_f):
assert n > 1
assert len(train_f.columns.symmetric_difference(test_f.columns)) == 0
clusterer = KMeans(n_clusters=n, random_state=0)
assignments = clusterer.fit(train_f).predict(test_f)
results_df = test_f.copy()
results_df.index = assignments
means = pd.DataFrame(index=range(0, n),
data=clusterer.cluster_centers_,
columns=test_f.columns)
within_cluster_sse = results_df.sub(means).pow(2)
return within_cluster_sse.sum(axis=1).sum()
Finally, we call the above function for a range of values of n and plot the Within-cluster SSE values against n.
In [8]:
sse_values = {n: within_cluster_sse(n, train_features, test_features) for n in range(2, 21)}
sse_values = pd.Series(sse_values)
In [9]:
ax = sse_values.plot(kind='bar', title='Elbow Plot', rot='0')
labels = ax.set(xlabel='Number of Clusters', ylabel='Within Cluster SSE')
In [ ]: