In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results. As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.
Let's first import the necessary modules
In [11]:
from sklearn.cluster import KMeans
import numpy as np
import bokeh.plotting
from bokeh.plotting import figure
Let's load our Iris data set
In [15]:
from sklearn import datasets
# the iris dataset is 150 samples, each with four features
# we only want petal length and petal width
iris = datasets.load_iris()
# get only petal features, which are the third and fourth values in each sample
petal_data = iris.data[:,2:]
Perform k-means analysis on iris data
In [16]:
# there are only 3 iris flower groups: 'setosa', 'versicolor', 'virginica'
kmean = KMeans(n_clusters=3) # n_clusters asks for only 3 groupings
kmean.fit(petal_data)
Out[16]:
Let's initialize Bokeh
In [13]:
bokeh.plotting.output_notebook()
In [17]:
# initialize our bokeh plot
plot = figure(width=500, height=500, title='Iris Petals', x_axis_label = "Petal Length", y_axis_label = "Petal Width")
# plot centroid / cluster center / group mean for each group
clus_xs = []
clus_ys = []
#we get the cluster x / y values from the k-means algorithm
for entry in kmean.cluster_centers_:
clus_xs.append(entry[0])
clus_ys.append(entry[1])
# the cluster center is marked by a circle, with a cross in it
plot.circle_cross(x=clus_xs, y=clus_ys, size=40, fill_alpha=0, line_width=2, color=['red', 'blue', 'purple'])
plot.text(text = ['setosa', 'versicolor', 'virginica'], x=clus_xs, y=clus_ys, text_font_size='30pt')
Out[17]:
In [19]:
# begin plotting each petal length / width
# We get our x / y values from the original plot data.
# The k-means algorithm tells us which 'color' each plot point is
# and therefore which group it is a member of.
i = 0
for sample in petal_data:
# "labels_" tells us which cluster each plot point is a member of
if kmean.labels_[i] == 0:
plot.circle(x=sample[0], y=sample[1], size=15, color="red")
if kmean.labels_[i] == 1:
plot.circle(x=sample[0], y=sample[1], size=15, color="blue")
if kmean.labels_[i] == 2:
plot.circle(x=sample[0], y=sample[1], size=15, color="purple")
i += 1
bokeh.plotting.show(plot)
Out[19]:
In [ ]: