K-means clustering

Authors

Ndèye Gagnessiry Ndiaye and Christin Seifert

License

This work is licensed under the Creative Commons Attribution 3.0 Unported License https://creativecommons.org/licenses/by/3.0/

This notebook:

  • introduces k-means clustering using features from the Iris flower dataset

In [39]:
import pandas as pd
import numpy as np
import pylab as plt
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import sklearn.metrics as sm

We load the Iris flower data set. From the four measured features (e.g 'SepalLength','SepalWidth','PetalLength','PetalWidth'), two features were selected to perform k-means clustering : 'SepalLength' and 'PetalLength'.


In [75]:
from sklearn import datasets
iris = datasets.load_iris()
#iris.data
#iris.feature_names
iris.target
#iris.target_names


Out[75]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [41]:
x = pd.DataFrame(iris.data)
x.columns = ['SepalLength','SepalWidth','PetalLength','PetalWidth'] 

y = pd.DataFrame(iris.target)
y.columns = ['Targets']

iris = x[['SepalLength', 'PetalLength']]

In [42]:
X= np.array ([[ 6,5],
 [ 6.2, 5.2],
 [ 5.8,4.8]])

model_1 = KMeans(n_clusters=3, random_state=42,max_iter=1,n_init=1, init = X ).fit(iris)
centroids_1 = model_1.cluster_centers_
labels_1=(model_1.labels_)
print(centroids_1)
print(labels_1)


[[ 6.18421053  4.81052632]
 [ 6.82325581  5.63953488]
 [ 5.29090909  2.6125    ]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 0 2 2 0 0 0 2 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 0
 0 1 0 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0
 0 0]

In [43]:
model_10= KMeans(n_clusters=3, random_state=42,max_iter=10, n_init=1, init = X).fit(iris)
centroids_10 = model_10.cluster_centers_
labels_10=(model_10.labels_)
print(centroids_10)
print(labels_10)


[[ 5.87413793  4.39310345]
 [ 6.83902439  5.67804878]
 [ 5.00784314  1.49411765]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0]

In [44]:
model_11= KMeans(n_clusters=3, random_state=42,max_iter=11,n_init=1, init = X).fit(iris)
centroids_max = model_11.cluster_centers_
labels_max=(model_11.labels_)
print(centroids_max)
print(labels_max)


[[ 5.87413793  4.39310345]
 [ 6.83902439  5.67804878]
 [ 5.00784314  1.49411765]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0]

In [45]:
'''model_999= KMeans(n_clusters=3, random_state=42,max_iter=999).fit(iris)
centroids_max = model.cluster_centers_
labels_max=(model.labels_)
print(centroids_max)
print(labels_max)'''


Out[45]:
'model_999= KMeans(n_clusters=3, random_state=42,max_iter=999).fit(iris)\ncentroids_max = model.cluster_centers_\nlabels_max=(model.labels_)\nprint(centroids_max)\nprint(labels_max)'

The following plots show for each iteration (ie. iter=1; iter=10 ;iter= max) the cluster centroids(blue) and the target data points. Each cluster is distinguished by a different color.


In [53]:
# Set the size of the plot
plt.figure(figsize=(24,10))

# Create a colormap
colormap = np.array(['red', 'lime', 'black'])
#colormap = {0: 'r', 1: 'g', 2: 'b'}

# Plot Original
plt.subplot(1, 4, 1)
plt.scatter(x.SepalLength, x.PetalLength, c="K", s=40)
plt.scatter(X[:,0],X[:,1],  c="b")
plt.title('Initial centroids')

# Plot the Models Classifications
plt.subplot(1, 4, 2)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_1], s=40)
plt.scatter(centroids_1[:,0],centroids_1[:,1],  c="b")
plt.title('K Mean Clustering(iter=1)')


plt.subplot(1, 4, 3)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_10], s=40)
plt.scatter(centroids_10[:,0],centroids_10[:,1],  c="b")
plt.title('K Mean Clustering (iter=10)')
                                                           
plt.subplot(1, 4, 4)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_max], s=40)
plt.scatter(centroids_max[:,0],centroids_max[:,1],  c="b")
plt.title('K Mean Clustering (iter= MAX)')

plt.show()


We compute the confusion matrices for each iteration and calculate the purity metric.


In [68]:
def confusion(y,labels):
    cm = sm.confusion_matrix(y, labels)
    return cm

In [69]:
# Confusion Matrix (iter=1)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_1)
pd.DataFrame(data,cluster_list, set_list)


Out[69]:
setosa versicolor virginica
c1 0 0 50
c2 41 0 9
c3 19 31 0

In [70]:
# Confusion Matrix (iter=10)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_10)
pd.DataFrame(data,cluster_list, set_list)


Out[70]:
setosa versicolor virginica
c1 0 0 50
c2 45 4 1
c3 13 37 0

In [71]:
# Confusion Matrix (iter=max)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_max)
pd.DataFrame(data,cluster_list, set_list)


Out[71]:
setosa versicolor virginica
c1 0 0 50
c2 45 4 1
c3 13 37 0

In [72]:
# Calculate purity of each confusion matrix
def Purity(cm):
    M=[]
    S=0
    for i in cm:
        k = max(i)
        M.append(k)
    for i in M:
        S+=i
    Purity=S/150 
    return Purity

metric_list = ["iter= 1", "iter= 10", "iter= MAX"]
set_list = ["Purity metric"]
data = np.array([Purity(confusion(y, labels_1)),Purity(confusion(y, labels_10)),Purity(confusion(y, labels_max))])
pd.DataFrame(data,metric_list, set_list)


Out[72]:
Purity metric
iter= 1 0.813333
iter= 10 0.880000
iter= MAX 0.880000

We select all the four measured features (e.g 'SepalLength','SepalWidth','PetalLength','PetalWidth') for different values of k (e.g k=2, k=3, k=4, k=6) and without random state. We compute the confusion matrix for each k and calculate the purity.


In [87]:
#k=2 , random-state= 0
model = KMeans(n_clusters=2,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

#Confusion matrix
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)


[[ 6.30103093  2.88659794  4.95876289  1.69587629]
 [ 5.00566038  3.36037736  1.56226415  0.28867925]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Out[87]:
c1 c2 c3
setosa 0 50 0
versicolor 47 3 0
virginica 50 0 0

In [88]:
print ("Purity(k=2)= %f " % Purity(confusion(y, labels)))


Purity(k=2)= 0.980000 

In [89]:
#k=3 , random-state= 0
model = KMeans(n_clusters=3,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

#Confusion matrix
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)


[[ 5.9016129   2.7483871   4.39354839  1.43387097]
 [ 5.006       3.418       1.464       0.244     ]
 [ 6.85        3.07368421  5.74210526  2.07105263]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
Out[89]:
c1 c2 c3
setosa 0 50 0
versicolor 48 0 2
virginica 14 0 36

In [90]:
print ("Purity(k=3)= %f " % Purity(confusion(y, labels)))


Purity(k=3)= 0.893333 

In [77]:
#k=4 , random-state= 0
model = KMeans(n_clusters=4,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)


# Confusion Matrix 
set_list = ["setosa","versicolor","virginica","undefined"]
cluster_list = ["c1", "c2", "c3","c4"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)


[[ 5.006       3.418       1.464       0.244     ]
 [ 6.23658537  2.85853659  4.80731707  1.62195122]
 [ 5.52962963  2.62222222  3.94074074  1.21851852]
 [ 6.9125      3.1         5.846875    2.13125   ]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 1 2 1 2 1 2 2 2 2 1 2 1 1 2 1 2 1 2 1 1
 1 1 1 1 1 2 2 2 2 1 2 1 1 1 2 2 2 1 2 2 2 2 2 1 2 2 3 1 3 3 3 3 2 3 3 3 1
 1 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 1 3 3 3 1 3 3 3 1 3 3 3 1 1
 3 1]
Out[77]:
c1 c2 c3 c4
setosa 50 0 0 0
versicolor 0 24 26 0
virginica 0 17 1 32
undefined 0 0 0 0

In [414]:
print ("Purity(k=4)= %f " % Purity(confusion(y, labels)))


Purity(k=4)= 0.726667 

In [86]:
#k=6 , random-state= 0
model = KMeans(n_clusters=6,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

# Confusion Matrix 
set_list = ["setosa","versicolor","virginica","undefined_1","undefined_2","undefined_3"]
cluster_list = ["c1", "c2", "c3","c4","c5","c6"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)


[[ 6.52916667  3.05833333  5.50833333  2.1625    ]
 [ 5.25555556  3.67037037  1.5037037   0.28888889]
 [ 5.508       2.6         3.908       1.204     ]
 [ 7.475       3.125       6.3         2.05      ]
 [ 6.20769231  2.85384615  4.74615385  1.56410256]
 [ 4.71304348  3.12173913  1.4173913   0.19130435]]
[1 5 5 5 1 1 5 1 5 5 1 5 5 5 1 1 1 1 1 1 1 1 5 1 5 5 1 1 1 5 5 1 1 1 5 5 1
 5 5 1 1 5 5 1 1 5 1 5 1 5 4 4 4 2 4 4 4 2 4 2 2 4 2 4 2 4 4 2 4 2 4 2 4 4
 4 4 4 4 4 2 2 2 2 4 2 4 4 4 2 2 2 4 2 2 2 2 2 4 2 2 0 4 3 0 0 3 2 3 0 3 0
 0 0 4 0 0 0 3 3 4 0 4 3 4 0 3 4 4 0 3 3 3 0 4 4 3 0 0 4 0 0 0 4 0 0 0 4 0
 0 4]
Out[86]:
c1 c2 c3 c4 c5 c6
setosa 0 27 0 0 0 23
versicolor 0 0 24 0 26 0
virginica 24 0 1 12 13 0
undefined_1 0 0 0 0 0 0
undefined_2 0 0 0 0 0 0
undefined_3 0 0 0 0 0 0

In [416]:
print ("Purity(k=6)= %f " % Purity(confusion(y, labels)))


Purity(k=6)= 0.526667