K-means clustering

Authors

License

Ndèye Gagnessiry Ndiaye and Christin Seifert

versicolor

undefined_1

undefined_2

undefined_3

This work is licensed under the Creative Commons Attribution 3.0 Unported License https://creativecommons.org/licenses/by/3.0/

This notebook:

introduces k-means clustering using features from the Iris flower dataset



In [39]:

    
import pandas as pd
import numpy as np
import pylab as plt
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import sklearn.metrics as sm

We load the Iris flower data set. From the four measured features (e.g 'SepalLength','SepalWidth','PetalLength','PetalWidth'), two features were selected to perform k-means clustering : 'SepalLength' and 'PetalLength'.



In [75]:

    
from sklearn import datasets
iris = datasets.load_iris()
#iris.data
#iris.feature_names
iris.target
#iris.target_names









    Out[75]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [41]:

    
x = pd.DataFrame(iris.data)
x.columns = ['SepalLength','SepalWidth','PetalLength','PetalWidth'] 

y = pd.DataFrame(iris.target)
y.columns = ['Targets']

iris = x[['SepalLength', 'PetalLength']]



In [42]:

    
X= np.array ([[ 6,5],
 [ 6.2, 5.2],
 [ 5.8,4.8]])

model_1 = KMeans(n_clusters=3, random_state=42,max_iter=1,n_init=1, init = X ).fit(iris)
centroids_1 = model_1.cluster_centers_
labels_1=(model_1.labels_)
print(centroids_1)
print(labels_1)









    



[[ 6.18421053  4.81052632]
 [ 6.82325581  5.63953488]
 [ 5.29090909  2.6125    ]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 0 2 2 0 0 0 2 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 0
 0 1 0 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0
 0 0]



In [43]:

    
model_10= KMeans(n_clusters=3, random_state=42,max_iter=10, n_init=1, init = X).fit(iris)
centroids_10 = model_10.cluster_centers_
labels_10=(model_10.labels_)
print(centroids_10)
print(labels_10)









    



[[ 5.87413793  4.39310345]
 [ 6.83902439  5.67804878]
 [ 5.00784314  1.49411765]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0]



In [44]:

    
model_11= KMeans(n_clusters=3, random_state=42,max_iter=11,n_init=1, init = X).fit(iris)
centroids_max = model_11.cluster_centers_
labels_max=(model_11.labels_)
print(centroids_max)
print(labels_max)









    



[[ 5.87413793  4.39310345]
 [ 6.83902439  5.67804878]
 [ 5.00784314  1.49411765]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0]



In [45]:

    
'''model_999= KMeans(n_clusters=3, random_state=42,max_iter=999).fit(iris)
centroids_max = model.cluster_centers_
labels_max=(model.labels_)
print(centroids_max)
print(labels_max)'''









    Out[45]:





'model_999= KMeans(n_clusters=3, random_state=42,max_iter=999).fit(iris)\ncentroids_max = model.cluster_centers_\nlabels_max=(model.labels_)\nprint(centroids_max)\nprint(labels_max)'

The following plots show for each iteration (ie. iter=1; iter=10 ;iter= max) the cluster centroids(blue) and the target data points. Each cluster is distinguished by a different color.



In [53]:

    
# Set the size of the plot
plt.figure(figsize=(24,10))

# Create a colormap
colormap = np.array(['red', 'lime', 'black'])
#colormap = {0: 'r', 1: 'g', 2: 'b'}

# Plot Original
plt.subplot(1, 4, 1)
plt.scatter(x.SepalLength, x.PetalLength, c="K", s=40)
plt.scatter(X[:,0],X[:,1],  c="b")
plt.title('Initial centroids')

# Plot the Models Classifications
plt.subplot(1, 4, 2)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_1], s=40)
plt.scatter(centroids_1[:,0],centroids_1[:,1],  c="b")
plt.title('K Mean Clustering(iter=1)')


plt.subplot(1, 4, 3)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_10], s=40)
plt.scatter(centroids_10[:,0],centroids_10[:,1],  c="b")
plt.title('K Mean Clustering (iter=10)')
                                                           
plt.subplot(1, 4, 4)
plt.scatter(iris.SepalLength, iris.PetalLength, c=colormap[labels_max], s=40)
plt.scatter(centroids_max[:,0],centroids_max[:,1],  c="b")
plt.title('K Mean Clustering (iter= MAX)')

plt.show()

We compute the confusion matrices for each iteration and calculate the purity metric.



In [68]:

    
def confusion(y,labels):
    cm = sm.confusion_matrix(y, labels)
    return cm



In [69]:

    
# Confusion Matrix (iter=1)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_1)
pd.DataFrame(data,cluster_list, set_list)



In [70]:

    
# Confusion Matrix (iter=10)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_10)
pd.DataFrame(data,cluster_list, set_list)



In [71]:

    
# Confusion Matrix (iter=max)
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels_max)
pd.DataFrame(data,cluster_list, set_list)



In [72]:

    
# Calculate purity of each confusion matrix
def Purity(cm):
    M=[]
    S=0
    for i in cm:
        k = max(i)
        M.append(k)
    for i in M:
        S+=i
    Purity=S/150 
    return Purity

metric_list = ["iter= 1", "iter= 10", "iter= MAX"]
set_list = ["Purity metric"]
data = np.array([Purity(confusion(y, labels_1)),Purity(confusion(y, labels_10)),Purity(confusion(y, labels_max))])
pd.DataFrame(data,metric_list, set_list)









    Out[72]:







  
    
      
      Purity metric
    
  
  
    
      iter= 1
      0.813333
    
    
      iter= 10
      0.880000
    
    
      iter= MAX
      0.880000

We select all the four measured features (e.g 'SepalLength','SepalWidth','PetalLength','PetalWidth') for different values of k (e.g k=2, k=3, k=4, k=6) and without random state. We compute the confusion matrix for each k and calculate the purity.



In [87]:

    
#k=2 , random-state= 0
model = KMeans(n_clusters=2,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

#Confusion matrix
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)









    



[[ 6.30103093  2.88659794  4.95876289  1.69587629]
 [ 5.00566038  3.36037736  1.56226415  0.28867925]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]






    Out[87]:







  
    
      
      c1
      c2
      c3
    
  
  
    
      setosa
      0
      50
      0
    
    
      versicolor
      47
      3
      0
    
    
      virginica
      50
      0
      0



In [88]:

    
print ("Purity(k=2)= %f " % Purity(confusion(y, labels)))









    



Purity(k=2)= 0.980000



In [89]:

    
#k=3 , random-state= 0
model = KMeans(n_clusters=3,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

#Confusion matrix
set_list = ["setosa","versicolor","virginica"]
cluster_list = ["c1", "c2", "c3"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)









    



[[ 5.9016129   2.7483871   4.39354839  1.43387097]
 [ 5.006       3.418       1.464       0.244     ]
 [ 6.85        3.07368421  5.74210526  2.07105263]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]






    Out[89]:







  
    
      
      c1
      c2
      c3
    
  
  
    
      setosa
      0
      50
      0
    
    
      versicolor
      48
      0
      2
    
    
      virginica
      14
      0
      36



In [90]:

    
print ("Purity(k=3)= %f " % Purity(confusion(y, labels)))









    



Purity(k=3)= 0.893333



In [77]:

    
#k=4 , random-state= 0
model = KMeans(n_clusters=4,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)


# Confusion Matrix 
set_list = ["setosa","versicolor","virginica","undefined"]
cluster_list = ["c1", "c2", "c3","c4"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)









    



[[ 5.006       3.418       1.464       0.244     ]
 [ 6.23658537  2.85853659  4.80731707  1.62195122]
 [ 5.52962963  2.62222222  3.94074074  1.21851852]
 [ 6.9125      3.1         5.846875    2.13125   ]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 1 2 1 2 1 2 2 2 2 1 2 1 1 2 1 2 1 2 1 1
 1 1 1 1 1 2 2 2 2 1 2 1 1 1 2 2 2 1 2 2 2 2 2 1 2 2 3 1 3 3 3 3 2 3 3 3 1
 1 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 1 3 3 3 1 3 3 3 1 3 3 3 1 1
 3 1]






    Out[77]:







  
    
      
      c1
      c2
      c3
      c4
    
  
  
    
      setosa
      50
      0
      0
      0
    
    
      versicolor
      0
      24
      26
      0
    
    
      virginica
      0
      17
      1
      32
    
    
      undefined
      0
      0
      0
      0



In [414]:

    
print ("Purity(k=4)= %f " % Purity(confusion(y, labels)))









    



Purity(k=4)= 0.726667



In [86]:

    
#k=6 , random-state= 0
model = KMeans(n_clusters=6,).fit(x)
centroids = model.cluster_centers_
labels=(model.labels_)
print(centroids)
print(labels)

# Confusion Matrix 
set_list = ["setosa","versicolor","virginica","undefined_1","undefined_2","undefined_3"]
cluster_list = ["c1", "c2", "c3","c4","c5","c6"]
data = confusion(y, labels)
pd.DataFrame(data,set_list, cluster_list)









    



[[ 6.52916667  3.05833333  5.50833333  2.1625    ]
 [ 5.25555556  3.67037037  1.5037037   0.28888889]
 [ 5.508       2.6         3.908       1.204     ]
 [ 7.475       3.125       6.3         2.05      ]
 [ 6.20769231  2.85384615  4.74615385  1.56410256]
 [ 4.71304348  3.12173913  1.4173913   0.19130435]]
[1 5 5 5 1 1 5 1 5 5 1 5 5 5 1 1 1 1 1 1 1 1 5 1 5 5 1 1 1 5 5 1 1 1 5 5 1
 5 5 1 1 5 5 1 1 5 1 5 1 5 4 4 4 2 4 4 4 2 4 2 2 4 2 4 2 4 4 2 4 2 4 2 4 4
 4 4 4 4 4 2 2 2 2 4 2 4 4 4 2 2 2 4 2 2 2 2 2 4 2 2 0 4 3 0 0 3 2 3 0 3 0
 0 0 4 0 0 0 3 3 4 0 4 3 4 0 3 4 4 0 3 3 3 0 4 4 3 0 0 4 0 0 0 4 0 0 0 4 0
 0 4]






    Out[86]:







  
    
      
      c1
      c2
      c3
      c4
      c5
      c6
    
  
  
    
      setosa
      0
      27
      0
      0
      0
      23
    
    
      versicolor
      0
      0
      24
      0
      26
      0
    
    
      virginica
      24
      0
      1
      12
      13
      0
    
    
      undefined_1
      0
      0
      0
      0
      0
      0
    
    
      undefined_2
      0
      0
      0
      0
      0
      0
    
    
      undefined_3
      0
      0
      0
      0
      0
      0



In [416]:

    
print ("Purity(k=6)= %f " % Purity(confusion(y, labels)))









    



Purity(k=6)= 0.526667