Here we are going to propose a framework for K means method in application to the MNIST database. The algorithm we are going to use is the one of Lloyd/Voronoi. We will try to see the performance of this algorithm, and his "online" behavior to understand how the cluster are compute.
In [5]:
%%html
<script src="https://cdn.rawgit.com/parente/4c3e6936d0d7a46fd071/raw/65b816fb9bdd3c28b4ddf3af602bfd6015486383/code_toggle.js"></script>
Let us rapidly describe how this algorithm is working. It is an iterative algorithm that, given an initial estimate of the centroids $\mu_i$ progresses by repating two steps :
Step 1 Each point is assigned to the cluster whose centroid gives the smallest energy, or distance in Euclidian space. In the case of Euclidian space, we can think of this step as the assignement of each point to the nearest centroid.
Step 2 For each cluster, a new centroid is calculated : $$ \mu_i = \text{arg min}_{\hat \mu} \sum_{x_j \in S_i} \lvert \lvert x_j - \hat \mu \lvert \lvert^2 $$ The function we want to minimize is differentiable, and to verify the conditions on the derivative we can choose : $$ \mu_i = \frac{1}{\lvert S_i \lvert} \sum_{x_j \in S_i} x_j $$
This algorithm converges to a local minimum.
We are going to create gaussian blops, with several standard deviation, in several input space. The idea is to see how the algorithm perform in such conditions.
In [1]:
import numpy as np
def random_sample(X, k):
'''
Take a random sample of a list X to generate the initial centroids
'''
return X[np.random.choice(X.shape[0], k, replace=False),:]
def pairwise_distances_argmin(X, y):
'''
Return the closest centroids to the point y
X are all the centroids
'''
indices = np.empty(X.shape[0], dtype=np.intp)
for i in range(len(X)):
indices[i] = np.linalg.norm(X[i,np.newaxis] - y, axis=1).argmin()
return indices
In [2]:
def kmeans_iteration(X, m,distance=pairwise_distances_argmin):
'''
One iteration of Lloyd's algorithm
'''
clusters = distance(X, m)
centroids = np.empty(m.shape)
for i in range(len(m)):
centroids[i] = np.mean(X[clusters == i], axis=0)
return centroids, clusters
def kmeans(X, k,m=None,distance=pairwise_distances_argmin):
'''
Run K_means until we are close to convergence
'''
if m is None : m = random_sample(X, k)
while True:
new_m, clusters = kmeans_iteration(X, m,distance)
if np.isclose(m, new_m).all():
break
m = new_m
return new_m, clusters
In [3]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
import seaborn as sns
plt.style.use('ggplot')
sns.set_style('whitegrid') # some graphic stuff use in jupyter in python3
rc('figure', figsize=(6, 4))
cmap = cm.get_cmap('rainbow') # colors for cluster
def plot_clusters(X, m, clusters):
'''
Plot all the pointsi n the clusters with respective colors
'''
k = len(m)
for i in range(k):
group = X[clusters == i]
plt.scatter(group[:,0], group[:,1], marker='.', color=cmap(i / k))
plt.scatter(m[i,0], m[i,1], marker='s',lw=2,color=cmap(i / k),edgecolor='k')
plt.show()
In [4]:
def classify_cluster(clusters,centroids,X):
"""
Assigns a digit label to each cluster.
Cluster is a list of clusters containing labelled datapoints.
IMPORTANT: this function depends on clusters and centroids being in the same order.
"""
labelled_centroids = []
clusters_list = {c: [] for c in range(len(centroids))}
for i in range(len(clusters)):
clusters_list[clusters[i]].append((X[i][0],X[i][1]))
clusters_final = list(clusters_list.values())
for i in range(len(clusters_final)):
labels = list(map(lambda x: x[0], clusters_final[i]))
# pick the most common label
most_common = max(set(labels), key=labels.count)
centroid = (most_common, centroids[i])
labelled_centroids.append(centroid)
return labelled_centroids
def classify_digit(digit, labelled_centroids):
"""
given an unlabelled digit represented by a vector and a list of
labelled centroids [(label,vector)], determine the closest centroid
and thus classify the digit.
"""
mindistance = float("inf")
for (label, centroid) in labelled_centroids:
distance = np.linalg.norm(centroid - digit)
if distance < mindistance:
mindistance = distance
closest_centroid_label = label
return closest_centroid_label
def get_error_rate(digits,labelled_centroids):
"""
classifies a list of labelled digits. returns the error rate.
"""
classified_incorrect = 0
for (label,digit) in digits:
classified_label = classify_digit(digit, labelled_centroids)
if classified_label != label:
classified_incorrect +=1
error_rate = classified_incorrect / float(len(digits))
return error_rate
In [5]:
from sklearn.datasets import make_blobs
%matplotlib inline
k=6 # Number of centers
sigma = 3 # Std deviation
X, label = make_blobs(n_samples=2000, centers=k, cluster_std=sigma)
m, clusters = kmeans(X, k)
plt.title('Results of Lloyds algorithm for {} clusters, with standard deviation {}' .format(k,sigma))
plot_clusters(X, m, clusters)
#Without cluster classification
plt.plot(X[:,0], X[:,1],
linestyle='', marker='.',
color=cmap(0.2), markeredgecolor=cmap(0.25))
plt.title('Blops for {} clusters, with standard deviation {} without identification' .format(k,sigma))
plt.show()
Due to the high variance, we do not really recognize the several blops. However when applying the K-means algorithm the centroids are compute in order to separate the datas in 6 cluster of equal size.
In order to obtain a better understanding of the algorithm, we will now study the 'online' algorithm, with some animations.
In [6]:
import matplotlib.animation as animation
class KMeansAnimation:
def __init__(self, fig, ax, X, m=None, k=2):
self.X = X
self.fig = fig
self.m = m if m is not None else random_sample(X, k)
# We have to call plot for each cluster and its centroid
# because we want to distinguish the clusters by color
# and draw the centroid with a different marker
self.clusters, self.centroids = [], []
for i in range(k):
color = cmap(i / k)
self.clusters.append(
ax.plot([], [],
linestyle='', marker='.',
markeredgecolor=color, color=color)[0]
)
self.centroids.append(
ax.plot([], [],
linestyle='', marker='s',
markeredgewidth=1,
markersize=8, color=color,markeredgecolor='k')[0]
)
def update(self, t):
self.m, clusters = kmeans_iteration(self.X, self.m)
self.fig.suptitle('n = {}, centers = {} – Iteration {}'.format(
len(self.X), len(self.m), t + 1)
)
# To update the plot, we simply call set_data on the saved axes
for i in range(len(self.m)):
group = self.X[clusters == i]
self.clusters[i].set_data(group.T)
self.centroids[i].set_data(self.m[i])
return self.clusters + self.centroids
In [7]:
from IPython.display import HTML
def make_animation(X, k, m=None, frames=20):
'''
display animation in jupyter notebook
'''
fig = plt.figure(figsize=(6, 4))
(xmin, ymin), (xmax, ymax) = np.min(X, axis=0), np.max(X, axis=0)
ax = plt.axes(xlim=(xmin, xmax), ylim=(ymin, ymax))
control = KMeansAnimation(fig, ax, X, m=m, k=k)
anim = animation.FuncAnimation(
fig, control.update,
frames=frames, interval=700, blit=True,
)
# Necessary, otherwise the notebook will display the final figure
# along with the animation
plt.close(fig)
return HTML(anim.to_html5_video())
In [8]:
k=6 # Number of blops
sigma = 3 # Std deviation
X,_ = make_blobs(n_samples=2000, centers=k, cluster_std=sigma)
print ('Results of Lloyds algorithm for {} clusters, with standard deviation {}, with less centers than blops' .format(k,sigma))
make_animation(X, k=4, frames=20)
Out[8]:
In order to better see the algorithm in action, we generate more blobs and we use less clusters than the number of blobs to better see the algorithm in action.
As we observe just above, even if the blobs are not totally recognizable, the centers of the clusters are moving until they reach an equilibirum.
In [9]:
k=6
sigma=2.2
X, _ = make_blobs(n_samples=2000, centers=6, cluster_std=2.2)
m = np.array([[0, 2.5], [3, 11], [4, 10], [5.5, 2]])
print ('Results of Lloyds algorithm for {} blops, with standard deviation {}, with close centroids at time 0' .format(k,sigma))
make_animation(X, k=4,m=m, frames=20)
Out[9]:
If we choose the centroids, close at time $t=0$, we can observe a strong displacement of the centers. Indeed, it is a tougher problem for our algorithm and we can observe the journey of the orange one : since the algorithm tends to put some distance between the centroids, if two centroids start too close one of them will be eventually moved farther away.
To finish these animations, we will observe what happens when running on a uniformly population :
In [10]:
X_dense = np.random.rand(2000, 2)
make_animation(X_dense, k=11)
Out[10]:
As we can see, Lloyd's algorithm tends to partition the space into clusters of uniform area. After running the algorithm on a dense space, it should be clear that Lloyd's algorithm is closely related to Voronoi diagrams. In fact, at each iteration we are effectively computing the Voronoi tessellation with respect to the centroids.
For now, we just have described how the algorithm is working. We will now study how he performs, with respect ot several parameters, by computing the error rate at the end of the total iterations.
In [32]:
# Define the list of the variables : dimensions,standard deviation
list_sigma=np.arange(1.5,6,0.1)
list_dimensions = np.arange(2,50,1)
list_error_rates=np.zeros(len(list_dimensions))
k=3 # Number of centers
sigma=6
for i,dim in enumerate(list_dimensions):
X, label = make_blobs(n_samples=2000, centers=k,n_features=dim, cluster_std=sigma)
X_final=list(zip(label,X))
m, clusters = kmeans(X, k)
labelled_centroids = classify_cluster(clusters,m,X_final)
error_rate = get_error_rate(X_final, labelled_centroids)
list_error_rates[i]=error_rate
# Grpahic plot
x_axis=list_dimensions
y_axis = list_error_rates
plt.figure()
plt.title("Error Rate function of the dimension for {} centers and a standard deviation of {}".format(k,sigma))
plt.scatter(x_axis, y_axis)
plt.xlabel("Dimension of the input space")
plt.ylabel("Error Rate")
plt.show()
On this graph we have plot the error rate we obtain with respect to the input space's dimension when we try to classify 3 gaussian clusters. We can note that we have choose a high standard deviation to be able to see an evolution of the error rate.
As with the perceptron, weo bserve that an increase in the input space decreases the number of error we make. Indeed, increasing the dimension allows the algorithm to have more parameters to calibrate the classification. The high standard deviation is compensate by a high diemnsional input space.
There is another set of parameter and that is why we are going to study 3-D plots to observe all the dependencies.
In [11]:
# Define the list of the variables : dimensions,standard deviation and number of centers
list_sigma=np.arange(4,12,2)
list_dimensions = np.arange(2,50,5)
list_centers=np.arange(2,50,5)
list_error_rates=np.zeros((len(list_dimensions),len(list_centers),len(list_sigma))) # Full results
for k,centers in enumerate(list_centers):
for j,sigma in enumerate(list_sigma):
for i,dim in enumerate(list_dimensions):
X, label = make_blobs(n_samples=2000, centers=centers,n_features=dim, cluster_std=sigma)
X_final=list(zip(label,X))
m, clusters = kmeans(X, centers)
labelled_centroids = classify_cluster(clusters,m,X_final)
error_rate = get_error_rate(X_final, labelled_centroids)
list_error_rates[i,k,j]=error_rate
In [12]:
#Graphic packages
import plotly as plotly # New package on grpahics, really good in 3D and to move graphs
plotly.tools.set_credentials_file(username='Kazoo', api_key='fVOEm7jiacAQuzw7juG0') # Just for reinitialization of the API key
from plotly.graph_objs import *
import plotly.plotly as py
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True) # Allow to plot the graphs in the notebook
In [13]:
# Plot of the full data
z1=list_error_rates[:,:,0]
z2=list_error_rates[:,:,1]+2
z3=list_error_rates[:,:,2]+4
z4=list_error_rates[:,:,3]+6
init_notebook_mode(connected=True)
layout = Layout(dict(title = 'Error rates of the K-means Algorithms',
scene=dict(
yaxis= dict(
title= 'Dimension',
ticklen= 7,
zeroline= False,
gridwidth= 2,
ticktext= list_dimensions,
tickvals=np.arange(0,len(list_dimensions),1),),
xaxis=dict(
title= 'Number of centers',
ticklen= 7,
gridwidth= 2,
ticktext= list_centers,
tickvals=np.arange(0,len(list_centers),1),
),
zaxis=dict(
title= 'Standard deviation',
ticklen= 7,
gridwidth= 2,
ticktext= list_sigma,
tickvals=np.arange(0,len(list_sigma),1),
),
),
showlegend=True))
data=Data([
dict(z=z1, type='surface',name='Sigma= '+str(list_sigma[0]),colorbar = dict(
title = "Error rate at the end")),
dict(z=z2, showscale=False, opacity=1, type='surface',name='Sigma = '+str(list_sigma[1])),
dict(z=z3, showscale=False, opacity=1, type='surface',name='Sigma = '+str(list_sigma[2])),
dict(z=z4, showscale=False, opacity=1, type='surface',name='Sigma = '+str(list_sigma[3])),
])
fig = dict(data=data,
layout=layout
)
py.iplot(fig, filename='multiple-surfaces-error-rates')
Out[13]:
This graph gives us a lot of informations. First of all, each surface correspond to a different value of standard deviation when coomputing the blobs.
In each case the error rate goes from almost 1 to 0 when varying the number of centers and the dimension. It's important to note that we work at total number of points fixed at 2000. That's the reason why increasing the dimension decreases the error rate like we saw before. We can now add that an increase in the number of cluster leads to an increase in the error rate. The explication is simple : because of the high variance and the fixed number of points, if we increase the number of cluster, they become smaller and they mixed together because of the standard deviation. That's why we obtain such a result.
To finish, we observe naturally that a high standard deviation gives a harder problem to solve to our algorithm, which is logical because all the points are more mixed together.
First of all we are going to reduce the dimensions of the datas, just to be able to plot them and to highlight the difficulties which come with real datasets.
In [14]:
import random
from base64 import b64decode
from json import loads
import numpy as np
import matplotlib.pyplot as plt
# set matplotlib to display all plots inline with the notebook
%matplotlib inline
In [15]:
def parse(x):
"""
to parse the digits file into tuples of
(labelled digit, numpy array of vector representation of digit)
"""
digit = loads(x)
array = np.fromstring(b64decode(digit["data"]),dtype=np.ubyte)
array = array.astype(np.float64)
return (digit["label"], array)
In [16]:
# read in the digits file. Digits is a list of 60,000 tuples,
# each containing a labelled digit and its vector representation.
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
with open("data/digits.base64.json","r") as f:
digits = list(map(parse, f.readlines()))
data=[]
for i in range(int(len(digits)*0.75)):
data.append(digits[i][1])
data = scale(data) # scale the datas
reduced_data = PCA(n_components=2).fit_transform(data) # Redced data to plot them in 2D
# Graphics stuff
print ('PCA reduced MNIST Data and K-Means algorithm for 10 cluster')
make_animation(reduced_data, k=10, frames=20)
Out[16]:
The question we can ask is, Is this classification accurate or does the centroids are kind of random ? To do this we will plot the label of a digit as color instead of of the cluster
We see that our algorithm is able to find cluster in the datas. However we are not sure that the classification we obtained is accurate. The next plot will answer this question.
In [123]:
label_digit=list()
for i in range (len(reduced_data)):
label_digit.append(digits[i][0])
training = zip(label_digit,reduced_data)
plt.figure(1,figsize=(8,6))
colormap=np.array(['r', 'g', 'b','y','m','k','c','crimson','orange','pink'])
for i in range(10): #scatter legend
group0 = list() ; group1 = list()
for k in range(len(reduced_data)):
if label_digit[k]==i:
group0.append(reduced_data[k,0])
group1.append(reduced_data[k,1])
plt.scatter(group0,group1,s=10,c=colormap[i],label=i)
plt.title('Reduced Mnist datas')
plt.legend(loc=9, bbox_to_anchor=(1.3, 0.6), ncol=3)
plt.show()
When we plot the points with corresponding labels, all the real cluster are mixed up and they don't correspond to the one we observed previously. We can then conclude that our algorithm doesn't really work on the reduced MNIST datas.
The reason is that the noise is important in real dataset, and using PCA reduction decreases the information we had. That's why a K-means algorithm does not perform very well.
We are now going to apply our algorithm to the full digit pictures of the MNIST dataset. We hopte that the results will be better than with PCA because we don't have lost information with some data prepocessing.
In [43]:
#Load all the datas
with open("data/digits.base64.json","r") as f:
digits = list(map(parse, f.readlines()))
# pick a ratio for splitting the digits list into a training and a validation set. Classic is 0.25
ratio = int(len(digits)*0.25)
validation = digits[:ratio]
training = digits[ratio:]
validation_data = np.zeros((ratio,784))
validation_label = np.zeros((ratio,1))
for i in range(ratio):
validation_data[i] =digits[i][1]
validation_label[i] = digits[i][0]
training_data = np.zeros((len(digits)-ratio,784))
training_label = np.zeros((len(digits)-ratio,1))
for i in range(len(digits)-ratio):
training_data[i] =digits[i+ratio][1]
training_label[i] = digits[i+ratio][0]
In [44]:
def display_digit(digit, labeled = True, title = ""):
"""
graphically displays a 784x1 vector, representing a digit
"""
if labeled:
digit = digit[1]
image = digit
plt.figure()
fig = plt.imshow(image.reshape(28,28))
fig.set_cmap('gray_r')
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
if title != "":
plt.title("Inferred label: " + str(title))
In [45]:
def classify_digit(digit, labelled_centroids):
"""
given an unlabelled digit represented by a vector and a list of
labelled centroids [(label,vector)], determine the closest centroid
and thus classify the digit.
"""
mindistance = float("inf")
for (label, centroid) in labelled_centroids:
distance = np.linalg.norm(centroid - digit)
if distance < mindistance:
mindistance = distance
closest_centroid_label = label
return closest_centroid_label
def get_error_rate(digits,labelled_centroids):
"""
classifies a list of labelled digits. returns the error rate.
"""
classified_incorrect = 0
for (label,digit) in digits:
classified_label = classify_digit(digit, labelled_centroids)
if classified_label != label:
classified_incorrect +=1
error_rate = classified_incorrect / float(len(digits))
return error_rate
In [46]:
def assign_labels_to_centroids(clusters, centroids):
"""
Assigns a digit label to each cluster.
Cluster is a list of clusters containing labelled datapoints.
IMPORTANT: this function depends on clusters and centroids being in the same order.
"""
labelled_centroids = []
clusters_list = {c: [] for c in range(len(centroids))}
for i in range(len(clusters)):
clusters_list[clusters[i]].append((digits[i+ratio][0],digits[i+ratio][1]))
clusters_final = list(clusters_list.values())
for i in range(len(clusters_final)):
labels = list(map(lambda x: x[0], clusters_final[i]))
# pick the most common label
most_common = max(set(labels), key=labels.count)
centroid = (most_common, centroids[i])
labelled_centroids.append(centroid)
return labelled_centroids
In [47]:
k=10
m, clusters = kmeans(training_data, k)
labelled_centroids = assign_labels_to_centroids(clusters,m)
error_rate = get_error_rate(validation, labelled_centroids)
In [51]:
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
print('Total error rate : {}'.format(error_rate))
def print_results(validation,labelled_centroids):
for i in range(10):
print ('-----------------------------------------')
print (bcolors.BOLD + 'Classification of the digit : ' + str(i) + ' Using ' + str(k) + ' clusters' +bcolors.ENDC)
label_list = []
frequency = {x:0 for x in range(10)}
for (label,digit) in validation:
inferred_label = classify_digit(digit, labelled_centroids)
if inferred_label==i:
label_list.append(digit)
frequency[label] +=1
print (bcolors.BOLD + "Digit in the cluster Frequency"+bcolors.ENDC)
for i in range(len(frequency)):
print (bcolors.BOLD + str(i) +" " + str(frequency[i])+bcolors.ENDC)
print_results(validation,labelled_centroids)
As we can see, when trying to classify the datas on 10 clusters we achieve a 60% sucess rate. It is not that bad but if we look further in the results we can observe an interesting phaenomen. Indeed for some number, like 2, the results are really good, most of the number in the corresponding cluster are the good ones. However for other number, like 5, it is really bad.
Finally this algorithm gives really fluctuating results. It really depends on which number we are trying to classify.
In the two first part of this work we have see the limits of the K-means algorithms and how we could implement it on real data. However each time we needed to choose which number of clusters we were taking. We can the nnaturally ask ourselves how is the error rate dependant of the number of cluster ? and how can we choose it ?
In [178]:
error_rates = {x:None for x in list(range(5,30))}
list_centroids=list()
max_iter=20
for k in range(5,20):
m, clusters = kmeans(training_data, k)
labelled_centroids = assign_labels_to_centroids(clusters,m)
error_rate = get_error_rate(validation, labelled_centroids)
error_rates[k] = error_rate
list_centroids.append(m)
# Show the error rates
x_axis = sorted(error_rates.keys())
y_axis = [error_rates[key] for key in x_axis]
plt.figure()
plt.title("Error Rate with respect to the Number of Clusters")
plt.scatter(x_axis, y_axis)
plt.xlabel("Number of Clusters")
plt.ylabel("Error Rate")
plt.show()
This plot highlights an interesting result. When increasing the number of cluster on which we are classifying the error rate is decreasing (even if it seems that we have a plateau around 10).
Indeed with more clusters than real categories, the algorithm can be more accurate when plotting the contour of the cluster and then decrease his error rate. The digits can now have several clusters which are classify for it, it allows to recognize some parts characterisitc of the digits more easily.
In [181]:
import pickle as pickle
with open('data/donnees_kmeans','wb') as fichier:
my_pickler= pickle.Pickler(fichier)
my_pickler.dump(error_rates)
my_pickler.dump(list_centroids)
readme=['centroids','erreurs']
my_pickler.dump(readme)
In order to choose the number of cluster and solve this problem of unsupervised learning we are going to present a recent theory developed by Tibshirani, Walther and Hastie in 2001 and called Gap statistic theory.
Let $W_k$ be the total energy of clusters, when $X$ is partitioned into $k$ clusters : $$ W_k = \sum_{i=1}^{k} \sum_{x_j \in S_i} \lvert \lvert x_j - \mu_i \lvert \lvert^2$$ The idea is to make a comparison between $\log W_k$ and a null reference distribution of our data : a distribution with no clear clustering. In their article they claim that the optimal number of clusters is the value of $k$ for which $\log W_k$ falls the farthest below the reference curve. This information is in the quantity : $$ \text{GAP}_n (k) = E_n\{\log W_k^{\star}\} - \log W_k$$ where $E_n$ denote an expectation. The estimate $\hat k$ will be the value of $k$ that maximizes this quantity.
The implementation of this algorithm will be the following :
Step 1 Cluster the data and computing for each partition the total energy
Step 2 Generate $N$ reference data sets, using Monte-Carlo sampling, computing for each one the total energy $W_{n,k}$. The gap wil lthen be : $$ \text{Gap}(k) = \frac{1}{N} = \sum_i \log W_{i,k} - \log W_k$$ It is important to note that we note to account for the simulation error. If $\sigma_k$ denotes the standard deviation of the log-energies of the reference state, then the error is : $$ e_k = \sigma_k \sqrt{1 + \frac{1}{N}}$$ Finally the optimal cluster size $\hat k$ is the smallest $k$ such that $\text{Gap}(k) - \text{Gap}(k+1) + e_{k+1} \geq 0$.
Let us first implement this method on the gaussian distributions of the first part, and after we will try to see what happens for the MNIST data (reduced only because of the monte carlo method computation).
In [21]:
def Wk(X, centroids, clusters):
"""
Energy of the cluster
"""
return np.sum([np.linalg.norm(X[i] - centroids[clusters[i]]) ** 2
for i in range(len(X))])
def monte_carlo(X, xmin, xmax, ymin, ymax, n=None):
# n is the sample size
# monte carlo method in 2-D
n = n if n is not None else len(X)
xs = np.random.uniform(xmin, xmax, size=(n, 1))
ys = np.random.uniform(ymin, ymax, size=(n, 1))
return np.concatenate([xs, ys], axis=1)
def gap_stats(X, K=8, B=10, n=None):
"""
Generate all the statistic on the gaps
It follows directly the method described in the paper
"""
(xmin, ymin), (xmax, ymax) = np.min(X, axis=0), np.max(X, axis=0)
ks = np.arange(1, K + 1)
# Generate B Monte Carlo samples (uniform) from the bounding box of X
samples = [monte_carlo(X, xmin, xmax, ymin, ymax, n) for _ in range(B)]
# Total energy of X for each k
Wks = np.empty(K)
# Mean total energy of samples for each k
sample_Wks = np.empty(K)
# Corrected standard deviation for each k
sk = np.empty(K)
for k in ks:
Wks[k - 1] = np.log(Wk(X, *kmeans(X, k)))
# Total energy for each sample
current_Wks = np.empty(B)
for i in range(B):
sample = samples[i]
current_Wks[i] = np.log(Wk(sample, *kmeans(sample, k)))
sample_Wks[k - 1] = current_Wks.mean()
sk[k - 1] = np.sqrt(((current_Wks - sample_Wks[k - 1]) ** 2).mean())
# Correction factor
sk *= np.sqrt(1 + 1 / B)
gaps = sample_Wks - Wks
return ks, Wks, sample_Wks, gaps, sk
import matplotlib.ticker as ticker
def gaps_info(X, ks, Wks, sample_Wks, gaps, sk):
"""
ALl the plotting options for describing the statistic
Some help from for the plotting : http://signal-to-noise.xyz/kmeans.html
"""
fig, axes = plt.subplots(2, 2, figsize=(8, 7))
axes[0,0].plot(X[:,0], X[:,1],
linestyle='', marker='.',
color=cmap(0.2), markeredgecolor=cmap(0.25))
line1, = axes[0,1].plot(ks, Wks, marker='.', markersize=10)
line2, = axes[0,1].plot(ks, sample_Wks, marker='.', markersize=10)
axes[0,1].legend(
(line1, line2),
(r'$\log W_k$', r'$\frac{1}{B}\sum_{b = 1}^B\,\log W_{kb}^*$')
)
axes[1,0].plot(ks, gaps, marker='.', markersize=10)
axes[1,0].set_ylabel('$\operatorname{Gap}(k)$')
gaps_diff = gaps[:-1] - gaps[1:] + sk[1:]
barlist = axes[1,1].bar(ks[:-1], gaps_diff,
width=0.5, align='center')
barlist[np.argmax(gaps_diff > 0)].set_color(sns.xkcd_rgb['pale red'])
axes[1,1].set_ylabel('$\operatorname{Gap}(k) -'
' \operatorname{Gap}(k + 1) + e_{k + 1}$')
for (i, j) in ((0, 1), (1, 0), (1, 1)):
axes[i,j].set_xlabel('Clusters')
axes[i,j].xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
fig.tight_layout(w_pad=3)
plt.show()
In [18]:
X, _ = make_blobs(n_samples=1000, centers=3, cluster_std=1)
ks, Wks, sample_Wks, gaps, sk = gap_stats(X, K=5)
gaps_info(X, ks, Wks, sample_Wks, gaps, sk)
Here we have generated blobs with 3 centers and indeed the algorithm confirms that $k=3$ is the optimal value. But this simulation was done using a small standard deviation, what happens when increasing it ?
In [19]:
X, _ = make_blobs(n_samples=1000, centers=3, cluster_std=2.5)
ks, Wks, sample_Wks, gaps, sk = gap_stats(X, K=5)
gaps_info(X, ks, Wks, sample_Wks, gaps, sk)
Here the clusters are close and lightly fused. Looking at them it seems that we have only 2 clusters, even if we have in reality generate it from 3 centers. And indeed the algorithm gives an optimal value of $k=2$. This highlight the fact that the method to find the optimal number of cluster is, like the K-means algorithm, dependant on the noise in the datas. We could try to see what happens for more clusters, but indeed we will look at the result of this algorithm on the PCA reduced MNIST data.
In [23]:
from time import time
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
import seaborn as sns
data=[]
for i in range(int(len(digits)*0.25)):
data.append(digits[i][1])
data = scale(data)
# Even if the results aren't as good with PCA, for this first implementation it will be useful to stay with it
# Or we need to compute Monte carlo in 784-
#reduced_data = PCA(n_components=2).fit_transform(data) # Redced data to plot them in 2D
ks, Wks, sample_Wks, gaps, sk = gap_stats(reduced_data, K=15)
gaps_info(reduced_data, ks, Wks, sample_Wks, gaps, sk)
For the MNIST data, the datas are so noisy that the algorithm gives $k=1$ as the optimal value of number of clusters. This is in accordance with the bad results for PCA data with K-means algorithm and highlights the fact that the PCA preprocessing decreases too much the information in the dataset.
To finish this report on K-means algorithm we are going to see an other type of limit that gives performance of the K-means algorithm. These is the form of the cluster that we are trying to classify. Indeed the K-means performs well when the cluster are linearly separable.
Let's take some ring clusters and observe the behavior of the algorithm.
In [24]:
plt.figure(2)
plt.scatter(np.random.normal(size=1000),np.random.normal(size=1000))
r=np.random.normal(loc=5,scale=0.2,size=1000)
theta=np.random.uniform(0,2*np.pi,size=1000)
plt.scatter(r*np.cos(theta),r*np.sin(theta))
plt.show()
In [25]:
data1=np.random.normal(size=(1000,2))
#data1y=np.random.normal(size=250)
#data1=np.concatenate((data1x,data1y),axis=1)
#print(data1)
data2x=r*np.cos(theta)
data2y=r*np.sin(theta)
data2=np.zeros((1000,2))
for i in range(1000):
data2[i][0]=data2x[i]
data2[i][1]=data2y[i]
#data2=zip(data2x,data2y)
data=np.concatenate((data1,data2),axis=0)
np.random.shuffle(data)
make_animation(data, k=2, frames=20)
Out[25]:
The clusters we obtained are not really the good ones. To solve this problem there are two solutions :
- Choose an appropriate Kernel to work in a higher dimensional space
- CHoose a good system of representation of the data
We are going to work with poalr coordinates to solve tbe previous problem.
In [26]:
class KMeansAnimation_polar:
def __init__(self, fig, ax, X, m=None, k=2):
self.X = X
self.fig = fig
self.m = m
# We have to call plot for each cluster and its centroid
# because we want to distinguish the clusters by color
# and draw the centroid with a different marker
self.clusters, self.centroids = [], []
for i in range(k):
color = cmap(i / k)
self.clusters.append(
ax.plot([], [],
linestyle='', marker='.',
markeredgecolor=color, color=color)[0]
)
self.centroids.append(
ax.plot([], [],
linestyle='', marker='s',
markeredgewidth=1,
markersize=8, color=color,markeredgecolor='k')[0]
)
def update(self, t):
self.m, clusters = kmeans_iteration(self.X, self.m)
self.fig.suptitle('n = {}, centers = {} – Iteration {}'.format(
len(self.X), len(self.m), t + 1)
)
# To update the plot, we simply call set_data on the saved axes
for i in range(len(self.m)):
group = self.X[clusters == i]
for l in range(len(group)): group[l]=pol2cart(group[l][1],group[l][0])
#print (group)
self.clusters[i].set_data(group.T)
self.centroids[i].set_data(pol2cart(self.m[i][1],self.m[i][0]))
return self.clusters + self.centroids
In [27]:
def make_animation_polar(X, k, m=None, frames=20):
'''
display animation in jupyter notebook
'''
fig = plt.figure(figsize=(6, 4))
(xmin, ymin), (xmax, ymax) = np.min(X, axis=0), np.max(X, axis=0)
ax = plt.axes(xlim=(-5, 5), ylim=(-5, 5))
#ax = plt.subplot((111),polar=True)
control = KMeansAnimation_polar(fig, ax, X, m=m, k=k)
anim = animation.FuncAnimation(
fig, control.update,
frames=frames, interval=700, blit=True,
)
# Necessary, otherwise the notebook will display the final figure
# along with the animation
plt.close(fig)
return HTML(anim.to_html5_video())
In [28]:
r=np.random.normal(loc=4,scale=0.3,size=1000)
theta=np.random.uniform(0,2*np.pi,size=1000)
data2pol = np.zeros((1000,2))
def z2polar(z):
return ( np.abs(z), np.angle(z) )
def pol2cart(rho, phi):
x = rho * np.cos(phi)
y = rho * np.sin(phi)
return np.array([x, y])
data1=np.random.normal(loc=(0,0),scale=0.8,size=(1000,2))
data1pol=np.zeros((1000,2))
for i in range(1000):
z,angle = z2polar(data1[i][0]+1j*data1[i][1])
data1pol[i][1] = z
data1pol[i][0]=angle
data2pol[i][1] = r[i]
data2pol[i][0]=theta[i]
data=np.concatenate((data1pol,data2pol),axis=0)
np.random.shuffle(data)
m = np.array([[0, 2.5], [0, 3]])
make_animation_polar(data, k=2,m=m, frames=20)
Out[28]:
With polar coordinates the K-means Algorithm is working and we obtained the good clusters.
To conclude, here we have shown how the K-means algorithm is working and perform and some limits that highlight the fact that we need to be very careful when using it because the clsuters are only local minima.