In [1]:
import numpy as np
def random_sample(X, k):
return X[np.random.choice(X.shape[0], k, replace=False),:]
def pairwise_distances_argmin(X, y):
indices = np.empty(X.shape[0], dtype=np.intp)
for i in range(len(X)):
indices[i] = np.linalg.norm(X[i,np.newaxis] - y, axis=1).argmin()
return indices
X = np.array([[0, 0], [5, 5]])
y = np.array([[1, 1], [6, 6]])
pairwise_distances_argmin(X, y)
Out[1]:
In [2]:
def kmeans_iteration(X, m):
clusters = pairwise_distances_argmin(X, m)
centroids = np.empty(m.shape)
for i in range(len(m)):
centroids[i] = np.mean(X[clusters == i], axis=0)
return centroids, clusters
def kmeans(X, k):
m = random_sample(X, k)
while True:
new_m, clusters = kmeans_iteration(X, m)
if np.isclose(m, new_m).all():
break
m = new_m
return new_m, clusters
In [10]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
import seaborn as sns
plt.style.use('ggplot')
sns.set_style('whitegrid')
rc('figure', figsize=(6, 4))
cmap = cm.get_cmap('rainbow')
def plot_clusters(X, m, clusters):
k = len(m)
for i in range(k):
group = X[clusters == i]
plt.scatter(group[:,0], group[:,1], marker='.', color=cmap(i / k))
plt.scatter(m[i,0], m[i,1], marker='*', color=cmap(i / k))
plt.show()
In [11]:
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=2000, centers=4, cluster_std=2)
m, clusters = kmeans(X, 4)
plot_clusters(X, m, clusters)
In [16]:
import matplotlib.animation as animation
class KMeansAnimation:
def __init__(self, fig, ax, X, m=None, k=2):
self.X = X
self.fig = fig
self.m = m if m is not None else random_sample(X, k)
# We have to call plot for each cluster and its centroid
# because we want to distinguish the clusters by color
# and draw the centroid with a different marker
self.clusters, self.centroids = [], []
for i in range(k):
color = cmap(i / k)
self.clusters.append(
ax.plot([], [],
linestyle='', marker='.',
markeredgecolor=color, color=color)[0]
)
self.centroids.append(
ax.plot([], [],
linestyle='', marker='o',
markeredgewidth=0.5,
markersize=10, color=color)[0]
)
def update(self, t):
self.m, clusters = kmeans_iteration(self.X, self.m)
self.fig.suptitle(u'n = {}, k = {} – Iteration {}'.format(
len(self.X), len(self.m), t + 1)
)
# To update the plot, we simply call set_data on the saved axes
for i in range(len(self.m)):
group = self.X[clusters == i]
self.clusters[i].set_data(group.T)
self.centroids[i].set_data(self.m[i])
return self.clusters + self.centroids
In [17]:
from IPython.display import HTML
def make_animation(X, k, m=None, frames=20):
fig = plt.figure(figsize=(6, 4))
(xmin, ymin), (xmax, ymax) = np.min(X, axis=0), np.max(X, axis=0)
ax = plt.axes(xlim=(xmin, xmax), ylim=(ymin, ymax))
control = KMeansAnimation(fig, ax, X, m=m, k=k)
anim = animation.FuncAnimation(
fig, control.update,
frames=frames, interval=700, blit=True,
)
# Necessary, otherwise the notebook will display the final figure
# along with the animation
plt.close(fig)
return HTML(anim.to_html5_video())
In [18]:
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=2000, centers=6, cluster_std=2.2)
plt.plot(X[:,0], X[:,1],
linestyle='', marker='.',
color=cmap(0.2), markeredgecolor=cmap(0.25))
plt.show()
In [19]:
#%matplotlib inline
m = np.array([[0, 2.5], [3, 11], [4, 10], [5.5, 2]])
make_animation(X, m=m, k=4, frames=20)
Out[19]:
In [ ]: