In [1]:
import numpy as np

def random_sample(X, k):
    return X[np.random.choice(X.shape[0], k, replace=False),:]

def pairwise_distances_argmin(X, y):
    indices = np.empty(X.shape[0], dtype=np.intp)
    for i in range(len(X)):
        indices[i] = np.linalg.norm(X[i,np.newaxis] - y, axis=1).argmin()
    return indices

X = np.array([[0, 0], [5, 5]])
y = np.array([[1, 1], [6, 6]])
pairwise_distances_argmin(X, y)


Out[1]:
array([0, 1])

In [2]:
def kmeans_iteration(X, m):
    clusters = pairwise_distances_argmin(X, m)
    centroids = np.empty(m.shape)
    for i in range(len(m)):
        centroids[i] = np.mean(X[clusters == i], axis=0)
    return centroids, clusters

def kmeans(X, k):
    m = random_sample(X, k)
    while True:
        new_m, clusters = kmeans_iteration(X, m)
        if np.isclose(m, new_m).all():
            break
        m = new_m
    return new_m, clusters

In [10]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
import seaborn as sns

plt.style.use('ggplot')
sns.set_style('whitegrid')
rc('figure', figsize=(6, 4))
cmap = cm.get_cmap('rainbow')

def plot_clusters(X, m, clusters):
    k = len(m)
    for i in range(k):
        group = X[clusters == i]
        plt.scatter(group[:,0], group[:,1], marker='.', color=cmap(i / k))
        plt.scatter(m[i,0], m[i,1], marker='*', color=cmap(i / k))
    plt.show()

In [11]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=2000, centers=4, cluster_std=2)
m, clusters = kmeans(X, 4)
plot_clusters(X, m, clusters)



In [16]:
import matplotlib.animation as animation

class KMeansAnimation:
    def __init__(self, fig, ax, X, m=None, k=2):
        self.X = X
        self.fig = fig
        self.m = m if m is not None else random_sample(X, k)
        # We have to call plot for each cluster and its centroid
        # because we want to distinguish the clusters by color
        # and draw the centroid with a different marker
        self.clusters, self.centroids = [], []
        for i in range(k):
            color = cmap(i / k)
            self.clusters.append(
                ax.plot([], [],
                        linestyle='', marker='.',
                        markeredgecolor=color, color=color)[0]
            )
            self.centroids.append(
                ax.plot([], [],
                        linestyle='', marker='o',
                        markeredgewidth=0.5,
                        markersize=10, color=color)[0]
            )

    def update(self, t):
        self.m, clusters = kmeans_iteration(self.X, self.m)
        self.fig.suptitle(u'n = {}, k = {} – Iteration {}'.format(
                len(self.X), len(self.m), t + 1)
        )
        # To update the plot, we simply call set_data on the saved axes
        for i in range(len(self.m)):
            group = self.X[clusters == i]
            self.clusters[i].set_data(group.T)
            self.centroids[i].set_data(self.m[i])
        return self.clusters + self.centroids

In [17]:
from IPython.display import HTML

def make_animation(X, k, m=None, frames=20):
    fig = plt.figure(figsize=(6, 4))
    (xmin, ymin), (xmax, ymax) = np.min(X, axis=0), np.max(X, axis=0)
    ax = plt.axes(xlim=(xmin, xmax), ylim=(ymin, ymax))
    control = KMeansAnimation(fig, ax, X, m=m, k=k)
    anim = animation.FuncAnimation(
        fig, control.update,
        frames=frames, interval=700, blit=True,
    )
    # Necessary, otherwise the notebook will display the final figure
    # along with the animation
    plt.close(fig)
    return HTML(anim.to_html5_video())

In [18]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=2000, centers=6, cluster_std=2.2)

plt.plot(X[:,0], X[:,1],
         linestyle='', marker='.',
         color=cmap(0.2), markeredgecolor=cmap(0.25))
plt.show()



In [19]:
#%matplotlib inline
m = np.array([[0, 2.5], [3, 11], [4, 10], [5.5, 2]])
make_animation(X, m=m, k=4, frames=20)


Out[19]:

In [ ]: