In [1]:
%matplotlib inline

In [3]:
# KMeans is not ideal for a lot of data due to complexity.
# MiniBatch KMeans is a faster implementation of KMeans
# KMeans is computationally expensive. (Problem is NP-hard)
# MiniBatch takes many subsamples and gives close approximations

In [4]:
from sklearn.datasets import make_blobs
blobs, labels = make_blobs(int(1e6), 3)

In [5]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [6]:
kmeans = KMeans(n_clusters=3)
minibatch = MiniBatchKMeans(n_clusters=3)

In [7]:
%time kmeans.fit(blobs)


CPU times: user 2.43 s, sys: 404 ms, total: 2.83 s
Wall time: 2.47 s
Out[7]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [8]:
%time minibatch.fit(blobs)


CPU times: user 1.11 s, sys: 22.2 ms, total: 1.14 s
Wall time: 1.14 s
Out[8]:
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=3,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [9]:
kmeans.cluster_centers_[0]


Out[9]:
array([-2.27167309,  8.84453084, -4.16034777])

In [10]:
minibatch.cluster_centers_[0]


Out[10]:
array([-5.53530281, -6.88961024, -2.40821803])

In [11]:
# determine how far apart the centers are

In [12]:
from sklearn.metrics import pairwise
pairwise.pairwise_distances(kmeans.cluster_centers_[0],
                           minibatch.cluster_centers_[0])


Out[12]:
array([[ 16.16429502]])

In [13]:
import numpy as np

In [14]:
np.diag(pairwise.pairwise_distances(kmeans.cluster_centers_,
                                   minibatch.cluster_centers_))


Out[14]:
array([ 16.16429502,  16.17278273,   0.07439217])

In [15]:
# batch_size parameter determines how large the batches should be.
minibatch = MiniBatchKMeans(batch_size=len(blobs))

In [16]:
%time minibatch.fit(blobs)


CPU times: user 14 s, sys: 1.44 s, total: 15.5 s
Wall time: 14.5 s
Out[16]:
MiniBatchKMeans(batch_size=1000000, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [ ]: