In [1]:
%matplotlib inline
In [3]:
# KMeans is not ideal for a lot of data due to complexity.
# MiniBatch KMeans is a faster implementation of KMeans
# KMeans is computationally expensive. (Problem is NP-hard)
# MiniBatch takes many subsamples and gives close approximations
In [4]:
from sklearn.datasets import make_blobs
blobs, labels = make_blobs(int(1e6), 3)
In [5]:
from sklearn.cluster import KMeans, MiniBatchKMeans
In [6]:
kmeans = KMeans(n_clusters=3)
minibatch = MiniBatchKMeans(n_clusters=3)
In [7]:
%time kmeans.fit(blobs)
Out[7]:
In [8]:
%time minibatch.fit(blobs)
Out[8]:
In [9]:
kmeans.cluster_centers_[0]
Out[9]:
In [10]:
minibatch.cluster_centers_[0]
Out[10]:
In [11]:
# determine how far apart the centers are
In [12]:
from sklearn.metrics import pairwise
pairwise.pairwise_distances(kmeans.cluster_centers_[0],
minibatch.cluster_centers_[0])
Out[12]:
In [13]:
import numpy as np
In [14]:
np.diag(pairwise.pairwise_distances(kmeans.cluster_centers_,
minibatch.cluster_centers_))
Out[14]:
In [15]:
# batch_size parameter determines how large the batches should be.
minibatch = MiniBatchKMeans(batch_size=len(blobs))
In [16]:
%time minibatch.fit(blobs)
Out[16]:
In [ ]: