In [1]:
%matplotlib inline

In [2]:
from sklearn.datasets import make_blobs

In [3]:
blobs, classes = make_blobs(500, centers=3)

In [6]:
import matplotlib.pyplot as plt
import numpy as np

In [7]:
f, ax = plt.subplots(figsize=(7.5, 7.5))
rgb = np.array(['r', 'g', 'b'])
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
ax.set_title('Blobs')


Out[7]:
<matplotlib.text.Text at 0x1152fdbd0>

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(blobs)


Out[8]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [9]:
kmeans.cluster_centers_


Out[9]:
array([[-3.12979654,  3.70842794],
       [-8.6620017 , -5.81939495],
       [-6.62181699,  0.97328685]])

In [11]:
f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
ax.scatter(kmeans.cluster_centers_[:, 0],
          kmeans.cluster_centers_[:, 1], marker='*', s=250,
          color='black', label='Center')
ax.set_title('Blobs')
ax.legend(loc='best')


Out[11]:
<matplotlib.legend.Legend at 0x116010510>

In [13]:
kmeans.labels_[:5] # labels_ returns the expected results


Out[13]:
array([2, 0, 1, 0, 2], dtype=int32)

In [15]:
# these classes should be the same as the labels_ except with
# the values potentially swapped (in this case 2 and 0)
# the value doesn't matter but that all the assigned values
# are in the same class.
classes[:5]


Out[15]:
array([0, 2, 1, 2, 0])

In [18]:
# transform method shows the distance between each point 
# and each centroid
kmeans.transform(blobs)[:5]


Out[18]:
array([[  6.05360088,   5.48742522,   1.68793427],
       [  1.15669074,  11.18727508,   5.07090102],
       [ 11.02944851,   1.15315452,   7.39995026],
       [  0.55439017,  11.12022548,   4.74550109],
       [  2.68709988,   9.21078836,   2.13878687]])

In [19]:
# kmeans works by minimizing the within-cluster sum of
# square distance from the mean. It does this by setting
# a pre-specified number of clusters, K, and then alternating
# between assigning each observation to the nearest cluster
# then updating each centroid by calculating the mean of
# each observation assigned to this cluster.

In [ ]: