In [1]:
%matplotlib inline
In [2]:
from sklearn.datasets import make_blobs
In [3]:
blobs, classes = make_blobs(500, centers=3)
In [6]:
import matplotlib.pyplot as plt
import numpy as np
In [7]:
f, ax = plt.subplots(figsize=(7.5, 7.5))
rgb = np.array(['r', 'g', 'b'])
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
ax.set_title('Blobs')
Out[7]:
In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(blobs)
Out[8]:
In [9]:
kmeans.cluster_centers_
Out[9]:
In [11]:
f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
ax.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1], marker='*', s=250,
color='black', label='Center')
ax.set_title('Blobs')
ax.legend(loc='best')
Out[11]:
In [13]:
kmeans.labels_[:5] # labels_ returns the expected results
Out[13]:
In [15]:
# these classes should be the same as the labels_ except with
# the values potentially swapped (in this case 2 and 0)
# the value doesn't matter but that all the assigned values
# are in the same class.
classes[:5]
Out[15]:
In [18]:
# transform method shows the distance between each point
# and each centroid
kmeans.transform(blobs)[:5]
Out[18]:
In [19]:
# kmeans works by minimizing the within-cluster sum of
# square distance from the mean. It does this by setting
# a pre-specified number of clusters, K, and then alternating
# between assigning each observation to the nearest cluster
# then updating each centroid by calculating the mean of
# each observation assigned to this cluster.
In [ ]: