In [1]:
# Demonstrate the K-Means algorithm

In [1]:
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs

In [3]:
n1 = 300
m1 = np.asarray([0, 0])
C1 = np.array([[8, 10], [10, 6]])
X1 = np.random.multivariate_normal(m1, C1, n1)

n2 = 500
m2 = np.asarray([12, 5])
C2 = np.array([[5., -5], [-5, 10]])
X2 = np.random.multivariate_normal(m2, C2, n2)

X = np.vstack((X1, X2))
y = np.ones(n1 + n2)
y[n1:] = -1
n_centers = 2

In [43]:
n_centers = 3
X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
                  cluster_std=0.7, random_state=0)

In [4]:
import kmeans
reload(kmeans)
y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
print(centers)


+++ Iter 6 change less than 0.0
[[ -0.23126861  -0.35876341]
 [ 11.80751504   4.89665035]]

In [5]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(obj_val_seq, 'b-', marker='*')
fig.suptitle("Change in K-means objective value")
ax.set_xlabel("Iteration")
ax.set_ylabel("Objective value")


Out[5]:
<matplotlib.text.Text at 0x2e55a50>

In [7]:
from itertools import cycle
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')

fig = plt.figure(figsize=plt.figaspect(0.5))  # Make twice as wide to accomodate both plots

# Plot data
ax = fig.add_subplot(121)
ax.set_title("Data with true labels and final centers")
for k, color in zip(range(n_centers), colors):
    ax.plot(X[y==k, 0], X[y==k, 1], color + '.')
    
initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.

# Plot initial centers
for x in initial_centers:
    ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)

# Plot final centers
for x in centers:
    ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)
    
# Plot assignments
ax = fig.add_subplot(122)
ax.set_title("Data with final assignments")
for k, color in zip(range(n_centers), colors):
    ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')

fig.tight_layout()
#fig.gca()
fig.show()