In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn.cluster import KMeans
In [2]:
x = [1, 5, 1.5, 8, 1, 9]
y = [2, 8, 1.8, 8, 0.6, 11]
plt.scatter(x,y)
Out[2]:
In [20]:
X = np.array([pair for pair in zip(x, y)])
# same as: X = np.array([[1,2], [5,8], [1.5,1.8], [8,8], [1,0.6], [9,11]])
In [27]:
# specifying a flat clustering with 2 clusters
kmeans = KMeans(n_clusters=2)
# fit the parameter X
kmeans.fit(X);
Points are clustered by equal degrees of variance. Centroid is the center of the cluster.
In [28]:
# the μ's
centroids = kmeans.cluster_centers_
# these are the labels the KMeans Algo actually suplpies us
labels = kmeans.labels_
In [29]:
print(centroids)
print(labels)
In [32]:
colors = ["g.","r."] # green/red dots
# visualize dat points according to cluster
for i in range(len(X)):
print("coordinate:", X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
# scatter plot the centroids
plt.scatter(centroids[:,0], centroids[:,1], marker = "x", s=150, linewidths=5, zorder=10)
plt.show()
Special syntax for indexing NumPy arrays: centroids[:, n] plots the nth colummn of all (:) rows of centroids. s is size. zorder is background/foreground display order. Higher: greater foreground priority.
Now, what if we force the algorithm to have 3 clusters?
In [34]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
colors = ["g.","r.", "c."]
for i in range(len(X)):
print("coordinate:", X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:,0], centroids[:,1], marker = "x", s=150, linewidths=5, zorder=1)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: