In [1]:
%matplotlib inline
In [4]:
from sklearn import datasets
from sklearn import cluster
import matplotlib.pyplot as plt
In [5]:
blobs, ground_truth = datasets.make_blobs(1000, centers=3,
cluster_std=1.75)
In [8]:
f, ax = plt.subplots(figsize=(10,7.5))
colors = ['r', 'g', 'b']
for i in range(3):
p = blobs[ground_truth == i]
ax.scatter(p[:,0], p[:,1], c=colors[i],
label = 'Cluster {}'.format(i))
ax.set_title('Cluster with Ground Truth')
ax.legend(loc='best')
Out[8]:
In [9]:
kmeans = cluster.KMeans(n_clusters=3)
kmeans.fit(blobs)
Out[9]:
In [10]:
kmeans.cluster_centers_
Out[10]:
In [11]:
f, ax = plt.subplots(figsize=(10, 7))
for i in range(3):
p = blobs[ground_truth==i]
ax.scatter(p[:,0], p[:,1], c=colors[i],
label='Cluster {}'.format(i))
ax.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1], s=100, color='black',
label='Centers', marker='*')
ax.set_title('Clusters with Ground Truth Centers')
ax.legend(loc='best')
Out[11]:
In [12]:
for i in range(3):
print(kmeans.labels_ == ground_truth)[ground_truth == i].astype(int).mean()
In [13]:
new_ground_truth = ground_truth.copy()
new_ground_truth[ground_truth == 0] = 2
new_ground_truth[ground_truth == 2] = 0
In [15]:
for i in range(3):
print(kmeans.labels_ == new_ground_truth)[ground_truth == i].astype(int).mean()
In [ ]: