In [ ]:
import time
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from msmbuilder import cluster
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
In [ ]:
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
n_samples = 2500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
#blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
blobs = np.concatenate([
np.array([[0,-1]]) + 0.5 * np.random.randn(n_samples/3, 2),
np.array([[5, 0]]) + 0.1 * np.random.randn(n_samples/3, 2),
np.array([[0, 5]]) + 2.0 * np.random.randn(n_samples/3, 2),
]), None
no_structure = np.random.rand(n_samples, 2), None
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
In [ ]:
plt.figure(figsize=(14.5, 9.5))
plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
for i_dataset, dataset in enumerate([noisy_circles, noisy_moons, blobs,
no_structure]):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# create clustering estimators
kcenters = cluster.KCenters(n_clusters=3)
rs = cluster.RegularSpatial(d_min=2)
kmeans = cluster.KMeans(n_clusters=3)
kmeans2 = cluster.MiniBatchKMeans(n_clusters=3)
kmedoids = cluster.KMedoids(n_clusters=3)
kmedoids2 = cluster.MiniBatchKMedoids(n_clusters=3)
for name, algorithm in [
('KCenters', kcenters),
('RegularSpatial', rs),
('KMeans', kmeans),
('MiniBatchKMeans', kmeans2),
('KMedoids', kmedoids),
('MiniBatchKMedoids', kmedoids2)
]:
# predict cluster memberships
t0 = time.time()
algorithm.fit([X])
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_[0].astype(np.int)
else:
y_pred = algorithm.predict([X])[0]
# plot
plt.subplot(4, 6, plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=6)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=20,
horizontalalignment='right')
plot_num += 1
In [ ]: