In [1]:
%matplotlib inline
In [2]:
import numpy as np
from sklearn import datasets
np.random.seed(0)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)
In [3]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
def plot_dataset(X, y_pred=[0], fname=None):
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# last color is black to properly display label -1 as noise (black)
colors = np.append(np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1)))), ['#000000'])
plt.figure(figsize=(10, 10))
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
if fname:
plt.savefig(fname)
In [4]:
# plt.scatter?
In [5]:
X, y = blobs
# plot_dataset(X, fname='blobs.png')
plot_dataset(X)
In [6]:
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=3)
clf.fit(X)
Out[6]:
In [7]:
y_pred = clf.predict(X)
y_pred
Out[7]:
In [8]:
# plot_dataset(X, y_pred, fname='blobs_kmeans_3')
plot_dataset(X, y_pred)
In [9]:
# from sklearn.cluster import MiniBatchKMeans
# clf = MiniBatchKMeans(n_clusters=10)
clf = KMeans(n_clusters=10)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='blobs_kmeans_10')
plot_dataset(X, y_pred)
In [10]:
X, y = noisy_circles
# plot_dataset(X, fname='noisy_circles.png')
plot_dataset(X)
In [11]:
clf = KMeans(n_clusters=2)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='noisy_circles_kmeans.png')
plot_dataset(X, y_pred)
In [12]:
X, y = no_structure
# plot_dataset(X, fname='no_structure.png')
plot_dataset(X)
In [13]:
clf = KMeans(n_clusters=3)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='no_structure_kmeans.png')
plot_dataset(X, y_pred)
In [14]:
# DBSCAN?
In [15]:
X, y = blobs
plot_dataset(X)
In [16]:
from sklearn.cluster import DBSCAN
clf = DBSCAN()
clf.fit(X)
Out[16]:
In [17]:
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='blobs_dbscan')
plot_dataset(X, y_pred)
In [18]:
X, y = no_structure
plot_dataset(X)
In [19]:
clf.fit(X)
Out[19]:
In [20]:
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='no_structure_dbscan')
plot_dataset(X, y_pred)
In [21]:
X, y = noisy_circles
plot_dataset(X)
In [22]:
clf = DBSCAN(eps=0.1)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)
In [23]:
clf = DBSCAN(eps=0.05)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)
In [24]:
labels = clf.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters
Out[24]:
In [25]:
labels
Out[25]:
In [26]:
contains_noise = -1 in labels
contains_noise
Out[26]:
In [27]:
X, y = blobs
plot_dataset(X)
In [38]:
# eps is the maximum distance per two points to still be in the same neighborhood
# min_samples minimum amout of samples to form a cluster
clf = DBSCAN(eps=1.5)
# clf = DBSCAN()
clf.fit(X)
Out[38]:
In [39]:
# DBSCAN?
In [40]:
unique_labels = set(clf.labels_)
unique_labels
Out[40]:
In [41]:
y_pred = clf.labels_.astype(np.int)
plot_dataset(X, y_pred)
In [32]:
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)
n_clusters
Out[32]:
In [42]:
contains_noise = -1 in clf.labels_
contains_noise
Out[42]:
In [43]:
from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)
Out[43]:
In [44]:
# 1 is great, -1 is worst, around 0 overlapping clusters
# silhouette_score?
In [45]:
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(X, y_pred)
Out[45]:
In [37]:
# calinski_harabaz_score?