In [1]:
    
%matplotlib inline
    
In [2]:
    
import numpy as np
from sklearn import datasets
np.random.seed(0)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)
    
In [3]:
    
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
def plot_dataset(X, y_pred=[0], fname=None):
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
    # last color is black to properly display label -1 as noise (black)
    colors = np.append(np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                     '#f781bf', '#a65628', '#984ea3',
                                     '#999999', '#e41a1c', '#dede00']),
                              int(max(y_pred) + 1)))), ['#000000'])
    plt.figure(figsize=(10, 10))
    plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    plt.xticks(())
    plt.yticks(())
    if fname:
        plt.savefig(fname)
    
In [4]:
    
# plt.scatter?
    
In [5]:
    
X, y = blobs
# plot_dataset(X, fname='blobs.png')
plot_dataset(X)
    
    
In [6]:
    
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=3)
clf.fit(X)
    
    Out[6]:
In [7]:
    
y_pred = clf.predict(X)
y_pred
    
    Out[7]:
In [8]:
    
# plot_dataset(X, y_pred, fname='blobs_kmeans_3')
plot_dataset(X, y_pred)
    
    
In [9]:
    
# from sklearn.cluster import MiniBatchKMeans
# clf = MiniBatchKMeans(n_clusters=10)
clf = KMeans(n_clusters=10)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='blobs_kmeans_10')
plot_dataset(X, y_pred)
    
    
In [10]:
    
X, y = noisy_circles
# plot_dataset(X, fname='noisy_circles.png')
plot_dataset(X)
    
    
In [11]:
    
clf = KMeans(n_clusters=2)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='noisy_circles_kmeans.png')
plot_dataset(X, y_pred)
    
    
In [12]:
    
X, y = no_structure
# plot_dataset(X, fname='no_structure.png')
plot_dataset(X)
    
    
In [13]:
    
clf = KMeans(n_clusters=3)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='no_structure_kmeans.png')
plot_dataset(X, y_pred)
    
    
In [14]:
    
# DBSCAN?
    
In [15]:
    
X, y = blobs
plot_dataset(X)
    
    
In [16]:
    
from sklearn.cluster import DBSCAN
clf = DBSCAN()
clf.fit(X)
    
    Out[16]:
In [17]:
    
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='blobs_dbscan')
plot_dataset(X, y_pred)
    
    
In [18]:
    
X, y = no_structure
plot_dataset(X)
    
    
In [19]:
    
clf.fit(X)
    
    Out[19]:
In [20]:
    
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='no_structure_dbscan')
plot_dataset(X, y_pred)
    
    
In [21]:
    
X, y = noisy_circles
plot_dataset(X)
    
    
In [22]:
    
clf = DBSCAN(eps=0.1)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)
    
    
In [23]:
    
clf = DBSCAN(eps=0.05)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)
    
    
In [24]:
    
labels = clf.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters
    
    Out[24]:
In [25]:
    
labels
    
    Out[25]:
In [26]:
    
contains_noise = -1 in labels
contains_noise
    
    Out[26]:
In [27]:
    
X, y = blobs
plot_dataset(X)
    
    
In [38]:
    
# eps is the maximum distance per two points to still be in the same neighborhood
# min_samples minimum amout of samples to form a cluster
clf = DBSCAN(eps=1.5)
# clf = DBSCAN()
clf.fit(X)
    
    Out[38]:
In [39]:
    
# DBSCAN?
    
In [40]:
    
unique_labels = set(clf.labels_)
unique_labels
    
    Out[40]:
In [41]:
    
y_pred = clf.labels_.astype(np.int)
plot_dataset(X, y_pred)
    
    
In [32]:
    
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)
n_clusters
    
    Out[32]:
In [42]:
    
contains_noise = -1 in clf.labels_
contains_noise
    
    Out[42]:
In [43]:
    
from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)
    
    Out[43]:
In [44]:
    
# 1 is great, -1 is worst, around 0 overlapping clusters
# silhouette_score?
    
In [45]:
    
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(X, y_pred)
    
    Out[45]:
In [37]:
    
# calinski_harabaz_score?