notebook.community

Edit and run



In [1]:

    
%matplotlib inline

Adapted from http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html



In [2]:

    
import numpy as np
from sklearn import datasets

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)



In [3]:

    
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

def plot_dataset(X, y_pred=[0], fname=None):
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # last color is black to properly display label -1 as noise (black)
    colors = np.append(np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                     '#f781bf', '#a65628', '#984ea3',
                                     '#999999', '#e41a1c', '#dede00']),
                              int(max(y_pred) + 1)))), ['#000000'])
    plt.figure(figsize=(10, 10))

    plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    plt.xticks(())
    plt.yticks(())
    if fname:
        plt.savefig(fname)



In [4]:

    
# plt.scatter?



In [5]:

    
X, y = blobs
# plot_dataset(X, fname='blobs.png')
plot_dataset(X)

K Means



In [6]:

    
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=3)
clf.fit(X)









    Out[6]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)



In [7]:

    
y_pred = clf.predict(X)
y_pred









    Out[7]:





array([1, 1, 1, ..., 1, 0, 0])



In [8]:

    
# plot_dataset(X, y_pred, fname='blobs_kmeans_3')
plot_dataset(X, y_pred)



In [9]:

    
# from sklearn.cluster import MiniBatchKMeans
# clf = MiniBatchKMeans(n_clusters=10)

clf = KMeans(n_clusters=10)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='blobs_kmeans_10')
plot_dataset(X, y_pred)



In [10]:

    
X, y = noisy_circles
# plot_dataset(X, fname='noisy_circles.png')
plot_dataset(X)



In [11]:

    
clf = KMeans(n_clusters=2)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='noisy_circles_kmeans.png')
plot_dataset(X, y_pred)



In [12]:

    
X, y = no_structure
# plot_dataset(X, fname='no_structure.png')
plot_dataset(X)



In [13]:

    
clf = KMeans(n_clusters=3)
clf.fit(X)
y_pred = clf.predict(X)
# plot_dataset(X, y_pred, fname='no_structure_kmeans.png')
plot_dataset(X, y_pred)

DBSCAN



In [14]:

    
# DBSCAN?



In [15]:

    
X, y = blobs
plot_dataset(X)



In [16]:

    
from sklearn.cluster import DBSCAN

clf = DBSCAN()
clf.fit(X)









    Out[16]:





DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)



In [17]:

    
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='blobs_dbscan')
plot_dataset(X, y_pred)



In [18]:

    
X, y = no_structure
plot_dataset(X)



In [19]:

    
clf.fit(X)









    Out[19]:





DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)



In [20]:

    
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='no_structure_dbscan')
plot_dataset(X, y_pred)



In [21]:

    
X, y = noisy_circles
plot_dataset(X)



In [22]:

    
clf = DBSCAN(eps=0.1)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)

Metrics



In [23]:

    
clf = DBSCAN(eps=0.05)
clf.fit(X)
y_pred = clf.labels_.astype(np.int)
# plot_dataset(X, y_pred, fname='noisy_circles_dbscan')
plot_dataset(X, y_pred)



In [24]:

    
labels = clf.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters









    Out[24]:





19



In [25]:

    
labels









    Out[25]:





array([ 0,  1,  2, ...,  2,  8, -1], dtype=int64)



In [26]:

    
contains_noise = -1 in labels
contains_noise









    Out[26]:





True



In [27]:

    
X, y = blobs
plot_dataset(X)



In [38]:

    
# eps is the maximum distance per two points to still be in the same neighborhood
# min_samples minimum amout of samples to form a cluster
clf = DBSCAN(eps=1.5)
# clf = DBSCAN()
clf.fit(X)









    Out[38]:





DBSCAN(algorithm='auto', eps=1.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)



In [39]:

    
# DBSCAN?



In [40]:

    
unique_labels = set(clf.labels_)
unique_labels









    Out[40]:





{0, 1, 2}



In [41]:

    
y_pred = clf.labels_.astype(np.int)
plot_dataset(X, y_pred)



In [32]:

    
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)
n_clusters









    Out[32]:





3



In [42]:

    
contains_noise = -1 in clf.labels_
contains_noise









    Out[42]:





False



In [43]:

    
from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)









    Out[43]:





0.8290743874701529



In [44]:

    
# 1 is great, -1 is worst, around 0 overlapping clusters
# silhouette_score?



In [45]:

    
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(X, y_pred)









    Out[45]:





37203.36303934007



In [37]:

    
# calinski_harabaz_score?