In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt

KMeans

Visualizations on synthetic datasets

DBSCAN

Original paper

Visualizations on synthetic datasets

Comparison



In [2]:

    
import numpy

from sklearn.datasets import make_circles, make_moons
from sklearn.preprocessing import scale

X_circle, y_circle = make_circles(n_samples=500, noise=.05, factor=.2)

X_moon, y_moon = make_moons(n_samples=300, noise=.07)
X_moon, y_moon = X_moon + 1.3, y_moon + 2

X = numpy.vstack((X_circle, X_moon))
y = numpy.concatenate((y_circle, y_moon))

X = scale(X)



In [3]:

    
def plot_labeled_clusters(points, labels):
    unique_labels = set(labels)
    label_count = len(unique_labels)
    
    colors = plt.cm.Spectral(numpy.linspace(0, 1, label_count))
    
    for label, color in zip(unique_labels, colors):
        
        if label == -1:
            color = 'k'
            label_count -= 1
        
        xy = points[labels == label]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=color)
        
    plt.title('Number of clusters: {}'.format(label_count))



In [4]:

    
plot_labeled_clusters(X, y)









    



C:\Python35\lib\site-packages\matplotlib\lines.py:1145: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self._markerfacecolor != fc:



In [5]:

    
from sklearn.cluster import KMeans

n_clusters = len(set(y))
labels = KMeans(n_clusters=n_clusters).fit_predict(X)

plot_labeled_clusters(X, labels)









    



C:\Python35\lib\site-packages\matplotlib\lines.py:1145: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self._markerfacecolor != fc:



In [6]:

    
from sklearn.cluster import DBSCAN

labels = DBSCAN(eps=.2, min_samples=10).fit_predict(X)

plot_labeled_clusters(X, labels)









    



C:\Python35\lib\site-packages\matplotlib\lines.py:1145: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self._markerfacecolor != fc:

Parameter Estimation



In [7]:

    
from sklearn.neighbors import kneighbors_graph

n_neighbors = 10
graph = kneighbors_graph(X, n_neighbors, mode='distance', metric='euclidean').todense()

counts = []
radiuses = numpy.arange(0, 1., .001)
for radius in radiuses:
    neighbors_in_radius = ((graph < radius) & (graph > 0))
    count = (neighbors_in_radius.sum(axis=0) >= n_neighbors).sum()
    counts.append(count)
    
plt.xlabel('Number of points with at least {} neighbors in neighborhood'.format(n_neighbors))
plt.ylabel('Neighborhood radius')
plt.plot(counts, radiuses);

KMeans

DBSCAN

Comparison

Parameter Estimation

Scikit-Learn