In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
import numpy
from sklearn.datasets import make_circles, make_moons
from sklearn.preprocessing import scale
X_circle, y_circle = make_circles(n_samples=500, noise=.05, factor=.2)
X_moon, y_moon = make_moons(n_samples=300, noise=.07)
X_moon, y_moon = X_moon + 1.3, y_moon + 2
X = numpy.vstack((X_circle, X_moon))
y = numpy.concatenate((y_circle, y_moon))
X = scale(X)
In [3]:
def plot_labeled_clusters(points, labels):
unique_labels = set(labels)
label_count = len(unique_labels)
colors = plt.cm.Spectral(numpy.linspace(0, 1, label_count))
for label, color in zip(unique_labels, colors):
if label == -1:
color = 'k'
label_count -= 1
xy = points[labels == label]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=color)
plt.title('Number of clusters: {}'.format(label_count))
In [4]:
plot_labeled_clusters(X, y)
In [5]:
from sklearn.cluster import KMeans
n_clusters = len(set(y))
labels = KMeans(n_clusters=n_clusters).fit_predict(X)
plot_labeled_clusters(X, labels)
In [6]:
from sklearn.cluster import DBSCAN
labels = DBSCAN(eps=.2, min_samples=10).fit_predict(X)
plot_labeled_clusters(X, labels)
In [7]:
from sklearn.neighbors import kneighbors_graph
n_neighbors = 10
graph = kneighbors_graph(X, n_neighbors, mode='distance', metric='euclidean').todense()
counts = []
radiuses = numpy.arange(0, 1., .001)
for radius in radiuses:
neighbors_in_radius = ((graph < radius) & (graph > 0))
count = (neighbors_in_radius.sum(axis=0) >= n_neighbors).sum()
counts.append(count)
plt.xlabel('Number of points with at least {} neighbors in neighborhood'.format(n_neighbors))
plt.ylabel('Neighborhood radius')
plt.plot(counts, radiuses);