HDBScan



In [87]:

    
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as data
import pandas as pd
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}



In [88]:

    
moons, _ = data.make_moons(n_samples=50, noise=0.05)
blobs, _ = data.make_blobs(n_samples=50, centers=[(-0.75,2.25), (1.0, 2.0)], cluster_std=0.25)
test_data = np.vstack([moons, blobs])
plt.scatter(test_data.T[0], test_data.T[1], color='b', **plot_kwds)









    Out[88]:





<matplotlib.collections.PathCollection at 0x7f3f092ebed0>



In [89]:

    
%time
import hdbscan

clusterer = hdbscan.HDBSCAN(algorithm='best', 
                            alpha=1.0, 
                            approx_min_span_tree=True,
                            core_dist_n_jobs = 4,
                            gen_min_span_tree=True, 
                            leaf_size=40,
                            metric='euclidean', 
                            min_cluster_size=5, 
                            min_samples=None, 
                            p=None)
clusterer.fit(test_data)









    



CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.06 µs






    Out[89]:





HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, core_dist_n_jobs=4, gen_min_span_tree=True,
    leaf_size=40, memory=Memory(cachedir=None), metric='euclidean',
    min_cluster_size=5, min_samples=None, p=None)



In [90]:

    
print clusterer.probabilities_[0:5]
print clusterer.labels_[0:5]
print clusterer.outlier_scores_[0:5]









    



[ 1.  1.  1.  1.  1.]
[1 1 1 1 1]
[ 0.          0.          0.          0.19214433  0.07532065]



In [91]:

    
sns.distplot(clusterer.outlier_scores_[np.isfinite(clusterer.outlier_scores_)], rug=True)









    Out[91]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f08aa3f10>



In [92]:

    
threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
plt.scatter(*test_data.T, s=50, linewidth=0, c='gray', alpha=0.25)
plt.scatter(*test_data[outliers].T, s=50, linewidth=0, c='red', alpha=0.5)









    Out[92]:





<matplotlib.collections.PathCollection at 0x7f3f08d11190>



In [93]:

    
clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis',
                                      edge_alpha=0.6,
                                      node_size=80,
                                      edge_linewidth=2)









    Out[93]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f08cc6c90>



In [94]:

    
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)









    Out[94]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f08c4b2d0>



In [95]:

    
clusterer.condensed_tree_.plot()









    Out[95]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f087f7f50>



In [96]:

    
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())









    Out[96]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f083bc550>



In [97]:

    
palette = sns.color_palette()
cluster_colors = [sns.desaturate(palette[col], sat)
                  if col >= 0 else (0.5, 0.5, 0.5) for col, sat in
                  zip(clusterer.labels_, clusterer.probabilities_)]
plt.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds)









    Out[97]:





<matplotlib.collections.PathCollection at 0x7f3ef3fd4e90>

Selecting Minimum Cluster Size



In [98]:

    
from sklearn.manifold import TSNE
projection = TSNE(perplexity=50,learning_rate=200).fit_transform(test_data) 
plt.scatter(*projection.T, **plot_kwds)









    Out[98]:





<matplotlib.collections.PathCollection at 0x7f3ef3f178d0>



In [106]:

    
def plot_hdbscan_tsne(test_size):
    clusterer = hdbscan.HDBSCAN(algorithm='best', 
                            alpha=1.0, 
                            approx_min_span_tree=True,
                            core_dist_n_jobs = 4,
                            gen_min_span_tree=True, 
                            leaf_size=40,
                            metric='euclidean', 
                            min_cluster_size=test_size, 
                            min_samples=None, 
                            p=None).fit(test_data)
    palette = sns.color_palette()
    cluster_colors = [sns.desaturate(palette[col], sat)
                      if col >= 0 else (0.5, 0.5, 0.5) for col, sat in
                      zip(clusterer.labels_, clusterer.probabilities_)]
    
    
    plt.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds)



In [107]:

    
plot_hdbscan_tsne(5)



In [105]:

    
plot_hdbscan_tsne(5)



In [ ]: