notebook.community

Edit and run



In [1]:

    
from planet4 import markings
import hdbscan
plot_kwds = {'alpha' : 0.8, 's' : 50, 'linewidths':0}



In [2]:

    
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
sns.set_color_codes()



In [ ]:

    
%matplotlib inline



In [25]:

    
from sklearn.datasets import make_blobs



In [91]:

    
X, y = make_blobs(
    n_samples=100,
    n_features=2,
    centers=3,
    cluster_std=3.0,
    center_box=(-10.0, 10.0),
    shuffle=True,
    random_state=42)



In [93]:

    
plt.scatter(X[:,0], X[:,1], **plot_kwds)









    Out[93]:





<matplotlib.collections.PathCollection at 0x1171f8cf8>



In [94]:

    
clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(X)
color_palette = sns.color_palette('Paired', 12)
val = 0.5
cluster_colors = [color_palette[x] if x >= 0
                  else (val,val,val)
                  for x in clusterer.labels_]
cluster_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(cluster_colors, clusterer.probabilities_)]



In [95]:

    
plt.scatter(X[:,0], X[:,1], 
            s=50, linewidth=0, c=cluster_member_colors, alpha=1)









    Out[95]:





<matplotlib.collections.PathCollection at 0x1173000b8>



In [97]:

    
plt.hist(clusterer.probabilities_, bins=50);



In [98]:

    
sns.distplot(
    clusterer.outlier_scores_[np.isfinite(clusterer.outlier_scores_)],
    rug=True)









    



/Users/klay6683/miniconda3/envs/stable/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.axes._subplots.AxesSubplot at 0x117629b00>



In [143]:

    
threshold = pd.Series(clusterer.outlier_scores_).quantile(0.80)



In [144]:

    
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]



In [145]:

    
outliers









    Out[145]:





array([ 0,  3,  9, 13, 29, 34, 38, 39, 40, 54, 57, 64, 66, 76, 78, 85, 87,
       88, 93, 94])



In [139]:

    
np.where(clusterer.labels_ <0 )[0]









    Out[139]:





array([ 0,  1,  3, 23, 24, 27, 29, 33, 34, 39, 40, 46, 52, 58, 59, 64, 65,
       66, 78, 83, 86, 89, 91, 93, 94, 96])



In [140]:

    
np.where(clusterer.probabilities_ ==0)[0]









    Out[140]:





array([ 0,  1,  3, 23, 24, 27, 29, 33, 34, 39, 40, 46, 52, 58, 59, 64, 65,
       66, 78, 83, 86, 89, 91, 93, 94, 96])



In [141]:

    
clusterer.probabilities_[9]









    Out[141]:





0.80810517548902239



In [132]:

    
sns.distplot(clusterer.probabilities_, rug=True)









    



/Users/klay6683/miniconda3/envs/stable/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[132]:





<matplotlib.axes._subplots.AxesSubplot at 0x117c0b8d0>



In [146]:

    
plt.scatter(X[:, 0], X[:, 1], s=50, lw=0, c='gray', alpha=0.5)
plt.scatter(X[outliers][:, 0],
            X[outliers][:, 1], s=50, lw=0, c='red', alpha=0.75)









    Out[146]:





<matplotlib.collections.PathCollection at 0x11858d2b0>






    Out[146]:





<matplotlib.collections.PathCollection at 0x11858ddd8>



In [135]:

    
set(clusterer.labels_)









    Out[135]:





{-1, 0, 1}



In [ ]:



In [ ]:



In [3]:

    
ids = 'b89 139 dch bvc pbr 1at 1dr 1fe br5 ek1'.split()
p4id = markings.ImageID(ids[0], scope='planet4')



In [4]:

    
p4id.plot_blotches(with_center=True)



In [5]:

    
p4id.plot_fans(with_center=True)



In [7]:

    
import logging
logger = logging.getLogger('planet4.hdbscan')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
logger.addHandler(ch)



In [5]:

    
from planet4.hdbscan import (HDBScanner, post_processing, 
                             parameter_scan, plot_results)
from hdbscan import HDBSCAN



In [6]:

    
basecols = 'x y'.split()
blotchcols = basecols + 'radius_1 radius_2'.split()
blotchcols_all = blotchcols + ['y_angle']
fancols = basecols + 'x_angle y_angle'.split()
fancols_all = fancols + 'spread'.split()



In [6]:

    
current_id = ids[0]
current_id
p4id = markings.ImageID(current_id, scope='planet4')



In [7]:

    
p4id.plot_blotches()



In [7]:

    
ids









    Out[7]:





['b89', '139', 'dch', 'bvc', 'pbr', '1at', '1dr', '1fe', 'br5', 'ek1']



In [11]:

    
for id_ in ids:
    for cols in [blotchcols_all]:
        for factor in [0.1, 0.15]:
            print(id_, cols, factor, scale)
            parameter_scan(id_, 'blotch',
                           cols=cols,
                           factor=factor)
            plt.close('all')









    



b89 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
b89 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
139 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
139 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
dch ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
dch ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
bvc ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
bvc ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
pbr ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
pbr ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
1at ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
1at ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
1dr ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
1dr ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
1fe ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
1fe ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
br5 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
br5 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False
ek1 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.1 False
ek1 ['x', 'y', 'radius_1', 'radius_2', 'y_angle'] 0.15 False



In [ ]:

    
for id_ in ids:
    for cols in [fancols, fancols_all]:
        for core in [True, False]:
            for scale in [True,False]:
                print(id_, cols, core, scale)
                parameter_scan(id_, 'fan',
                               cols=cols,
                               only_core=core,
                               do_scale=scale)
                plt.close('all')









    



b89 ['x', 'y', 'x_angle', 'y_angle'] True True
b89 ['x', 'y', 'x_angle', 'y_angle'] True False



In [16]:

    
plt.figure()
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-5d804e83eaf8> in <module>()
      1 plt.figure()
----> 2 clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

NameError: name 'clusterer' is not defined



In [23]:

    
plt.figure()
clusterer.condensed_tree_.plot()









    Out[23]:





<matplotlib.figure.Figure at 0x11328e320>






    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x11629f630>



In [24]:

    
plt.figure()
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())









    Out[24]:





<matplotlib.figure.Figure at 0x1161db240>






    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x116f38b38>



In [148]:

    
from planet4.hdbscan import plot_results



In [ ]:

    
plot_results()



In [49]:

    
test_data = X.as_matrix()
fig, ax = plt.subplots()
imid.show_subframe(ax=ax)
ax.scatter(test_data.T[0], test_data.T[1], c=cluster_member_colors, **plot_kwds)
imid.plot_blotches(ax=ax)









    Out[49]:





<matplotlib.collections.PathCollection at 0x116afc8d0>



In [44]:

    
test_data = X.as_matrix()
fig, ax = plt.subplots()
imid.show_subframe(ax=ax)
ax.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds)
imid.plot_blotches(ax=ax)









    Out[44]:





<matplotlib.collections.PathCollection at 0x118355a90>



In [26]:

    
proba_cut=0.75

labels = clusterer.labels_
core_samples_mask = np.zeros_like(clusterer.labels_, dtype=bool)
core_samples_mask[clusterer.probabilities_ > proba_cut] = True
unique_labels = set(labels)
n_clusters = len(unique_labels) - (1 if -1 in labels else 0)
reduced_data = []  # list of `kind` cluster average objects
n_rejected = 0

# loop over unique labels.
for label in unique_labels:
    # get indices for members of this cluster
    class_member_mask = (labels == label)
    cluster_members = (class_member_mask & core_samples_mask)
    # treat noise
    if label == -1:
        n_rejected = len(cluster_members)
    # if label is a cluster member:
    else:
        reduced_data.append(cluster_members)



In [27]:

    
from planet4 import markings
from scipy.stats import circmean

cols = markings.Fan.to_average
Marking = markings.Fan

data = fans

mean_markings = []
for cluster_members in reduced_data:
    clusterdata = data.loc[cluster_members, cols]
    meandata = clusterdata.mean()
    meandata.angle = np.rad2deg(
        circmean(
        np.deg2rad(
        clusterdata.angle)))
    cluster = Marking(meandata, scope='planet4')
    # storing n_members into the object for later.
    cluster.n_members = len(cluster_members)
    # storing this saved marker for later in ClusteringManager
    cluster.saved = False
    mean_markings.append(cluster)



In [26]:

    
mean_markings









    Out[26]:





[base: [ 368.   483.4]
 armlength: 60.46394682644806
 arm1: [ 324.98334441  440.90933976]
 arm2: [ 400.38108478  432.33774178], base: [ 430.32963392  463.92777846]
 armlength: 72.13719945416335
 arm1: [ 374.13636523  418.69402891]
 arm2: [ 458.213927    397.39777211], base: [ 502.904759    472.28571429]
 armlength: 48.26017249273475
 arm1: [ 474.40949863  433.33620356]
 arm2: [ 518.18050208  426.50694276], base: [ 441.19777629  444.86333313]
 armlength: 64.80054691720133
 arm1: [ 399.83083329  394.98461113]
 arm2: [ 467.69619577  385.7283634 ]]



In [ ]: