notebook.community

Edit and run



In [6]:

    
import sys
sys.path.append('../scripts/')
from tpe import TPE



In [4]:

    
from scipy.cluster import hierarchy
from scipy.spatial import distance
from scipy.spatial.distance import squareform,pdist



In [2]:

    
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target



In [20]:

    
import numpy as np
import numpy.random as npr
n = 1000
npr.seed(0)
rand = npr.rand(len(X))
ind = sorted(np.arange(len(X)),key=lambda i:rand[i])
X_s = X[ind][:n]
Y_s = Y[ind][:n]



In [ ]:



In [ ]:



In [42]:

    
c = hierarchy.linkage(X_s,'single')



In [13]:

    
from time import time



In [14]:

    
X_ = []
t = time()
for i in range(10):
    tpe = TPE()
    X_.append(tpe.fit_transform(X_s))
    print(i)
    print(time() - t)









    



0
25.589579105377197
1
51.85299801826477
2
79.2054591178894
3
105.24459218978882
4
131.55915212631226
5
157.11906719207764
6
183.43748307228088
7
209.51415920257568
8
236.66364312171936
9
262.7641382217407



In [22]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [19]:

    
def plot(X,Y,title='Embedding'):
    plt.scatter(X[:,0],X[:,1],c=Y,linewidths=0)
    plt.title(title)



In [24]:

    
for x in X_:
    plot(x,Y_s)
    plt.figure()









    












    












    












    












    












    












    












    












    












    












    





<matplotlib.figure.Figure at 0x10baa3748>



In [15]:

    
def compare_pairwise_distances(X1,X2):
    return np.sum((pdist(X1) - pdist(X2))**2)



In [25]:

    
C = [hierarchy.linkage(x,'single') for x in X_]



In [33]:

    
D = [hierarchy.cophenet(c) for c in C]



In [43]:

    
D_true = hierarchy.cophenet(c)



In [35]:

    
for d in D:
    plt.imshow(squareform(d),interpolation='none',cmap='Blues')
    plt.figure()









    












    












    












    












    












    












    












    












    












    












    





<matplotlib.figure.Figure at 0x10aa53c50>



In [38]:

    
cophenetic_inconsistency = np.zeros((len(D),len(D)))
for i in range(len(D)):
    for j in range(len(D)):
        cophenetic_inconsistency[i,j] = np.sum((D[i]-D[j])**2)
plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
plt.colorbar()









    Out[38]:





<matplotlib.colorbar.Colorbar at 0x10bb0f4e0>



In [65]:

    
cophenetic_inaccuracy = np.zeros(len(D))
for i in range(len(D)):
    cophenetic_inaccuracy[i] = np.sum((D[i]-D_true*(D[i].mean() / D_true.mean()))**2)
#plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
#plt.colorbar()
plt.bar(range(len(D)),cophenetic_inaccuracy/len(D_true))









    Out[65]:





<Container object of 10 artists>



In [64]:

    
D[0],D_true*(D[0].mean() / D_true.mean())









    Out[64]:





(array([ 1.49872455,  1.49872455,  3.1576942 , ...,  2.01262438,
         0.71845327,  2.01262438]),
 array([ 1.67964019,  1.67964019,  1.95003938, ...,  1.67964019,
         1.46830927,  1.67964019]))



In [55]:

    
# how well does it preserve the relative cophenetic distances?
coph_order = np.array(sorted(np.arange(len(D_true)),key=lambda i:D_true[i]))



In [58]:

    
def order_of_distances(pairwise_distances):
    return np.array(sorted(np.arange(len(pairwise_distances)),
                           key=lambda i:pairwise_distances[i]))



In [87]:

    
sl_preservation = np.array([spearmanr(D_true,d) for d in D])
print(sl_preservation)









    



[[ 0.80497668  0.        ]
 [ 0.89727265  0.        ]
 [ 0.7945124   0.        ]
 [ 0.78215153  0.        ]
 [ 0.75372019  0.        ]
 [ 0.87580903  0.        ]
 [ 0.66437276  0.        ]
 [ 0.81000696  0.        ]
 [ 0.77369214  0.        ]
 [ 0.84767451  0.        ]]



In [89]:

    
sl_preservation.mean(0)[0],sl_preservation.std(0)[0],









    Out[89]:





(0.80041888353949964, 0.062595801447962354)



In [80]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_s)



In [81]:

    
d_pca = hierarchy.cophenet(hierarchy.linkage(X_pca,'single'))



In [82]:

    
spearmanr(D_true,d_pca)









    Out[82]:





(-0.0016329806155463997, 0.24845423056222249)



In [83]:

    
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X_s)
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,'single'))



In [84]:

    
spearmanr(D_true,d_tsne)









    Out[84]:





(0.27196255133439384, 0.0)



In [85]:

    
linkage = 'complete'
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,linkage))
spearmanr(d_true,d_tsne)









    Out[85]:





(0.25316209355552, 0.0)



In [86]:

    
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_single = hierarchy.cophenet(hierarchy.linkage(X_[0],linkage))
spearmanr(d_true,d_single)









    Out[86]:





(0.13019556211901576, 0.0)



In [66]:

    
spearmanr(D[1],D[0])









    Out[66]:





(0.70295505944230774, 0.0)



In [71]:

    
np.array(D).shape









    Out[71]:





(10, 499500)



In [73]:

    
spearmanr(pdist(X_s),pdist(X_[0]))









    Out[73]:





(0.11175886055285705, 0.0)



In [ ]:



In [72]:









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-72-d05178017768> in <module>()
----> 1 spearmanr(D)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/scipy/stats/stats.py in spearmanr(a, b, axis)
   2736     """
   2737     a, axisout = _chk_asarray(a, axis)
-> 2738     ar = np.apply_along_axis(rankdata,axisout,a)
   2739 
   2740     br = None

/Users/joshuafass/anaconda/lib/python3.4/site-packages/numpy/lib/shape_base.py in apply_along_axis(func1d, axis, arr, *args, **kwargs)
    126                 n -= 1
    127             i.put(indlist, ind)
--> 128             res = func1d(arr[tuple(i.tolist())], *args, **kwargs)
    129             outarr[tuple(i.tolist())] = res
    130             k += 1

KeyboardInterrupt:



In [62]:

    
spearmanr(npr.rand(len(D_true)),npr.rand(len(D_true)))









    Out[62]:





(-0.001165309677910792, 0.41017569073697491)



In [ ]:

    
from sklearn.decomposition import KernelPCA



In [ ]:



In [ ]:



In [57]:

    
from scipy.stats import spearmanr
order_of_distances(D_true),order_of_distances(D[0])



In [30]:

    
hierarchy.cophenet(C[0])









    Out[30]:





(499500,)



In [27]:

    
C[0]









    Out[27]:





array([[  4.37000000e+02,   4.86000000e+02,   1.80252320e-01,
          2.00000000e+00],
       [  1.27000000e+02,   9.12000000e+02,   1.85243762e-01,
          2.00000000e+00],
       [  8.90000000e+02,   9.18000000e+02,   1.90887479e-01,
          2.00000000e+00],
       ..., 
       [  3.50000000e+01,   1.99500000e+03,   4.22645481e+00,
          9.98000000e+02],
       [  3.05000000e+02,   1.99600000e+03,   4.37952780e+00,
          9.99000000e+02],
       [  5.97000000e+02,   1.99700000e+03,   4.59068060e+00,
          1.00000000e+03]])



In [ ]:

    
hierarchy.linkage()



In [18]:

    
compare_pairwise_distances(X_[0],X_[1])









    Out[18]:





52076478.112204485



In [ ]:



In [ ]:

    
# compare the hierarchical clusterings induced by each of several embeddings



In [ ]: