In [6]:
import sys
sys.path.append('../scripts/')
from tpe import TPE

In [4]:
from scipy.cluster import hierarchy
from scipy.spatial import distance
from scipy.spatial.distance import squareform,pdist

In [2]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target

In [20]:
import numpy as np
import numpy.random as npr
n = 1000
npr.seed(0)
rand = npr.rand(len(X))
ind = sorted(np.arange(len(X)),key=lambda i:rand[i])
X_s = X[ind][:n]
Y_s = Y[ind][:n]

In [ ]:


In [ ]:


In [42]:
c = hierarchy.linkage(X_s,'single')

In [13]:
from time import time

In [14]:
X_ = []
t = time()
for i in range(10):
    tpe = TPE()
    X_.append(tpe.fit_transform(X_s))
    print(i)
    print(time() - t)


0
25.589579105377197
1
51.85299801826477
2
79.2054591178894
3
105.24459218978882
4
131.55915212631226
5
157.11906719207764
6
183.43748307228088
7
209.51415920257568
8
236.66364312171936
9
262.7641382217407

In [22]:
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
def plot(X,Y,title='Embedding'):
    plt.scatter(X[:,0],X[:,1],c=Y,linewidths=0)
    plt.title(title)

In [24]:
for x in X_:
    plot(x,Y_s)
    plt.figure()


<matplotlib.figure.Figure at 0x10baa3748>

In [15]:
def compare_pairwise_distances(X1,X2):
    return np.sum((pdist(X1) - pdist(X2))**2)

In [25]:
C = [hierarchy.linkage(x,'single') for x in X_]

In [33]:
D = [hierarchy.cophenet(c) for c in C]

In [43]:
D_true = hierarchy.cophenet(c)

In [35]:
for d in D:
    plt.imshow(squareform(d),interpolation='none',cmap='Blues')
    plt.figure()


<matplotlib.figure.Figure at 0x10aa53c50>

In [38]:
cophenetic_inconsistency = np.zeros((len(D),len(D)))
for i in range(len(D)):
    for j in range(len(D)):
        cophenetic_inconsistency[i,j] = np.sum((D[i]-D[j])**2)
plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
plt.colorbar()


Out[38]:
<matplotlib.colorbar.Colorbar at 0x10bb0f4e0>

In [65]:
cophenetic_inaccuracy = np.zeros(len(D))
for i in range(len(D)):
    cophenetic_inaccuracy[i] = np.sum((D[i]-D_true*(D[i].mean() / D_true.mean()))**2)
#plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
#plt.colorbar()
plt.bar(range(len(D)),cophenetic_inaccuracy/len(D_true))


Out[65]:
<Container object of 10 artists>

In [64]:
D[0],D_true*(D[0].mean() / D_true.mean())


Out[64]:
(array([ 1.49872455,  1.49872455,  3.1576942 , ...,  2.01262438,
         0.71845327,  2.01262438]),
 array([ 1.67964019,  1.67964019,  1.95003938, ...,  1.67964019,
         1.46830927,  1.67964019]))

In [55]:
# how well does it preserve the relative cophenetic distances?
coph_order = np.array(sorted(np.arange(len(D_true)),key=lambda i:D_true[i]))

In [58]:
def order_of_distances(pairwise_distances):
    return np.array(sorted(np.arange(len(pairwise_distances)),
                           key=lambda i:pairwise_distances[i]))

In [87]:
sl_preservation = np.array([spearmanr(D_true,d) for d in D])
print(sl_preservation)


[[ 0.80497668  0.        ]
 [ 0.89727265  0.        ]
 [ 0.7945124   0.        ]
 [ 0.78215153  0.        ]
 [ 0.75372019  0.        ]
 [ 0.87580903  0.        ]
 [ 0.66437276  0.        ]
 [ 0.81000696  0.        ]
 [ 0.77369214  0.        ]
 [ 0.84767451  0.        ]]

In [89]:
sl_preservation.mean(0)[0],sl_preservation.std(0)[0],


Out[89]:
(0.80041888353949964, 0.062595801447962354)

In [80]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_s)

In [81]:
d_pca = hierarchy.cophenet(hierarchy.linkage(X_pca,'single'))

In [82]:
spearmanr(D_true,d_pca)


Out[82]:
(-0.0016329806155463997, 0.24845423056222249)

In [83]:
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X_s)
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,'single'))

In [84]:
spearmanr(D_true,d_tsne)


Out[84]:
(0.27196255133439384, 0.0)

In [85]:
linkage = 'complete'
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,linkage))
spearmanr(d_true,d_tsne)


Out[85]:
(0.25316209355552, 0.0)

In [86]:
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_single = hierarchy.cophenet(hierarchy.linkage(X_[0],linkage))
spearmanr(d_true,d_single)


Out[86]:
(0.13019556211901576, 0.0)

In [66]:
spearmanr(D[1],D[0])


Out[66]:
(0.70295505944230774, 0.0)

In [71]:
np.array(D).shape


Out[71]:
(10, 499500)

In [73]:
spearmanr(pdist(X_s),pdist(X_[0]))


Out[73]:
(0.11175886055285705, 0.0)

In [ ]:


In [72]:



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-72-d05178017768> in <module>()
----> 1 spearmanr(D)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/scipy/stats/stats.py in spearmanr(a, b, axis)
   2736     """
   2737     a, axisout = _chk_asarray(a, axis)
-> 2738     ar = np.apply_along_axis(rankdata,axisout,a)
   2739 
   2740     br = None

/Users/joshuafass/anaconda/lib/python3.4/site-packages/numpy/lib/shape_base.py in apply_along_axis(func1d, axis, arr, *args, **kwargs)
    126                 n -= 1
    127             i.put(indlist, ind)
--> 128             res = func1d(arr[tuple(i.tolist())], *args, **kwargs)
    129             outarr[tuple(i.tolist())] = res
    130             k += 1

KeyboardInterrupt: 

In [62]:
spearmanr(npr.rand(len(D_true)),npr.rand(len(D_true)))


Out[62]:
(-0.001165309677910792, 0.41017569073697491)

In [ ]:
from sklearn.decomposition import KernelPCA

In [ ]:


In [ ]:


In [57]:
from scipy.stats import spearmanr
order_of_distances(D_true),order_of_distances(D[0])

In [30]:
hierarchy.cophenet(C[0])


Out[30]:
(499500,)

In [27]:
C[0]


Out[27]:
array([[  4.37000000e+02,   4.86000000e+02,   1.80252320e-01,
          2.00000000e+00],
       [  1.27000000e+02,   9.12000000e+02,   1.85243762e-01,
          2.00000000e+00],
       [  8.90000000e+02,   9.18000000e+02,   1.90887479e-01,
          2.00000000e+00],
       ..., 
       [  3.50000000e+01,   1.99500000e+03,   4.22645481e+00,
          9.98000000e+02],
       [  3.05000000e+02,   1.99600000e+03,   4.37952780e+00,
          9.99000000e+02],
       [  5.97000000e+02,   1.99700000e+03,   4.59068060e+00,
          1.00000000e+03]])

In [ ]:
hierarchy.linkage()

In [18]:
compare_pairwise_distances(X_[0],X_[1])


Out[18]:
52076478.112204485

In [ ]:


In [ ]:
# compare the hierarchical clusterings induced by each of several embeddings

In [ ]: