In [6]:
import sys
sys.path.append('../scripts/')
from tpe import TPE
In [4]:
from scipy.cluster import hierarchy
from scipy.spatial import distance
from scipy.spatial.distance import squareform,pdist
In [2]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target
In [20]:
import numpy as np
import numpy.random as npr
n = 1000
npr.seed(0)
rand = npr.rand(len(X))
ind = sorted(np.arange(len(X)),key=lambda i:rand[i])
X_s = X[ind][:n]
Y_s = Y[ind][:n]
In [ ]:
In [ ]:
In [42]:
c = hierarchy.linkage(X_s,'single')
In [13]:
from time import time
In [14]:
X_ = []
t = time()
for i in range(10):
tpe = TPE()
X_.append(tpe.fit_transform(X_s))
print(i)
print(time() - t)
In [22]:
import matplotlib.pyplot as plt
%matplotlib inline
In [19]:
def plot(X,Y,title='Embedding'):
plt.scatter(X[:,0],X[:,1],c=Y,linewidths=0)
plt.title(title)
In [24]:
for x in X_:
plot(x,Y_s)
plt.figure()
In [15]:
def compare_pairwise_distances(X1,X2):
return np.sum((pdist(X1) - pdist(X2))**2)
In [25]:
C = [hierarchy.linkage(x,'single') for x in X_]
In [33]:
D = [hierarchy.cophenet(c) for c in C]
In [43]:
D_true = hierarchy.cophenet(c)
In [35]:
for d in D:
plt.imshow(squareform(d),interpolation='none',cmap='Blues')
plt.figure()
In [38]:
cophenetic_inconsistency = np.zeros((len(D),len(D)))
for i in range(len(D)):
for j in range(len(D)):
cophenetic_inconsistency[i,j] = np.sum((D[i]-D[j])**2)
plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
plt.colorbar()
Out[38]:
In [65]:
cophenetic_inaccuracy = np.zeros(len(D))
for i in range(len(D)):
cophenetic_inaccuracy[i] = np.sum((D[i]-D_true*(D[i].mean() / D_true.mean()))**2)
#plt.imshow(cophenetic_inconsistency,interpolation='none',cmap='Blues')
#plt.colorbar()
plt.bar(range(len(D)),cophenetic_inaccuracy/len(D_true))
Out[65]:
In [64]:
D[0],D_true*(D[0].mean() / D_true.mean())
Out[64]:
In [55]:
# how well does it preserve the relative cophenetic distances?
coph_order = np.array(sorted(np.arange(len(D_true)),key=lambda i:D_true[i]))
In [58]:
def order_of_distances(pairwise_distances):
return np.array(sorted(np.arange(len(pairwise_distances)),
key=lambda i:pairwise_distances[i]))
In [87]:
sl_preservation = np.array([spearmanr(D_true,d) for d in D])
print(sl_preservation)
In [89]:
sl_preservation.mean(0)[0],sl_preservation.std(0)[0],
Out[89]:
In [80]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_s)
In [81]:
d_pca = hierarchy.cophenet(hierarchy.linkage(X_pca,'single'))
In [82]:
spearmanr(D_true,d_pca)
Out[82]:
In [83]:
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X_s)
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,'single'))
In [84]:
spearmanr(D_true,d_tsne)
Out[84]:
In [85]:
linkage = 'complete'
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_tsne = hierarchy.cophenet(hierarchy.linkage(X_tsne,linkage))
spearmanr(d_true,d_tsne)
Out[85]:
In [86]:
d_true = hierarchy.cophenet(hierarchy.linkage(X_s,linkage))
d_single = hierarchy.cophenet(hierarchy.linkage(X_[0],linkage))
spearmanr(d_true,d_single)
Out[86]:
In [66]:
spearmanr(D[1],D[0])
Out[66]:
In [71]:
np.array(D).shape
Out[71]:
In [73]:
spearmanr(pdist(X_s),pdist(X_[0]))
Out[73]:
In [ ]:
In [72]:
In [62]:
spearmanr(npr.rand(len(D_true)),npr.rand(len(D_true)))
Out[62]:
In [ ]:
from sklearn.decomposition import KernelPCA
In [ ]:
In [ ]:
In [57]:
from scipy.stats import spearmanr
order_of_distances(D_true),order_of_distances(D[0])
In [30]:
hierarchy.cophenet(C[0])
Out[30]:
In [27]:
C[0]
Out[27]:
In [ ]:
hierarchy.linkage()
In [18]:
compare_pairwise_distances(X_[0],X_[1])
Out[18]:
In [ ]:
In [ ]:
# compare the hierarchical clusterings induced by each of several embeddings
In [ ]: