In [68]:
import sys
sys.path.append('../scripts/')
In [285]:
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
%matplotlib inline
plt.rcParams['font.family']='Serif'
In [2]:
npr.seed(0)
In [3]:
X = np.vstack((npr.randn(1000,3),
npr.randn(1000,3)+5))
In [ ]:
X = np.vstack(([]))
In [423]:
import itertools
dim=3
radius=1
corners = [(np.array(i)*radius+radius)/2 for i in itertools.product([-1,1],repeat=dim)]
In [424]:
def coord_str(array):
str_a = [str(int(elem)) for elem in array]
return "(" + ", ".join(str_a) + ")"
In [425]:
from scipy.spatial.distance import pdist,squareform
plt.imshow(squareform(pdist(corners)),interpolation='none',cmap='Blues')
plt.colorbar()
#plt.axis('off')
plt.yticks(np.arange(len(corners)),[coord_str(c) for c in corners])
plt.xticks(np.arange(len(corners)),[coord_str(c) for c in corners],rotation=30)
Out[425]:
In [419]:
for d in range(1,12):
corners = [np.array(i) for i in itertools.product([-1,1],repeat=d)]
plt.imshow(squareform(pdist(corners)),interpolation='none')
plt.colorbar()
#plt.yticks(np.arange(len(corners)),[str(c) for c in corners])
plt.title('{0}D hypercube'.format(d))
plt.savefig('../figures/hypercube-d={0}.jpg'.format(d),dpi=300)
plt.close()
In [400]:
for d in range(1,13):
corners = [np.array(i) for i in itertools.product([-1,1],repeat=d)]
plt.imshow(squareform(pdist(corners)),interpolation='none',cmap='Blues')
plt.axis('off')
#plt.colorbar()
#plt.yticks(np.arange(len(corners)),[str(c) for c in corners])
#plt.title('{0}D hypercube'.format(d))
plt.savefig('../figures/blue-hypercube-nolabel-d={0}.jpg'.format(d),dpi=300)
plt.close()
In [ ]:
In [182]:
corners = [np.array(i) for i in itertools.product([-1,1],repeat=13)]
plt.hist(pdist(corners),bins=10);
In [183]:
len(set(pdist(corners)))
Out[183]:
In [ ]:
print(corners[0])
In [279]:
n=100
X = np.vstack([npr.randn(n,dim) + c for c in corners])
Y = np.hstack([np.ones(n)*r for r in range(len(corners))])
X.shape,Y.shape
Out[279]:
In [240]:
def generate_hypercube(dim=3,radius=4,n_tot=100):
corners = [np.array(i)*radius for i in itertools.product([-1,1],repeat=dim)]
n = int(n_tot / len(corners))
X = np.vstack([npr.randn(n,dim) + c for c in corners])
Y = np.hstack([np.ones(n)*r for r in range(len(corners))])
return X,Y
In [318]:
X,Y = generate_hypercube(4,n_tot=1400)
In [319]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0],X[:,1],X[:,2],c=Y,linewidths=0,alpha=0.5)
Out[319]:
In [335]:
plt.scatter(X[:,0],X[:,1],c=Y,linewidths=0,alpha=0.5)
Out[335]:
In [321]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X)
Out[321]:
In [322]:
pca.explained_variance_ratio_
Out[322]:
In [323]:
X_ = pca.transform(X)[:,:2]
In [359]:
plt.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5)
plt.tick_params(labelleft='off',
labelbottom='off') # labels along the bottom edge are off
In [445]:
ds = list(range(3,7))
#algs = []
results = dict()
num_algs = 3
for i,d in enumerate(ds):
X,Y = generate_hypercube(d,n_tot=1400)
pca = PCA()
pca.fit(X)
X_ = pca.transform(X)[:,:2]
plt.subplot(len(ds),num_algs,(num_algs*i)+1)
#plt.title('PCA: {0}D hypercube'.format(d))
if i == 0:
plt.title('PCA')
plt.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5,s=3)
plt.tick_params(labelleft='off',labelbottom='off')
#plt.xlabel('PC1')
#plt.ylabel('PC2')
plt.ylabel('{0}D'.format(d))
#X_tpe = X_tsne = X_
tpe = TPE('complete')
X_tpe = tpe.fit_transform(X)
plt.subplot(len(ds),num_algs,(num_algs*i)+2)
if i == 0:
plt.title('TPE')
#plt.title('TPE: {0}D hypercube'.format(d))
plt.scatter(X_tpe[:,0],X_tpe[:,1],c=Y,linewidths=0,alpha=0.5,s=3)
plt.tick_params(labelleft='off',labelbottom='off')
tsne = TSNE()
X_tsne = tsne.fit_transform(X)
plt.subplot(len(ds),num_algs,(num_algs*i)+3)
if i == 0:
plt.title('t-SNE')
plt.scatter(X_tsne[:,0],X_tsne[:,1],c=Y,linewidths=0,alpha=0.5,s=3)
plt.tick_params(labelleft='off',labelbottom='off')
results[d] = (X_,X_tpe,X_tsne)
plt.tight_layout()
plt.savefig('blah-complete.pdf')
In [446]:
for d in results:
for a in results[d]:
_,Y = generate_hypercube(d,n_tot=1400)
print(one_nn_class_baseline(a,Y))
print()
In [482]:
d = 6
X_pca,X_tpe,X_tsne = results[d]
X,Y = generate_hypercube(d,n_tot=1400)
cluster_centers = np.array([np.mean(X[Y==y],0) for y in set(Y)])
cluster_centers_pca = np.array([np.mean(X_pca[Y==y],0) for y in set(Y)])
cluster_centers_tpe = np.array([np.mean(X_tpe[Y==y],0) for y in set(Y)])
cluster_centers_tsne = np.array([np.mean(X_tsne[Y==y],0) for y in set(Y)])
In [499]:
plt.scatter(pdist(cluster_centers_pca),pdist(cluster_centers),alpha=0.5,linewidths=0)
plt.xlabel("Inter-cluster distances in PCA embedding")
plt.ylabel("Ground-truth inter-cluster distances")
plt.title('PCA')
plt.figure()
plt.scatter(pdist(cluster_centers_tpe),pdist(cluster_centers),alpha=0.5,linewidths=0)
plt.xlabel("Inter-cluster distances in Approx. TPE embedding")
plt.ylabel("Ground-truth inter-cluster distances")
plt.title('Approx. TPE')
plt.figure()
plt.scatter(pdist(cluster_centers_tsne),pdist(cluster_centers),alpha=0.5,linewidths=0)
plt.xlabel("Inter-cluster distances in t-SNE embedding")
plt.ylabel("Ground-truth inter-cluster distances")
plt.title('t-SNE')
plt.figure()
Out[499]:
In [501]:
from sklearn.metrics import r2_score
r2_score(pdist(cluster_centers),pdist(cluster_centers_pca))
Out[501]:
In [502]:
r2_score(pdist(cluster_centers),pdist(cluster_centers_tpe))
Out[502]:
In [503]:
r2_score(pdist(cluster_centers),pdist(cluster_centers_tsne))
Out[503]:
In [487]:
plt.imshow(squareform(pdist(cluster_centers)),interpolation='none',cmap='Blues')
plt.title('Ground-truth inter-cluster distances')
plt.figure()
plt.imshow(squareform(pdist(cluster_centers_pca)),interpolation='none',cmap='Blues')
plt.title('PCA inter-cluster distances')
plt.figure()
plt.imshow(squareform(pdist(cluster_centers_tpe)),interpolation='none',cmap='Blues')
plt.title('TPE inter-cluster distances')
plt.figure()
plt.imshow(squareform(pdist(cluster_centers_tsne)),interpolation='none',cmap='Blues')
plt.title('t-SNE inter-cluster distances')
Out[487]:
In [508]:
from sklearn.manifold import MDS
mds = MDS(dissimilarity='precomputed')
mds_centers = mds.fit_transform(squareform(pdist(cluster_centers)))
In [509]:
r2_score(pdist(cluster_centers),pdist(mds_centers))
Out[509]:
In [510]:
plt.imshow(squareform(pdist(mds_centers)),interpolation='none',cmap='Blues')
plt.title('Directly optimized inter-cluster distances')
Out[510]:
In [513]:
plt.scatter(pdist(mds_centers),pdist(cluster_centers),alpha=0.5,linewidths=0)
plt.xlabel("Directly optimized inter-cluster distances (2D)")
plt.ylabel("Ground-truth inter-cluster distances (6D)")
plt.figure()
Out[513]:
In [340]:
for i,d in enumerate(ds):
print(i,d)
In [331]:
len(set(Y))
Out[331]:
In [332]:
2**6
Out[332]:
In [329]:
2**7
Out[329]:
In [297]:
X_.shape,Y.shape
Out[297]:
In [298]:
from sklearn.decomposition import FastICA
ica = FastICA(n_components=2)
X_ica = ica.fit_transform(X)
plt.scatter(X_ica[:,0],X_ica[:,1],c=Y,linewidths=0,alpha=0.5)
Out[298]:
In [299]:
from sklearn import neighbors
def one_nn_baseline(X,Y):
one_nn_X = neighbors.kneighbors_graph(X,2)
one_nn_Y = neighbors.kneighbors_graph(Y,2)
sames = 0
for i in range(len(X)):
neighbor_X = one_nn_X[i].indices[one_nn_X[i].indices!=i][0]
neighbor_Y = one_nn_Y[i].indices[one_nn_Y[i].indices!=i][0]
if neighbor_X == neighbor_Y:
sames+=1
return 1.0*sames / len(X)
def one_nn_class_baseline(X,Y):
one_nn = neighbors.kneighbors_graph(X,2)
inds = np.zeros(len(X),dtype=int)
for i in range(len(X)):
inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
preds = Y[inds]
return 1.0*sum(preds==Y) / len(Y)
In [300]:
one_nn_baseline(X_,X),one_nn_class_baseline(X_,Y)
Out[300]:
In [301]:
one_nn_class_baseline(X_ica,Y)
Out[301]:
In [302]:
# example, if we have clusters centered on corners of a cube, then there's no linear projection that preserves all the clusters
In [303]:
from tpe import TPE
In [304]:
tpe = TPE()
In [305]:
X_tpe = tpe.fit_transform(X)
In [306]:
plt.scatter(X_tpe[:,0],X_tpe[:,1],c=Y,linewidths=0,alpha=0.5)
Out[306]:
In [307]:
one_nn_class_baseline(X_tpe,Y)
Out[307]:
In [308]:
from sklearn.manifold import Isomap
iso = Isomap()
X_iso = iso.fit_transform(X)
plt.scatter(X_iso[:,0],X_iso[:,1],c=Y,linewidths=0,alpha=0.5)
Out[308]:
In [309]:
one_nn_class_baseline(X_iso,Y)
Out[309]:
In [310]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding()
X_lle = lle.fit_transform(X)
plt.scatter(X_lle[:,0],X_lle[:,1],c=Y,linewidths=0,alpha=0.5)
Out[310]:
In [311]:
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:,0],X_tsne[:,1],c=Y,linewidths=0,alpha=0.5)
Out[311]:
In [ ]:
In [312]:
one_nn_class_baseline(X_tsne,Y)
Out[312]:
In [313]:
from sklearn.decomposition import KernelPCA
In [314]:
kpca = KernelPCA(n_components=2,kernel='rbf',gamma=0.05)
X_kpca = kpca.fit_transform(X)
In [315]:
plt.scatter(X_kpca[:,0],X_kpca[:,1],c=Y,linewidths=0,alpha=0.5)
Out[315]:
In [316]:
one_nn_class_baseline(X_kpca,Y)
Out[316]:
In [317]:
X_l,Y_l = generate_hypercube(8,n_tot=2000)
kpca = KernelPCA(n_components=2,kernel='rbf',gamma=0.06)
X_kpca = kpca.fit_transform(X_l)
plt.scatter(X_kpca[:,0],X_kpca[:,1],c=Y_l,linewidths=0,alpha=0.5)
Out[317]:
In [ ]:
# what if the clusters are not isotropic
In [427]:
from scipy.cluster import hierarchy
In [438]:
dim=6
radius=1
corners = [(np.array(i)*radius+radius)/2 for i in itertools.product([-1,1],repeat=dim)]
In [453]:
hierarchy.dendrogram(hierarchy.to_tree(corners));
Out[453]:
In [454]:
Z = hierarchy.complete(corners)
In [456]:
l = hierarchy.to_tree(Z)
In [515]:
from sklearn.datasets import make_swiss_roll
In [514]:
tpe = TPE()
In [516]:
X,Y = make_swiss_roll(1000)
In [517]:
X_ = tpe.fit_transform(X)
In [518]:
plt.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5)
Out[518]:
In [519]:
tpe = TPE('ward')
X_ = tpe.fit_transform(X)
plt.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5)
Out[519]:
In [520]:
tsne = TSNE()
X_ = tsne.fit_transform(X)
plt.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5)
Out[520]:
In [521]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0],X[:,1],X[:,2],c=Y,linewidths=0,alpha=0.5)
Out[521]:
In [ ]: