notebook.community

Edit and run

Question: how many connected components does the k-nearest neighbor graph have for a collection of handwritten digit images?



In [1]:

    
from sklearn import neighbors



In [2]:

    
from sklearn import datasets



In [3]:

    
import pylab as pl
%matplotlib inline
pl.rcParams['font.family'] = 'serif'



In [4]:

    
data = datasets.load_digits()



In [5]:

    
X = data.data
Y = data.target



In [6]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=64,whiten=True)



In [7]:

    
import networkx as nx
print(nx.__version__)



In [8]:

    
X_ = pca.fit_transform(X)



In [9]:

    
epsneighborhood = neighbors.graph.radius_neighbors_graph(X_,9.0)
G = nx.Graph(epsneighborhood)
nx.number_connected_components(G)









    Out[9]:





38



In [10]:

    
kneighbors = neighbors.graph.kneighbors_graph(X_,3)#,mode='distance')
G = nx.Graph(kneighbors)
nx.number_connected_components(G)









    Out[10]:





4

Okay, interesting



In [11]:

    
cc = [c for c in nx.connected_components(G)]



In [12]:

    
for c in cc:
    print(Y[c])









    



[0 1 2 ..., 8 9 8]
[7 4 7 7]
[4 4 4 4 4 4 4 4]
[4 4 4]



In [13]:

    
pl.imshow(data.images[cc[0][0]],cmap='gray')









    Out[13]:





<matplotlib.image.AxesImage at 0x11077e390>



In [14]:

    
sum(kneighbors.toarray()[1])









    Out[14]:





3.0

What do the locally linear maps between tangent spaces look like when learned by Isomap or LLE (or LDA), for example?



In [15]:

    
from sklearn.lda import LDA



In [16]:

    
lda= LDA(n_components=2)



In [17]:

    
pca = PCA(n_components=8)
X_ = pca.fit_transform(X)
X_.shape









    Out[17]:





(1797, 8)

How well can we do with a linear embedding in the best case, when we know the class memberships ahead of time?



In [18]:

    
lda.fit(X_,Y)









    Out[18]:





LDA(n_components=2, priors=None)



In [19]:

    
lda_map = lda.transform(X_)



In [20]:

    
pl.scatter(lda_map[:,0],lda_map[:,1],c=Y,linewidth=0)
pl.axis('off')
pl.colorbar();



In [21]:

    
lo_d = lda_map[1] - lda_map[0]
lo_d









    Out[21]:





array([-3.35368058, -6.87986537])



In [22]:

    
hi_d = X_[1] - X_[0]
hi_d









    Out[22]:





array([ -9.21707775,  42.04358244, -13.90256066,  27.90785313,
       -13.02507156,   0.95503663,  -1.12660948,   7.16940677])



In [23]:

    
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=5)
iso.fit(X_)
iso_map = iso.transform(X_)



In [24]:

    
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=5)
lle.fit(X_)
lle_map = lle.transform(X_)
lle_map.shape









    Out[24]:





(1797, 2)



In [26]:

    
import numpy as np
import numpy.random as npr
y_dif_iso = np.zeros((len(X_),len(lo_d)))
y_dif_lda = np.zeros((len(X_),len(lo_d)))
x_dif = np.zeros((len(X_),len(hi_d)))
i = 0
for j in range(len(X_)):
    if j != i:
        y_dif_lda[j] = lda_map[j] - lda_map[i]
        y_dif_iso[j] = iso_map[j] - iso_map[i]
        x_dif[j] = X_[j] - X_[i]



In [27]:

    
x_dif_pinv = np.linalg.pinv(x_dif)
x_dif_pinv.shape









    Out[27]:





(8, 1797)



In [28]:

    
C_lda = y_dif_lda.T.dot(x_dif_pinv.T)
C_lda.shape









    Out[28]:





(2, 8)



In [29]:

    
C_iso = y_dif_iso.T.dot(x_dif_pinv.T)
C_iso.shape









    Out[29]:





(2, 8)



In [30]:

    
C_lda.dot(x_dif.T).shape









    Out[30]:





(2, 1797)



In [31]:

    
# works well for a globally linear mapping
np.mean(abs(y_dif_lda - C_lda.dot(x_dif.T).T),0)









    Out[31]:





array([  1.26785601e-15,   3.36194429e-15])



In [32]:

    
# doesn't work so well for a highly nonlinear mapping
np.mean(abs(y_dif_iso - C_iso.dot(x_dif.T).T),0)









    Out[32]:





array([ 16.1095651 ,  11.06908156])



In [33]:

    
pl.scatter(iso_map[:,0],iso_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
pl.title("Isomap")









    Out[33]:





<matplotlib.text.Text at 0x114ab7b50>



In [34]:

    
pl.scatter(lle_map[:,0],lle_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();

Hmm that doesn't look useful at all



In [35]:

    
%timeit x_dif_pinv = np.linalg.pinv(x_dif)









    



1000 loops, best of 3: 394 µs per loop



In [36]:

    
x_dif.shape









    Out[36]:





(1797, 8)



In [38]:

    
import numpy as np
import numpy.random as npr
#C = np.ones((len(lo_d),len(hi_d)))
C = npr.randn(len(lo_d),len(hi_d))
C.shape









    Out[38]:





(2, 8)



In [39]:

    
C.dot(x_dif.T)









    Out[39]:





array([[  0.        ,  28.68731001,  17.44784321, ...,  37.1547794 ,
         15.76435884,  46.14608673],
       [  0.        ,  39.57420734,  52.91959838, ...,  47.46539675,
        -12.70389009,  22.08310674]])



In [40]:

    
hi_d.shape









    Out[40]:





(8,)

So that's cool, that's for one point, evaluated w.r.t. all other points in the dataset. Next up: compute w.r.t. only $k$-nearest neighbors



In [41]:

    
k=10
knearest = neighbors.kneighbors_graph(X_,k).toarray()
knearest = knearest - np.diag(np.ones(len(X_))) # knearest[i,i] = 0



In [42]:

    
knearest_sparse = neighbors.kneighbors_graph(X_,k)



In [43]:

    
row = knearest_sparse.getrow(0)



In [44]:

    
row.indices









    Out[44]:





array([   0, 1365, 1029,  877, 1167,  464,  546,  458, 1541,  516], dtype=int32)



In [45]:

    
knearest_sparse[0].indices









    Out[45]:





array([   0, 1365, 1029,  877, 1167,  464,  546,  458, 1541,  516], dtype=int32)



In [46]:

    
knearest[0].sum()









    Out[46]:





9.0



In [47]:

    
i = 0
for ind,j in enumerate(knearest_sparse[i].indices):
    if j != i:
        print(ind,j)









    



(1, 1365)
(2, 1029)
(3, 877)
(4, 1167)
(5, 464)
(6, 546)
(7, 458)
(8, 1541)
(9, 516)



In [48]:

    
def construct_C(X,Y,k=10):
    ''' X = high-dimensional feature-vectors,
    Y = low-dimensional feature-vectors,
    k = number of neighbors to consider'''
    d_x = X.shape[1]
    d_y = Y.shape[1]
    knearest_sparse = neighbors.kneighbors_graph(X,k)
    #Y_reconstructed = np.zeros(Y.shape)
    error = np.empty(len(X),dtype=object)
    
    
    C = np.empty(len(X),dtype=object)
    for i in range(len(X)):
        y_dif = np.zeros((k,d_y))
        x_dif = np.zeros((k,d_x))
        for ind,j in enumerate(knearest_sparse[i].indices):
            if j != i:
                #y_dif_lda[j] = lda_map[j] - lda_map[i]
                #y_dif_iso[ind] = iso_map[j] - iso_map[i]
                y_dif[ind] = Y[j] - Y[i]
                x_dif[ind] = X[j] - X[i]

        x_dif_pinv = np.linalg.pinv(x_dif)
        C_ = y_dif.T.dot(x_dif_pinv.T)
        
        error[i] = y_dif - C_.dot(x_dif.T).T
        C[i] = C_
        #Y_reconstructed[i] = C_.dot(x_dif)
        
    return C,error



In [49]:

    
%%timeit
knearest_sparse = neighbors.kneighbors_graph(X,15)









    



1 loops, best of 3: 304 ms per loop



In [50]:

    
C,error = construct_C(X_,iso_map,k=25)



In [51]:

    
%%timeit
C,error = construct_C(X_,iso_map,k=25)









    



1 loops, best of 3: 672 ms per loop

About half of the computational cost here is in computing the neighborhood graph in the input space-- only need to do this once though, for future reference...



In [52]:

    
C[0].shape









    Out[52]:





(2, 8)



In [53]:

    
average_error = np.array([np.mean(np.sqrt(sum(abs(e)**2))) for e in error])



In [54]:

    
error_stdev = np.array([np.std(np.sqrt(sum(abs(e)**2))) for e in error])

hmm so in an Isomap embedding the largest deviations from locally linear approximation occur for points far away from their cluster cores, interesting



In [55]:

    
pl.scatter(iso_map[:,0],iso_map[:,1],c=np.log(average_error),
           alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [56]:

    
pl.hist(average_error,bins=50);
pl.xlabel('Local reconstruction error')
pl.ylabel('Probability density')









    Out[56]:





<matplotlib.text.Text at 0x114bf2590>



In [57]:

    
pl.hist(np.log(average_error),bins=50);
pl.xlabel('log(local reconstruction error)')
pl.ylabel('Probability density')









    Out[57]:





<matplotlib.text.Text at 0x1146eba90>



In [58]:

    
pl.scatter(iso_map[:,0],iso_map[:,1],c=error_stdev,
           alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [59]:

    
pl.hist(error_stdev,bins=50);

Here I've plotted the points in the locations provided by LDA, colored by the "strain" they felt in the isomap embedding



In [60]:

    
pl.scatter(lda_map[:,0],lda_map[:,1],c=np.log(average_error),alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();

Hmm how similar are all the maps in C?



In [61]:

    
map_dim = C[0].shape[0]*C[0].shape[1]
c = C[0].reshape((map_dim,1))



In [62]:

    
maps = np.hstack([c.reshape((map_dim,1)) for c in C]).T
maps.shape









    Out[62]:





(1797, 16)



In [63]:

    
maps_rand = npr.randn(*maps.shape)



In [64]:

    
pca = PCA()
pca.fit(maps)

pca_r = PCA()
pca_r.fit(maps_rand)









    Out[64]:





PCA(copy=True, n_components=None, whiten=False)



In [65]:

    
# comparison to random expectation
pl.plot(range(1,maps.shape[1]+1),pca.explained_variance_ratio_,label='observed')
pl.plot(range(1,maps.shape[1]+1),pca_r.explained_variance_ratio_,label='random')
pl.ylim((0,max(pca.explained_variance_ratio_)))
pl.legend(loc='best')
pl.xlim((1,maps.shape[1]+1))
pl.xlabel('Component')
pl.ylabel('Explained variance')









    Out[65]:





<matplotlib.text.Text at 0x114b23a90>

Now this is getting meta-- embedding the map vectors themselves, to visualize...



In [66]:

    
iso = Isomap(n_neighbors=5)
iso.fit(maps)
maps_map = iso.transform(maps)



In [67]:

    
pl.scatter(PCA(2).fit_transform(maps)[:,0],PCA(2).fit_transform(maps)[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [68]:

    
pl.scatter(maps_map[:,0],maps_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();

Hmm instead of examining these for maps generated by existing algorithms, let's try instead when we have a "ground-truth" low-dimensional map?

Hmm it would be neat to examine how "dependent" locally linear approximations are for representations learned by autoencoders over time

Hmm I wonder what would happen if you used a stack of nonlinear projections to gradually reduce dimensionality. Let's try it for spectral embedding!



In [69]:

    
X_ = PCA().fit_transform(X)
X_.shape









    Out[69]:





(1797, 64)



In [70]:

    
from sklearn.manifold import SpectralEmbedding



In [71]:

    
se = SpectralEmbedding(n_neighbors=48)



In [72]:

    
proj = se.fit_transform(X_)
se.n_neighbors_









    Out[72]:





48



In [73]:

    
se.n_neighbors_









    Out[73]:





48



In [74]:

    
pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [76]:

    
def one_nn_baseline(X,Y):
    one_nn = neighbors.kneighbors_graph(X,2)
    inds = np.zeros(len(X),dtype=int)
    for i in range(len(X)):
        inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
    preds = Y[inds]
    return 1.0*sum(preds==Y) / len(Y)



In [77]:

    
one_nn_baseline(proj,Y)









    Out[77]:





0.7128547579298832



In [78]:

    
dim = X_.shape[1]
dim









    Out[78]:





64



In [79]:

    
proj = X_.copy()



In [80]:

    
projs = []
projs.append((dim,proj))



In [81]:

    
# alternate scale?
scale_neighbors = lambda x : int(6**(1+(x / 31.0)))



In [82]:

    
pl.plot(range(2,dim),[scale_neighbors(x) for x in range(2,dim)])









    Out[82]:





[<matplotlib.lines.Line2D at 0x1107b38d0>]



In [83]:

    
step = 10
target = 2
while dim > target:
    comp = max(dim - step,target)
    #se = SpectralEmbedding(comp,n_neighbors=3*dim)
    se = SpectralEmbedding(comp,n_neighbors=scale_neighbors(dim))
    projs.append((dim,se.fit_transform(projs[-1][1])))
    dim -= step
    print(dim)









    



54
44
34
24
14
4
-6






    



/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/manifold/spectral_embedding_.py:226: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn("Graph is not fully connected, spectral embedding"



In [84]:

    
for dim,proj in projs:
    pl.figure()
    pl.title("Input dimensionality: {0}".format(dim))
    pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
    pl.axis('off')
    pl.colorbar();



In [85]:

    
def iterative_spectral_embedding(X,step=10,target=2,dim_mult=3):
    dim = X.shape[1]
    projs = []
    projs.append(X.copy())
    while dim > target:
        comp = max(dim - step,target)
        # not sure how to select the number of neighbors here
        # large k work well when the input dimensionality is high
        # but fail for low input dimensionality, and vice-versa
        # for low k
        
        se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
        projs.append(se.fit_transform(projs[-1]))
        dim -= step
        print(dim)
    return projs



In [86]:

    
def plot_projs(projs):
    for proj in projs:
        pl.figure()
        pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
        pl.axis('off')
        pl.colorbar();



In [87]:

    
plot_projs(iterative_spectral_embedding(X_))



In [98]:

    
plot_projs(iterative_spectral_embedding(X_,dim_mult=2))



In [89]:

    
plot_projs(iterative_spectral_embedding(X_,dim_mult=5))



In [90]:

    
cray_projs = iterative_spectral_embedding(X_,step=24,dim_mult=2)
plot_projs(cray_projs)



In [91]:

    
projs_24_3 = iterative_spectral_embedding(X_,step=24,dim_mult=3)
plot_projs(projs_24_3)



In [92]:

    
one_nn_baseline(projs_24_3[-1],Y)









    Out[92]:





0.8491930996104619



In [93]:

    
projs_24_4 = iterative_spectral_embedding(X_,step=24,dim_mult=4)
plot_projs(projs_24_4)



In [94]:

    
one_nn_baseline(projs_24_4[-1],Y)









    Out[94]:





0.805230940456316



In [95]:

    
projs = iterative_spectral_embedding(X_,step=24,dim_mult=5)
plot_projs(projs)



In [96]:

    
one_nn_baseline(lda_map,Y)









    Out[96]:





0.557039510294936



In [97]:

    
one_nn_baseline(PCA(2).fit_transform(X),Y)









    Out[97]:





0.5870895937673901



In [99]:

    
from sklearn.decomposition import kernel_pca
kpca = kernel_pca.KernelPCA(2,kernel='sigmoid')
one_nn_baseline(kpca.fit_transform(X),Y)









    Out[99]:





0.1580411797440178



In [100]:

    
one_nn_baseline(iso_map,Y)









    Out[100]:





0.800222593210907



In [101]:

    
se_map = SpectralEmbedding().fit_transform(X_)



In [102]:

    
one_nn_baseline(se_map,Y)









    Out[102]:





0.6755703951029494



In [103]:

    
one_nn_baseline(cray_projs[-1],Y)









    Out[103]:





0.8603227601558152



In [104]:

    
one_nn_baseline(projs[-1],Y)









    Out[104]:





0.7746243739565943



In [105]:

    
one_nn_baseline(projs[0][:,:2],Y)









    Out[105]:





0.5870895937673901



In [106]:

    
kpca = kernel_pca.KernelPCA()



In [107]:

    
kpca.fit(X[:,:2])









    Out[107]:





KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None,
     remove_zero_eig=False, tol=0)



In [108]:

    
kpca.fit(X[:,:20])









    Out[108]:





KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None,
     remove_zero_eig=False, tol=0)



In [109]:

    
def iterative_kpca(X,step=10,target=2,dim_mult=3):
    dim = X.shape[1]
    projs = []
    projs.append(X.copy())
    while dim > target:
        comp = max(dim - step,target)
        # not sure how to select the number of neighbors here
        # large k work well when the input dimensionality is high
        # but fail for low input dimensionality, and vice-versa
        # for low k
        kpca = kernel_pca.KernelPCA(dim,'poly')#'rbf',gamma=1.0/dim)
        #se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
        projs.append(kpca.fit_transform(projs[-1]))
        dim -= step
        print(dim)
    return projs



In [110]:

    
projs_k = iterative_kpca(X,step=16)



In [111]:

    
[one_nn_baseline(proj[:,:2],Y) for proj in projs_k]









    Out[111]:





[0.11908736783528102,
 0.5453533667223149,
 0.335559265442404,
 0.25153032832498606,
 0.24318308291597107]



In [112]:

    
plot_projs(iterative_spectral_embedding(X_,step=24,dim_mult=6))



In [113]:

    
plot_projs(iterative_spectral_embedding(X_,step=16,dim_mult=5))

And now some Bayesian manifold learning tinkering



In [114]:

    
X_reduced = PCA(8).fit_transform(X)[:200,:]
X_reduced.shape









    Out[114]:





(200, 8)



In [115]:

    
Y_reduced = Isomap().fit_transform(X_reduced)
Y_reduced.shape









    Out[115]:





(200, 2)



In [116]:

    
C,error = construct_C(X_reduced,Y_reduced,k=10)



In [117]:

    
np.hstack(C).shape









    Out[117]:





(2, 1600)



In [118]:

    
C_ = np.hstack(C)



In [119]:

    
U = C_.dot(C_.T)
U.shape









    Out[119]:





(2, 2)



In [120]:

    
k = 15
G = neighbors.kneighbors_graph(X_reduced,k).toarray()
L = np.eye(len(G))*k - G



In [121]:

    
I = np.eye(2)



In [122]:

    
from scipy import linalg



In [123]:

    
omega_inv = linalg.kron(L,I)



In [124]:

    
omega_inv.shape









    Out[124]:





(400, 400)



In [125]:

    
omega = np.linalg.inv(omega_inv)



In [126]:

    
covariance_c_prior = linalg.kron(omega,U)
covariance_c_prior.shape









    Out[126]:





(800, 800)



In [127]:

    
# likelihood
# etc.



In [128]:

    
d_x = 8
d_y = 2



In [129]:

    
gamma = 0.1
V_inv = np.eye(d_y)*gamma
V = np.linalg.inv(V_inv)
V.shape









    Out[129]:





(2, 2)



In [130]:

    
G.shape









    Out[130]:





(200, 200)



In [131]:

    
i = 0
j = 0
n = len(X_reduced)

A_inv = np.empty((n,n),dtype=object)

from time import time
t0 = time()

for i in range(n):
    if i%25 == 0:
        print(i)
    for j in range(n):
        if i == j:
            A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
                     C[j].T.dot(V_inv).dot(C[j]))

        if i != j:
            A_inv[i,j] = np.zeros((d_x,d_x))
            for k in range(n):
                A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
                                  C[i].T.dot(V_inv).dot(C[i]))
t1 = time()
print(t1 - t0)









    



0
25
50
75
100
125
150
175
63.2417168617



In [132]:

    
def compute_A_inv(X,C,G):
    n = len(X)
    A_inv = np.empty((n,n),dtype=object)

    t0 = time()

    for i in range(n):
        if i%10 == 0:
            print(i)
        for j in range(n):
            if i == j:
                A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
                         C[j].T.dot(V_inv).dot(C[j]))

            if i != j:
                A_inv[i,j] = np.zeros((d_x,d_x))
                for k in range(n):
                    A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
                                      C[i].T.dot(V_inv).dot(C[i]))
    t1 = time()
    print(t1 - t0)
    return A_inv

Constructing this matrix is ultra slow but could be trivially parallelized?



In [133]:

    
b = np.empty(n,dtype=object)



In [134]:

    
for i in range(n):
    b[i] = np.zeros(d_x)
    for j in range(n):
        b[i] -= G[i,j]*(C[j]+C[i]).T.dot(V_inv).dot(Y_reduced[j] - Y_reduced[i])



In [136]:

    
k = 0

C[k].T.dot(V_inv).dot(C[k]).shape









    Out[136]:





(8, 8)



In [139]:

    
# A_inv_ij in R^{d_x * d_x}
A_inv[i,j].shape









    Out[139]:





(8, 8)



In [638]:

    
# E step



In [639]:

    
# M step



In [147]:

    
X_.shape









    Out[147]:





(1797, 64)

Approximation



In [142]:

    
from sklearn import kernel_approximation
from sklearn.kernel_approximation import RBFSampler



In [144]:

    
nystrom = kernel_approximation.Nystroem()



In [149]:

    
%timeit nystrom.fit(X_)









    



100 loops, best of 3: 3.34 ms per loop



In [150]:

    
nystrom.fit(X_)









    Out[150]:





Nystroem(coef0=1, degree=3, gamma=None, kernel='rbf', kernel_params=None,
     n_components=100, random_state=None)



In [151]:

    
nystrom.transform(X_).shape









    Out[151]:





(1797, 100)



In [ ]: