Question: how many connected components does the k-nearest neighbor graph have for a collection of handwritten digit images?


In [1]:
from sklearn import neighbors

In [2]:
from sklearn import datasets

In [3]:
import pylab as pl
%matplotlib inline
pl.rcParams['font.family'] = 'serif'

In [4]:
data = datasets.load_digits()

In [5]:
X = data.data
Y = data.target

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=64,whiten=True)

In [7]:
import networkx as nx
print(nx.__version__)


1.9.1

In [8]:
X_ = pca.fit_transform(X)

In [9]:
epsneighborhood = neighbors.graph.radius_neighbors_graph(X_,9.0)
G = nx.Graph(epsneighborhood)
nx.number_connected_components(G)


Out[9]:
38

In [10]:
kneighbors = neighbors.graph.kneighbors_graph(X_,3)#,mode='distance')
G = nx.Graph(kneighbors)
nx.number_connected_components(G)


Out[10]:
4

Okay, interesting


In [11]:
cc = [c for c in nx.connected_components(G)]

In [12]:
for c in cc:
    print(Y[c])


[0 1 2 ..., 8 9 8]
[7 4 7 7]
[4 4 4 4 4 4 4 4]
[4 4 4]

In [13]:
pl.imshow(data.images[cc[0][0]],cmap='gray')


Out[13]:
<matplotlib.image.AxesImage at 0x11077e390>

In [14]:
sum(kneighbors.toarray()[1])


Out[14]:
3.0

What do the locally linear maps between tangent spaces look like when learned by Isomap or LLE (or LDA), for example?


In [15]:
from sklearn.lda import LDA

In [16]:
lda= LDA(n_components=2)

In [17]:
pca = PCA(n_components=8)
X_ = pca.fit_transform(X)
X_.shape


Out[17]:
(1797, 8)

How well can we do with a linear embedding in the best case, when we know the class memberships ahead of time?


In [18]:
lda.fit(X_,Y)


Out[18]:
LDA(n_components=2, priors=None)

In [19]:
lda_map = lda.transform(X_)

In [20]:
pl.scatter(lda_map[:,0],lda_map[:,1],c=Y,linewidth=0)
pl.axis('off')
pl.colorbar();



In [21]:
lo_d = lda_map[1] - lda_map[0]
lo_d


Out[21]:
array([-3.35368058, -6.87986537])

In [22]:
hi_d = X_[1] - X_[0]
hi_d


Out[22]:
array([ -9.21707775,  42.04358244, -13.90256066,  27.90785313,
       -13.02507156,   0.95503663,  -1.12660948,   7.16940677])

In [23]:
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=5)
iso.fit(X_)
iso_map = iso.transform(X_)

In [24]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=5)
lle.fit(X_)
lle_map = lle.transform(X_)
lle_map.shape


Out[24]:
(1797, 2)

In [26]:
import numpy as np
import numpy.random as npr
y_dif_iso = np.zeros((len(X_),len(lo_d)))
y_dif_lda = np.zeros((len(X_),len(lo_d)))
x_dif = np.zeros((len(X_),len(hi_d)))
i = 0
for j in range(len(X_)):
    if j != i:
        y_dif_lda[j] = lda_map[j] - lda_map[i]
        y_dif_iso[j] = iso_map[j] - iso_map[i]
        x_dif[j] = X_[j] - X_[i]

In [27]:
x_dif_pinv = np.linalg.pinv(x_dif)
x_dif_pinv.shape


Out[27]:
(8, 1797)

In [28]:
C_lda = y_dif_lda.T.dot(x_dif_pinv.T)
C_lda.shape


Out[28]:
(2, 8)

In [29]:
C_iso = y_dif_iso.T.dot(x_dif_pinv.T)
C_iso.shape


Out[29]:
(2, 8)

In [30]:
C_lda.dot(x_dif.T).shape


Out[30]:
(2, 1797)

In [31]:
# works well for a globally linear mapping
np.mean(abs(y_dif_lda - C_lda.dot(x_dif.T).T),0)


Out[31]:
array([  1.26785601e-15,   3.36194429e-15])

In [32]:
# doesn't work so well for a highly nonlinear mapping
np.mean(abs(y_dif_iso - C_iso.dot(x_dif.T).T),0)


Out[32]:
array([ 16.1095651 ,  11.06908156])

In [33]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
pl.title("Isomap")


Out[33]:
<matplotlib.text.Text at 0x114ab7b50>

In [34]:
pl.scatter(lle_map[:,0],lle_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();


Hmm that doesn't look useful at all


In [35]:
%timeit x_dif_pinv = np.linalg.pinv(x_dif)


1000 loops, best of 3: 394 µs per loop

In [36]:
x_dif.shape


Out[36]:
(1797, 8)

In [38]:
import numpy as np
import numpy.random as npr
#C = np.ones((len(lo_d),len(hi_d)))
C = npr.randn(len(lo_d),len(hi_d))
C.shape


Out[38]:
(2, 8)

In [39]:
C.dot(x_dif.T)


Out[39]:
array([[  0.        ,  28.68731001,  17.44784321, ...,  37.1547794 ,
         15.76435884,  46.14608673],
       [  0.        ,  39.57420734,  52.91959838, ...,  47.46539675,
        -12.70389009,  22.08310674]])

In [40]:
hi_d.shape


Out[40]:
(8,)

So that's cool, that's for one point, evaluated w.r.t. all other points in the dataset. Next up: compute w.r.t. only $k$-nearest neighbors


In [41]:
k=10
knearest = neighbors.kneighbors_graph(X_,k).toarray()
knearest = knearest - np.diag(np.ones(len(X_))) # knearest[i,i] = 0

In [42]:
knearest_sparse = neighbors.kneighbors_graph(X_,k)

In [43]:
row = knearest_sparse.getrow(0)

In [44]:
row.indices


Out[44]:
array([   0, 1365, 1029,  877, 1167,  464,  546,  458, 1541,  516], dtype=int32)

In [45]:
knearest_sparse[0].indices


Out[45]:
array([   0, 1365, 1029,  877, 1167,  464,  546,  458, 1541,  516], dtype=int32)

In [46]:
knearest[0].sum()


Out[46]:
9.0

In [47]:
i = 0
for ind,j in enumerate(knearest_sparse[i].indices):
    if j != i:
        print(ind,j)


(1, 1365)
(2, 1029)
(3, 877)
(4, 1167)
(5, 464)
(6, 546)
(7, 458)
(8, 1541)
(9, 516)

In [48]:
def construct_C(X,Y,k=10):
    ''' X = high-dimensional feature-vectors,
    Y = low-dimensional feature-vectors,
    k = number of neighbors to consider'''
    d_x = X.shape[1]
    d_y = Y.shape[1]
    knearest_sparse = neighbors.kneighbors_graph(X,k)
    #Y_reconstructed = np.zeros(Y.shape)
    error = np.empty(len(X),dtype=object)
    
    
    C = np.empty(len(X),dtype=object)
    for i in range(len(X)):
        y_dif = np.zeros((k,d_y))
        x_dif = np.zeros((k,d_x))
        for ind,j in enumerate(knearest_sparse[i].indices):
            if j != i:
                #y_dif_lda[j] = lda_map[j] - lda_map[i]
                #y_dif_iso[ind] = iso_map[j] - iso_map[i]
                y_dif[ind] = Y[j] - Y[i]
                x_dif[ind] = X[j] - X[i]

        x_dif_pinv = np.linalg.pinv(x_dif)
        C_ = y_dif.T.dot(x_dif_pinv.T)
        
        error[i] = y_dif - C_.dot(x_dif.T).T
        C[i] = C_
        #Y_reconstructed[i] = C_.dot(x_dif)
        
    return C,error

In [49]:
%%timeit
knearest_sparse = neighbors.kneighbors_graph(X,15)


1 loops, best of 3: 304 ms per loop

In [50]:
C,error = construct_C(X_,iso_map,k=25)

In [51]:
%%timeit
C,error = construct_C(X_,iso_map,k=25)


1 loops, best of 3: 672 ms per loop

About half of the computational cost here is in computing the neighborhood graph in the input space-- only need to do this once though, for future reference...


In [52]:
C[0].shape


Out[52]:
(2, 8)

In [53]:
average_error = np.array([np.mean(np.sqrt(sum(abs(e)**2))) for e in error])

In [54]:
error_stdev = np.array([np.std(np.sqrt(sum(abs(e)**2))) for e in error])

hmm so in an Isomap embedding the largest deviations from locally linear approximation occur for points far away from their cluster cores, interesting


In [55]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=np.log(average_error),
           alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [56]:
pl.hist(average_error,bins=50);
pl.xlabel('Local reconstruction error')
pl.ylabel('Probability density')


Out[56]:
<matplotlib.text.Text at 0x114bf2590>

In [57]:
pl.hist(np.log(average_error),bins=50);
pl.xlabel('log(local reconstruction error)')
pl.ylabel('Probability density')


Out[57]:
<matplotlib.text.Text at 0x1146eba90>

In [58]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=error_stdev,
           alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [59]:
pl.hist(error_stdev,bins=50);


Here I've plotted the points in the locations provided by LDA, colored by the "strain" they felt in the isomap embedding


In [60]:
pl.scatter(lda_map[:,0],lda_map[:,1],c=np.log(average_error),alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();


Hmm how similar are all the maps in C?


In [61]:
map_dim = C[0].shape[0]*C[0].shape[1]
c = C[0].reshape((map_dim,1))

In [62]:
maps = np.hstack([c.reshape((map_dim,1)) for c in C]).T
maps.shape


Out[62]:
(1797, 16)

In [63]:
maps_rand = npr.randn(*maps.shape)

In [64]:
pca = PCA()
pca.fit(maps)

pca_r = PCA()
pca_r.fit(maps_rand)


Out[64]:
PCA(copy=True, n_components=None, whiten=False)

In [65]:
# comparison to random expectation
pl.plot(range(1,maps.shape[1]+1),pca.explained_variance_ratio_,label='observed')
pl.plot(range(1,maps.shape[1]+1),pca_r.explained_variance_ratio_,label='random')
pl.ylim((0,max(pca.explained_variance_ratio_)))
pl.legend(loc='best')
pl.xlim((1,maps.shape[1]+1))
pl.xlabel('Component')
pl.ylabel('Explained variance')


Out[65]:
<matplotlib.text.Text at 0x114b23a90>

Now this is getting meta-- embedding the map vectors themselves, to visualize...


In [66]:
iso = Isomap(n_neighbors=5)
iso.fit(maps)
maps_map = iso.transform(maps)

In [67]:
pl.scatter(PCA(2).fit_transform(maps)[:,0],PCA(2).fit_transform(maps)[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [68]:
pl.scatter(maps_map[:,0],maps_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();


Hmm instead of examining these for maps generated by existing algorithms, let's try instead when we have a "ground-truth" low-dimensional map?

Hmm it would be neat to examine how "dependent" locally linear approximations are for representations learned by autoencoders over time

Hmm I wonder what would happen if you used a stack of nonlinear projections to gradually reduce dimensionality. Let's try it for spectral embedding!


In [69]:
X_ = PCA().fit_transform(X)
X_.shape


Out[69]:
(1797, 64)

In [70]:
from sklearn.manifold import SpectralEmbedding

In [71]:
se = SpectralEmbedding(n_neighbors=48)

In [72]:
proj = se.fit_transform(X_)
se.n_neighbors_


Out[72]:
48

In [73]:
se.n_neighbors_


Out[73]:
48

In [74]:
pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();



In [76]:
def one_nn_baseline(X,Y):
    one_nn = neighbors.kneighbors_graph(X,2)
    inds = np.zeros(len(X),dtype=int)
    for i in range(len(X)):
        inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
    preds = Y[inds]
    return 1.0*sum(preds==Y) / len(Y)

In [77]:
one_nn_baseline(proj,Y)


Out[77]:
0.7128547579298832

In [78]:
dim = X_.shape[1]
dim


Out[78]:
64

In [79]:
proj = X_.copy()

In [80]:
projs = []
projs.append((dim,proj))

In [81]:
# alternate scale?
scale_neighbors = lambda x : int(6**(1+(x / 31.0)))

In [82]:
pl.plot(range(2,dim),[scale_neighbors(x) for x in range(2,dim)])


Out[82]:
[<matplotlib.lines.Line2D at 0x1107b38d0>]

In [83]:
step = 10
target = 2
while dim > target:
    comp = max(dim - step,target)
    #se = SpectralEmbedding(comp,n_neighbors=3*dim)
    se = SpectralEmbedding(comp,n_neighbors=scale_neighbors(dim))
    projs.append((dim,se.fit_transform(projs[-1][1])))
    dim -= step
    print(dim)


54
44
34
24
14
4
-6
/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/manifold/spectral_embedding_.py:226: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn("Graph is not fully connected, spectral embedding"

In [84]:
for dim,proj in projs:
    pl.figure()
    pl.title("Input dimensionality: {0}".format(dim))
    pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
    pl.axis('off')
    pl.colorbar();



In [85]:
def iterative_spectral_embedding(X,step=10,target=2,dim_mult=3):
    dim = X.shape[1]
    projs = []
    projs.append(X.copy())
    while dim > target:
        comp = max(dim - step,target)
        # not sure how to select the number of neighbors here
        # large k work well when the input dimensionality is high
        # but fail for low input dimensionality, and vice-versa
        # for low k
        
        se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
        projs.append(se.fit_transform(projs[-1]))
        dim -= step
        print(dim)
    return projs

In [86]:
def plot_projs(projs):
    for proj in projs:
        pl.figure()
        pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
        pl.axis('off')
        pl.colorbar();

In [87]:
plot_projs(iterative_spectral_embedding(X_))


54
44
34
24
14
4
-6

In [98]:
plot_projs(iterative_spectral_embedding(X_,dim_mult=2))


54
44
34
24
14
4
-6

In [89]:
plot_projs(iterative_spectral_embedding(X_,dim_mult=5))


54
44
34
24
14
4
-6

In [90]:
cray_projs = iterative_spectral_embedding(X_,step=24,dim_mult=2)
plot_projs(cray_projs)


40
16
-8

In [91]:
projs_24_3 = iterative_spectral_embedding(X_,step=24,dim_mult=3)
plot_projs(projs_24_3)


40
16
-8

In [92]:
one_nn_baseline(projs_24_3[-1],Y)


Out[92]:
0.8491930996104619

In [93]:
projs_24_4 = iterative_spectral_embedding(X_,step=24,dim_mult=4)
plot_projs(projs_24_4)


40
16
-8

In [94]:
one_nn_baseline(projs_24_4[-1],Y)


Out[94]:
0.805230940456316

In [95]:
projs = iterative_spectral_embedding(X_,step=24,dim_mult=5)
plot_projs(projs)


40
16
-8

In [96]:
one_nn_baseline(lda_map,Y)


Out[96]:
0.557039510294936

In [97]:
one_nn_baseline(PCA(2).fit_transform(X),Y)


Out[97]:
0.5870895937673901

In [99]:
from sklearn.decomposition import kernel_pca
kpca = kernel_pca.KernelPCA(2,kernel='sigmoid')
one_nn_baseline(kpca.fit_transform(X),Y)


Out[99]:
0.1580411797440178

In [100]:
one_nn_baseline(iso_map,Y)


Out[100]:
0.800222593210907

In [101]:
se_map = SpectralEmbedding().fit_transform(X_)

In [102]:
one_nn_baseline(se_map,Y)


Out[102]:
0.6755703951029494

In [103]:
one_nn_baseline(cray_projs[-1],Y)


Out[103]:
0.8603227601558152

In [104]:
one_nn_baseline(projs[-1],Y)


Out[104]:
0.7746243739565943

In [105]:
one_nn_baseline(projs[0][:,:2],Y)


Out[105]:
0.5870895937673901

In [106]:
kpca = kernel_pca.KernelPCA()

In [107]:
kpca.fit(X[:,:2])


Out[107]:
KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None,
     remove_zero_eig=False, tol=0)

In [108]:
kpca.fit(X[:,:20])


Out[108]:
KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None,
     remove_zero_eig=False, tol=0)

In [109]:
def iterative_kpca(X,step=10,target=2,dim_mult=3):
    dim = X.shape[1]
    projs = []
    projs.append(X.copy())
    while dim > target:
        comp = max(dim - step,target)
        # not sure how to select the number of neighbors here
        # large k work well when the input dimensionality is high
        # but fail for low input dimensionality, and vice-versa
        # for low k
        kpca = kernel_pca.KernelPCA(dim,'poly')#'rbf',gamma=1.0/dim)
        #se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
        projs.append(kpca.fit_transform(projs[-1]))
        dim -= step
        print(dim)
    return projs

In [110]:
projs_k = iterative_kpca(X,step=16)


48
32
16
0

In [111]:
[one_nn_baseline(proj[:,:2],Y) for proj in projs_k]


Out[111]:
[0.11908736783528102,
 0.5453533667223149,
 0.335559265442404,
 0.25153032832498606,
 0.24318308291597107]

In [112]:
plot_projs(iterative_spectral_embedding(X_,step=24,dim_mult=6))


40
16
-8

In [113]:
plot_projs(iterative_spectral_embedding(X_,step=16,dim_mult=5))


48
32
16
0

And now some Bayesian manifold learning tinkering


In [114]:
X_reduced = PCA(8).fit_transform(X)[:200,:]
X_reduced.shape


Out[114]:
(200, 8)

In [115]:
Y_reduced = Isomap().fit_transform(X_reduced)
Y_reduced.shape


Out[115]:
(200, 2)

In [116]:
C,error = construct_C(X_reduced,Y_reduced,k=10)

In [117]:
np.hstack(C).shape


Out[117]:
(2, 1600)

In [118]:
C_ = np.hstack(C)

In [119]:
U = C_.dot(C_.T)
U.shape


Out[119]:
(2, 2)

In [120]:
k = 15
G = neighbors.kneighbors_graph(X_reduced,k).toarray()
L = np.eye(len(G))*k - G

In [121]:
I = np.eye(2)

In [122]:
from scipy import linalg

In [123]:
omega_inv = linalg.kron(L,I)

In [124]:
omega_inv.shape


Out[124]:
(400, 400)

In [125]:
omega = np.linalg.inv(omega_inv)

In [126]:
covariance_c_prior = linalg.kron(omega,U)
covariance_c_prior.shape


Out[126]:
(800, 800)

In [127]:
# likelihood
# etc.

In [128]:
d_x = 8
d_y = 2

In [129]:
gamma = 0.1
V_inv = np.eye(d_y)*gamma
V = np.linalg.inv(V_inv)
V.shape


Out[129]:
(2, 2)

In [130]:
G.shape


Out[130]:
(200, 200)

In [131]:
i = 0
j = 0
n = len(X_reduced)

A_inv = np.empty((n,n),dtype=object)

from time import time
t0 = time()

for i in range(n):
    if i%25 == 0:
        print(i)
    for j in range(n):
        if i == j:
            A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
                     C[j].T.dot(V_inv).dot(C[j]))

        if i != j:
            A_inv[i,j] = np.zeros((d_x,d_x))
            for k in range(n):
                A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
                                  C[i].T.dot(V_inv).dot(C[i]))
t1 = time()
print(t1 - t0)


0
25
50
75
100
125
150
175
63.2417168617

In [132]:
def compute_A_inv(X,C,G):
    n = len(X)
    A_inv = np.empty((n,n),dtype=object)

    t0 = time()

    for i in range(n):
        if i%10 == 0:
            print(i)
        for j in range(n):
            if i == j:
                A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
                         C[j].T.dot(V_inv).dot(C[j]))

            if i != j:
                A_inv[i,j] = np.zeros((d_x,d_x))
                for k in range(n):
                    A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
                                      C[i].T.dot(V_inv).dot(C[i]))
    t1 = time()
    print(t1 - t0)
    return A_inv

Constructing this matrix is ultra slow but could be trivially parallelized?


In [133]:
b = np.empty(n,dtype=object)

In [134]:
for i in range(n):
    b[i] = np.zeros(d_x)
    for j in range(n):
        b[i] -= G[i,j]*(C[j]+C[i]).T.dot(V_inv).dot(Y_reduced[j] - Y_reduced[i])

In [136]:
k = 0

C[k].T.dot(V_inv).dot(C[k]).shape


Out[136]:
(8, 8)

In [139]:
# A_inv_ij in R^{d_x * d_x}
A_inv[i,j].shape


Out[139]:
(8, 8)

In [638]:
# E step

In [639]:
# M step

In [147]:
X_.shape


Out[147]:
(1797, 64)

Approximation


In [142]:
from sklearn import kernel_approximation
from sklearn.kernel_approximation import RBFSampler

In [144]:
nystrom = kernel_approximation.Nystroem()

In [149]:
%timeit nystrom.fit(X_)


100 loops, best of 3: 3.34 ms per loop

In [150]:
nystrom.fit(X_)


Out[150]:
Nystroem(coef0=1, degree=3, gamma=None, kernel='rbf', kernel_params=None,
     n_components=100, random_state=None)

In [151]:
nystrom.transform(X_).shape


Out[151]:
(1797, 100)

In [ ]: