As noted in the Nystrom for DPP, Fast-DPP algorithm doesn't have theoretical guarentees, but can still be used as it demonstrates fast mixing in certain instances.


In [31]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.kernel_approximation import Nystroem

In [34]:
X = np.random.normal(size=(1000, 1000))
nyst = Nystroem()
nyst.fit(X.T)
X_rbf = nyst.transform(X.T)

In [35]:
pca = PCA(n_components=None)
pca.fit(X_rbf)


Out[35]:
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [36]:
pca.explained_variance_ratio_


Out[36]:
array([ 0.01257521,  0.01240583,  0.01225821,  0.01215681,  0.01209248,
        0.01201319,  0.01195963,  0.01186326,  0.01177649,  0.01168349,
        0.01166537,  0.01155385,  0.01153902,  0.01148566,  0.01145203,
        0.0113615 ,  0.01126469,  0.01121406,  0.01119955,  0.01113806,
        0.01111245,  0.01107618,  0.01101184,  0.01097707,  0.01092166,
        0.01089678,  0.01089163,  0.0108535 ,  0.010835  ,  0.01079418,
        0.01072059,  0.01068718,  0.01064874,  0.01059367,  0.01056046,
        0.01052685,  0.01047922,  0.01044396,  0.01038056,  0.01035531,
        0.01032861,  0.01028064,  0.01022585,  0.01019853,  0.01013273,
        0.01010609,  0.01009476,  0.01003283,  0.00994139,  0.00992571,
        0.00988597,  0.00986796,  0.00984521,  0.00983416,  0.00980064,
        0.00972654,  0.00969255,  0.00967289,  0.00966347,  0.00962928,
        0.00958922,  0.00957161,  0.00952659,  0.00951837,  0.00948758,
        0.00946558,  0.00944315,  0.00940335,  0.00934535,  0.0092755 ,
        0.00923621,  0.00921121,  0.00919117,  0.00914248,  0.0091107 ,
        0.00908861,  0.00905253,  0.00900653,  0.00899219,  0.00896295,
        0.00894812,  0.00883755,  0.00882469,  0.00881042,  0.00877359,
        0.00875806,  0.00872919,  0.00870989,  0.00864089,  0.00857137,
        0.00855303,  0.00853481,  0.00849406,  0.00846107,  0.00839687,
        0.00830433,  0.00829033,  0.00825582,  0.00816268,  0.00301129])

In [37]:
# get number of components where this would be...cumulative sum
alpha = 0.05
np.min(np.argwhere(np.cumsum(pca.explained_variance_ratio_) > (1-alpha)))


Out[37]:
93

In [38]:
def get_num_components(L, alpha=0.05):
    """
    L is the kernel for the features in DPP
    alpha represents acceptable error
    """
    pca = PCA(n_components=None)
    pca.fit(L)
    # add one for zero indexing
    return np.min(np.argwhere(np.cumsum(pca.explained_variance_ratio_) > (1-alpha)))+1

In [40]:
%%time
get_num_components(rbf_kernel(X.T))


Wall time: 916 ms
Out[40]:
875

In [41]:
%%time
get_num_components(nyst.fit_transform(X.T))


Wall time: 41 ms
Out[41]:
94