As noted in the Nystrom for DPP, Fast-DPP algorithm doesn't have theoretical guarentees, but can still be used as it demonstrates fast mixing in certain instances.
In [31]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.kernel_approximation import Nystroem
In [34]:
X = np.random.normal(size=(1000, 1000))
nyst = Nystroem()
nyst.fit(X.T)
X_rbf = nyst.transform(X.T)
In [35]:
pca = PCA(n_components=None)
pca.fit(X_rbf)
Out[35]:
In [36]:
pca.explained_variance_ratio_
Out[36]:
In [37]:
# get number of components where this would be...cumulative sum
alpha = 0.05
np.min(np.argwhere(np.cumsum(pca.explained_variance_ratio_) > (1-alpha)))
Out[37]:
In [38]:
def get_num_components(L, alpha=0.05):
"""
L is the kernel for the features in DPP
alpha represents acceptable error
"""
pca = PCA(n_components=None)
pca.fit(L)
# add one for zero indexing
return np.min(np.argwhere(np.cumsum(pca.explained_variance_ratio_) > (1-alpha)))+1
In [40]:
%%time
get_num_components(rbf_kernel(X.T))
Out[40]:
In [41]:
%%time
get_num_components(nyst.fit_transform(X.T))
Out[41]: