This notebook is to test the implementation of HSIC using finite-dimensional feature maps. FiniteFeatureHSIC class.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import fsic.util as util
import fsic.data as data
import fsic.feature as fea
import fsic.kernel as kernel
import fsic.indtest as it
import fsic.glo as glo
import scipy.stats as stats
In [ ]:
# font options
font = {
#'family' : 'normal',
#'weight' : 'bold',
'size' : 16
}
plt.rc('font', **font)
plt.rc('lines', linewidth=2)
#matplotlib.rc('text', usetex=True)
#matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"]
In [ ]:
def get_quad_psfunc():
"""
Return a PairedSource to generate y = x^2 + Gaussian noise.
"""
px = lambda n: np.random.rand(n, 1)*8 - 4
f = lambda x: 0.2*x**2 + np.random.randn(x.shape[0], 1)
return data.PSFunc(f, px)
In [ ]:
# paired source
alpha = 0.05
n = 2000
seed = 18
dx = 10
dy = 5
ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
#ps = data.PS2DUnifRotate(angle=np.pi/3)
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])
#ps = data.PS2DSinFreq(freq=2)
pdata = ps.sample(n, seed=seed)
#tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)
In [ ]:
# get the median distances
X, Y = pdata.xy()
medx = util.meddistance(X, subsample=1000)
medy = util.meddistance(Y, subsample=1000)
sigmax2 = medx**2
sigmay2 = medy**2
feature_pairs = 50
n_simulate = 5000
fmx = fea.RFFKGauss(sigmax2, n_features=feature_pairs, seed=seed+1)
fmy = fea.RFFKGauss(sigmay2, n_features=feature_pairs, seed=seed+2)
ffhsic = it.FiniteFeatureHSIC(fmx, fmy, n_simulate=n_simulate, alpha=alpha, seed=seed+89)
ffhsic_result = ffhsic.perform_test(pdata)
ffhsic_result
In [ ]:
n_permute = 500
n_simulate = 500
Zx = fmx.gen_features(X)
Zy = fmy.gen_features(Y)
list_perm = it.FiniteFeatureHSIC.list_permute(X, Y, fmx, fmy, n_permute=n_permute, seed=100)
list_spectral, eigx, eigy = it.FiniteFeatureHSIC.list_permute_spectral(Zx, Zy,
n_simulate=n_simulate, seed=119)
In [ ]:
freq_p, edge_p = np.histogram(list_perm)
freq_s, edge_s = np.histogram(list_spectral)
nfreq_p = freq_p/float(np.sum(freq_p))
nfreq_s = freq_s/float(np.sum(freq_s))
np.abs(nfreq_p-nfreq_s)
In [ ]:
# histogram
plt.figure(figsize=(10, 5))
#plt.hist(list_perm, color='blue', alpha=0.7, normed=True, bins=20)
plt.hist(list_spectral, color='red', alpha=0.5, normed=True, bins=20, label='Spectral')
plt.hist(list_perm, color='blue', alpha=0.5, normed=True, bins=20, label='Permutation')
plt.legend()
In [ ]:
k = kernel.KGauss(sigmax2)
l = kernel.KGauss(sigmay2)
n = X.shape[0]
H = np.eye(n) - np.ones((n, n))/float(n)
K = k.eval(X, X)
L = l.eval(Y, Y)
HKH = H.dot(K).dot(H)
HLH = H.dot(L).dot(H)
In [ ]:
full_eigx, _ = np.linalg.eig(HKH)
full_eigx = np.real(full_eigx)
full_eigy, _ = np.linalg.eig(HLH)
full_eigy = np.real(full_eigy)
# sort decreasingly
full_eigx = -np.sort(-full_eigx)/n
full_eigy = -np.sort(-full_eigy)/n
In [ ]:
# compare the product of eigenvalues to the full kernel matrix case
full_eig = np.outer(full_eigx, full_eigy).reshape(-1)
finite_eig = np.outer(eigx, eigy).reshape(-1)
lim = min(len(full_eig), len(finite_eig), 10)
plt.plot(finite_eig[:lim], 'bo-', label='finite-dim kernel')
plt.plot(full_eig[:lim], 'go-', label='full kernel matrix')
plt.legend()
plt.title('Product of eigenvalues')
In [ ]: