This notebook is to test the implementation of the Randomized Dependence Coefficient test.
In [ ]:
    
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import fsic.util as util
import fsic.data as data
import fsic.feature as fea
import fsic.kernel as kernel
import fsic.indtest as it
import fsic.glo as glo
import scipy.stats as stats
    
In [ ]:
    
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 16
}
plt.rc('font', **font)
plt.rc('lines', linewidth=2)
#matplotlib.rc('text', usetex=True)
#matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"]
    
In [ ]:
    
def get_quad_psfunc():
    """
    Return a PairedSource to generate y = x^2 + Gaussian noise.
    """
    px = lambda n: np.random.rand(n, 1)*8 - 4
    f = lambda x: 0.2*x**2 + np.random.randn(x.shape[0], 1)
    return data.PSFunc(f, px)
    
In [ ]:
    
# paired source 
alpha = 0.05
n = 800
seed = 17
dx = 10 
dy = 5
#ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])
ps = data.PS2DSinFreq(freq=2)
pdata = ps.sample(n, seed=seed)
#tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)
    
In [ ]:
    
# get the median distances 
X, Y = pdata.xy()
# copula transform to both X and Y
cop_map = fea.MarginalCDFMap() 
Xcdf = cop_map.gen_features(X)
Ycdf = cop_map.gen_features(Y)
medx = util.meddistance(Xcdf, subsample=1000)
medy = util.meddistance(Ycdf, subsample=1000)
sigmax2 = medx**2
sigmay2 = medy**2
feature_pairs = 50
fmx = fea.RFFKGauss(sigmax2, n_features=feature_pairs, seed=seed+1)
fmy = fea.RFFKGauss(sigmay2, n_features=feature_pairs, seed=seed+2)
rdc = it.RDC(fmx, fmy, alpha=alpha)
rdc.perform_test(pdata)
    
In [ ]:
    
# random Fourier features 
Xrff = fmx.gen_features(Xcdf)
Yrff = fmy.gen_features(Ycdf)
# CCA 
evals, Vx, Vy = util.cca(Xrff, Yrff)
# Barlett approximation 
bartlett_stat = -(n-1-0.5*(Xrff.shape[1]+Yrff.shape[1]+1))*np.sum(np.log(1-evals**2))
    
In [ ]:
    
evals
    
In [ ]:
    
np.log(1-evals**2)
    
In [ ]:
    
def get_rff_maps(pdata, n_features, seed=2893):
    
    X, Y = pdata.xy()
    
    # copula transform to both X and Y
    cop_map = fea.MarginalCDFMap() 
    Xcdf = cop_map.gen_features(X)
    Ycdf = cop_map.gen_features(Y)
    medx = util.meddistance(Xcdf, subsample=1000)
    medy = util.meddistance(Ycdf, subsample=1000)
    sigmax2 = medx**2
    sigmay2 = medy**2
    fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=seed+1)
    fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=seed+2)
    return fmx, fmy
    
In [ ]:
    
# permutations
#ps = data.PS2DUnifRotate(angle=np.pi*0)
#ps = data.PS2DSinFreq(freq=1)
#ps = data.PSUnifRotateNoise(angle=np.pi*0, noise_dim=8)
ps = data.PSIndSameGauss(dx=2, dy=3)
dx = ps.dx()
dy = ps.dy()
seed = 90
repeats = 500
n = 800
n_features = 10
arr_stats = np.zeros(repeats)
# RFF maps
pdata = ps.sample(n, seed=repeats)
fmx, fmy = get_rff_maps(pdata, n_features=n_features, seed=repeats+1)
rdc = it.RDC(fmx, fmy, alpha=alpha)
for r in range(repeats):
    if r%50==0:
        print 'Starting repetition: %d'%(r+1)
    pdata = ps.sample(n, seed=r+297+repeats)
    stat = rdc.compute_stat(pdata)
    arr_stats[r] = stat
    
In [ ]:
    
# plot null distribution
chi2_df = fmx.num_features()*fmy.num_features()
#chi2_df = 9
#chi2_df = n_features
dom_end = max(np.max(arr_stats), stats.chi2.isf(0.01, df=chi2_df))
dom_start = min(np.min(arr_stats), stats.chi2.isf(0.99, df=chi2_df))
dom = np.linspace(dom_start, dom_end, 700)
chi2_den = stats.chi2.pdf(dom, df=chi2_df)
plt.hist(arr_stats, normed=True, bins=20)
plt.plot(dom, chi2_den, '-')
    
In [ ]:
    
cdf_map = fea.MarginalCDFMap()
cdf_map.gen_features(Y)
    
In [ ]:
    
    
In [ ]: