This notebook is to test the implementation of QuadHSIC.



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import fsic.util as util
import fsic.data as data
import fsic.kernel as kernel
import fsic.indtest as it
import fsic.glo as glo
import scipy.stats as stats



In [ ]:

    
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 16
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)
#matplotlib.rc('text', usetex=True)
#matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"]



In [ ]:

    
def get_quad_psfunc():
    """
    Return a PairedSource to generate y = x^2 + Gaussian noise.
    """
    px = lambda n: np.random.rand(n, 1)*8 - 4
    f = lambda x: 0.2*x**2 + np.random.randn(x.shape[0], 1)
    return data.PSFunc(f, px)



In [ ]:

    
# paired source 
alpha = 0.05
n = 800
seed = 15

dx = 10 
dy = 5
ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])

pdata = ps.sample(n, seed=seed)
tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)



In [ ]:

    
def kl_median(pdata):
    """
    Get two Gaussian kernels constructed with the median heuristic.
    """
    xtr, ytr = pdata.xy()
    dx = xtr.shape[1]
    dy = ytr.shape[1]
    medx2 = util.meddistance(xtr, subsample=1000)**2
    medy2 = util.meddistance(ytr, subsample=1000)**2
    k = kernel.KGauss(medx2)
    l = kernel.KGauss(medy2)
    return k, l



In [ ]:

    
# number of test locations
k, l = kl_median(tr)

# perform test
n_permute = 20
qhsic = it.QuadHSIC(k, l, n_permute=n_permute, alpha=alpha)
qhsic.perform_test(te)

Check type-I errors, power



In [ ]:

    
alpha = 0.05
n = 800
n_permute = 100
repeats = 100
# data
ps = data.PSIndSameGauss(dx=2, dy=3)
pdata = ps.sample(n, seed=398)

#ps = get_quad_psfunc()
#pdata = ps.sample(n, seed=938)

tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=11)
k, l = kl_median(tr)
# the test 

qhsic = it.QuadHSIC(k, l, n_permute=n_permute, alpha=alpha)



In [ ]:

    
nte = 400
all_results = []
for r in range(repeats):
    if (r+1)%10==0:
        print 'starting trial: %d'%(r+1)
    te = ps.sample(nte, seed=r+2389)
    test_result = qhsic.perform_test(te)
    all_results.append(test_result)



In [ ]:

    
pvalues = np.array([result['pvalue'] for result in all_results])
stats = np.array([result['test_stat'] for result in all_results])
prob_reject = np.mean(pvalues < alpha)
print 'prob(reject H0) = %.4g'%prob_reject

Permutation time



In [ ]:

    
n_permute = 200
pdata = ps.sample(4000, seed=29)
X, Y = pdata.xy()
#with util.ContextTimer() as t1:
#    arr_hsic_naive = it.QuadHSIC._list_permute_generic(X, Y, k, l, n_permute)
#print 'naive permutations took %.3g s'%t1.secs

with util.ContextTimer() as t2:
    arr_hsic = it.QuadHSIC.list_permute(X, Y, k, l, n_permute)
print 'permutations with precomputed K,L took %.3g'%t2.secs



In [ ]: