Two-sample test with linear mmd


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import sys

In [ ]:
# sample source 
m = 2000
dim = 6
alpha = 0.01
seed = 22
n = m
#ss = data.SSGaussMeanDiff(dim, my=0)
ss = data.SSSameGauss(d=dim)
#ss = data.SSGaussVarDiff(dim)
#ss = data.SSBlobs()
tst_data = ss.sample(m, seed=seed)
tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)

In [ ]:
# choose the best kernel that maximizes the test power 
med = util.meddistance(tr.stack_xy())
widths = [ (med*f) for f in 2.0**np.linspace(-1, 4, 20)]
list_kernels = [kernel.KGauss( w**2 ) for w in widths]

besti, powers = tst.LinearMMDTest.grid_search_kernel(tr, list_kernels, alpha)

In [ ]:
plt.plot(widths, powers, 'o-')
plt.xlabel('Gaussian width')
plt.ylabel('test power')
plt.title('median distance = %.3g. Best width: %.3g'%(med, widths[besti]) )

In [ ]:
# The actual test 
best_ker = list_kernels[besti]
lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
test_result = lin_mmd_test.perform_test(te)
test_result

Test type-1 error


In [ ]:
rep = 500
te_size = 5000
alpha = 0.05
lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
pvals = np.zeros(rep)
for r in range(rep):
    te = ss.sample(te_size, seed=r+34000)
    test_result = lin_mmd_test.perform_test(te)
    pvals[r] = test_result['pvalue']
  
# plot
type1 = np.sum(pvals < alpha)/float(rep)
print('type1: %.4g'%type1)

In [ ]: