A notebook to demonstrate how to use the UME (unnormalized mean embeddings) test.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import autograd.numpy as np
import matplotlib
import matplotlib.pyplot as plt
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
In [ ]:
# font options
font = {
#'family' : 'normal',
#'weight' : 'bold',
'size' : 18
}
plt.rc('font', **font)
plt.rc('lines', linewidth=2)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
In [ ]:
# sample source
n = 600
dim = 1
seed = 17
ss = data.SSGaussMeanDiff(dim, my=1.0)
#ss = data.SSGaussVarDiff(dim)
#ss = data.SSSameGauss(dim)
# ss = data.SSBlobs()
dim = ss.dim()
dat = ss.sample(n, seed=seed)
xy = dat.stack_xy()
# tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
Draw a random set of test locations, and create a Gaussian kernel using the median heuristic.
In [ ]:
# number of test locations
J = 3
V = util.fit_gaussian_draw(xy, J, seed=seed+1)
med = util.meddistance(xy, subsample=1000)
k = kernel.KGauss(sigma2=med**2)
Test with the chosen parameters.
In [ ]:
# significance level
alpha = 0.01
ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha)
ume.perform_test(dat)
In [ ]:
# sample source
n = 800
dim = 2
seed = 18
ss = data.SSGaussMeanDiff(dim, my=1.0)
# ss = data.SSGaussVarDiff(dim)
# ss = data.SSSameGauss(dim)
# ss = data.SSBlobs()
dim = ss.dim()
dat = ss.sample(n, seed=seed)
tr, te = dat.split_tr_te(tr_proportion=0.5, seed=10)
xy_tr = tr.stack_xy()
In [ ]:
# initialize test_locs by drawing the a Gaussian fitted to the data
# number of test locations
J = 2
V0 = util.fit_gaussian_draw(xy_tr, J, seed=seed+7)
med = util.meddistance(xy_tr, subsample=1000)
gwidth0 = med**2
V_opt, gw2_opt, opt_info = tst.GaussUMETest.optimize_locs_width(tr, V0, gwidth0, reg=1e-3,
max_iter=100, tol_fun=1e-6, disp=False, locs_bounds_frac=100,
gwidth_lb=None, gwidth_ub=None)
display(opt_info)
In [ ]:
# perform the test using the optimized parameters on the test set
alpha = 0.01
ume_opt = tst.GaussUMETest(V_opt, gw2_opt, n_simulate=2000, alpha=alpha)
ume_opt.perform_test(te)
In [ ]:
xtr, ytr = tr.xy()
plt.figure(figsize=(8, 5))
plt.plot(xtr[:, 0], xtr[:, 1], 'bo', alpha=0.6, label='X')
plt.plot(ytr[:, 0], ytr[:, 1], 'ro', alpha=0.6, label='Y')
for j in range(J):
v = V_opt[j, :]
plt.plot(v[0], v[1], 'k*', markersize=30, label='Test locations' if j==0 else '')
plt.legend()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# create a problem where H0 is true (P=Q)
seed = 278
dim = 4
n = 500
# ss = data.SSGaussMeanDiff(d=dim, my=0.0)
# ss = data.SSGaussVarDiff(d=dim)
ss_h1 = data.SSGaussMeanDiff(d=dim, my=1.0)
ss = data.SSNullResample(ss_h1.sample(5000, seed=seed+93).stack_xy())
dat = ss.sample(n, seed=8)
xy = dat.stack_xy()
In [ ]:
# create some hyperparameters of the test
J = 5
V = util.fit_gaussian_draw(xy, J, seed=seed+82)
med = util.meddistance(xy, subsample=1000)
k = kernel.KGauss(sigma2=med**2)
ume = tst.UMETest(V, k, n_simulate=4000, seed=seed+3)
In [ ]:
# number of times to create a new problem (draw new samples)
trials = 1000
null_stats = np.zeros(trials)
for t in range(trials):
dat = ss.sample(n, seed=seed+t+1)
null_stats[t] = ume.compute_stat(dat)
# use the data in the last trial to perform test
results = ume.perform_test(dat, return_simulated_stats=True)
sim_stats = results['sim_stats']
display(results)
In [ ]:
# histogram of the null stats
plt.figure(figsize=(8, 5))
plt.hist(null_stats, label='Empirical ground truth', alpha=0.7, bins=15, normed=True);
plt.hist(sim_stats, label='Asymptotic', alpha=0.7, bins=15, normed=True)
plt.legend()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: