This notebook investigates the test power vs. the number of test locations J in an incremental way. Specifically, we conjectured that the test power using $\mathcal{T}$, the set of $J$ locations should not be higher than the test power obtained by using $\mathcal{T} \cup \{t_{J+1}\}$


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.ex.exglobal as exglo
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import freqopttest.plot as plot
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import sys

In [ ]:
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 18
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)

In [ ]:
# sample source 
n = 500
dim = 30
seed = 13
#ss = data.SSGaussMeanDiff(dim, my=0.5)
ss = data.SSGaussVarDiff(dim)
#ss = data.SSSameGauss(dim)
#ss = data.SSBlobs()
dim = ss.dim()
tst_data = ss.sample(n, seed=seed)
tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=seed+82)

J = 2
alpha = 0.01
T = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=seed+1)
#T = np.random.randn(J, dim)

In [ ]:
med = util.meddistance(tr.stack_xy(), 800)
list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-5, 5, 30) ) ) )
list_gwidth.sort()
besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(tr, T, list_gwidth, alpha)

In [ ]:
# test with the best Gaussian with 
best_width = list_gwidth[besti]
met_grid = tst.MeanEmbeddingTest(T, best_width, alpha)
met_grid.perform_test(te)

$\hat{\lambda}_n$ vs $J$


In [ ]:
def draw_t(tst_data, seed=None):
    # Fit one Gaussian to the X,Y data. 
    if seed is not None:
        rand_state = np.random.get_state()
        np.random.seed(seed)

    xy = tst_data.stack_xy()
    # fit a Gaussian to each of X, Y
    m = np.mean(xy, 0)
    cov = np.cov(xy.T)
    t = np.random.multivariate_normal(m, cov, 1)
    
    # reset the seed back
    if seed is not None:
        np.random.set_state(rand_state)
    return t

In [ ]:
def simulate_stats_trajectory(T):
    Tn = T
    # add one new test location at a time.
    trials = 30
    test_stats = np.zeros(trials)
    for i in range(trials):
        # draw new location
        t = draw_t(tr)
        Tn = np.vstack((Tn, t))
        met = tst.MeanEmbeddingTest(Tn, best_width, alpha)
        tresult = met.perform_test(te)
        test_stats[i] = tresult['test_stat']
    return test_stats, Tn

for rep in range(6):
    test_stats, Tn = simulate_stats_trajectory(T)
    plt.plot(np.arange(len(T), len(Tn)), test_stats)
    print('stats increasing: %s', np.all(np.diff(test_stats)>=0) )
plt.xlabel('$J$')
plt.title('$\hat{\lambda}_n$ as J increases')

p-values vs J


In [ ]:
# plot p-value. 
for r in range(6):
    test_stats, Tn = simulate_stats_trajectory(T)
    Js = np.arange(len(T), len(Tn))
    pvals = [stats.chi2.sf(s, df=J) for s, J in zip(test_stats, Js)]
    plt.plot(Js, pvals)
plt.xlabel('$J$')
plt.title('p-values as J increases')

test threshold vs J


In [ ]:
Js = range(1, 30)
alphas = [1e-6, 0.005, 0.01, 0.05, 0.1]


for i, al in enumerate(alphas):
    threshs = [stats.chi2.isf(al, df=J) for J in Js ]        
    plt.plot(Js, threshs, '-', label='$\\alpha = %.3g$'%(al) )
plt.xlabel('J')
plt.ylabel('$T_\\alpha$')
plt.legend(loc='best')

The test threshold $T_\alpha$ seems to increase approximately linearly with respect to $J$ for any value of $\alpha$. The slope is roughly constant for all $\alpha$.

Test power vs. J: 2d Gaussian mean diff problem

For this example, we will consider a 2d Gaussian example where both P, Q are Gaussian with unit variance. P has mean [0, 0] and Q has mean [0, 1]. We will consider two ways to add test locations. Firstly we will add test locations in regions which reveal the difference of P, Q. Then, we will add test locations in uninformative regions to show that more locations dot necessarily increase the test power.


In [ ]:
# sample source 
n = 1000
d = 2
seed = 13
np.random.seed(seed)
ss = data.SSGaussMeanDiff(d, my=1.0)

J = 2
alpha = 0.01

In [ ]:
def eval_test_locs(T, ss, n, rep, seed_start=1, alpha=0.01):
    """Return a empirical test power"""
    rejs = np.zeros(rep)
    dat = ss.sample(1000, seed=298)
    gwidth2 = util.meddistance(dat.stack_xy())**2
    for r in range(rep):
        te = ss.sample(n, seed=seed_start+r)
        met = tst.MeanEmbeddingTest(T, gwidth2, alpha)
        result = met.perform_test(te)
        h0_rejected = result['h0_rejected']
        rejs[r] = h0_rejected
        print('rep %d: rej: %s'%(r, h0_rejected))
    power = np.mean(rejs)
    return power

In [ ]:
# define a set of locations
#mid = np.zeros(d)

#T_1 = mid[np.newaxis, :]
#T_2 = np.vstack((T_1, np.hstack((np.zeros(d-1), 20)) ))
#T_3 = np.vstack((T_2, np.hstack((np.zeros(d-1), 40)) ))

T = np.random.randn(270, d)

In [ ]:
eval_test_locs(T, ss, n=n, rep=100, seed_start=1, alpha=alpha)

In [ ]:
# plot one instance of the data in 2d.
te = ss.sample(n, seed=seed)
X, Y = te.xy()
plt.plot(X[:, 0], X[:, 1], 'ob')
plt.plot(Y[:, 0], Y[:, 1], 'or')

In [ ]:


In [ ]:


In [ ]: