This notebook investigates the stability of the learned test location. We consider the case where P is a mixture of two uniform distributions on 1D, one of which has small height, and the second component has a much larger height. Here Q is a uniform distribution whose mass strongly overlaps with the second component of P.
The idea is that if the sample size n is low, we will have very few to no points from the first component of P, and the learned location will be at the second component. If n is high, the learned location will be at the first component since the difference is larger.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.ex.exglobal as exglo
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import freqopttest.plot as plot
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import scipy.signal as sig
import sys
In [ ]:
# font options
font = {
#'family' : 'normal',
#'weight' : 'bold',
'size' : 18
}
plt.rc('font', **font)
plt.rc('lines', linewidth=2)
In [ ]:
class SSMix2Unif(data.SampleSource):
"""
1-d problem.
P: U(m_p - w_p/2, mp+w_p/2) where w_p is the width of the uniform distribution,
m_p is the mean
Q: w*U(m_q-w_q/2, mq+w_q/2) + (1-w)*p(x) where p(x) is the density of
"""
def __init__(self, w, mp, wp, mq, wq):
if not (w>=0 and w<=1):
raise RuntimeError('w must be in [0, 1]')
self.w = w
self.mp = mp
self.wp = wp
self.mq = mq
self.wq = wq
def dim(self):
return 1
def sample(self, n, seed):
rstate = np.random.get_state()
np.random.seed(seed)
w = self.w
mp = self.mp
wp = self.wp
mq = self.mq
wq = self.wq
disc_var = stats.rv_discrete(values=([0, 1], [w, 1-w]) )
ind = disc_var.rvs(size=n)
ind0 = ind==0
#print ind0
ind1 = ind==1
# draw from Q
ys = stats.uniform.rvs(loc=mq-wq/2.0, scale=wq, size=np.sum(ind0))
yb = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=np.sum(ind1))
y = np.hstack((ys, yb))
y = y[:, np.newaxis]
# draw from P
x = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=n)
x = x[:, np.newaxis]
np.random.set_state(rstate)
return data.TSTData(x, y, label='mix2unif')
In [ ]:
# sample source
n = 4000
alpha = 0.01
w = 0.4
seed = 43
prob_params = {'w': w, 'mp': 25, 'wp': 5, 'mq': 0, 'wq': 0.2}
ss = SSMix2Unif(**prob_params)
In [ ]:
tst_data = ss.sample(n, seed=seed)
tr = ss.sample(n//2, seed=seed)
te = ss.sample(n//2, seed=seed+1)
#tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
nte = te.X.shape[0]
In [ ]:
xtr, ytr = tr.xy()
xytr = tr.stack_xy()
bins = np.linspace(np.min(xytr), np.max(xytr), 30)
normed_hist = True
plt.hist(xtr, bins, label='X', alpha=0.5, normed=normed_hist)
plt.hist(ytr, bins, label='Y', alpha=0.5, normed=normed_hist)
plt.legend(loc='best')
In [ ]:
alpha = 0.01
nte = 400
rep = 400
In [ ]:
# repeat many trials to see the value of the optimized location
def two_locations_test_results(nte, k):
"""
k: an instance of Kernel
"""
Tp_results = []
Tq_results = []
shift_seed = 1000
for r in range(shift_seed, shift_seed+rep):
#tst_data = ss.sample(n, seed=r)
#tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
te = ss.sample(nte, seed=r+1)
# test loc at the small bump
Tq = np.array([[prob_params['mq']]])
# test loc at the big bump
Tp = np.array([[prob_params['mp'] ]])
#Tp = np.array([[ 2 ]])
# actual test
q_met = tst.METest(Tq, k, alpha)
Tq_results.append(q_met.perform_test(te))
p_met = tst.METest(Tp, k, alpha)
Tp_results.append(p_met.perform_test(te))
return Tp_results, Tq_results
def prob_tq_better(nte, k):
"""
k: an instance of Kernel
"""
Tp_results, Tq_results = two_locations_test_results(nte, k)
tp_lambs = np.array([r['test_stat'] for r in Tp_results ])
tq_lambs = np.array([r['test_stat'] for r in Tq_results ])
n_left_high = np.sum(tq_lambs>tp_lambs)
return float(n_left_high)/rep, tp_lambs, tq_lambs
def plot_tup_stats(tup_ntes):
ps_small_better = np.array([t[0] for t in tup_ntes])
plt.plot(ntes, ps_small_better, 'ob-')
plt.xlabel('nte')
plt.ylabel('p(loc at small bump gives high $\hat{\lambda}_n$)')
plt.figure()
tp_lamb_means = np.array([np.mean(l) for l in [t[1] for t in tup_ntes]])
tp_lamb_stds = np.array([np.std(l) for l in [t[1] for t in tup_ntes]])
tq_lamb_means = np.array([np.mean(l) for l in [t[2] for t in tup_ntes]])
tq_lamb_stds = np.array([np.std(l) for l in [t[2] for t in tup_ntes]])
print('mq is the mean of the small bump')
plt.errorbar(ntes, tp_lamb_means, tp_lamb_stds,
label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_p]$')
plt.errorbar(ntes, tq_lamb_means, tq_lamb_stds,
label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_q]$')
plt.xlabel('test sample size')
plt.ylabel('$\hat{\lambda}_n$')
plt.legend(loc='best')
In [ ]:
ntes = np.linspace(5, 200, num=7, dtype=np.int64)
#ntes = np.array([ 25, 50, 75, 100, 150, 200, 250])
tri_width = 3
ktri = kernel.KTriangle(tri_width)
ktri_tup_ntes = [prob_tq_better(nte, ktri) for nte in ntes]
plot_tup_stats(ktri_tup_ntes)
In [ ]:
tp_lamb_means = np.array([np.mean(l) for l in [t[1] for t in ktri_tup_ntes]])
tp_lamb_stds = np.array([np.std(l) for l in [t[1] for t in ktri_tup_ntes]])
tq_lamb_means = np.array([np.mean(l) for l in [t[2] for t in ktri_tup_ntes]])
tq_lamb_stds = np.array([np.std(l) for l in [t[2] for t in ktri_tup_ntes]])
Here we fix the test location to either on the left or the right bump and compare the objective function value.
In [ ]:
alpha = 0.01
nte = 400
rep = 300
In [ ]:
#ntes = np.linspace(50, 300, num=7, dtype=np.int64)
ntes = np.array([ 25, 50, 75, 100, 150, 200, 250])
#gwidth0 = 1
med_data = ss.sample(1000, seed=10)
gwidth0 = util.meddistance(med_data.stack_xy(), subsample=1000)**2
kgauss = kernel.KGauss(gwidth0)
tup_ntes = [prob_tq_better(nte, kgauss) for nte in ntes]
In [ ]:
plot_tup_stats(tup_ntes)
In [ ]:
dom = np.linspace(-3, 3, 200)
for order in [0, 1, 2]:
bspline_vals = sig.bspline(dom, order)
plt.plot(dom, bspline_vals, label='order=%d'%order)
plt.legend()
plt.title('B-spline')
In [ ]:
In [ ]:
In [ ]:
In [ ]: