This notebook investigates the stability of the learned test location. We consider the case where P is a mixture of two uniform distributions on 1D, one of which has small height, and the second component has a much larger height. Here Q is a uniform distribution whose mass strongly overlaps with the second component of P.

The idea is that if the sample size n is low, we will have very few to no points from the first component of P, and the learned location will be at the second component. If n is high, the learned location will be at the first component since the difference is larger.



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.ex.exglobal as exglo
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import freqopttest.plot as plot
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import scipy.signal as sig
import sys



In [ ]:

    
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 18
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)



In [ ]:

    
class SSMix2Unif(data.SampleSource):
    """
    1-d problem.
    
    P: U(m_p - w_p/2, mp+w_p/2) where w_p is the width of the uniform distribution, 
        m_p is the mean
    Q: w*U(m_q-w_q/2, mq+w_q/2) + (1-w)*p(x) where p(x) is the density of 
    """
    def __init__(self, w, mp, wp, mq, wq):
        if not (w>=0 and w<=1):
            raise RuntimeError('w must be in [0, 1]')
           
        self.w = w
        self.mp = mp
        self.wp = wp
        self.mq = mq
        self.wq = wq

    def dim(self):
        return 1

    def sample(self, n, seed):
        
        rstate = np.random.get_state()
        np.random.seed(seed)
        
        w = self.w
        mp = self.mp
        wp = self.wp
        mq = self.mq
        wq = self.wq
        
        disc_var = stats.rv_discrete(values=([0, 1], [w, 1-w]) )
        ind = disc_var.rvs(size=n)
        
        ind0 = ind==0
        #print ind0
        ind1 = ind==1
        # draw from Q
        ys = stats.uniform.rvs(loc=mq-wq/2.0, scale=wq, size=np.sum(ind0))
        yb = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=np.sum(ind1))
        y = np.hstack((ys, yb))
        y = y[:, np.newaxis]
                
        # draw from P
        x = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=n)
        x = x[:, np.newaxis]
        np.random.set_state(rstate)
        return data.TSTData(x, y, label='mix2unif')



In [ ]:

    
# sample source 
n = 4000
alpha = 0.01
w = 0.4
seed = 43
prob_params = {'w': w, 'mp': 25, 'wp': 5, 'mq': 0, 'wq': 0.2}
ss = SSMix2Unif(**prob_params)



In [ ]:

    
tst_data = ss.sample(n, seed=seed)
tr = ss.sample(n//2, seed=seed)
te = ss.sample(n//2, seed=seed+1)
#tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
nte = te.X.shape[0]



In [ ]:

    
xtr, ytr = tr.xy()
xytr = tr.stack_xy()
bins = np.linspace(np.min(xytr), np.max(xytr), 30)
normed_hist = True
plt.hist(xtr, bins, label='X', alpha=0.5, normed=normed_hist)
plt.hist(ytr, bins, label='Y', alpha=0.5, normed=normed_hist)
plt.legend(loc='best')

Check stability of the two modes. Triangular kernel



In [ ]:

    
alpha = 0.01
nte = 400
rep = 400



In [ ]:

    
# repeat many trials to see the value of the optimized location
def two_locations_test_results(nte, k):
    """
    k: an instance of Kernel
    """
    Tp_results = []
    Tq_results = []
    shift_seed = 1000
    
    for r in range(shift_seed, shift_seed+rep):    
        #tst_data = ss.sample(n, seed=r)
        #tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
        te = ss.sample(nte, seed=r+1)
       
        # test loc at the small bump
        Tq = np.array([[prob_params['mq']]])
        # test loc at the big bump
        Tp = np.array([[prob_params['mp'] ]])
        #Tp = np.array([[ 2 ]])

        # actual test 
        q_met = tst.METest(Tq, k, alpha)
        Tq_results.append(q_met.perform_test(te))
        p_met = tst.METest(Tp, k, alpha)
        Tp_results.append(p_met.perform_test(te))
    return Tp_results, Tq_results

def prob_tq_better(nte, k):
    """
    k: an instance of Kernel
    """
    Tp_results, Tq_results = two_locations_test_results(nte, k)
    tp_lambs = np.array([r['test_stat'] for r in Tp_results ])
    tq_lambs = np.array([r['test_stat'] for r in Tq_results ])
    n_left_high = np.sum(tq_lambs>tp_lambs)
    return float(n_left_high)/rep, tp_lambs, tq_lambs

def plot_tup_stats(tup_ntes):
    ps_small_better = np.array([t[0] for t in tup_ntes])

    plt.plot(ntes, ps_small_better, 'ob-')
    plt.xlabel('nte')
    plt.ylabel('p(loc at small bump gives high $\hat{\lambda}_n$)')
    
    plt.figure()
    tp_lamb_means = np.array([np.mean(l) for l in [t[1] for t in tup_ntes]])
    tp_lamb_stds = np.array([np.std(l) for l in [t[1] for t in tup_ntes]])
    tq_lamb_means = np.array([np.mean(l) for l in [t[2] for t in tup_ntes]])
    tq_lamb_stds = np.array([np.std(l) for l in [t[2] for t in tup_ntes]])

    print('mq is the mean of the small bump')
    plt.errorbar(ntes, tp_lamb_means, tp_lamb_stds, 
                 label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_p]$')
    plt.errorbar(ntes, tq_lamb_means, tq_lamb_stds, 
                 label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_q]$')
    plt.xlabel('test sample size')
    plt.ylabel('$\hat{\lambda}_n$')
    plt.legend(loc='best')



In [ ]:

    
ntes = np.linspace(5, 200, num=7, dtype=np.int64)
#ntes = np.array([ 25, 50, 75, 100, 150, 200, 250])

tri_width = 3
ktri = kernel.KTriangle(tri_width)
ktri_tup_ntes = [prob_tq_better(nte, ktri) for nte in ntes]
plot_tup_stats(ktri_tup_ntes)



In [ ]:

    
tp_lamb_means = np.array([np.mean(l) for l in [t[1] for t in ktri_tup_ntes]])
tp_lamb_stds = np.array([np.std(l) for l in [t[1] for t in ktri_tup_ntes]])
tq_lamb_means = np.array([np.mean(l) for l in [t[2] for t in ktri_tup_ntes]])
tq_lamb_stds = np.array([np.std(l) for l in [t[2] for t in ktri_tup_ntes]])

Check stability of the two modes. Gaussian kernel

Here we fix the test location to either on the left or the right bump and compare the objective function value.



In [ ]:

    
alpha = 0.01
nte = 400
rep = 300



In [ ]:

    
#ntes = np.linspace(50, 300, num=7, dtype=np.int64)
ntes = np.array([ 25, 50, 75, 100, 150, 200, 250])

#gwidth0 = 1
med_data = ss.sample(1000, seed=10)
gwidth0 = util.meddistance(med_data.stack_xy(), subsample=1000)**2

kgauss = kernel.KGauss(gwidth0)
tup_ntes = [prob_tq_better(nte, kgauss) for nte in ntes]



In [ ]:

    
plot_tup_stats(tup_ntes)

test B-spline



In [ ]:

    
dom = np.linspace(-3, 3, 200)
for order in [0, 1, 2]:
    bspline_vals = sig.bspline(dom, order)
    plt.plot(dom, bspline_vals, label='order=%d'%order)
plt.legend()
plt.title('B-spline')



In [ ]:



In [ ]:



In [ ]:



In [ ]: