The first notebook to test FSIC idea. Likely to be deprecated in the near future. Created on 16 June 2016.


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import fsic.util as util
import fsic.data as data
import fsic.kernel as kernel
import fsic.indtest as it
import fsic.glo as glo
import scipy.stats as stats

In [ ]:
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 15
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)

In [ ]:
def get_quad_psfunc():
    """
    Return a PairedSource to generate y = x^2 + Gaussian noise.
    """
    px = lambda n: np.random.rand(n, 1)*8 - 4
    f = lambda x: 0.2*x**2 + np.random.randn(x.shape[0], 1)
    return data.PSFunc(f, px)

def kl_kgauss_median(pdata, med_factor=1):
    """
    Get two Gaussian kernels constructed with the median heuristic.
    """
    xtr, ytr = pdata.xy()
    dx = xtr.shape[1]
    dy = ytr.shape[1]
    medx2 = util.meddistance(xtr, subsample=1000)**2
    medy2 = util.meddistance(ytr, subsample=1000)**2
    k = kernel.KGauss(medx2*med_factor)
    l = kernel.KGauss(medy2*med_factor)
    return k, l

In [ ]:
# paired source 
alpha = 0.01
n = 1000
dx = 100
dy = 100
seed = 393
#ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
ps = data.PS2DSinFreq(freq=5)
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])

pdata = ps.sample(n, seed=seed)
X, Y = pdata.xy()
#tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)

In [ ]:
# data and parameters
#xtr, ytr = tr.xy()
#xte, yte = te.xy()
k, l = kl_kgauss_median(pdata)
# number of test locations
J = 5
dx = pdata.dx()
dy = pdata.dy()
#V = np.random.randn(J, dx)
#W = np.random.randn(J, dy)
V, W = it.GaussNFSIC.init_locs_joint_randn(pdata, J, seed=seed+2)
#V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=seed+2)

In [ ]:
# perform test
nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg=1e-5)
nfsic_result = nfsic.perform_test(pdata)
nfsic_result

Null distribution


In [ ]:
# permute and compute the statistic many times
n_permute = 1000
with util.ContextTimer() as t1:
    test_stats = nfsic.list_permute(X, Y, k, l, V, W, n_permute=n_permute, reg='auto')
    #test_stats = nfsic._list_permute_naive(X, Y, k, l, V, W, n_permute=n_permute)

sim_pval = np.mean(test_stats > nfsic_result['test_stat'])
asymp_pval = nfsic_result['pvalue']

print 'p-value by permutations: %.3g'%sim_pval
print 'p-value by asymptotic chi-square: %.3g'%asymp_pval
print 'permutation took: %.3f s'%t1.secs

In [ ]:
dom = np.linspace(1e-1, np.max(test_stats), 500)
chi2den = stats.chi2.pdf(dom, df=J)
nc2den = stats.ncx2.pdf(dom, df=J, nc=np.mean(test_stats))
# histogram
plt.figure(figsize=(10,4))
plt.hist(test_stats, alpha=0.5, bins=20, normed=True, label='Stats under $H_0$')
plt.plot(dom, chi2den, label=r'$\chi^2(J)$')
#plt.plot(dom, nc2den, label=r'$\chi^2(J, \lambda)$')
plt.legend()
plt.title('FSIC. $J=%d$'%J)

In [ ]:
# plot empirical cdf
sorted_h0_stats = np.sort(test_stats)
normed_ranks = np.arange(len(sorted_h0_stats))/float(len(sorted_h0_stats))
plt.plot(sorted_h0_stats, normed_ranks, label='Simulated ECDF')

cdfs = stats.chi2.cdf(dom, df=J)
plt.plot(dom, cdfs, label='$\chi^2(J)$ CDF')
plt.legend()

In [ ]:
# diff CDF
diff_cdf = normed_ranks - stats.chi2.cdf(sorted_h0_stats, df=J)
plt.plot(sorted_h0_stats, diff_cdf)
plt.title('Diff in the CDFs')
plt.grid(True)

Permutation test


In [ ]:
#nfsic_perm = it.NFSIC(k, l, V, W, alpha=alpha, reg=1e-6, n_permute=500)
st = nfsic.compute_stat(pdata)
pval = np.mean(test_stats > st)

print 'stat: %.3f'%st
print 'p-value: %.3f'%pval

Test power

Simulate from a toy example and try to compute the test power.


In [ ]:
reps = 1000
n = 1000
J = 10
alpha = 0.05
# None = use aymptotics
n_permute = None
#n_permute = 200

ps = data.PSIndSameGauss(dx=20, dy=20)
k, l = kl_kgauss_median(ps.sample(1000, seed=2198), med_factor=1.0)
with util.NumpySeedContext(seed=23):
    V = np.random.randn(J, ps.dx())
    W = np.random.randn(J, ps.dy())


test_results = []
for r in range(reps):
    pdata = ps.sample(n, seed=r)
    pdata2 = ps.sample(300, seed=r+66)
    #with util.NumpySeedContext(seed=23):
    #    V = np.random.randn(J, ps.dx())
    #    W = np.random.randn(J, ps.dy())
    
    V, W = it.GaussNFSIC.init_locs_joint_subset(pdata2, J, seed=r+1)
    #V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r+3)
        
    #k, l = kl_kgauss_median(pdata, med_factor=1.0)
    nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto', n_permute=n_permute, seed=89)
    result = nfsic.perform_test(pdata)
    test_results.append(result)

In [ ]:
rejs = [r['h0_rejected'] for r in test_results]
rep_stats = [r['test_stat'] for r in test_results]
thresh = stats.chi2.isf(alpha, df=J)

power = np.mean(rejs)
print 'power: %g'%power

In [ ]:
# histogram
dom = np.linspace(max(1e-1, np.min(rep_stats)), np.max(rep_stats), 600)
chi2_den = stats.chi2.pdf(dom, df=J)

plt.figure(figsize=(10, 5))
plt.hist(rep_stats, bins=20, alpha=0.5, label='Repeated trials', normed=True)
plt.plot(dom, chi2_den, '-', label=r'$\chi^2(%d)$'%J)
plt.legend()

When two locations are very close


In [ ]:
# 2d data
#ps = data.PSIndSameGauss(dx=1, dy=1)
ps = data.PS2DSinFreq(freq=1)
pdata = ps.sample(n=700, seed=8)
X, Y = pdata.xy()

In [ ]:
k, l = kl_kgauss_median(pdata, med_factor=0.85)
reg = 1e-6
W = np.array([[-1.7], [-1.7]])
v0_cand = np.hstack( (np.linspace(-4, 4, 500), [0, -1] ))
v0_cand.sort()
nfsics = np.zeros(len(v0_cand))
for i, v0 in enumerate(v0_cand):
    V = np.array([[v0], [-1]])
    nfsic = it.NFSIC(k, l, V, W, alpha=0.05, reg=reg)
    nfsics[i] = nfsic.compute_stat(pdata)

In [ ]:
# plot

#plt.figure(figsize=(7, 2))
ax1 = plt.subplot(2, 1, 1)
plt.locator_params(nbins=5)

plt.plot(X[:, 0], Y[:, 0], 'k.', label='Sample', alpha=0.8)
plt.plot(v0_cand, np.ones(len(v0_cand))*W[0], 'g-', linewidth=3, label='$\mathbf{t}_2$ trajectory')
plt.plot(V[1], W[1], 'r*', markersize=23, label=r'$\mathbf{t}_1$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.xlim([-np.pi, np.pi])
plt.ylim([-np.pi, np.pi])
plt.legend(numpoints=1, ncol=3, fontsize=16, 
          # bbox_to_anchor=(1.02, 1.9)
            bbox_to_anchor=(1.02, 1.5)
          )
plt.setp(ax1.get_xticklabels(), visible=False)


# values
plt.subplot(2, 1, 2, sharex=ax1)
plt.locator_params(nbins=5)

plt.plot(v0_cand, nfsics, 'b-', label='$\hat{\lambda}_n(\mathbf{t}_1, \mathbf{t}_2)$')
#plt.title('V: [[v0], [%.1f]], W: %s'%(V[1], W))
plt.xlabel(r'$\mathbf{t}_2$')
plt.ylabel('$\hat{\lambda}_n(\mathbf{t}_1, \mathbf{t}_2)$')
plt.ylim([np.min(nfsics)-10, np.max(nfsics)+10])
plt.xlim([-np.pi, np.pi])
#plt.legend(numpoints=1, ncol=3, loc='lower right')
#plt.gca().get_yaxis().set_visible(False)
plt.savefig('redundant_locs.pdf', bbox_inches='tight')

Test power when J is high


In [ ]:
# paired source 
alpha = 0.01
n = 1000
dx = 100
dy = 100
seed = 393
#ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
ps = data.PS2DSinFreq(freq=2)
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])

pdata = ps.sample(n, seed=seed)
X, Y = pdata.xy()
k, l = kl_kgauss_median(pdata)
tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)

In [ ]:
def test_power(ps, nte, J, reps):
    rejs = np.zeros(reps)
    for r in range(reps):
        te = ps.sample(nte, seed=r+9827)
        tr = ps.sample(nte, seed=r+27)
        V, W = it.GaussNFSIC.init_locs_2randn(tr, J, seed=r+2)
        #V, W = it.GaussNFSIC.init_locs_joint_randn(tr, J, seed=r+2)
        #V, W = it.GaussNFSIC.init_locs_marginals_subset(tr, J, seed=r+2)
        #V, W = it.GaussNFSIC.init_locs_joint_subset(tr, J, seed=r+2)
        nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto')
        try:
            test_result = nfsic.perform_test(te)
            rejs[r] = test_result['h0_rejected']
        except:
            rejs[r] = False
    return np.mean(rejs)

In [ ]:
nte = 800
reps = 500

Js = range(1, 600+3, 50) + [10, 20, 30, 40]
Js = np.sort(np.array(Js))
#Js = np.logspace(0, 2.6, 10).astype(np.int64)
Js_pow = np.zeros(len(Js))
test_results = np.zeros(len(Js), dtype=np.object)

for i, J in enumerate(Js):
    tpow = test_power(ps, nte, J, reps)
    Js_pow[i] = tpow

In [ ]:
#plt.semilogx(Js, Js_pow, 'bo-')
plt.plot(Js, Js_pow, 'bo-')
plt.xlim([np.min(Js), np.max(Js)])
plt.ylim([np.min(Js_pow), 1])
plt.xlabel('J')
plt.ylabel('Test power')
plt.grid()

fname = 'pow_vs_J.pdf'
plt.savefig(fname, bbox_inches='tight')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: