The first notebook to test FSIC idea. Likely to be deprecated in the near future. Created on 16 June 2016.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import fsic.util as util
import fsic.data as data
import fsic.kernel as kernel
import fsic.indtest as it
import fsic.glo as glo
import scipy.stats as stats
In [ ]:
# font options
font = {
#'family' : 'normal',
#'weight' : 'bold',
'size' : 15
}
plt.rc('font', **font)
plt.rc('lines', linewidth=2)
In [ ]:
def get_quad_psfunc():
"""
Return a PairedSource to generate y = x^2 + Gaussian noise.
"""
px = lambda n: np.random.rand(n, 1)*8 - 4
f = lambda x: 0.2*x**2 + np.random.randn(x.shape[0], 1)
return data.PSFunc(f, px)
def kl_kgauss_median(pdata, med_factor=1):
"""
Get two Gaussian kernels constructed with the median heuristic.
"""
xtr, ytr = pdata.xy()
dx = xtr.shape[1]
dy = ytr.shape[1]
medx2 = util.meddistance(xtr, subsample=1000)**2
medy2 = util.meddistance(ytr, subsample=1000)**2
k = kernel.KGauss(medx2*med_factor)
l = kernel.KGauss(medy2*med_factor)
return k, l
In [ ]:
# paired source
alpha = 0.01
n = 1000
dx = 100
dy = 100
seed = 393
#ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
ps = data.PS2DSinFreq(freq=5)
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])
pdata = ps.sample(n, seed=seed)
X, Y = pdata.xy()
#tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)
In [ ]:
# data and parameters
#xtr, ytr = tr.xy()
#xte, yte = te.xy()
k, l = kl_kgauss_median(pdata)
# number of test locations
J = 5
dx = pdata.dx()
dy = pdata.dy()
#V = np.random.randn(J, dx)
#W = np.random.randn(J, dy)
V, W = it.GaussNFSIC.init_locs_joint_randn(pdata, J, seed=seed+2)
#V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=seed+2)
In [ ]:
# perform test
nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg=1e-5)
nfsic_result = nfsic.perform_test(pdata)
nfsic_result
In [ ]:
# permute and compute the statistic many times
n_permute = 1000
with util.ContextTimer() as t1:
test_stats = nfsic.list_permute(X, Y, k, l, V, W, n_permute=n_permute, reg='auto')
#test_stats = nfsic._list_permute_naive(X, Y, k, l, V, W, n_permute=n_permute)
sim_pval = np.mean(test_stats > nfsic_result['test_stat'])
asymp_pval = nfsic_result['pvalue']
print 'p-value by permutations: %.3g'%sim_pval
print 'p-value by asymptotic chi-square: %.3g'%asymp_pval
print 'permutation took: %.3f s'%t1.secs
In [ ]:
dom = np.linspace(1e-1, np.max(test_stats), 500)
chi2den = stats.chi2.pdf(dom, df=J)
nc2den = stats.ncx2.pdf(dom, df=J, nc=np.mean(test_stats))
# histogram
plt.figure(figsize=(10,4))
plt.hist(test_stats, alpha=0.5, bins=20, normed=True, label='Stats under $H_0$')
plt.plot(dom, chi2den, label=r'$\chi^2(J)$')
#plt.plot(dom, nc2den, label=r'$\chi^2(J, \lambda)$')
plt.legend()
plt.title('FSIC. $J=%d$'%J)
In [ ]:
# plot empirical cdf
sorted_h0_stats = np.sort(test_stats)
normed_ranks = np.arange(len(sorted_h0_stats))/float(len(sorted_h0_stats))
plt.plot(sorted_h0_stats, normed_ranks, label='Simulated ECDF')
cdfs = stats.chi2.cdf(dom, df=J)
plt.plot(dom, cdfs, label='$\chi^2(J)$ CDF')
plt.legend()
In [ ]:
# diff CDF
diff_cdf = normed_ranks - stats.chi2.cdf(sorted_h0_stats, df=J)
plt.plot(sorted_h0_stats, diff_cdf)
plt.title('Diff in the CDFs')
plt.grid(True)
In [ ]:
#nfsic_perm = it.NFSIC(k, l, V, W, alpha=alpha, reg=1e-6, n_permute=500)
st = nfsic.compute_stat(pdata)
pval = np.mean(test_stats > st)
print 'stat: %.3f'%st
print 'p-value: %.3f'%pval
In [ ]:
reps = 1000
n = 1000
J = 10
alpha = 0.05
# None = use aymptotics
n_permute = None
#n_permute = 200
ps = data.PSIndSameGauss(dx=20, dy=20)
k, l = kl_kgauss_median(ps.sample(1000, seed=2198), med_factor=1.0)
with util.NumpySeedContext(seed=23):
V = np.random.randn(J, ps.dx())
W = np.random.randn(J, ps.dy())
test_results = []
for r in range(reps):
pdata = ps.sample(n, seed=r)
pdata2 = ps.sample(300, seed=r+66)
#with util.NumpySeedContext(seed=23):
# V = np.random.randn(J, ps.dx())
# W = np.random.randn(J, ps.dy())
V, W = it.GaussNFSIC.init_locs_joint_subset(pdata2, J, seed=r+1)
#V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r+3)
#k, l = kl_kgauss_median(pdata, med_factor=1.0)
nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto', n_permute=n_permute, seed=89)
result = nfsic.perform_test(pdata)
test_results.append(result)
In [ ]:
rejs = [r['h0_rejected'] for r in test_results]
rep_stats = [r['test_stat'] for r in test_results]
thresh = stats.chi2.isf(alpha, df=J)
power = np.mean(rejs)
print 'power: %g'%power
In [ ]:
# histogram
dom = np.linspace(max(1e-1, np.min(rep_stats)), np.max(rep_stats), 600)
chi2_den = stats.chi2.pdf(dom, df=J)
plt.figure(figsize=(10, 5))
plt.hist(rep_stats, bins=20, alpha=0.5, label='Repeated trials', normed=True)
plt.plot(dom, chi2_den, '-', label=r'$\chi^2(%d)$'%J)
plt.legend()
In [ ]:
# 2d data
#ps = data.PSIndSameGauss(dx=1, dy=1)
ps = data.PS2DSinFreq(freq=1)
pdata = ps.sample(n=700, seed=8)
X, Y = pdata.xy()
In [ ]:
k, l = kl_kgauss_median(pdata, med_factor=0.85)
reg = 1e-6
W = np.array([[-1.7], [-1.7]])
v0_cand = np.hstack( (np.linspace(-4, 4, 500), [0, -1] ))
v0_cand.sort()
nfsics = np.zeros(len(v0_cand))
for i, v0 in enumerate(v0_cand):
V = np.array([[v0], [-1]])
nfsic = it.NFSIC(k, l, V, W, alpha=0.05, reg=reg)
nfsics[i] = nfsic.compute_stat(pdata)
In [ ]:
# plot
#plt.figure(figsize=(7, 2))
ax1 = plt.subplot(2, 1, 1)
plt.locator_params(nbins=5)
plt.plot(X[:, 0], Y[:, 0], 'k.', label='Sample', alpha=0.8)
plt.plot(v0_cand, np.ones(len(v0_cand))*W[0], 'g-', linewidth=3, label='$\mathbf{t}_2$ trajectory')
plt.plot(V[1], W[1], 'r*', markersize=23, label=r'$\mathbf{t}_1$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.xlim([-np.pi, np.pi])
plt.ylim([-np.pi, np.pi])
plt.legend(numpoints=1, ncol=3, fontsize=16,
# bbox_to_anchor=(1.02, 1.9)
bbox_to_anchor=(1.02, 1.5)
)
plt.setp(ax1.get_xticklabels(), visible=False)
# values
plt.subplot(2, 1, 2, sharex=ax1)
plt.locator_params(nbins=5)
plt.plot(v0_cand, nfsics, 'b-', label='$\hat{\lambda}_n(\mathbf{t}_1, \mathbf{t}_2)$')
#plt.title('V: [[v0], [%.1f]], W: %s'%(V[1], W))
plt.xlabel(r'$\mathbf{t}_2$')
plt.ylabel('$\hat{\lambda}_n(\mathbf{t}_1, \mathbf{t}_2)$')
plt.ylim([np.min(nfsics)-10, np.max(nfsics)+10])
plt.xlim([-np.pi, np.pi])
#plt.legend(numpoints=1, ncol=3, loc='lower right')
#plt.gca().get_yaxis().set_visible(False)
plt.savefig('redundant_locs.pdf', bbox_inches='tight')
In [ ]:
# paired source
alpha = 0.01
n = 1000
dx = 100
dy = 100
seed = 393
#ps = data.PSIndSameGauss(dx, dy)
#ps = get_quad_psfunc()
ps = data.PS2DSinFreq(freq=2)
#ps = data.PSIndUnif(xlb=[0, 3], xub=[1, 10], ylb=[-5, 5], yub=[8, 10])
pdata = ps.sample(n, seed=seed)
X, Y = pdata.xy()
k, l = kl_kgauss_median(pdata)
tr, te = pdata.split_tr_te(tr_proportion=0.5, seed=10)
In [ ]:
def test_power(ps, nte, J, reps):
rejs = np.zeros(reps)
for r in range(reps):
te = ps.sample(nte, seed=r+9827)
tr = ps.sample(nte, seed=r+27)
V, W = it.GaussNFSIC.init_locs_2randn(tr, J, seed=r+2)
#V, W = it.GaussNFSIC.init_locs_joint_randn(tr, J, seed=r+2)
#V, W = it.GaussNFSIC.init_locs_marginals_subset(tr, J, seed=r+2)
#V, W = it.GaussNFSIC.init_locs_joint_subset(tr, J, seed=r+2)
nfsic = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto')
try:
test_result = nfsic.perform_test(te)
rejs[r] = test_result['h0_rejected']
except:
rejs[r] = False
return np.mean(rejs)
In [ ]:
nte = 800
reps = 500
Js = range(1, 600+3, 50) + [10, 20, 30, 40]
Js = np.sort(np.array(Js))
#Js = np.logspace(0, 2.6, 10).astype(np.int64)
Js_pow = np.zeros(len(Js))
test_results = np.zeros(len(Js), dtype=np.object)
for i, J in enumerate(Js):
tpow = test_power(ps, nte, J, reps)
Js_pow[i] = tpow
In [ ]:
#plt.semilogx(Js, Js_pow, 'bo-')
plt.plot(Js, Js_pow, 'bo-')
plt.xlim([np.min(Js), np.max(Js)])
plt.ylim([np.min(Js_pow), 1])
plt.xlabel('J')
plt.ylabel('Test power')
plt.grid()
fname = 'pow_vs_J.pdf'
plt.savefig(fname, bbox_inches='tight')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: