In [3]:
from collections import Counter
from itertools import product
import numpy as np
from scipy import stats
from history_diagnostics import targetspace

In [2]:
%pylab


Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['product']
`%matplotlib` prevents importing * from pylab and numpy

In [4]:
def evaluate_bootstrap_cdfs(metric, gen_func, n_bs_ref=200, n_test=500):
    ref_sample = gen_func()
    metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
                          dtype=float, count=n_bs_ref)
    cdf = targetspace.MetricCDF(metric_sample)
    p_better = np.asarray([cdf(metric(gen_func())) for _ in range(n_test)])
    return p_better, cdf

def evaluate_bootstrap_cdfs_2(metric, gen_func, n_bs_ref=200, n_test=500):
    p_better = []
    for _ in range(n_test):
        ref_sample = gen_func()
        test_sample = gen_func()
        metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
                                    dtype=float, count=n_bs_ref)
        cdf = targetspace.MetricCDF(metric_sample)
        p_better.append(cdf(metric(test_sample)))
    return np.asarray(p_better)

In [7]:
def plot_p_better_cdfs_Normal(n_bs_ref, n_test=1000):
    figure()
    suptitle("Normal(0, sig, n) with n_bs_ref={}".format(n_bs_ref))
    plot([0,1], [0,1], "r--", lw=2)
    bins = linspace(0, 1, 101)
    def metric(s):
        return s.sum() / len(s)
    samp_size_max_exp = 5
    for sig_exp in range(0, samp_size_max_exp):
        sig = 10**(-sig_exp)
        for samp_size_exp in range(0, samp_size_max_exp+1):
            samp_size = 10**samp_size_exp
            print(sig, samp_size)
            samp_gen = lambda: np.random.randn(samp_size) * sig
            p_better = evaluate_bootstrap_cdfs_2(metric, samp_gen, n_bs_ref, n_test)
            hist(p_better, bins, cumulative=True, normed=True, histtype="step",
                 label="sig=10^-{} n=10^{}".format(sig_exp, samp_size_exp))
    legend(loc='best')

In [8]:
plot_p_better_cdfs_Normal(100)


1 1
1 10
1 100
1 1000
1 10000
1 100000
0.1 1
0.1 10
0.1 100
0.1 1000
0.1 10000
0.1 100000
0.01 1
0.01 10
0.01 100
0.01 1000
0.01 10000
0.01 100000
0.001 1
0.001 10
0.001 100
0.001 1000
0.001 10000
0.001 100000
0.0001 1
0.0001 10
0.0001 100
0.0001 1000
0.0001 10000
0.0001 100000