In [39]:
from collections import Counter
from itertools import product
import numpy as np
from scipy import stats
from history_diagnostics import targetspace
In [9]:
%pylab
In [35]:
def evaluate_bootstrap_cdfs(metric, gen_func, n_bs_ref=200, n_test=500):
ref_sample = gen_func()
metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
dtype=float, count=n_bs_ref)
cdf = targetspace.MetricCDF(metric_sample)
p_better = np.asarray([cdf(metric(gen_func())) for _ in range(n_test)])
return p_better, cdf
def evaluate_bootstrap_cdfs_2(metric, gen_func, n_bs_ref=200, n_test=500):
p_better = []
for _ in range(n_test):
ref_sample = gen_func()
test_sample = gen_func()
metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
dtype=float, count=n_bs_ref)
cdf = targetspace.MetricCDF(metric_sample)
p_better.append(cdf(metric(test_sample)))
return np.asarray(p_better)
In [36]:
p_better = evaluate_bootstrap_cdfs_2(lambda s: s.sum() / len(s),
lambda: np.random.binomial(1, 0.1, 1000),
n_bs_ref=100,
n_test=1000)
In [20]:
print(1/len(p_better))
for significance in (0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5):
print("{:.3f} {}".format(significance, ((1-p_better) <= significance).sum() / len(p_better)))
In [37]:
figure()
hist(p_better, linspace(0, 1, 101), normed=True, cumulative=True)
plot([0,1], [0,1], 'r--', lw=4)
Out[37]:
In [58]:
def plot_p_better_cdfs_Binomial(n_bs_ref, n_test=1000):
figure()
suptitle("Binomial(1, p, n) with n_bs_ref={}".format(n_bs_ref))
plot([0,1], [0,1], "r--", lw=2)
bins = linspace(0, 1, 101)
def metric(s):
return s.sum() / len(s)
sarmp_size_max_exp = 5
for p_exp in range(1, samp_size_max_exp):
p = 10**(-p_exp)
for samp_size_exp in range(p_exp+1, samp_size_max_exp+1):
samp_size = 10**samp_size_exp
print(p, samp_size)
samp_gen = lambda: np.random.binomial(1, p, size=samp_size)
p_better = evaluate_bootstrap_cdfs_2(metric, samp_gen, n_bs_ref, n_test)
hist(p_better, bins, cumulative=True, normed=True, histtype="step",
label="p=10^-{} n=10^{}".format(p_exp, samp_size_exp))
legend(loc='best')
In [59]:
plot_p_better_cdfs_Binomial(500)
In [49]:
plot([0,1], [0,1], "r--")
xlabel("P(better)")
ylabel("cumulative frequency")
Out[49]:
In [50]:
f = gcf()
In [52]:
t = f.texts[0]
In [54]:
t.set_text("Binomial(1, p, n) n_bs_ref=100")
In [33]:
cdf = targetspace.MetricCDF(np.arange(5))
In [38]:
cdf.index(0), cdf(0)
Out[38]:
In [4]:
np.searchsorted(np.arange(5), 0, side='right')
Out[4]:
In [24]:
import importlib
In [32]:
import history_diagnostics.stats
importlib.reload(history_diagnostics.stats)
importlib.reload(targetspace)
Out[32]:
In [29]:
cdf2 = history_diagnostics.stats.MetricCDF(np.arange(5))
cdf2.index(0), cdf2(0)
Out[29]:
Define test samples
In [2]:
ref = targetspace.Sample.generate(0, 1, 1000,
performance=("float",
lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
same = targetspace.Sample.generate(0, 1, 1000,
performance=("float",
lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
How often are two i.i.d. samples marked as distinct; which metric is responsible?
The expected false-positive probability for OneMinusMaxMixin
is
$$
1 - P[ \text{all metric p-values above significance} ]
\\ = 1 - P[ \text{p-value} > \mathit{significance} ]^{n_\mathit{metrics}}
\\ = 1 - (1-\mathit{significance})^{n_\mathit{metrics}}
$$.
In [10]:
n_tests = 10000
n_metrics = 4
significance = 0.02
t = (np.random.rand(n_tests, n_metrics) < significance).any(axis=1)
print(t.sum() / n_tests, 1 - (1-significance)**n_metrics)
In [3]:
metric_generators = dict(performance=("float", lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
In [36]:
@targetspace.metric_directions("upper", "lower", "lower", "upper", "upper")
@targetspace.metric_ranges((0, None), (0, None), (0, 1), (0, 1), (0, 1))
def calc_context(sample):
n_requests = len(sample)
if n_requests == 0:
return 0, 0, 0, 0, 0
return (n_requests / sample.duration,
np.median(sample.requests.performance),
sample.requests.error.sum() / n_requests,
sample.requests.freq_evt.sum() / n_requests,
sample.requests.rare_evt.sum() / n_requests)
class RecordingOneMinusMaxMixin(targetspace.OneMinusMaxMixin):
def combine_evaluated_metrics(self, evaluated):
i_max = evaluated.argmax()
self.max_metrics.append(i_max)
return evaluated[i_max]
class TestTargetSpace(RecordingOneMinusMaxMixin, targetspace.TargetSpace):
def __init__(self, sample):
self.max_metrics = []
super().__init__(calc_context, sample)
In [40]:
def run_test(n_requests, n_tests):
locs = np.empty(n_tests)
max_metrics = []
for i in range(n_tests):
print(".", end="")
if (i + 1) % 100 == 0:
print()
a = targetspace.Sample.generate(0, 1, n_requests, **metric_generators)
b = targetspace.Sample.generate(0, 1, n_requests, **metric_generators)
ts = TestTargetSpace(a)
ts.calibrate(1.0, 200, 0.01)
locs[i] = ts.locate(b)
max_metrics.append(ts.max_metrics[-1])
In [39]:
Counter(max_metrics)
Out[39]:
In [41]:
max_metrics = np.array(max_metrics)
In [47]:
for i in range(5):
idx = max_metrics == i
print(i, idx.sum(), np.percentile(locs[idx], (100, 99, 95, 90, 75, 50)))
In [163]:
hist(p_better, linspace(0, 1, 101), cumulative=True, normed=True)
plot([0,0], [1,1], "r--", lw=2)
Out[163]:
In [81]:
s, ss
Out[81]:
In [15]:
sum(locs <= 0.05) / locs.size, 1 - 0.95 ** 5
Out[15]:
In [55]:
np.random.binomial?
np.random.binomial(1, 0.05, 100)
Out[55]:
In [3]:
worse_performance = targetspace.Sample.generate(0, 1, 1000,
performance=("float",
lambda times: np.abs(stats.norm.rvs(loc=4.5 + 0.5 * 0.54, scale=0.54, size=times.size))),
error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
worse_error_02 = targetspace.Sample.generate(0, 1, 1000,
performance=("float",
lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
error=(bool, lambda times: stats.binom.rvs(1, 0.05 * 1.2, size=times.size)),
freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
In [ ]: