In [39]:
from collections import Counter
from itertools import product
import numpy as np
from scipy import stats
from history_diagnostics import targetspace

In [9]:
%pylab


Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib

Evaluate bootstrap cdf method


In [35]:
def evaluate_bootstrap_cdfs(metric, gen_func, n_bs_ref=200, n_test=500):
    ref_sample = gen_func()
    metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
                          dtype=float, count=n_bs_ref)
    cdf = targetspace.MetricCDF(metric_sample)
    p_better = np.asarray([cdf(metric(gen_func())) for _ in range(n_test)])
    return p_better, cdf

def evaluate_bootstrap_cdfs_2(metric, gen_func, n_bs_ref=200, n_test=500):
    p_better = []
    for _ in range(n_test):
        ref_sample = gen_func()
        test_sample = gen_func()
        metric_sample = np.fromiter((metric(np.random.choice(ref_sample, len(ref_sample))) for _ in range(n_bs_ref)),
                                    dtype=float, count=n_bs_ref)
        cdf = targetspace.MetricCDF(metric_sample)
        p_better.append(cdf(metric(test_sample)))
    return np.asarray(p_better)

In [36]:
p_better = evaluate_bootstrap_cdfs_2(lambda s: s.sum() / len(s),
                                        lambda: np.random.binomial(1, 0.1, 1000),
                                        n_bs_ref=100,
                                        n_test=1000)

In [20]:
print(1/len(p_better))
for significance in (0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5):
    print("{:.3f} {}".format(significance, ((1-p_better) <= significance).sum() / len(p_better)))


0.001
0.010 0.06
0.020 0.08
0.030 0.098
0.040 0.122
0.050 0.133
0.100 0.192
0.200 0.296
0.300 0.364
0.400 0.447
0.500 0.514

In [37]:
figure()
hist(p_better, linspace(0, 1, 101), normed=True, cumulative=True)
plot([0,1], [0,1], 'r--', lw=4)


Out[37]:
[<matplotlib.lines.Line2D at 0x7f1a3913d828>]

In [58]:
def plot_p_better_cdfs_Binomial(n_bs_ref, n_test=1000):
    figure()
    suptitle("Binomial(1, p, n) with n_bs_ref={}".format(n_bs_ref))
    plot([0,1], [0,1], "r--", lw=2)
    bins = linspace(0, 1, 101)
    def metric(s):
        return s.sum() / len(s)
    sarmp_size_max_exp = 5
    for p_exp in range(1, samp_size_max_exp):
        p = 10**(-p_exp)
        for samp_size_exp in range(p_exp+1, samp_size_max_exp+1):
            samp_size = 10**samp_size_exp
            print(p, samp_size)
            samp_gen = lambda: np.random.binomial(1, p, size=samp_size)
            p_better = evaluate_bootstrap_cdfs_2(metric, samp_gen, n_bs_ref, n_test)
            hist(p_better, bins, cumulative=True, normed=True, histtype="step",
                 label="p=10^-{} n=10^{}".format(p_exp, samp_size_exp))
    legend(loc='best')

In [59]:
plot_p_better_cdfs_Binomial(500)


0.1 100
0.1 1000
0.1 10000
0.1 100000
0.01 1000
0.01 10000
0.01 100000
0.001 10000
0.001 100000
0.0001 100000

In [49]:
plot([0,1], [0,1], "r--")
xlabel("P(better)")
ylabel("cumulative frequency")


Out[49]:
<matplotlib.text.Text at 0x7f1a2bf4ee10>

In [50]:
f = gcf()

In [52]:
t = f.texts[0]

In [54]:
t.set_text("Binomial(1, p, n)  n_bs_ref=100")

In [33]:
cdf = targetspace.MetricCDF(np.arange(5))

In [38]:
cdf.index(0), cdf(0)


Out[38]:
(1, 0.80000000000000004)

In [4]:
np.searchsorted(np.arange(5), 0, side='right')


Out[4]:
1

In [24]:
import importlib

In [32]:
import history_diagnostics.stats
importlib.reload(history_diagnostics.stats)
importlib.reload(targetspace)


Out[32]:
<module 'history_diagnostics.targetspace' from '/home/eike/fdx/work-packages/history-diagnostics/history_diagnostics/targetspace.py'>

In [29]:
cdf2 = history_diagnostics.stats.MetricCDF(np.arange(5))
cdf2.index(0), cdf2(0)


Out[29]:
(1, 0.80000000000000004)

Define test samples


In [2]:
ref = targetspace.Sample.generate(0, 1, 1000,
                                  performance=("float",
                                               lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
                                  error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
                                  freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
                                  rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
same = targetspace.Sample.generate(0, 1, 1000,
                                   performance=("float",
                                                lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
                                   error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
                                   freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
                                   rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))

Test recall

How often are two i.i.d. samples marked as distinct; which metric is responsible?

The expected false-positive probability for OneMinusMaxMixin is $$ 1 - P[ \text{all metric p-values above significance} ] \\ = 1 - P[ \text{p-value} > \mathit{significance} ]^{n_\mathit{metrics}} \\ = 1 - (1-\mathit{significance})^{n_\mathit{metrics}} $$.

Test of above formula:


In [10]:
n_tests = 10000
n_metrics = 4
significance = 0.02
t = (np.random.rand(n_tests, n_metrics) < significance).any(axis=1)
print(t.sum() / n_tests, 1 - (1-significance)**n_metrics)


0.0766 0.07763184000000012

In [3]:
metric_generators = dict(performance=("float", lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
                         error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
                         freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
                         rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))

Test with real data


In [36]:
@targetspace.metric_directions("upper", "lower", "lower", "upper", "upper")
@targetspace.metric_ranges((0, None), (0, None), (0, 1), (0, 1), (0, 1))
def calc_context(sample):
    n_requests = len(sample)
    if n_requests == 0:
        return 0, 0, 0, 0, 0
    
    return (n_requests / sample.duration,
            np.median(sample.requests.performance),
            sample.requests.error.sum() / n_requests,
            sample.requests.freq_evt.sum() / n_requests,
            sample.requests.rare_evt.sum() / n_requests)

class RecordingOneMinusMaxMixin(targetspace.OneMinusMaxMixin):
    def combine_evaluated_metrics(self, evaluated):
        i_max = evaluated.argmax()
        self.max_metrics.append(i_max)
        return evaluated[i_max]

class TestTargetSpace(RecordingOneMinusMaxMixin, targetspace.TargetSpace):
    def __init__(self, sample):
        self.max_metrics = []
        super().__init__(calc_context, sample)

In [40]:
def run_test(n_requests, n_tests):
    locs = np.empty(n_tests)
    max_metrics = []
    for i in range(n_tests):
        print(".", end="")
        if (i + 1) % 100 == 0:
            print()
        a = targetspace.Sample.generate(0, 1, n_requests, **metric_generators)
        b = targetspace.Sample.generate(0, 1, n_requests, **metric_generators)
        ts = TestTargetSpace(a)
        ts.calibrate(1.0, 200, 0.01)
        locs[i] = ts.locate(b)
        max_metrics.append(ts.max_metrics[-1])

In [39]:
Counter(max_metrics)


Out[39]:
Counter({1: 302, 2: 280, 3: 233, 4: 122, 0: 63})

In [41]:
max_metrics = np.array(max_metrics)

In [47]:
for i in range(5):
    idx = max_metrics == i
    print(i, idx.sum(), np.percentile(locs[idx], (100, 99, 95, 90, 75, 50)))


0 63 [0.56000000000000005, 0.54760000000000009, 0.53499999999999992, 0.52800000000000002, 0.51000000000000001, 0.48499999999999999]
1 302 [1.0, 1.0, 1.0, 1.0, 0.98499999999999999, 0.93999999999999995]
2 280 [1.0, 1.0, 1.0, 1.0, 0.98999999999999999, 0.93500000000000005]
3 233 [1.0, 1.0, 1.0, 0.99499999999999988, 0.97999999999999998, 0.92500000000000004]
4 122 [0.97999999999999998, 0.97894999999999999, 0.96499999999999997, 0.95499999999999996, 0.91500000000000004, 0.83749999999999991]

In [163]:
hist(p_better, linspace(0, 1, 101), cumulative=True, normed=True)
plot([0,0], [1,1], "r--", lw=2)


Out[163]:
[<matplotlib.lines.Line2D at 0x7f0431e117f0>]

In [81]:
s, ss


Out[81]:
(array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]))

In [15]:
sum(locs <= 0.05) / locs.size, 1 - 0.95 ** 5


Out[15]:
(0.39000000000000001, 0.22621906250000023)

In [55]:
np.random.binomial?
np.random.binomial(1, 0.05, 100)


Out[55]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [3]:
worse_performance = targetspace.Sample.generate(0, 1, 1000,
                                   performance=("float",
                                                lambda times: np.abs(stats.norm.rvs(loc=4.5 + 0.5 * 0.54, scale=0.54, size=times.size))),
                                   error=(bool, lambda times: stats.binom.rvs(1, 0.05, size=times.size)),
                                   freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
                                   rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))
worse_error_02 = targetspace.Sample.generate(0, 1, 1000,
                                   performance=("float",
                                                lambda times: np.abs(stats.norm.rvs(loc=4.5, scale=0.54, size=times.size))),
                                   error=(bool, lambda times: stats.binom.rvs(1, 0.05 * 1.2, size=times.size)),
                                   freq_evt=(bool, lambda times: stats.binom.rvs(1, 0.2, size=times.size)),
                                   rare_evt=(bool, lambda times: stats.binom.rvs(1, 0.01, size=times.size)))

In [ ]: