In [19]:
import numpy as np
import pandas as pd
import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint
In [20]:
data = pd.read_csv('banner_click_stat.txt', header = None, sep = '\t')
data.columns = ['banner_a', 'banner_b']
In [21]:
data.head()
Out[21]:
In [22]:
data.describe()
Out[22]:
In [23]:
conf_interval_banner_a = proportion_confint(sum(data.banner_a),
data.shape[0],
method = 'wilson')
conf_interval_banner_b = proportion_confint(sum(data.banner_b),
data.shape[0],
method = 'wilson')
In [24]:
print ('interval for banner a [%f, %f]' % conf_interval_banner_a)
print ('interval for banner b [%f, %f]' % conf_interval_banner_b)
$X_1$ | $X_2$ | |
---|---|---|
1 | a | b |
0 | c | d |
$\sum$ | $n_1$ | $n_2$ |
In [25]:
def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):
z = scipy.stats.norm.ppf(1 - alpha / 2.)
p1 = float(sum(sample1)) / len(sample1)
p2 = float(sum(sample2)) / len(sample2)
left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
return (left_boundary, right_boundary)
In [26]:
print ("confidence interval: [%f, %f]" % proportions_confint_diff_ind(data.banner_a, data.banner_b))
$X_1$ \ $X_2$ | 1 | 0 | $\sum$ |
---|---|---|---|
1 | e | f | e + f |
0 | g | h | g + h |
$\sum$ | e + g | f + h | n |
In [38]:
def proportions_confint_diff_rel(sample1, sample2, alpha = 0.05):
z = scipy.stats.norm.ppf(1 - alpha / 2.)
sample = zip(sample1, sample2)
#n = len(sample)
#test_data = list(sample)
n = len(sample1)
f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
left_boundary = float(f - g) / n - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
right_boundary = float(f - g) / n + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
return (left_boundary, right_boundary)
In [40]:
print ("confidence interval: [%f, %f]" % proportions_confint_diff_rel(data.banner_a, data.banner_b))