In [1]:
import numpy as np
import pandas as pd
import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint
In [2]:
data = pd.read_csv('banner_click_stat.txt', header = None, sep = '\t')
data.columns = ['banner_a', 'banner_b']
In [3]:
data.head()
Out[3]:
In [4]:
data.describe()
Out[4]:
In [5]:
conf_interval_banner_a = proportion_confint(sum(data.banner_a),
data.shape[0],
method = 'wilson')
conf_interval_banner_b = proportion_confint(sum(data.banner_b),
data.shape[0],
method = 'wilson')
In [6]:
print '95%% confidence interval for a click probability, banner a: [%f, %f]' % conf_interval_banner_a
print '95%% confidence interval for a click probability, banner b [%f, %f]' % conf_interval_banner_b
| $X_1$ | $X_2$ | |
|---|---|---|
| 1 | a | b |
| 0 | c | d |
| $\sum$ | $n_1$ | $n_2$ |
In [7]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):
z = scipy.stats.norm.ppf(1 - alpha / 2.)
p1 = float(sum(sample1)) / len(sample1)
p2 = float(sum(sample2)) / len(sample2)
left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
return (left_boundary, right_boundary)
In [8]:
def proportions_diff_z_stat_ind(sample1, sample2):
n1 = len(sample1)
n2 = len(sample2)
p1 = float(sum(sample1)) / n1
p2 = float(sum(sample2)) / n2
P = float(p1*n1 + p2*n2) / (n1 + n2)
return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))
In [9]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
if alternative not in ('two-sided', 'less', 'greater'):
raise ValueError("alternative not recognized\n"
"should be 'two-sided', 'less' or 'greater'")
if alternative == 'two-sided':
return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
if alternative == 'less':
return scipy.stats.norm.cdf(z_stat)
if alternative == 'greater':
return 1 - scipy.stats.norm.cdf(z_stat)
In [10]:
print "95%% confidence interval for a difference between proportions: [%f, %f]" %\
proportions_diff_confint_ind(data.banner_a, data.banner_b)
In [11]:
print "p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(data.banner_a, data.banner_b))
In [12]:
print "p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(data.banner_a, data.banner_b), 'less')
| $X_1$ \ $X_2$ | 1 | 0 | $\sum$ |
|---|---|---|---|
| 1 | e | f | e + f |
| 0 | g | h | g + h |
| $\sum$ | e + g | f + h | n |
In [13]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
z = scipy.stats.norm.ppf(1 - alpha / 2.)
sample = zip(sample1, sample2)
n = len(sample)
f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
left_boundary = float(f - g) / n - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
right_boundary = float(f - g) / n + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
return (left_boundary, right_boundary)
In [14]:
def proportions_diff_z_stat_rel(sample1, sample2):
sample = zip(sample1, sample2)
n = len(sample)
f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )
In [15]:
print "95%% confidence interval for a difference between proportions: [%f, %f]" \
% proportions_diff_confint_rel(data.banner_a, data.banner_b)
In [22]:
print "p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(data.banner_a, data.banner_b))
In [23]:
print "p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(data.banner_a, data.banner_b), 'less')