P_control
is the conversion rate of control group, P_experiment
is the conversion rate of experiment group.d
is the detectable effectDmin
, indicating the min change that's practically significant to the business.
In [119]:
import pandas as pd
import math
import numpy as np
from scipy.stats import norm
In [6]:
# Control group
control_df = pd.read_csv('ab_control.csv')
control_df.head()
Out[6]:
In [9]:
# Experiment group
experiment_df = pd.read_csv('ab_experiment.csv')
experiment_df.head()
Out[9]:
In [10]:
control_df.describe()
Out[10]:
In [11]:
experiment_df.describe()
Out[11]:
GConversion = enrolled/Cl
Retention = paid/enrolled
NConversion = paid/Cl
In [28]:
baseline = {'Cookies': 40000, 'Clicks': 3200, 'Enrollments': 660, 'CTP': 0.08,
'GConversion': 0.20625, 'Retention': 0.53, 'NConversion': 0.109313}
std = sqrt(p_hat*(1-p_hat)/n)
p_hat
: baseline probability of the event to occurn
: sample size
In [29]:
sample_baseline = baseline.copy()
sample_baseline['Cookies'] = 5000 # assume sample size is 5000
sample_baseline['Clicks'] = baseline['Clicks'] * 5000/baseline['Cookies']
sample_baseline['Enrollments'] = baseline['Enrollments'] * 5000/baseline['Cookies']
sample_baseline
Out[29]:
In [33]:
def get_binomial_std(p_hat, n):
"""
p_hat: baseline probability of the event to occur
n: sample size
return: the standard deviation
"""
std = round(math.sqrt(p_hat*(1-p_hat)/n),4)
return std
In [34]:
gross_conversion = {}
gross_conversion['d_min'] = 0.01
gross_conversion['p_hat'] = sample_baseline['GConversion']
gross_conversion['n'] = sample_baseline['Clicks']
gross_conversion['std'] = get_binomial_std(gross_conversion['p_hat'],
gross_conversion['n'])
gross_conversion
Out[34]:
In [37]:
retention = {}
retention['d_min'] = 0.01
retention['p_hat'] = sample_baseline['Retention']
retention['n'] = sample_baseline['Enrollments']
retention['std'] = get_binomial_std(retention['p_hat'],
retention['n'])
retention
Out[37]:
In [54]:
net_conversion = {}
net_conversion['d_min'] = 0.0075
net_conversion['p_hat'] = sample_baseline['NConversion']
net_conversion['n'] = sample_baseline['Clicks']
net_conversion['std'] = get_binomial_std(net_conversion['p_hat'],
net_conversion['n'])
net_conversion
Out[54]:
P_control
is the conversion rate of control group, P_experiment
is the conversion rate of experiment group.d
is the detectable effectn = pow(Z1-α/2 * std1 + Z1-β * std2, 2)/pow(d, 2)
Z1-α/2
is the Z score for 1-α/2, α is the probability of Type I errorZ1-β
is the Z score for 1-β (Power), β is the probability of Type II errorstd1 = sqrt(2p*(1-p))
std2 = sqrt(p*(1-p) + (p+d)*(1-(p+d)))
p
is the baseline conversion rate, it's the p_hat from aboved
is the detectable effect, it's the d_min
from aboveThis is the online calculator for sample size: https://www.evanmiller.org/ab-testing/sample-size.html
In [42]:
def get_z_score(alpha):
return norm.ppf(alpha)
def get_stds(p, d):
std1 = math.sqrt(2*p*(1-p))
std2 = math.sqrt(p*(1-p) + (p+d)*(1-(p+d)))
std_lst = [std1, std2]
return std_lst
def get_sample_size(std_lst, alpha, beta, d):
n = pow(get_z_score(1-alpha/2)*std_lst[0] + get_z_score(1-beta)*std_lst[1], 2)/pow(d,2)
return n
In [44]:
alpha = 0.05
beta = 0.2
In [49]:
gross_conversion['sample_size'] = round(get_sample_size(get_stds(gross_conversion['p_hat'],
gross_conversion['d_min']), alpha, beta,
gross_conversion['d_min']))
gross_conversion['page_views'] = 2*round(gross_conversion['sample_size']/(gross_conversion['n']/5000))
gross_conversion
Out[49]:
In [52]:
retention['sample_size'] = round(get_sample_size(get_stds(retention['p_hat'],
retention['d_min']), alpha, beta,
retention['d_min']))
retention['page_views'] = 2*round(retention['sample_size']/(retention['n']/5000))
retention
Out[52]:
In [55]:
net_conversion['sample_size'] = round(get_sample_size(get_stds(net_conversion['p_hat'],
net_conversion['d_min']), alpha, beta,
net_conversion['d_min']))
net_conversion['page_views'] = 2*round(net_conversion['sample_size']/(net_conversion['n']/5000))
net_conversion
Out[55]:
In [56]:
control_df.head()
Out[56]:
In [57]:
experiment_df.head()
Out[57]:
In [60]:
p=0.5
alpha=0.05
In [62]:
def get_std(p, total_size):
std = math.sqrt(p*(1-p)/total_size)
return std
def get_marginOferror(std, alpha):
me = round(get_z_score(1-alpha/2)*std, 4)
return me
p_hat = control group pageviews/both groups pageviews
is not significantly different from p=0.5.ME = Z1-α/2 * std
CI = [p_hat - ME, p_hat + ME]
In [61]:
control_pageviews = control_df['Pageviews'].sum()
experiment_pageviews = experiment_df['Pageviews'].sum()
print(control_pageviews, experiment_pageviews)
In [67]:
total_pageviews = control_pageviews + experiment_pageviews
p_hat = control_pageviews/(total_pageviews)
std = get_std(p, total_pageviews)
me = get_marginOferror(std, alpha)
print('If ' + str(p) +' is within [' + str(round(p_hat - me, 4)) + ', ' + str(round(p_hat + me, 4)) + '], then the difference is expected.')
In [90]:
control_clicks = control_df['Clicks'].sum()
experiment_clicks = experiment_df['Clicks'].sum()
print(control_clicks, experiment_clicks)
In [91]:
total_clicks = control_clicks + experiment_clicks
p_hat = control_clicks/(total_clicks)
std = get_std(p, total_clicks)
me = get_marginOferror(std, alpha)
print('If ' + str(p) +' is within [' + str(round(p_hat - me, 4)) + ', ' + str(round(p_hat + me, 4)) + '], then the difference is expected.')
p_pool = (experiment_clicks + control_clicks)/(experiment_pageviews + control_pageviews)
std_pool = sqrt(p_pool*(1-p_pool)*(1/experiment_pageviews + 1/control_pageviews))
In [92]:
control_ctp = control_clicks/control_pageviews
experiment_ctp = experiment_clicks/experiment_pageviews
p_pool = (control_clicks + experiment_clicks)/(control_pageviews + experiment_pageviews)
std_pool = math.sqrt(p_pool*(1-p_pool)*(1/experiment_pageviews + 1/control_pageviews))
me = get_marginOferror(std_pool, alpha)
diff = round(experiment_ctp - control_ctp, 4)
print('If ' + str(diff) +' is within [' + str(round(0 - me, 4)) + ', ' + str(round(0 + me, 4)) + '], then the difference is expected.')
Similar to the sanity check above, here is to check the differences in evaluation metrics between the 2 groups,to see:
[Dmin - ME, Dmin + ME]
As Step 2 has found, Gross Conversion and Net Conversion can be the evaluation metrics, while Retention is not. All because of the limitation in data collection in reality.
Evaluation Metrics
GConversion = enrolled/Cl
NConversion = paid/Cl
In [81]:
print(control_df.isnull().sum())
print()
print(experiment_df.isnull().sum())
The method here is almost the same as what's used in "Compare CTP" above.
Observation
In [93]:
control_clicks = control_df['Clicks'].loc[control_df['Enrollments'].notnull()].sum()
experiment_clicks = experiment_df['Clicks'].loc[experiment_df['Enrollments'].notnull()].sum()
print('Clicks', control_clicks, experiment_clicks)
control_enrolls = control_df['Enrollments'].sum()
experiment_enrolls = experiment_df['Enrollments'].sum()
print('Enrollments', control_enrolls, experiment_enrolls)
control_GC = control_enrolls/control_clicks
experiment_GC = experiment_enrolls/experiment_clicks
print('Gross Conversion', control_GC, experiment_GC)
In [94]:
p_pool = (control_enrolls + experiment_enrolls)/(control_clicks + experiment_clicks)
std_pool = math.sqrt(p_pool*(1-p_pool)*(1/control_clicks + 1/experiment_clicks))
me = get_marginOferror(std_pool, alpha)
print(p_pool, std_pool, me)
In [97]:
# Statistical significance
GC_diff = round(experiment_GC - control_GC, 4)
print('If ' + str(GC_diff) +' is within [' + str(round(0 - me, 4)) + ', ' + str(round(0 + me, 4)) + '], then the difference is expected, and the change is not significant.')
In [100]:
# Practically significance
d_min = gross_conversion['d_min']
print('If ' + str(GC_diff) +' is within [' + str(round(d_min - me, 4)) + ', ' + str(round(d_min + me, 4)) + '], then the change is not practically significant.')
In [105]:
control_clicks = control_df['Clicks'].loc[control_df['Payments'].notnull()].sum()
experiment_clicks = experiment_df['Clicks'].loc[experiment_df['Payments'].notnull()].sum()
print('Clicks', control_clicks, experiment_clicks)
control_paid = control_df['Payments'].sum()
experiment_paid = experiment_df['Payments'].sum()
print('Payments', control_paid, experiment_paid)
control_NC = control_paid/control_clicks
experiment_NC = experiment_paid/experiment_clicks
print('Net Conversion', control_NC, experiment_NC)
In [106]:
p_pool = (control_paid + experiment_paid)/(control_clicks + experiment_clicks)
std_pool = math.sqrt(p_pool*(1-p_pool)*(1/control_clicks + 1/experiment_clicks))
me = get_marginOferror(std_pool, alpha)
print(p_pool, std_pool, me)
In [107]:
# Statistical significance
NC_diff = round(experiment_NC - control_NC, 4)
print('If ' + str(NC_diff) +' is within [' + str(round(0 - me, 4)) + ', ' + str(round(0 + me, 4)) + '], then the difference is expected, and the change is not significant.')
In [108]:
# Practically significance
d_min = net_conversion['d_min']
print('If ' + str(NC_diff) +' is within [' + str(round(d_min - me, 4)) + ', ' + str(round(d_min + me, 4)) + '], then the change is not practically significant.')
prob(success) = (n!/(x! * (n-x)!)) * pow(p, x) * pow(1-p, n-x)
p=0.5
p-value
is the sum of prob(success)
from each success record. When p-value is smaller than alpha, then the success is significant.
In [110]:
control_experiment_df = control_df.join(experiment_df, lsuffix='_control', rsuffix='_experiment')
print(control_experiment_df.shape)
control_experiment_df.head()
Out[110]:
In [111]:
control_experiment_df.isnull().sum()
Out[111]:
In [113]:
control_experiment_df.dropna(inplace=True)
print(control_experiment_df.shape)
control_experiment_df.isnull().sum()
Out[113]:
In [122]:
# If it's "success", assign 1, otherwise 0
control_experiment_df['GC_increase'] = np.where(
control_experiment_df['Enrollments_experiment']/control_experiment_df['Clicks_experiment'] \
> control_experiment_df['Enrollments_control']/control_experiment_df['Clicks_control'], 1, 0)
control_experiment_df['NC_increase'] = np.where(
control_experiment_df['Payments_experiment']/control_experiment_df['Clicks_experiment'] \
> control_experiment_df['Payments_control']/control_experiment_df['Clicks_control'], 1, 0)
control_experiment_df[['GC_increase', 'NC_increase']].head()
Out[122]:
In [126]:
print(control_experiment_df['GC_increase'].value_counts())
print(control_experiment_df['NC_increase'].value_counts())
In [143]:
GC_success_ct = control_experiment_df['GC_increase'].value_counts()[1]
NC_success_ct = control_experiment_df['NC_increase'].value_counts()[1]
print(GC_success_ct, NC_success_ct)
In [144]:
p = 0.5
alpha = 0.05
n = control_experiment_df.shape[0]
print(n)
In [152]:
def get_probability(x, n):
prob = round(math.factorial(n)/(math.factorial(x)*math.factorial(n-x))*pow(p,x)*pow(1-p, n-x), 4)
return prob
def get_p_value(x, n):
p_value = 0
for i in range(0, x+1):
p_value += get_probability(i, n)
return round(p_value*2, 4) # 2 side p_value
In [153]:
print ("GC Change is significant if", get_p_value(GC_success_ct,n), "is smaller than", alpha)
print ("NC Change is significant if", get_p_value(NC_success_ct,n), "is smaller than", alpha)