In [1]:
from simlogs import *
from IPython.display import Image
def get_pval(df_a, df_b, metric):
set_a = df_a.groupby('user_id')[metric].mean().values
set_b = df_b.groupby('user_id')[metric].mean().values
_,_,_,pval = wald_test(set_a, set_b)
return pval
def click_report(df_a, df_b):
print 'Impression level control average: %0.3f' % df_a.click.mean()
print 'Impression level treatment average: %0.3f' % df_b.click.mean()
print 'Lift: %0.2f' % ((df_b.click.mean() - df_a.click.mean()) / df_a.click.mean())
print ''
print 'User level control click average: %0.3f' % df_a.groupby('user_id').click.mean().mean()
print 'User level treatment click average: %0.3f' % df_b.groupby('user_id').click.mean().mean()
print 'Lift: %0.2f' % ((df_b.groupby('user_id').click.mean().mean() - df_a.groupby('user_id').click.mean().mean()) / df_a.groupby('user_id').click.mean().mean())
Almost Equal?Chris Harland :: Data Scientist :: Context Relevant :: @cdubhland
(work done while at Microsoft)
I claim unit testing is a place we can all "agree"
Well...we do
It is how we (as in humans) establish causality
In [306]:
Image(filename='bloodletting.jpg', width = 300)
Out[306]:
The Burns Archive - Burns Archive via Newsweek, 2.4.2011
In [307]:
Image(filename='stiff_cover.jpg')
Out[307]:
Experimentation helps us find the truth in crazy situations
In [406]:
Image(filename='A-B_testing.png')
Out[406]:
In [407]:
Image(filename='real_testing.jpg')
Out[407]:
Homework Machine - A Light in the Attic - Shell Silverstein
In [315]:
Image(filename='ab_flame_1.png', width = 700)
Out[315]:
In [316]:
Image(filename='ab_flame_2.png', width = 700)
Out[316]:
A/B testing simultaneously:
In [409]:
Image(filename='modified_ab_test.png')
Out[409]:
Common stumbling blocks:
It's possible to avoid some "pitfalls"
Critical to know your platform works because users are wacky
In [6]:
# code mock up
def test_bucket_split(self):
# Ensure that user bucketing created two equivalent groups
for metric in self.important_metrics:
assert abs(self.group_a[metric] - self.group_b[metric]) < self.tolerance
But where get these magical groups?
Skip the users and just get to the log lines
Anatomy of a log line:
1. Human visits
2. Human has choice (often influenced by treatment...you hope)
3. Human makes choice
4. (Optional) Human repeats 2 and 3 additional times
Logs are generated by a process
1. Present a choice (probability distribution, computer know what these are)
2. Draw from that distribution (nice...a computer can do this)
3. Given the draw, present a second choice (another probability distribution, possibly different)
4. Draw again (hey a computer can do this too)
5. Repeat (oh you bet a computer can do this)
Simple process but it captures the essence of the log generation process
The layering of draws and choice of distributions inject flexibility and complexity
In [410]:
# if you like python 2.7 you can high five me @cdubhland
# if you are stunned by my lack of commitment to python 3
# you can send complaints to @joelgrus
from __future__ import division
from scipy import stats
import numpy as np
def get_bernoulli_trial(p, n = 1):
""" return a bernoulli trial of success or failure with probability p """
return stats.bernoulli.rvs(p = p, size = n)
In [416]:
p = 0.5
n_trials = 10000
print 'Expected p ~ %0.2f and obtained p = %0.2f' % \
(p,np.mean(get_bernoulli_trial(p,n_trials)))
# We expect result to be near p
But all decisions aren't this simple =/
In [417]:
# We can make the probability of success a random variable
def get_beta_result(a,b, n = 1):
""" takes a draw from beta(a,b) used to simulate random rates """
return stats.beta.rvs(a,b, size = n)
# We can model a collection of user behaviors
def get_expon_result(mu, _lambda, n = 1):
""" takes a draw from a exponential(mu, lambda) """
return stats.expon.rvs(mu, _lambda, size = n)
# We can model the collective results of many choices
def get_exp_result(n,p, size = 1):
""" return the outcome of n bernoulli trials with probability p """
return stats.binom.rvs(n = n, p = p, size = size)
# Maybe the users visit at different frequencies
def gen_user_visit_freq(n_users = 100, _lambda = 2):
""" return the total number of visits in a set time delta for the number of given users """
return stats.poisson.rvs(mu = _lambda, size = n_users)
Timestamp, user_id, impression, click, conversionTimestamp can be draw from distribution of average gap times or assigned sequential
In [357]:
def gen_impression(p_imp = 1.0, p_click = 0.5, p_convert = 0.5):
""" This function generates an impression, click, conversion based
on probabilities defined by the input parameters """
impression = get_bernoulli_trial(p_imp)[0]
# Note: to speed this up would could draw all trials at once
# and post process the results to make the outcomes conditional
if impression == 1:
did_click = get_bernoulli_trial(p_click)[0]
# For now we assume only those that click can convert
if did_click == 1:
did_convert = get_bernoulli_trial(p_convert)[0]
else:
# Optionally this could be a bernoulli with a different p
# (i.e. the base rate)
did_convert = 0
imp_arr = [impression, did_click, did_convert]
return imp_arr
else:
return None
In [358]:
[gen_impression() for _ in range(10)]
Out[358]:
In [359]:
import datetime
def gen_log_line(uid, t_current = datetime.datetime.now(), p_imp = 1.0, p_click = 0.5, p_convert = 0.5):
""" Get a log line for the given user and return with timestamp
and impression info """
imp = gen_impression(p_imp, p_click, p_convert)
if imp is None:
return None
else:
# add a random t_delta
delta_sec = stats.norm.rvs(loc = 300, scale = 100)
t_ = t_current + datetime.timedelta(0,delta_sec)
timestamp = t_.strftime('%Y-%m-%d %I:%M:%S%p')
log_line = [timestamp, uid] + imp
return log_line, t_
In [360]:
gen_log_line('Trey Causey')[0]
Out[360]:
In [422]:
import hashlib
import pandas as pd
def create_hash_id(user, salt):
""" returns a sha1 hash of user string combined with salt string """
return hashlib.sha1(salt + '_' + repr(user)).hexdigest()
col_names = ['timestamp','user_id','impression','click','conversion']
user_hash = create_hash_id('Trey Causey', 'Spurs always let you down')
single_log = [gen_log_line(user_hash)[0] for x in range(10)]
pd.DataFrame(single_log, columns = col_names).sort('timestamp') \
.reset_index(drop = True).head()
Out[422]:
In general the formula is:
The simplicity is deceptive
(this is bayesian stat testing backwards)
Find the "unknown" parameters
Simulated logs are noisy instantiations of your supplied parameters
In other words, you put a number into the function and it spit out a ton of hand wavey examples
Your task (well, the dev's task) is to recover that parameter (within reason)
In [3]:
Image(filename='AA.png')
Out[3]:
In [10]:
# code mock up
def test_bucket_split(df_a, df_b, metric):
# Ensure that user bucketing created two equivalent groups
assert get_pval(df_a, df_b, metric) > 0.05
## Want to "recover" p > 0.05
In [6]:
p_click_control = 0.1
p_convert_control = 0.1
n_users = 1000
n_rows = 10000
# Going to hand wave this function
df_a = simulate_log_vectorized(n_users = n_users,
n_rows=n_rows,
p_click=p_click_control,
p_convert=p_convert_control,
strict = False)
df_b = simulate_log_vectorized(n_users = n_users,
n_rows=n_rows,
p_click=p_click_control,
p_convert=p_convert_control,
strict = False)
In [7]:
df_a.head(3)
Out[7]:
In [8]:
df_b.head(3)
Out[8]:
In [11]:
test_bucket_split(df_a, df_b, 'click')
Why am I so wary of random number generators?
In [488]:
Image(filename = 'reagan.jpg', width = 500)
Out[488]:
In [12]:
# We often test many metrics
def add_metrics(n_metrics, df_a, df_b):
for i in range(n_metrics):
p = np.random.rand()
# same p for both groups...should be equal
df_a.loc[:,'metric_%d'%i] = get_bernoulli_trial(p = p, n = len(df_a))
df_b.loc[:,'metric_%d'%i] = get_bernoulli_trial(p = p, n = len(df_b))
return df_a, df_b
# We can make a factory of fails
def aa_fail_o_tron(df_a, df_b, n_metrics):
# add some metrics to the pile
df_a_mod, df_b_mod = add_metrics(n_metrics, df_a, df_b)
# Check that all metrics come back not significant
for i in range(n_metrics):
test_bucket_split(df_a_mod, df_b_mod, 'metric_%d' % i)
In [14]:
aa_fail_o_tron(df_a, df_b, 10) # won't always fail
In [15]:
# But how often does it fail?
def count_dem_fails(df_a, df_b, n_metrics):
pvals = []
# add some metrics to the pile
df_a_mod, df_b_mod = add_metrics(n_metrics, df_a, df_b)
# Check that all metrics come back not significant
for i in range(n_metrics):
pvals.append(get_pval(df_a_mod, df_b_mod, 'metric_%d' % i))
return sum([pval < 0.05 for pval in pvals])
In [19]:
print [count_dem_fails(df_a, df_b, 2) for _ in range(10)]
print [count_dem_fails(df_a, df_b, 5) for _ in range(10)]
print [count_dem_fails(df_a, df_b, 20) for _ in range(10)]
In [20]:
def bonferroni_correction(pval, n_metrics):
# Simply make it harder to fail by lowering pvail
return pval / n_metrics
In [23]:
def test_bucket_split(df_a, df_b, metric, n_metrics = 1):
# Ensure that user bucketing created two equivalent groups
assert get_pval(df_a, df_b, metric) > bonferroni_correction(0.05, n_metrics)
def aa_fail_o_tron(df_a, df_b, n_metrics):
# add some metrics to the pile
df_a_mod, df_b_mod = add_metrics(n_metrics, df_a, df_b)
# Check that all metrics come back not significant
for i in range(n_metrics):
test_bucket_split(df_a_mod, df_b_mod, 'metric_%d' % i, n_metrics)
In [24]:
aa_fail_o_tron(df_a, df_b, 10) # this passes like a lot
In [25]:
[aa_fail_o_tron(df_a, df_b, 10) for _ in range(100)]
It's okay though...this is expected
I see alot of this:
df.click.mean()
Don't do that
In [474]:
df_heavy_users = simulate_log_vectorized(n_users = 10,
n_rows= 50000,
p_click=0.8,
p_convert=0.1,
strict=False)
df_light_users = simulate_log_vectorized(n_users = 1000,
n_rows= 10000,
p_click=0.1,
p_convert=0.1,
strict=False)
df_users = pd.concat([df_heavy_users, df_light_users])
In [475]:
print 'Impression level click average: %0.3f' % df_users.click.mean()
print 'User level click average: %0.3f' % df_users.groupby('user_id').click.mean().mean()
In [482]:
df_heavy_users_moved = simulate_log_vectorized(n_users = 10,
n_rows= 50000,
p_click=0.88,
p_convert=0.1,
strict=False)
df_users_moved = pd.concat([df_heavy_users_moved, df_light_users])
In [483]:
print 'Impression level click average: %0.3f' % df_users_moved.click.mean()
print 'User level click average: %0.3f' % df_users_moved.groupby('user_id').click.mean().mean()
In [484]:
click_report(df_users, df_users_moved)
In [485]:
df_heavy_users = simulate_log_vectorized(n_users = 10,
n_rows= 50000,
p_click=0.8,
p_convert=0.1,
strict=False)
df_stubborn_light_users = simulate_log_vectorized(n_users = 900,
n_rows= 9000,
p_click=0.1,
p_convert=0.1,
strict=False)
df_cooperative_light_users = simulate_log_vectorized(n_users = 100,
n_rows= 1000,
p_click=0.11,
p_convert=0.1,
strict=False)
df_users_le_sigh = pd.concat([df_heavy_users,
df_stubborn_light_users,
df_cooperative_light_users])
In [486]:
click_report(df_users, df_users_le_sigh)
Impression level rollups aren't sensitive enough =/
Unit test for sensitivity (too much or too little)
Avoid making ship mistakes
In [1]:
def correct_rollup(df, injected_lift, ratio):
# ratio - fraction of users effected by injected_lift
# calculate the user level lift
user_lift = user_level_lift(df)
impression_level_lift = impression_level_lift(df)
# Example case: ratio < 1.0 and injected_lift >= 0.10
assert user_lift < impression_level_lift