Setup + Utils


In [1]:
import pyspark.sql.functions as F
from pyspark.sql import Row
import pyspark.sql.types as T
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from collections import Counter
from IPython.display import Markdown, display
from moztelemetry import Dataset
from pyspark.sql.window import Window

sc.setLogLevel("INFO")
sns.set(style='whitegrid')
% matplotlib inline


/mnt/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
def md(md_text):
    """ Print Markdown text so that it renders correctly in the cell output. """
    display(Markdown(md_text))
    
    
def date_plus_x_days(date, x):
    new_date = dt.datetime.strptime(date, '%Y%m%d') + dt.timedelta(days=x)
    return new_date.strftime('%Y%m%d')

def _date_diff(d1, d2):
    if d1 and d2:
        try:
            d1, d2 = [pd.to_datetime(i, format="%Y%m%d") for i in (d1, d2)]
            return (d1 - d2).days
        except ValueError:
            pass
    return None
    
def get_install_rates_disco(d, grouper=['branch']):
    # how many clients had 1+ install per branch
    installs = (
        d.filter("ping_type = 'install'")
         .filter("addon_id not like '%mozilla%'")
         .filter("src = 'about:addons'")
    )
    
    distinct_clients_that_installed = (
         installs
         .groupby(grouper)
         .agg(F.countDistinct('client_id')
               .alias("distinct_clients_that_installed"))
         .toPandas()
    )

    # installs per client
    installs_per_client = (
        installs
         .groupby(grouper + ['client_id'])
         .agg(F.countDistinct('addon_id')
               .alias("n_installs_per_client"))
         .groupby(grouper)
         .agg(F.avg("n_installs_per_client").alias("avg_num_installs_per_client"))
         .toPandas()
    )
    
    # total distinct clients
    total_counts = (
        d.select(["client_id"] + grouper)
         .filter("discopane_loaded = true")
         .distinct()
         .groupby(grouper).count().toPandas()
    )
    
    
    total_counts.columns = grouper +  ['total_distinct_clients']

    # join for getting proportions
    m = (
        pd.merge(
            pd.merge(distinct_clients_that_installed, 
                     installs_per_client, on=grouper), 
            total_counts, on=grouper)
    )
    
    m['prob_at_least_one_install'] = m.distinct_clients_that_installed / m.total_distinct_clients
    m['src'] = 'about:addons'
    return m

def get_install_rates_AMO_total(d, grouper=['branch'], saw_treatment=True):
     # how many clients had 1+ install per branch
        
    
    treatment_indicator = (
         d.groupby("client_id")
          .agg(F.max(bool2int('discopane_loaded')).alias('saw_treatment'))
    )
    
    d_group = d.join(treatment_indicator
                     .filter(treatment_indicator.saw_treatment == int(saw_treatment)),
                     on='client_id')
    
    if saw_treatment == False:
        # comparison by branch irrelevant 
        # if the sample never navigated to discopane
        d_group = d_group.withColumn("branch", F.lit("All"))
    
    installs = (
        d_group.filter("ping_type = 'install'")
         .filter("addon_id not like '%mozilla%'")
         .filter("src = 'AMO'")
    )
    
    
    distinct_clients_that_installed = (
         installs
         .groupby(grouper)
         .agg(F.countDistinct('client_id')
               .alias("distinct_clients_that_installed"))
         .toPandas()
    )

    installs_per_client = (
        installs
         .groupby(grouper + ['client_id'])
         .agg(F.countDistinct('addon_id')
               .alias("n_installs_per_client"))
         .groupby(grouper)
         .agg(F.avg("n_installs_per_client").alias("avg_num_installs_per_client"))
         .toPandas()
    )
    
    # total distinct clients
    total_counts = (
        d_group.select(["client_id"] + grouper)
         .distinct()
         .groupby(grouper).count().toPandas()
    )
    
    
    total_counts.columns = grouper +  ['total_distinct_clients']

    # join for getting proportions
    m = (
        pd.merge(
            pd.merge(distinct_clients_that_installed, 
                     installs_per_client, on=grouper), 
            total_counts, on=grouper)
    )
    
    m['prob_at_least_one_install'] = m.distinct_clients_that_installed / m.total_distinct_clients
    m['src'] = 'AMO'
    return m

def get_visitation_rates(d, grouper=['branch']):
    visits = (
        d.filter("ping_type = 'disco-pane-loaded'")
         .filter("dwell_time > 0")
         .groupby(grouper).count().toPandas()
    )
    visits.columns = grouper +  ["total_visits"]

    total_counts = (
        d.select(["client_id"] + grouper)
         .filter("discopane_loaded = true")
          .distinct()
         .groupby(grouper).count().toPandas()
    )
    total_counts.columns = grouper +  ['total_distinct_clients']


    mv = pd.merge(visits, total_counts, on=grouper)
    mv['visits_per_client'] = mv.total_visits / mv.total_distinct_clients
    return mv

def get_uninstall_rates(d, grouper=['branch']):
    first_install = (
    d.filter("ping_type = 'install'")
     .filter("src = 'about:addons'")
         .filter("addon_id not like '%mozilla%'")
         .groupby("client_id").agg(F.min("submission_date_s3").alias("first_install_date"))
    ).cache()

    fields = ['client_id', 'addon_id'] + grouper

    installs = (
        d.filter("ping_type = 'install'")
         .filter("src = 'about:addons'")
         .filter("addon_id not like '%mozilla%'")
         .select(fields)
         .distinct()
    ).cache()


    installs_and_final_addons = (
        installs.groupby(["client_id"] + grouper)
                .agg(F.collect_set("addon_id").alias("installs"))
        .join(first_install, on='client_id')
        .join(d.groupby("client_id")
               .agg(F.max("submission_date_s3").alias("submission_date_s3"))
               .join(d.select("client_id", "submission_date_s3", F.explode("current_addons").alias("addon_id"))
                       .filter("addon_id not like '%mozilla%'"),
                    on=['client_id', 'submission_date_s3'])
               .withColumnRenamed("submission_date_s3", "last_ping")
               .groupby("client_id", "last_ping")
               .agg(F.collect_set("addon_id").alias("retained")),
              on='client_id')      
    )

    uninstalls = (
     installs_and_final_addons
        .filter("last_ping > first_install_date")
        .withColumn('not_retained', set_diff("installs", "retained")) 
        .select(grouper + [F.explode("not_retained")]) 
        .groupby(grouper)
        .agg(F.count(F.lit(1)).alias("n_installed"))
        .join(
          installs_and_final_addons
            .select(grouper + [F.explode("installs")]) 
            .groupby(grouper)
            .agg(F.count(F.lit(1)).alias("n_total")),
         on=grouper)
    ).toPandas()


    uninstalls['n_retained'] = uninstalls.n_total - uninstalls.n_installed
    uninstalls['addon_retention_rate'] = uninstalls.n_retained / uninstalls.n_total
    return uninstalls

def _format_creation_date(d):
    d = d.split('.')[0]
    return long(re.sub("[-T:]", '', d))

format_creation_date = F.udf(_format_creation_date, T.LongType())
set_diff = F.udf(lambda x, y: list(set(x) - set(y)), T.ArrayType(T.StringType()))
date_diff = F.udf(_date_diff, T.LongType())
branch_counts = lambda x: x.groupby('branch').agg(F.countDistinct("client_id")).toPandas()
bool2int = F.udf(lambda x: 1 if x else 0, T.IntegerType())

Data Loading


In [3]:
S3_PATH = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/"
START_DATE = '20180312'
RELOAD_MS = False

data = sqlContext.read.parquet(S3_PATH + 'data/')
exp_data = data.filter("submission_date_s3 >= '{}'".format(START_DATE))
N = exp_data.count()
print "n pings:", N
print "n clients:", exp_data.select("client_id").distinct().count()

branch_counts(exp_data)


n pings: 8961698
n clients: 3493614
Out[3]:
branch count(DISTINCT client_id)
0 ensemble-taar 1164434
1 control 1165835
2 linear-taar 1163444

In [4]:
exp_data.printSchema()


root
 |-- addon_id: string (nullable = true)
 |-- clicked_button: boolean (nullable = true)
 |-- client_id: string (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- current_addons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- discopane_loaded: boolean (nullable = true)
 |-- dwell_time: long (nullable = true)
 |-- locale: string (nullable = true)
 |-- ping_type: string (nullable = true)
 |-- saw_popup: boolean (nullable = true)
 |-- src: string (nullable = true)
 |-- start_time_utc: long (nullable = true)
 |-- submission_date_s3: string (nullable = true)
 |-- branch: string (nullable = true)

Cleaning

We must verify our assumptions and requirements about the experiment data before performing any analysis. Any clients that do not conform to these assumptions will be excluded. The code below continuously adds non-conforming client_ids to a blacklist.

Assumption: Each client has a profile_creation_date within 21 days of their first experiment ping, and each client has not submitted a main ping > 21 days before their first experimental ping


In [5]:
if not RELOAD_MS:
    # load pre aggregated data, generated from 'else' statement
    # as it is quite expensive
    ms_min_dates = (sqlContext.read
                    .parquet("s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/ms-min-dates/"))
    blacklist_clients = (sqlContext.read
                         .parquet("s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/blacklist-clients/")
                         .select('client_id').distinct())
    print "loading pre-aggregated datasets"
else:
    # load main_summary a month before study start date
    ms = (
        sqlContext.read.option("mergeSchema", True)
                  .parquet("s3://telemetry-parquet/main_summary/v4")
                  .filter("submission_Date_s3 > '{}'".format(date_plus_x_days(START_DATE, -30)))
                  .filter("submission_date_s3 <= '{}'".format('20180423'))
                  .filter("app_name = 'Firefox'")
                  .filter("normalized_channel = 'release'")
                  .select("client_id", 
                          "profile_creation_date", 
                          'locale', 'profile_creation_date', "submission_date_s3", 
                          "sample_id", "active_addons", "apple_model_id", "memory_mb")
    )

    # clients with different locales, apple_model_id (null for PC)
    # or memory_mb are excluded. Potentially ignores clients
    # with new hardware.
    blacklist_clients = (
        ms.select("client_id", 'locale', 'apple_model_id', 'memory_mb').distinct()
          .groupby('client_id').agg(F.count(F.lit(1)).alias('count'))
    ).filter("count > 1").select("client_id", F.lit(1).alias("blacklist"))

    # get the min submission_date_s3 and min PCD (should be constant)
    # per client
    ms_min_dates = (
        ms.groupBy("client_id")
          .agg(F.min('profile_creation_date').alias('min_pcd'),
               F.min('submission_date_s3').alias("min_submission_date"))
    )
    
    # write to s3 for faster iterations moving forward
    (ms_min_dates
     .repartition(50)
     .write
     .mode("overwrite")
     .parquet("s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/ms-min-dates/"))

    # write to s3 for faster iterations moving forward
    (blacklist_clients
     .repartition(10)
     .write
     .mode("overwrite")
     .parquet("s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/blacklist-clients/"))

    
# join with blacklist clients, and filter clients
# with multiple locales, hardware ids and/or memory
joined = (
    exp_data.join(ms_min_dates, on='client_id', how='left')
        .withColumn("min_pcd", 
                    F.from_unixtime(F.col("min_pcd") * 60 * 60 * 24, 'yyyyMMdd'))
).cache()


loading pre-aggregated datasets

In [6]:
min_start_dates = joined.groupBy("client_id").agg(F.min("start_time_utc").alias("min_start_date"))

agg = (
    joined.join(min_start_dates, on="client_id")
            .withColumn("days_before_first_ping", date_diff("min_submission_date", "min_pcd"))
            .withColumn("days_before_exp_start", date_diff(F.lit(START_DATE), "min_submission_date"))
)

days_before = agg.groupby("days_before_exp_start").count().toPandas()
days_before['days_before_exp_start'] *= -1

plt.rcParams['figure.figsize'] = (10, 5)
days_before.sort_values("days_before_exp_start").plot.bar(x="days_before_exp_start")
plt.title("Distribution of min(submission_date_s3) - Experiment Start Date in Days")
plt.xlabel("Days from Start Date (0 = Start Date)")


Out[6]:
<matplotlib.text.Text at 0x7faa33cfb9d0>

In [7]:
# no client should have a ping or pcd > 21 days from the experiment start date
# validating normandy targetting critera
# join back with exp_data and blacklist since this 
# the current blacklist considers all of main_summary, not just the exp data
blacklist_clients = (
    agg.filter("min_submission_date < '{}'".format(date_plus_x_days(START_DATE, -21)))
       .filter("min_pcd < '{}'".format(date_plus_x_days(START_DATE, -21)))
       .select("client_id")
       .join(exp_data.select("client_id"), on='client_id', how='inner')
       .join(blacklist_clients, on='client_id', how='inner')
       .select("client_id").distinct()
)


bl_n = blacklist_clients.count()
print "{} clients were added to the blacklist".format(bl_n)


13181 clients were added to the blacklist

Assumption: clients that installed from disco-pane have a true value for discopane_loaded


In [8]:
w = Window.partitionBy("client_id").orderBy("creation_date_formatted")
cum = (F.sum('dwell_time').over(w))

cum_dwell = (exp_data.withColumn("creation_date_formatted", format_creation_date("creation_date"))
         .withColumn("dwell_time_cum", cum)
         .sort('client_id', 'creation_date_formatted')
         .select("client_id", 'creation_date_formatted', 
                 'creation_date', 'discopane_loaded', 'dwell_time_cum'))


# override discopane_loaded if dwell_time_cum is non-zero
# since a client should by definition have loaded the page
# if they dwelled for more than 0 seconds. Leave discopane_loaded
# untouched if it is true.
exp_data = (
    exp_data.join(cum_dwell.select("client_id", 
                               'creation_date', 
                               'dwell_time_cum'),
                  on=['client_id', 'creation_date'])
            .withColumn("discopane_loaded", F.col('discopane_loaded') | (F.col('dwell_time_cum') > 0))
)

# initialize blacklist
# as clients shown to have installed from
# discopane without ever loading it (~400 pings, so small)
blacklist_clients = blacklist_clients.unionAll(
    exp_data.filter("ping_type='install' and src='about:addons' and discopane_loaded = false")
    .select("client_id").distinct()
)


# print blacklist additions and update
new_bl_n = blacklist_clients.count()
print "{} clients were added to the blacklist".format(new_bl_n - bl_n)
new_bl = new_bl_n


309 clients were added to the blacklist

Assumption: Each client belongs to only one branch


In [9]:
# count the number of distinct branches per client
multiple_cohort_clients = (
    exp_data.groupBy("client_id")
        .agg(F.collect_set("branch").alias("branches"))
        .withColumn("n_branches", F.size("branches"))
        .filter("n_branches > 1")
)


blacklist_clients = (
    blacklist_clients.unionAll(multiple_cohort_clients.select("client_id").distinct())
)

# print blacklist additions and update
new_bl_n = blacklist_clients.count()
print "{} clients were added to the blacklist".format(new_bl_n - bl_n)
new_bl = new_bl_n


385 clients were added to the blacklist

Requirement: No Firefox Pioneers


In [10]:
pioneers = (
    agg.select("client_id", F.explode("current_addons").alias("addon_id)"))
     .filter("addon_id like 'pioneer-enrollment%'")
     .select("client_id")
     .distinct()
)

blacklist_clients = blacklist_clients.unionAll(pioneers).distinct()

# print blacklist additions and update
new_bl_n = blacklist_clients.count()
print "{} clients were added to the blacklist".format(new_bl_n - bl_n)
new_bl = new_bl_n


66753 clients were added to the blacklist

Black List and Whitelist Filtering


In [11]:
%%bash
# loading whitelists/blacklists generated here: 
# https://github.com/mozilla/taar/blob/master/analysis/TAARLogMunge.ipynb
aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-analysis/failed_dynamo_clients.csv .
aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-analysis/clients_served_linear.csv .
aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-analysis/clients_served_ensemble.csv .


download: s3://net-mozaws-prod-us-west-2-pipeline-analysis/failed_dynamo_clients.csv to ./failed_dynamo_clients.csv
download: s3://net-mozaws-prod-us-west-2-pipeline-analysis/clients_served_linear.csv to ./clients_served_linear.csv
download: s3://net-mozaws-prod-us-west-2-pipeline-analysis/clients_served_ensemble.csv to ./clients_served_ensemble.csv

In [12]:
load_list = (lambda x: sc.parallelize(list(pd.read_json(x)[0]))
                          .map(lambda x: Row(client_id=x))
                          .toDF())
def filter_bl(d, bl):
    return (
        d.join(bl.select("client_id", F.lit(1).alias("in_bl")),
               on="client_id", how='left')
         .filter("in_bl is null")
    )

In [13]:
# load whitelist and convert to SparkDF
linear_whitelist = load_list('clients_served_linear.csv').distinct()
ensemble_whitelist =  load_list('clients_served_ensemble.csv').distinct()
blacklist_clients = blacklist_clients.unionAll(load_list('failed_dynamo_clients.csv')).distinct()

print "Clients in the linear whitelist:", linear_whitelist.count()
print "Clients in the ensemble whitelist:", ensemble_whitelist.count()
print "Clients in the blacklist:", blacklist_clients.count()

# filter out blacklist from master data
exp_data_bl = filter_bl(exp_data, blacklist_clients)

linear_filtered  = exp_data_bl.filter("branch = 'linear-taar'").join(linear_whitelist, on='client_id')
ensemble_filtered = exp_data_bl.filter("branch = 'ensemble-taar'").join(ensemble_whitelist, on='client_id')
control = exp_data_bl.filter("branch = 'control'")

cols=linear_filtered.columns

d_bl_wl = (
    linear_filtered
    .unionAll(ensemble_filtered.select(cols))
    .unionAll(control.select(cols))
)

d_bl_wl = d_bl_wl.withColumn('is_en_US', d_bl_wl.locale == 'en-US').cache()


Clients in the linear whitelist: 175911
Clients in the ensemble whitelist: 175321
Clients in the blacklist: 101491

In [14]:
# save cleaned dataset to s3 for analysis outside of this notebook
(d_bl_wl.repartition(10).write.parquet(S3_PATH + 'cleaned_data/', 
                       mode="overwrite"))

Data Aggregation for Analysis

Installation rates by branch / source


In [14]:
m = pd.concat([get_install_rates_disco(d_bl_wl),
               get_install_rates_AMO_total(d_bl_wl)])
m.to_csv("installs.csv", index=False)
m.sort_values(['src', 'branch'])
m


Out[14]:
branch distinct_clients_that_installed avg_num_installs_per_client total_distinct_clients prob_at_least_one_install src
0 ensemble-taar 18066 1.342909 108381 0.166690 about:addons
1 control 22191 1.410617 139913 0.158606 about:addons
2 linear-taar 15403 1.423749 108071 0.142527 about:addons
0 ensemble-taar 34194 1.696292 108381 0.315498 AMO
1 control 44164 1.703446 139913 0.315653 AMO
2 linear-taar 34020 1.704997 108071 0.314793 AMO

In [15]:
%%bash -s "$S3_PATH"
aws s3 cp installs.csv $1taar-installs.csv


upload: ./installs.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-installs.csv

Installation rates by branch / source / en-US vs. Non en-US


In [16]:
men = pd.concat([get_install_rates_disco(d_bl_wl, grouper=['branch', 'is_en_US']),
               get_install_rates_AMO_total(d_bl_wl, grouper=['branch', 'is_en_US'])])
men.to_csv("en-us-installs.csv", index=False)
men.sort_values(['src', 'branch'])
men


Out[16]:
branch is_en_US distinct_clients_that_installed avg_num_installs_per_client total_distinct_clients prob_at_least_one_install src
0 ensemble-taar True 8099 1.358563 52522 0.154202 about:addons
1 linear-taar True 7270 1.445117 52196 0.139283 about:addons
2 control False 11945 1.392884 72687 0.164335 about:addons
3 control True 10280 1.428502 68058 0.151048 about:addons
4 ensemble-taar False 10001 1.326967 56583 0.176749 about:addons
5 linear-taar False 8167 1.401004 56587 0.144326 about:addons
0 ensemble-taar True 17330 1.736699 52855 0.327878 AMO
1 linear-taar True 16775 1.743666 52534 0.319317 AMO
2 control False 22286 1.650049 74197 0.300363 AMO
3 control True 21981 1.749920 68486 0.320956 AMO
4 ensemble-taar False 16944 1.647368 57764 0.293331 AMO
5 linear-taar False 17343 1.658537 57831 0.299891 AMO

In [17]:
%%bash -s "$S3_PATH"
aws s3 cp en-us-installs.csv $1taar-en-us-installs.csv


upload: ./en-us-installs.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-en-us-installs.csv

Installation rates by branch / source / all locales


In [18]:
ml = pd.concat([get_install_rates_disco(d_bl_wl, grouper=['branch', 'locale']),
               get_install_rates_AMO_total(d_bl_wl, grouper=['branch', 'locale'])])
ml.to_csv("locale-installs.csv", index=False)
ml.sort_values(['src', 'branch'])
ml


Out[18]:
branch locale distinct_clients_that_installed avg_num_installs_per_client total_distinct_clients prob_at_least_one_install src
0 linear-taar it 293 1.494881 2353 0.124522 about:addons
1 control ru 1989 1.365510 11078 0.179545 about:addons
2 linear-taar de 1348 1.324184 10012 0.134638 about:addons
3 ensemble-taar fr 1422 1.310127 7890 0.180228 about:addons
4 control sr 25 1.920000 146 0.171233 about:addons
5 linear-taar el 42 1.333333 247 0.170040 about:addons
6 ensemble-taar tr 266 1.439850 1199 0.221852 about:addons
7 ensemble-taar id 306 1.366013 1420 0.215493 about:addons
8 linear-taar es-ES 943 1.458112 6247 0.150952 about:addons
9 control fa 1 1.000000 3 0.333333 about:addons
10 ensemble-taar es-MX 2 1.500000 17 0.117647 about:addons
11 control da 32 1.312500 241 0.132780 about:addons
12 control es-CL 1 1.000000 4 0.250000 about:addons
13 linear-taar he 1 2.000000 1 1.000000 about:addons
14 ensemble-taar zh-CN 3 1.000000 30 0.100000 about:addons
15 control zh-CN 13 1.538462 52 0.250000 about:addons
16 ensemble-taar cs 154 1.285714 675 0.228148 about:addons
17 control ca 1 1.000000 4 0.250000 about:addons
18 control uk 74 1.405405 335 0.220896 about:addons
19 linear-taar fr 1125 1.433778 7652 0.147020 about:addons
20 ensemble-taar da 71 1.281690 246 0.288618 about:addons
21 linear-taar hu 115 1.513043 842 0.136580 about:addons
22 control en-US 10280 1.428502 68058 0.151048 about:addons
23 linear-taar ro 55 1.800000 339 0.162242 about:addons
24 control de 1845 1.294851 12424 0.148503 about:addons
25 linear-taar ja 172 1.482558 1570 0.109554 about:addons
26 control ja 305 1.301639 2113 0.144345 about:addons
27 control lt 1 3.000000 4 0.250000 about:addons
28 ensemble-taar zh-TW 137 1.197080 858 0.159674 about:addons
29 control fi 30 1.166667 322 0.093168 about:addons
... ... ... ... ... ... ... ...
82 control it 889 1.674916 2986 0.297723 AMO
83 ensemble-taar de 3142 1.550605 10188 0.308402 AMO
84 ensemble-taar ro 103 1.864078 348 0.295977 AMO
85 ensemble-taar en-US 17330 1.736699 52855 0.327878 AMO
86 control hu 341 1.668622 1023 0.333333 AMO
87 control tr 509 1.864440 1608 0.316542 AMO
88 ensemble-taar hu 246 1.873984 862 0.285383 AMO
89 control bg 77 1.805195 288 0.267361 AMO
90 linear-taar id 401 1.615960 1374 0.291849 AMO
91 linear-taar pl 1137 1.656113 3567 0.318755 AMO
92 ensemble-taar fi 51 1.470588 282 0.180851 AMO
93 control zh-TW 375 1.853333 1054 0.355787 AMO
94 linear-taar ar 187 1.786096 902 0.207317 AMO
95 ensemble-taar es-AR 5 1.600000 12 0.416667 AMO
96 ensemble-taar en-GB 803 1.676214 2474 0.324576 AMO
97 ensemble-taar sr 27 2.037037 106 0.254717 AMO
98 control ko 2 3.000000 3 0.666667 AMO
99 linear-taar en-GB 804 1.676617 2507 0.320702 AMO
100 control es-MX 4 2.250000 18 0.222222 AMO
101 control ro 117 1.743590 395 0.296203 AMO
102 linear-taar uk 92 1.706522 246 0.373984 AMO
103 ensemble-taar sk 67 2.000000 254 0.263780 AMO
104 linear-taar ko 2 4.000000 5 0.400000 AMO
105 ensemble-taar ar 177 2.000000 950 0.186316 AMO
106 control en-GB 1066 1.694184 3236 0.329419 AMO
107 control el 101 1.782178 306 0.330065 AMO
108 ensemble-taar ca 1 2.000000 3 0.333333 AMO
109 control sk 82 1.853659 330 0.248485 AMO
110 linear-taar sr 41 1.560976 125 0.328000 AMO
111 control es-AR 5 1.600000 11 0.454545 AMO

213 rows × 7 columns


In [19]:
%%bash -s "$S3_PATH"
aws s3 cp locale-installs.csv $1taar-locale-installs.csv


upload: ./locale-installs.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-locale-installs.csv

Visitation rate to about:addons by branch


In [20]:
mv = get_visitation_rates(d_bl_wl)
mv.to_csv("visits.csv", index=False)
mv


Out[20]:
branch total_visits total_distinct_clients visits_per_client
0 ensemble-taar 132066 108381 1.218535
1 control 148664 139913 1.062546
2 linear-taar 128991 108071 1.193576

Visitation rate to about:addons by branch / en-US vs. Non en-US


In [21]:
mev = get_visitation_rates(d_bl_wl, grouper=['branch', 'is_en_US'])
mev.to_csv("en-us-visits.csv", index=False)
mev


Out[21]:
branch is_en_US total_visits total_distinct_clients visits_per_client
0 ensemble-taar True 63356 52522 1.206275
1 linear-taar True 60953 52196 1.167771
2 control False 77861 72687 1.071182
3 control True 70803 68058 1.040333
4 ensemble-taar False 68710 56583 1.214322
5 linear-taar False 68038 56587 1.202361

Visitation rate to about:addons by branch / locale


In [22]:
mvl = get_visitation_rates(d_bl_wl, ['branch', 'locale'])
mvl.to_csv("locale-visits.csv", index=False)
mvl


Out[22]:
branch locale total_visits total_distinct_clients visits_per_client
0 control nb-NO 2 2 1.000000
1 linear-taar it 2830 2353 1.202720
2 control ru 11356 11078 1.025095
3 linear-taar de 12177 10012 1.216241
4 ensemble-taar fr 9074 7890 1.150063
5 control sr 195 146 1.335616
6 ensemble-taar tr 1409 1199 1.175146
7 linear-taar el 304 247 1.230769
8 ensemble-taar id 2123 1420 1.495070
9 linear-taar es-ES 8412 6247 1.346566
10 control fa 4 3 1.333333
11 ensemble-taar es-MX 16 17 0.941176
12 control da 323 241 1.340249
13 control es-CL 1 4 0.250000
14 linear-taar he 4 1 4.000000
15 ensemble-taar zh-CN 26 30 0.866667
16 control zh-CN 54 52 1.038462
17 ensemble-taar cs 805 675 1.192593
18 control ca 3 4 0.750000
19 control uk 379 335 1.131343
20 linear-taar es-AR 6 6 1.000000
21 linear-taar fr 8763 7652 1.145191
22 ensemble-taar da 303 246 1.231707
23 linear-taar hu 1035 842 1.229216
24 control en-US 70803 68058 1.040333
25 linear-taar ro 361 339 1.064897
26 linear-taar sv-SE 7 4 1.750000
27 linear-taar ja 2298 1570 1.463694
28 control de 13555 12424 1.091033
29 ensemble-taar nb-NO 1 1 1.000000
... ... ... ... ... ...
93 control tr 1552 1601 0.969394
94 control hu 1167 1011 1.154303
95 ensemble-taar hu 1101 853 1.290739
96 control bg 336 284 1.183099
97 linear-taar id 1522 1362 1.117474
98 ensemble-taar fi 332 276 1.202899
99 linear-taar pl 4089 3519 1.161978
100 control zh-TW 1194 1037 1.151398
101 linear-taar ar 999 881 1.133939
102 ensemble-taar es-AR 16 12 1.333333
103 ensemble-taar en-GB 2870 2402 1.194838
104 ensemble-taar sr 156 104 1.500000
105 control ko 9 3 3.000000
106 linear-taar en-GB 2933 2435 1.204517
107 control es-MX 18 18 1.000000
108 ensemble-taar th 1 3 0.333333
109 control ro 389 390 0.997436
110 linear-taar uk 262 236 1.110169
111 ensemble-taar sk 318 249 1.277108
112 control sl 1 1 1.000000
113 linear-taar ko 7 5 1.400000
114 ensemble-taar ar 1105 934 1.183084
115 control en-GB 3338 3138 1.063735
116 control el 383 300 1.276667
117 ensemble-taar ca 3 3 1.000000
118 control sk 367 326 1.125767
119 linear-taar th 5 3 1.666667
120 linear-taar sr 175 125 1.400000
121 control es-AR 14 11 1.272727
122 ensemble-taar ko 3 3 1.000000

123 rows × 5 columns


In [23]:
%%bash -s "$S3_PATH"
aws s3 cp visits.csv $1taar-visits.csv
aws s3 cp en-us-visits.csv $1taar-en-us-visits.csv
aws s3 cp locale-visits.csv $1taar-locale-visits.csv


upload: ./visits.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-visits.csv
upload: ./en-us-visits.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-en-us-visits.csv
upload: ./locale-visits.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/taar-locale-visits.csv

Add-on installations per Day/Branch


In [24]:
daily = pd.concat([
    get_install_rates_AMO_total(d_bl_wl, grouper=['branch', 'submission_date_s3']),
    get_install_rates_disco(d_bl_wl, grouper=['branch', 'submission_date_s3'])
])

enrollment = (
    d_bl_wl.filter("ping_type='init'")
    .groupby("submission_date_s3")
    .agg(F.countDistinct("client_id").alias("count")).toPandas()
)

daily.to_csv('daily_installs.csv', index=False)
enrollment.to_csv("enrollment.csv", index=False)

In [25]:
dd = daily[daily.src=='about:addons']
fig, ax = plt.subplots()
for g, data in dd.groupby('branch'):
    data.sort_values("submission_date_s3").plot(x='submission_date_s3', y='prob_at_least_one_install', ax=ax, label=g)
plt.show()



In [26]:
%%bash -s "$S3_PATH"
aws s3 cp daily_installs.csv $1daily_installs.csv
aws s3 cp enrollment.csv $1enrollment.csv


upload: ./daily_installs.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/daily_installs.csv
upload: ./enrollment.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/enrollment.csv

Add-on installations per Day/Branch/en-US locale


In [27]:
daily_en_us = pd.concat([
    get_install_rates_AMO_total(d_bl_wl, grouper=['branch','is_en_US', 'submission_date_s3']),
    get_install_rates_disco(d_bl_wl, grouper=['branch', 'is_en_US', 'submission_date_s3'])
])

enrollment_en_us = (
    d_bl_wl.filter("ping_type='init'")
    .groupby("submission_date_s3", 'is_en_US')
    .agg(F.countDistinct("client_id").alias("count")).toPandas()
)

daily_en_us.to_csv('daily_installs_en_us.csv', index=False)
enrollment_en_us.to_csv("enrollment_en_us.csv", index=False)

In [28]:
dd = daily_en_us[daily_en_us.src=='about:addons']

for en_us, data in dd.groupby("is_en_US"):
    fig, ax = plt.subplots()
    for g, data1 in data.groupby('branch'):
        (data1.sort_values("submission_date_s3")
         .plot(title="en_US:{}".format(en_us), 
               x='submission_date_s3', 
               y='prob_at_least_one_install', 
               ax=ax, label=g))
plt.show()



In [29]:
%%bash -s "$S3_PATH"
aws s3 cp daily_installs_en_us.csv $1daily_installs_en_us.csv
aws s3 cp enrollment_en_us.csv $1enrollment_en_us.csv


upload: ./daily_installs_en_us.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/daily_installs_en_us.csv
upload: ./enrollment_en_us.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/enrollment_en_us.csv

Add-on installations per Day/Branch/ Locale


In [30]:
daily_locale = pd.concat([
    get_install_rates_AMO_total(d_bl_wl, grouper=['branch','locale', 'submission_date_s3']),
    get_install_rates_disco(d_bl_wl, grouper=['branch', 'locale', 'submission_date_s3'])
])

enrollment_locale = (
    d_bl_wl.filter("ping_type='init'")
    .groupby("submission_date_s3", 'is_en_US')
    .agg(F.countDistinct("client_id").alias("count")).toPandas()
)

daily_locale.to_csv('daily_installs_locale.csv', index=False)
enrollment_locale.to_csv("enrollment_locale.csv", index=False)

In [31]:
# uncomment to plot for each locale, some locales
# have incomplete data

# dd = daily_locale[daily_locale.src=='about:addons']

# for locale, data in dd.groupby("locale"):
#     fig, ax = plt.subplots()
#     for g, data1 in data.groupby('branch'):
#         (data1.sort_values("submission_date_s3")
#          .plot(title="Locale:{}".format(locale), 
#                x='submission_date_s3', 
#                y='prob_at_least_one_install', 
#                ax=ax, label=g))
# plt.show()

In [32]:
%%bash -s "$S3_PATH"
aws s3 cp daily_installs_locale.csv $1daily_installs_locale.csv
aws s3 cp enrollment_locale.csv $1enrollment_locale.csv


upload: ./daily_installs_locale.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/daily_installs_locale.csv
upload: ./enrollment_locale.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/enrollment_locale.csv

Old UI vs New UI click through rate

Load pings from portion of the experiment with the old UI (excluded in all prior cells)


In [33]:
def collapse_fields(x):
    """
    Collapsed nested field names 
    and returns a flattened object as a 
    PySpark Row to prepare for DataFrame 
    conversion
    """
    if x is None:
        x = {}
    data = x.get("payload", {}).get("data").get("attributes", {})
    addons= x.get("environment", {}).get("addons", {}).get("activeAddons", {})
    result = Row(
        client_id=x.get("clientId"),
        locale=x.get("environment", {}).get("settings", {}).get("locale"),
        branch=x.get("payload", {}).get("branch"),
        addon_id=data.get("addon_id"),
        clicked_button=data.get("clickedButton"))
    return result

old_ui = (
    Dataset.from_source("telemetry")
           .where(docType="shield-study-addon")
           .where(submissionDate=lambda x: x < START_DATE and x >= '20180305')
           .records(sc)
           .filter(lambda x: x.get("payload", {}).get("study_name") == "TAARExperimentV2")
           .filter(lambda x: x.get("payload", {}).get("addon_version") == "1.0.11")
           .filter(lambda x: x.get("payload", {}).get("testing") == False)
).map(collapse_fields).toDF()


old_clicked = old_ui.filter("clicked_button=true").select('client_id').distinct().count()
old_total = old_ui.select("client_id").distinct().count()

new_clicked = exp_data.filter("clicked_button=true").select('client_id').distinct().count()
new_total = exp_data.select("client_id").distinct().count()


fetching 3315.84379MB in 35981 files...

In [34]:
ctr = pd.DataFrame([[old_clicked, old_total], [new_clicked, new_total]])
ctr['branch'] = ['old', 'new']
ctr.columns = ['n_clicked', 'n_total', 'branch']

ctr['prob_click'] = ctr.n_clicked / ctr.n_total
ctr.to_csv("ctr.csv", index=False)
ctr


Out[34]:
n_clicked n_total branch prob_click
0 53001 1705341 old 0.031079
1 115258 3493614 new 0.032991

In [35]:
%%bash -s "$S3_PATH"
aws s3 cp ctr.csv $1ctr.csv


upload: ./ctr.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/ctr.csv

Old UI vs New UI click through rate by en-US / Non en-US


In [36]:
old_ui = old_ui.withColumn("is_en_US", old_ui.locale == 'en-US')
exp_data = exp_data.withColumn("is_en_US", exp_data.locale == 'en-US')


old_ctr = (
    old_ui.filter("clicked_button = true")
          .groupby("is_en_US")
          .agg(F.countDistinct("client_id").alias("n_clicked"))
    .join(
    old_ui.groupby("is_en_US")
          .agg(F.countDistinct("client_id").alias("n_total")),
    on='is_en_US')
).toPandas()

old_ctr['branch'] = 'old'

new_ctr = (
    exp_data.filter("clicked_button = true")
          .groupby("is_en_US")
          .agg(F.countDistinct("client_id").alias("n_clicked"))
    .join(
    exp_data.groupby("is_en_US")
          .agg(F.countDistinct("client_id").alias("n_total")),
    on='is_en_US')
).toPandas()

new_ctr['branch'] = 'new'

ctr_en_us = pd.concat([old_ctr, new_ctr])
ctr_en_us['prob_click'] = ctr_en_us.n_clicked / ctr_en_us.n_total
ctr_en_us.to_csv("ctr-en-us.csv", index=False)
ctr_en_us


Out[36]:
is_en_US n_clicked n_total branch prob_click
0 True 25259 817055 old 0.030915
1 False 27855 890253 old 0.031289
0 True 58070 1709745 new 0.033964
1 False 58044 1803092 new 0.032191

In [37]:
%%bash -s "$S3_PATH"

aws s3 cp ctr-en-us.csv $1ctr-en-us.csv


upload: ./ctr-en-us.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/ctr-en-us.csv

Old UI vs New UI click through rate by locale


In [38]:
old_ctr = (
    old_ui.filter("clicked_button = true")
          .groupby("locale")
          .agg(F.countDistinct("client_id").alias("n_clicked"))
    .join(
    old_ui.groupby("locale")
          .agg(F.countDistinct("client_id").alias("n_total")),
    on='locale')
).toPandas()

old_ctr['branch'] = 'old'

new_ctr = (
    exp_data.filter("clicked_button = true")
          .groupby("locale")
          .agg(F.countDistinct("client_id").alias("n_clicked"))
    .join(
    exp_data.groupby("locale")
          .agg(F.countDistinct("client_id").alias("n_total")),
    on='locale')
).toPandas()

new_ctr['branch'] = 'new'

In [39]:
ctr_locale = pd.concat([old_ctr, new_ctr])
ctr_locale['prob_click'] = ctr_locale.n_clicked / ctr_locale.n_total
ctr_locale.to_csv("ctr-locale.csv", index=False)
ctr_locale


Out[39]:
locale n_clicked n_total branch prob_click
0 vi 133 2356 old 0.056452
1 ro 224 5895 old 0.037998
2 sl 1 3 old 0.333333
3 es-ES 3860 103745 old 0.037207
4 lv 1 1 old 1.000000
5 pl 1492 61235 old 0.024365
6 sk 170 3845 old 0.044213
7 ko 279 6580 old 0.042401
8 ms 2 42 old 0.047619
9 uk 92 3458 old 0.026605
10 cs 385 11597 old 0.033198
11 sr 82 1878 old 0.043663
12 es-MX 3 51 old 0.058824
13 tr 622 14734 old 0.042215
14 de 4285 177708 old 0.024113
15 hr 1 3 old 0.333333
16 eu 1 1 old 1.000000
17 el 161 3534 old 0.045557
18 it 1260 40374 old 0.031208
19 ar 637 7876 old 0.080879
20 nl 514 17284 old 0.029738
21 zh-CN 4 65 old 0.061538
22 hu 456 13425 old 0.033966
23 en-GB 1132 37295 old 0.030353
24 ru 3957 92472 old 0.042791
25 th 3 12 old 0.250000
26 fa 1 9 old 0.111111
27 bg 177 3341 old 0.052978
28 pt-PT 1 20 old 0.050000
29 pt-BR 1904 104785 old 0.018171
... ... ... ... ... ...
16 de 8289 354731 new 0.023367
17 hi-IN 1 4 new 0.250000
18 el 295 6744 new 0.043743
19 it 2460 83633 new 0.029414
20 ar 1519 19060 new 0.079696
21 nl 964 34500 new 0.027942
22 sv-SE 2 28 new 0.071429
23 zh-CN 13 238 new 0.054622
24 km 1 2 new 0.500000
25 hu 850 24959 new 0.034056
26 ca 2 26 new 0.076923
27 en-GB 2317 75917 new 0.030520
28 ast 1 1 new 1.000000
29 ru 7973 189188 new 0.042143
30 th 3 32 new 0.093750
31 bn-IN 1 1 new 1.000000
32 lt 1 10 new 0.100000
33 fa 2 20 new 0.100000
34 bg 437 7207 new 0.060635
35 pt-PT 4 51 new 0.078431
36 pt-BR 4299 215791 new 0.019922
37 fr 7945 250222 new 0.031752
38 ja 1439 40303 new 0.035705
39 en-US 58070 1709745 new 0.033964
40 id 1580 33968 new 0.046514
41 zh-TW 1240 17890 new 0.069312
42 da 307 6788 new 0.045227
43 fi 284 10631 new 0.026714
44 he 1 6 new 0.166667
45 nb-NO 1 11 new 0.090909

84 rows × 5 columns


In [40]:
%%bash -s "$S3_PATH"


aws s3 cp ctr-locale.csv $1ctr-locale.csv


upload: ./ctr-locale.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/ctr-locale.csv

Add-on Retention by branch


In [41]:
ret = get_uninstall_rates(d_bl_wl)
ret.to_csv("addon-retention.csv", index=False)
ret


Out[41]:
branch n_installed n_total n_retained addon_retention_rate
0 ensemble-taar 1734 23942 22208 0.927575
1 control 2042 30956 28914 0.934035
2 linear-taar 1778 21640 19862 0.917837

Add-on Retention by branch / en-US vs. Non en-US


In [42]:
reten= get_uninstall_rates(d_bl_wl, grouper=['branch', 'is_en_US'])
reten.to_csv("addon-en-us-retention.csv", index=False)
reten


Out[42]:
branch is_en_US n_installed n_total n_retained addon_retention_rate
0 ensemble-taar True 769 10863 10094 0.929209
1 linear-taar True 864 10368 9504 0.916667
2 control False 1042 16461 15419 0.936699
3 control True 1005 14515 13510 0.930761
4 ensemble-taar False 965 13092 12127 0.926291
5 linear-taar False 914 11290 10376 0.919043

Add-on Retention by branch / locale


In [43]:
retl = get_uninstall_rates(d_bl_wl, grouper=['branch', 'locale'])
retl.to_csv("addon-locale-retention.csv", index=False)
retl


Out[43]:
branch locale n_installed n_total n_retained addon_retention_rate
0 linear-taar it 52 437 385 0.881007
1 control ru 193 2694 2501 0.928359
2 linear-taar de 120 1768 1648 0.932127
3 control sr 12 48 36 0.750000
4 ensemble-taar fr 139 1829 1690 0.924002
5 ensemble-taar tr 28 375 347 0.925333
6 linear-taar el 5 56 51 0.910714
7 ensemble-taar id 31 400 369 0.922500
8 linear-taar es-ES 75 1351 1276 0.944486
9 ensemble-taar cs 8 197 189 0.959391
10 control uk 16 104 88 0.846154
11 linear-taar fr 144 1591 1447 0.909491
12 ensemble-taar da 2 91 89 0.978022
13 linear-taar hu 16 173 157 0.907514
14 control en-US 1005 14515 13510 0.930761
15 linear-taar ro 7 99 92 0.929293
16 control de 135 2364 2229 0.942893
17 linear-taar ja 33 250 217 0.868000
18 control ja 37 390 353 0.905128
19 ensemble-taar zh-TW 10 163 153 0.938650
20 ensemble-taar pt-BR 77 1053 976 0.926876
21 linear-taar bg 3 56 53 0.946429
22 control ar 24 358 334 0.932961
23 control fr 116 2551 2435 0.954528
24 ensemble-taar es-CL 2 2 0 0.000000
25 linear-taar vi 7 32 25 0.781250
26 linear-taar cs 10 109 99 0.908257
27 ensemble-taar ru 207 1950 1743 0.893846
28 ensemble-taar it 42 624 582 0.932692
29 linear-taar pt-BR 56 865 809 0.935260
... ... ... ... ... ... ...
50 linear-taar da 3 35 32 0.914286
51 ensemble-taar nl 17 203 186 0.916256
52 ensemble-taar bg 3 58 55 0.948276
53 control nl 11 249 238 0.955823
54 linear-taar sk 4 45 41 0.911111
55 control it 42 668 626 0.937126
56 ensemble-taar de 145 2212 2067 0.934448
57 ensemble-taar ro 1 100 99 0.990000
58 ensemble-taar en-US 769 10863 10094 0.929209
59 control hu 21 236 215 0.911017
60 control tr 38 372 334 0.897849
61 ensemble-taar hu 14 266 252 0.947368
62 control bg 9 71 62 0.873239
63 linear-taar id 23 264 241 0.912879
64 ensemble-taar fi 1 58 57 0.982759
65 linear-taar pl 42 581 539 0.927711
66 control zh-TW 10 260 250 0.961538
67 linear-taar ar 14 150 136 0.906667
68 ensemble-taar en-GB 42 514 472 0.918288
69 ensemble-taar sr 2 34 32 0.941176
70 linear-taar en-GB 49 580 531 0.915517
71 control ro 8 136 128 0.941176
72 linear-taar uk 6 43 37 0.860465
73 ensemble-taar sk 8 81 73 0.901235
74 ensemble-taar ar 17 283 266 0.939929
75 control en-GB 54 630 576 0.914286
76 control el 5 73 68 0.931507
77 control sk 2 66 64 0.969697
78 linear-taar sr 5 24 19 0.791667
79 control es-AR 1 4 3 0.750000

80 rows × 6 columns


In [44]:
%%bash -s "$S3_PATH"

aws s3 cp addon-retention.csv $1addon-retention.csv
aws s3 cp addon-en-us-retention.csv $1addon-en-us-retention.csv
aws s3 cp addon-locale-retention.csv $1addon-locale-retention.csv


upload: ./addon-retention.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/addon-retention.csv
upload: ./addon-en-us-retention.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/addon-en-us-retention.csv
upload: ./addon-locale-retention.csv to s3://net-mozaws-prod-us-west-2-pipeline-analysis/taarv2/addon-locale-retention.csv

Add-on Installation Diversity by branch and source


In [45]:
unique_installs = (
    d_bl_wl.filter("ping_type = 'install'")
           .select("branch", 
                   "src", 
                   F.explode("current_addons").alias("addon_id"))
           .groupby("branch", "src")
           .agg(F.countDistinct("addon_id").alias("n_unique_addons"))
).toPandas()


total_installs = (
 d_bl_wl.filter("ping_type = 'install'")
           .select("branch", 
                   "src", 
                   F.explode("current_addons").alias("addon_id"))
           .groupby("branch", "src")
           .agg(F.count(F.lit(1)).alias("n_total_addons"))
).toPandas()

In [46]:
m = (pd.merge(unique_installs,
          total_installs, on=['branch', 'src'])
    )

m['scaled'] = m.n_unique_addons / m.n_total_addons


m.sort_values('scaled', ascending=False)


Out[46]:
branch src n_unique_addons n_total_addons scaled
4 linear-taar about:addons 2669 122817 0.021732
1 ensemble-taar about:addons 2639 122329 0.021573
2 control about:addons 2990 157856 0.018941
3 ensemble-taar other 2791 155041 0.018002
8 linear-taar other 2715 159970 0.016972
0 ensemble-taar AMO 3856 268948 0.014337
6 linear-taar AMO 3829 269332 0.014217
7 control AMO 4427 422554 0.010477
5 control other 3748 543238 0.006899

Add-on Installation Diversity by branch and source and en-US Locale


In [54]:
unique_installs = (
    d_bl_wl.filter("ping_type = 'install'")
           .select("branch", 
                   "src", "is_en_us",
                   F.explode("current_addons").alias("addon_id"))
           .groupby("branch", "src", 'is_en_us')
           .agg(F.countDistinct("addon_id").alias("n_unique_addons"))
).toPandas()


total_installs = (
 d_bl_wl.filter("ping_type = 'install'")
           .select("branch", "is_en_us",
                   "src",
                   F.explode("current_addons").alias("addon_id"))
           .groupby("branch", "src", 'is_en_us')
           .agg(F.count(F.lit(1)).alias("n_total_addons"))
).toPandas()

In [55]:
m = (pd.merge(unique_installs,
          total_installs, on=['branch', 'src', 'is_en_us'])
    )

m['scaled'] = m.n_unique_addons / m.n_total_addons


m.sort_values('scaled', ascending=False)


Out[55]:
branch src is_en_us n_unique_addons n_total_addons scaled
2 ensemble-taar about:addons True 1950 56358 0.034600
15 linear-taar about:addons True 2001 59908 0.033401
12 linear-taar about:addons False 1861 62909 0.029582
4 control about:addons True 2143 73913 0.028994
11 ensemble-taar about:addons False 1860 65971 0.028194
9 ensemble-taar other True 2173 77978 0.027867
3 linear-taar other True 2178 78587 0.027715
14 control about:addons False 2192 83943 0.026113
5 ensemble-taar other False 1857 77063 0.024097
8 ensemble-taar AMO False 2772 128240 0.021616
1 linear-taar other False 1756 81383 0.021577
0 linear-taar AMO True 2967 138461 0.021428
17 linear-taar AMO False 2796 130871 0.021365
16 ensemble-taar AMO True 2994 140708 0.021278
7 control AMO False 3235 200089 0.016168
13 control AMO True 3505 222465 0.015755
6 control other True 3028 272016 0.011132
10 control other False 2579 271222 0.009509

In [ ]: