In [3]:
'''
MIT License

Copyright (c) 2016 Parvez Ahammad, Qingzhu (Clark) Gao, Prasenjit Dey

Permission is hereby granted, free of charge, to any person obtaining a copy
of this dataset, associated software, documentation files and analysis scripts (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.pyplot import show
from scipy.stats import chi2_contingency
import scipy.stats as scp
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline

import io
from nbformat import current

def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("sp_analysis_helper.ipynb")

In [4]:
# Summary of the data, numbers of valid/complete/incomplete sessions
# Valid sessions are those who accurately labeld at least 4 out of 5 honeypot pairs

num_total_sessions = 5444
num_comlete_sessions = 2914
num_valid_sessions = 2772
num_invalid_sessions = num_comlete_sessions - num_valid_sessions
num_incomplete_sessions = 2530
total_vote = 77482
valid_vote = num_valid_sessions * 21

print '%d Total sessions'%num_total_sessions
print '%d Complete sessions'%num_comlete_sessions
print '%d Valid sessions'%num_valid_sessions
print '%d Invalid sessions'%(num_comlete_sessions - num_valid_sessions)
print '%d Incomplete'%num_incomplete_sessions
print '%d Total votes'%total_vote
print '%d Total valid votes'%(valid_vote)

# Pie Chart to show the above numbers
# plt.cm.prism
fig = plt.figure(figsize=(8,8))
fig.suptitle("SpeedPerception Data of %d sessions"%(num_comlete_sessions+num_incomplete_sessions), fontsize=20)

fig1 = plt.pie(
    [num_valid_sessions, num_invalid_sessions, num_incomplete_sessions],
    labels=['Valid', 'Invalid', 'Incomplete'],
    colors=['darkkhaki','#f5deb3', '#ffa07a',],
    startangle=90,
    shadow=True,
    explode=(0, 0,0.04),
    labeldistance= 0.8,
    autopct='%1.f%%'
)


5444 Total sessions
2914 Complete sessions
2772 Valid sessions
142 Invalid sessions
2530 Incomplete
77482 Total votes
58212 Total valid votes

In [5]:
fig = plt.figure(figsize=(8,8))
fig.suptitle("SpeedPerception Data of %d votes"%(total_vote), fontsize=20)

# ax1 = plt.subplot2grid((4,3), (1,0), rowspan=3, colspan = 3)
fig1 = plt.pie(
    [valid_vote, total_vote-valid_vote],
    labels=['Valid Votes', 'Invalid Votes'],
    colors=['darkkhaki', '#ffa07a',],
    startangle=90,
    shadow=True,
    explode=(0, 0.04),
    labeldistance= 1.1,
    autopct='%1.f%%'
)



In [6]:
# Read valid session result data
df_valid = pd.read_csv('./data/sp_phase1_csv3.csv')
print 'There are %d valid votes'%(len(df_valid))
print 'There are %d valid sessions'%(len(df_valid)/16.)
df_valid.head(3)


There are 44352 valid votes
There are 2772 valid sessions
Out[6]:
sessionID pairID TimeToClick_InMS userAgent vote
0 58aPyrqJm6zGYK6eJ 5786da3020cfb49feadbbe6c 5643.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... left
1 58aPyrqJm6zGYK6eJ 5786da3020cfb49feadbbe6d 4561.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... right
2 58aPyrqJm6zGYK6eJ 5786da2f20cfb49feadbbe5a 5484.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... equal

In [7]:
# Number of valid votes on each video pair
pair_count = df_valid.groupby(['pairID']).agg(['count'])
plt.figure(figsize=(20,6))
s = pd.Series(pair_count['vote']['count'].values)
g = s.plot(kind='bar' )
g.set(xticklabels=[], xlabel='160 Video Pairs', ylabel='Number of Votes')


Out[7]:
[<matplotlib.text.Text at 0x11bdf7250>,
 [],
 <matplotlib.text.Text at 0x11bddadd0>]

In [8]:
# Compute majority vote for each pair
max_vote = {}
for i, id_ in enumerate(df_valid['pairID'].unique()):
    one_pair = df_valid.loc[df_valid['pairID']==id_]
    tmp = one_pair.groupby(['vote']).count()['sessionID']
    if len(tmp) < 3:
        for v in ['equal', 'left', 'right']:
            if not v in tmp.index.values:
                tmp.loc[v] = 0      
    count0, count1, count2 = tmp.ix['equal'], tmp.ix['left'], tmp.ix['right']
    new_l = sorted([(count1, 'pick1'), (count2, 'pick2'), (count0, 'pick0')], key=lambda x: x[0])
    max_vote[id_] = int(new_l[2][1][-1])
max_pick = []
for _id in df_valid['pairID'].ravel():
    max_pick.append(max_vote[_id])
df_valid['majority_pick'] = max_pick  
mapping = {'equal': 0, 'left': 1, 'right': 2}
df_valid = df_valid.replace({'vote': mapping})
df_valid.head(3)


Out[8]:
sessionID pairID TimeToClick_InMS userAgent vote majority_pick
0 58aPyrqJm6zGYK6eJ 5786da3020cfb49feadbbe6c 5643.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... 1 1
1 58aPyrqJm6zGYK6eJ 5786da3020cfb49feadbbe6d 4561.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... 2 2
2 58aPyrqJm6zGYK6eJ 5786da2f20cfb49feadbbe5a 5484.0 Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ... 0 2

In [9]:
# Side by side bar plot showing the percentage of vote on equal/left/right of both majority and population
mVote_count = []
pVote_count = []
for i in range(3):
    mVote_count.append(max_vote.values().count(i)/float(len(max_vote)))
    pVote_count.append(df_valid['vote'].tolist().count(i)/float(len(df_valid)))
vote_count = pd.DataFrame({
        'vote_count': mVote_count + pVote_count,
        'vote': ['equal', 'left', 'right'] * 2,
        'type': ['majority'] * 3 + ['population'] * 3
    })

plt.figure(figsize=(12,6))
plt.title('Vote Count Distribution', size=20)
plt.gca()
ax = sns.barplot(x="type", y="vote_count", hue="vote", data=vote_count, palette="muted", alpha=0.8)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+0.1, height+0.005, '%1.2f'%(height))
ax.set(xlabel = '', ylabel='percentage', ylim=(0,1))
plt.legend(loc='best')
sns.despine(left=False, bottom=False)



In [10]:
df_perf = pd.read_csv('./data/sp_phase1_csv1.csv')
df_perf = df_perf[[
        '_TTFB',
        '_firstPaint',
        '_render',
#         '_domContentLoadedEventStart',
        '_domContentLoadedEventEnd',
        '_SpeedIndex',
        '_PSI',
        '_loadTime',
        '_lastVisualChange',
        '_visualComplete',
        'uid']]
df_pair = pd.read_csv('./data/sp_phase1_csv2.csv')
df_pair.head(3)


Out[10]:
pairID uid_left uid_right
0 5786da3020cfb49feadbbe6c 160605_JY_2PB 160605_X9_2DH
1 5786da3020cfb49feadbbe6d 160605_X9_2DH 160605_1W_2ZH
2 5786da2f20cfb49feadbbe5a 160605_TF_2MS 160605_HA_2PK

In [11]:
# merge performance metrics with user sessions
# map performance metrics to picks

for col in df_perf.columns.tolist()[:-1]:
    d = ([],[])
    for i in range(len(df_pair)):
        wpt1, wpt2 = df_pair['uid_left'][i], df_pair['uid_right'][i]
        d[0].append(df_perf[df_perf['uid']==wpt1][col].values[0])
        d[1].append(df_perf[df_perf['uid']==wpt2][col].values[0])
    df_pair[col[1:] + "_%d"%(1)] = d[0]
    df_pair[col[1:] + "_%d"%(2)] = d[1]
pair_perf = pd.merge(df_valid, df_pair, on=['pairID'])

d_diff = {}
pair_perf_m = pair_perf.iloc[:,range(8,26)]

for col in df_perf.columns.tolist()[:-1]:
    l1 = pair_perf_m[col[1:] + '_1'].ravel()
    l2 = pair_perf_m[col[1:] + '_2'].ravel()
    pick = pick_one(l1, l2)
    d_diff[col[1:] + "_pick"] = pick

perf_pick = pd.DataFrame(d_diff)
df_valid_perf = pd.concat([pair_perf[['pairID', 'vote', 'majority_pick']],perf_pick], axis=1)
df_pair.head(3)


Out[11]:
pairID uid_left uid_right TTFB_1 TTFB_2 firstPaint_1 firstPaint_2 render_1 render_2 domContentLoadedEventEnd_1 ... SpeedIndex_1 SpeedIndex_2 PSI_1 PSI_2 loadTime_1 loadTime_2 lastVisualChange_1 lastVisualChange_2 visualComplete_1 visualComplete_2
0 5786da3020cfb49feadbbe6c 160605_JY_2PB 160605_X9_2DH 178 1338 1478 2782 1696 2896 2085 ... 3419 4704 5067 4777 3924 7011 8396 7996 8400 8000
1 5786da3020cfb49feadbbe6d 160605_X9_2DH 160605_1W_2ZH 1338 280 2782 2191 2896 2296 3542 ... 4704 3661 4777 4059 7011 3392 7996 13308 8000 8400
2 5786da2f20cfb49feadbbe5a 160605_TF_2MS 160605_HA_2PK 97 611 1976 1622 2096 1296 2533 ... 3048 3178 3001 4200 3625 5026 8095 7996 8100 8000

3 rows × 21 columns


In [12]:
# Percentage match of each performance based pick to population pick
target1 = 'Population'
percentage_match(df_valid_perf, target1)



In [13]:
# Percentage match of each performance based pick to majority pick
unique_pair_perf = df_valid_perf.drop_duplicates(['pairID'], keep='first')
target2 = 'Majrotiy'
percentage_match(unique_pair_perf, target2)



In [14]:
# # cramer's V
# association based on chi-square test statistics 
# WRT to population
d_cat = {}
for col in df_valid_perf.columns.tolist():
    d_cat[col] = map({0: 'euqal', 1: 'left', 2: 'right'}.get, df_valid_perf[col])
chiSquare_association(d_cat, target1)



In [15]:
# WRT to majority
chiSquare_association(d_cat, target2)



In [16]:
col_m = df_perf.columns.tolist()[:-1] 
cmap = plt.cm.prism
colors = cmap(np.linspace(0., 1., 2))
fig = plt.figure(figsize=(20, 10))

chart_count = 0
for metric in col_m:
    data = df_perf[metric]
    chart_count = chart_count + 1
    ax = fig.add_subplot(3, 3, chart_count)
    ax.set_title('%s median = %d'%(metric[1:], data.median()))
    ax.hist(data, bins = 50)
    ax.axvline(data.median(), color = 'red', linestyle = 'dashed', linewidth = 2)



In [17]:
import operator

pair_perf_copy = pair_perf.copy()
GIF_DURATION_STRETCH_FACTOR = 1.09412199387
offset = 0

pair_perf_copy['TimeToClick_InMS'] = GIF_DURATION_STRETCH_FACTOR * pair_perf_copy['TimeToClick_InMS']
visual_metrics_median = {}
for metric in col_m:
    visual_metrics_median[metric[1:]] = df_perf[metric].median()
median_sorted_metrics = sorted(visual_metrics_median.items(), key = operator.itemgetter(1))
print median_sorted_metrics

sorted_visual_metrics = [i[0] for i in median_sorted_metrics]
print '************** Timing Analysis ****************'
### 3.1 : Count of votes made before/after/in-between the visual metrics of a pair
### Pie chart to show the timing numbers
cmap = plt.cm.prism
colors = cmap(np.linspace(0., 1., 2))
fig = plt.figure(figsize=(20, 10))
# fig.suptitle("SpeedPerception Response Timing Data of %d samples"%len(df_timing))
### Charts in the order of sorted visual metrics
chart_count = 0
before_both_metrics = []
after_both_metrics = []
in_between_metrics = []
for metric in sorted_visual_metrics:
    chart_count = chart_count + 1

    d_before_both_metrics = pair_perf_copy.loc[
            (pair_perf_copy['TimeToClick_InMS'] - offset < pair_perf_copy[metric + '_%d'%1])
            & 
            (pair_perf_copy['TimeToClick_InMS'] - offset< pair_perf_copy[metric + '_%d'%2])
        ]

    timings_before_both_metrics = d_before_both_metrics.count()[0]
#     print 'Number of tests with view duration less than both %s: %d'%(metric, timings_before_both_metrics)

    d_after_both_metrics = pair_perf_copy.loc[
            (pair_perf_copy['TimeToClick_InMS'] - offset> pair_perf_copy[metric + '_%d'%1])
            & 
            (pair_perf_copy['TimeToClick_InMS'] - offset> pair_perf_copy[metric + '_%d'%2])
        ]
    timings_after_both_metrics = d_after_both_metrics.count()[0]
#     print 'Number of tests with view duration greater than both %s: %d'%(metric, timings_after_both_metrics)

    timings_in_between_metrics = (len(pair_perf_copy) - timings_before_both_metrics - timings_after_both_metrics)
#     print 'Number of tests with duration lying in between both %s: %d'%(metric, timings_in_between_metrics)
#     print 
    
    before_both_metrics.append(timings_before_both_metrics/float(len(pair_perf_copy)))
    after_both_metrics.append(timings_after_both_metrics/float(len(pair_perf_copy)))
    in_between_metrics.append(timings_in_between_metrics/float(len(pair_perf_copy)))
    ax = fig.add_subplot(3,3, chart_count)
    ax.set_title(metric)
    ax.pie(
        [timings_before_both_metrics, timings_after_both_metrics, timings_in_between_metrics],
        labels= ['Before', 'After', 'In Between'],
        colors= ['darksalmon', 'darkkhaki', 'darkturquoise'],
        startangle= 200,
        shadow=True,
        explode=(0, 0.01, 0.01),
        autopct='%1.1f%%',
        labeldistance= 1.1
    )

    ax.axis('equal')


[('TTFB', 373.0), ('firstPaint', 1651.0), ('render', 1896.0), ('domContentLoadedEventEnd', 2645.0), ('SpeedIndex', 4165.0), ('PSI', 4283.0), ('loadTime', 6024.0), ('visualComplete', 9000.0), ('lastVisualChange', 9196.0)]
************** Timing Analysis ****************

In [18]:
fig = plt.figure(figsize=(12, 6))
total_metrics = [1]*len(sorted_visual_metrics)
ttd_timeline = {
    'percentage': in_between_metrics+after_both_metrics+before_both_metrics,
    'click_time': ['in_between_metrics']*len(sorted_visual_metrics) +
    ['after_both_metrics']*len(sorted_visual_metrics) + ['before_both_metrics']*len(sorted_visual_metrics),
    'metrics_timeline': sorted_visual_metrics * 3
               }
# pd.DataFrame(ttd_timeline).plot(kind='barh', stacked=True)
ttd_df = pd.DataFrame(ttd_timeline)
sns.pointplot(x='metrics_timeline', y='percentage',hue='click_time', data= ttd_df,
             palette={'in_between_metrics':'darkturquoise', 
                     'after_both_metrics':'darkkhaki',
                     'before_both_metrics':'darksalmon'})
plt.ylabel('percentage')
plt.title('TimeToClick_pointplot')
plt.xticks(rotation=30)


Out[18]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), <a list of 9 Text xticklabel objects>)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: