In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime
from __future__ import division

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
# We grab pings starting from 2 days ago until [TimeWindow] days ago.
TimeWindow = 14

# Additional filters.
Filters = {
    'app': 'Firefox',
    
    # We sample 0.5% of pings. For testing, it is beter to use a small number
    # here (like 0.001) to speed up processing time.
    'fraction': 0.005,
    
    # Optionally restrict pings to a single channel.
    # 'channel': 'beta',
}

# In case you need them!
VendorIDs = {
    'Intel': '0x8086',
    'NVIDIA': '0x10de',
    'AMD': '0x1002',
    'ATI': '0x1002'
}

In [3]:
###############################
# This section gathers pings. #
###############################

def run_get_pings():
    def fmt_date(d):
        return d.strftime("%Y%m%d")
    t1 = fmt_date(datetime.datetime.now() - datetime.timedelta(TimeWindow + 2)) # go back 16 days
    t2 = fmt_date(datetime.datetime.now() - datetime.timedelta(2)) # go back 2 days
    return get_pings(sc, build_id=(t1, t2), **Filters)

# Get pings for the parameters in the previous step.
raw_pings = run_get_pings()

In [4]:
######################################################################
# This section takes the raw ping list, then formats and filters it. #
######################################################################

# Map the pings into a more readable dictionary-like form. To see
# what these look like, execute "pings.take(1)".
unique_pings = get_pings_properties(raw_pings, [
  "clientId",
  "environment/build/version",
  "environment/system/os/name",
  "environment/system/os/version",
  "environment/system/os/servicePackMajor",
  "environment/system/os/servicePackMinor",
  "environment/system/gfx/adapters",
])
unique_pings = get_one_ping_per_client(unique_pings)

In [5]:
# We add two extra steps. The first rewrites the ping to have some
# information more easily accessible (like the primary adapter),
# and the second step removes any pings that don't have adapter
# information.
def rewrite_ping(p):
    adapters = p.get('environment/system/gfx/adapters', None)
    if not adapters:
        return None
    adapter = adapters[0]
            
    p['adapter'] = adapter
            
    # Convert the version to a tuple of integers.
    if adapter['driverVersion'] is not None:
        p['driverVersion'] = [int(n) for n in adapter['driverVersion'].split('.') if n.isdigit()]
    return p

def filter_ping(p):
    return 'adapter' in p

pings = unique_pings.map(rewrite_ping).filter(filter_ping)
pings = pings.cache()

In [ ]:
# Observe the format of a random ping. This may take some time since it has to
# execute the pipeline.
pings.take(1)

In [6]:
# Count the total number of sessions in the dataset.
TotalSessions = pings.count()
print('Number of sessions: {0}'.format(TotalSessions))


Number of sessions: 150596

In [7]:
##############################################
# Helper function to compare version tuples. #
##############################################
def compare_version_tuples(v1, v2):
    n = max(len(v1), len(v2))
    for i in xrange(0, n):
        x1 = v1[i] if i < len(v1) else 0
        x2 = v2[i] if i < len(v2) else 0
        if x1 != x2:
            return x1 - x2
    return 0

# Tests
assert(compare_version_tuples((1, 0), (1, 1)) < 0)
assert(compare_version_tuples((1, 1), (1, 0)) > 0)
assert(compare_version_tuples((1, 1), (1, 1)) == 0)
assert(compare_version_tuples((1,), (1, 0)) == 0)
assert(compare_version_tuples((1,), (1, 0)) == 0)
assert(compare_version_tuples((1,0), (2,5)) < 0)

In [8]:
# Sample filter #1 - how many people are using Intel devices
# with a driver less than 8.15.10.2622? (bug 1175366).
BadVersion = (8, 15, 10, 2622)
def sample_filter_1(p):
    if p['adapter']['vendorID'] != VendorIDs['Intel']:
        return False
    if 'driverVersion' not in p:
        return False
    return compare_version_tuples(p['driverVersion'], BadVersion) < 0

sample_result_1 = pings.filter(sample_filter_1)
print('{0} out of {1} sessions matched. ({2:.2f}%)'.format(
    sample_result_1.count(),
    pings.count(),
    ((sample_result_1.count() / pings.count()) * 100)))


48990 out of 150596 sessions matched. (32.53%)

In [9]:
# Sample filter #2 - how many users have either devices:
#   0x8086, 0x2e32 - Intel G41 express graphics
#   0x8086, 0x2a02 - Intel GM965, Intel X3100
# See bug 1116812.
#
# Note that vendor and deviceID hex digits are lowercase.
def sample_filter_2(p):
    if p['adapter']['vendorID'] != VendorIDs['Intel']:
        return False
    if p['adapter']['deviceID'] == '0x2e32':
        return True
    if p['adapter']['deviceID'] == '0x2a02':
        return True
    return False

sample_result_2 = pings.filter(sample_filter_2)
print('{0} out of {1} sessions matched. ({2:.2f}%)'.format(
    sample_result_2.count(),
    pings.count(),
    ((sample_result_2.count() / pings.count()) * 100)))


10964 out of 150596 sessions matched. (7.28%)

In [ ]: