In order to measure Wikipedia traffic from 2008-2016, collect data from two different API endpoints, the Legacy Pagecounts API and the pageview API.
In [1]:
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
import copy
from datetime import datetime
import plotly
import plotly.graph_objs as go
from plotly import tools
plotly.__version__
plotly.offline.init_notebook_mode(connected=True)
headers={'User-Agent' : 'https://github.com/mbzhuang', 'From' : 'mbzhuang@uw.edu'}
In [2]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'
params = {'project' : 'en.wikipedia.org',
'access' : 'desktop-site',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open('pagecounts_desktop-site_200801-201607.json', 'w') as outfile:
json.dump(response, outfile)
Get pagecounts of en.wikipedia.org through mobile.
In [3]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-site',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open('pagecounts_mobile-site_200801-201607.json', 'w') as outfile:
json.dump(response, outfile)
In [4]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
params = {'project' : 'en.wikipedia.org',
'access' : 'desktop',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open('pageviews_desktop-site_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
Get page viewcounts of en.wikipedia.org through mobile-web and mobile-app.
In [5]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-web',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open('pageviews_mobile-web_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-app',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open('pageviews_mobile-app_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
Define a function to load data from a JSON format to a dataframe. Then Use the function to load the five JSON files created in step 1.
In [6]:
def json_to_DataFrame(json_file):
with open(json_file, 'r') as infile:
json_content = json.load(infile)
DataFrame = json_normalize(json_content['items'])
return DataFrame
df_pagecounts_desktop = json_to_DataFrame("pagecounts_desktop-site_200801-201607.json")
df_pagecounts_mobile = json_to_DataFrame("pagecounts_mobile-site_200801-201607.json")
df_pageviews_desktop = json_to_DataFrame("pageviews_desktop-site_201507-201709.json")
df_pageviews_mobile_web = json_to_DataFrame("pageviews_mobile-web_201507-201709.json")
df_pageviews_mobile_app = json_to_DataFrame("pageviews_mobile-app_201507-201709.json")
Take a look at the dataframes.
In [7]:
df_pagecounts_desktop.head()
Out[7]:
In [8]:
df_pagecounts_mobile.head()
Out[8]:
In [9]:
df_pageviews_desktop.head()
Out[9]:
In [10]:
df_pageviews_mobile_web.head()
Out[10]:
In [11]:
df_pageviews_mobile_app.head()
Out[11]:
Combine mobile app and mobile web views and create a new dataframe df_pageviews_mobile. Since both dataframes are of the same dimension and sorted the on timestamp, we can just add the values of views to get the total mobile views.
In [12]:
df_pageviews_mobile = copy.copy(df_pageviews_mobile_web)
df_pageviews_mobile.views = df_pageviews_mobile.views + df_pageviews_mobile_app.views
df_pageviews_mobile.head()
Out[12]:
Prepare the four dataframes for merging on timestamp.
In [13]:
# Merge mobile and desktop of pagecounts dataframes.
df_pagecounts = pd.merge(df_pagecounts_desktop, df_pagecounts_mobile, on='timestamp', how='left')
# Merge mobile and desktop of pageviews dataframes
df_pageviews = pd.merge(df_pageviews_desktop, df_pageviews_mobile, on='timestamp')
# Subset the merged dataframes and rename column names
df_pagecounts = df_pagecounts[['timestamp', 'count_x', 'count_y']]
df_pagecounts.columns = ['timestamp', 'pagecount_desktop_views', 'pagecount_mobile_views']
df_pageviews = df_pageviews[['timestamp', 'views_x', 'views_y']]
df_pageviews.columns = ['timestamp', 'pageview_desktop_views', 'pageview_mobile_views']
# Finally, merge pagecount and pageview dataframes
df_total = pd.merge(df_pagecounts, df_pageviews, on='timestamp', how='left')
# Fill the missing value with 0
df_total = df_total.fillna(value = 0)
# Create two new columns, 'pagecount_all_views' and 'pageview_all_views', that are both the combinition of
# mobile and desktop counts respectively.
df_total['pagecount_all_views'] = df_total.pagecount_desktop_views + df_total.pagecount_mobile_views
df_total['pageview_all_views'] = df_total.pageview_desktop_views + df_total.pageview_mobile_views
# Create two new columns, 'year' and 'month', that are both based on the column 'timestamp'
df_total.timestamp = pd.to_datetime(df_total.timestamp, format='%Y%m%d%H')
df_total['year'] = df_total['timestamp'].dt.year
df_total['month'] = df_total['timestamp'].dt.month
# Do subsetting to get the final dataframe
df_total = df_total[['year', 'month',
'pagecount_all_views', 'pagecount_desktop_views', 'pagecount_mobile_views',
'pageview_all_views', 'pageview_desktop_views', 'pageview_mobile_views']]
In [14]:
df_total.head()
Out[14]:
Save the df_total dataframe to csv file.
In [15]:
df_total.to_csv("en-wikipedia_traffic_200801-201709.csv", sep='\t', index=False)
Read the csv file created in step 2 to a dataframe.
In [16]:
df = pd.read_csv("en-wikipedia_traffic_200801-201709.csv", sep='\t')
Create a column 'date' that combines year, month, and day. Use it as the x axis later when plotting
In [17]:
df['day'] = '01'
df['date']= df.apply(lambda x:datetime.strptime("{0} {1} {2}".format(x['year'],x['month'], x['day']), "%Y %m %d"),axis=1)
Use Plotly to create an interactive visualization to present the page view information on English Wikipedia.
In [18]:
trace1 = go.Scatter(
x = df[df.pagecount_all_views != 0].date,
y = df[df.pagecount_all_views != 0].pagecount_all_views,
name='Total(Pagecount definition)',
line = dict(color = 'black', dash = 'dash')
)
trace2 = go.Scatter(
x = df[df.pagecount_desktop_views != 0].date,
y = df[df.pagecount_desktop_views != 0].pagecount_desktop_views,
name='Main site(Pagecount definition)',
line = dict(color = 'green', dash = 'dash')
)
trace3 = go.Scatter(
x = df[df.pagecount_mobile_views != 0].date,
y = df[df.pagecount_mobile_views != 0].pagecount_mobile_views,
name='Mobile(Pagecount definition)',
line = dict(color = 'blue', dash = 'dash')
)
trace4 = go.Scatter(
x = df[df.pageview_all_views != 0].date,
y = df[df.pageview_all_views != 0].pageview_all_views,
name='Total(Pageview definition)',
line = dict(color = 'black')
)
trace5 = go.Scatter(
x = df[df.pageview_desktop_views != 0].date,
y = df[df.pageview_desktop_views != 0].pageview_desktop_views,
name='Main site(Pageview definition)',
line = dict(color = 'green')
)
trace6 = go.Scatter(
x = df[df.pageview_mobile_views != 0].date,
y = df[df.pageview_mobile_views != 0].pageview_mobile_views,
name='Mobile(Pagecount definition)',
line = dict(color = 'blue')
)
data = [trace1, trace2, trace3, trace4, trace5, trace6]
layout = dict(title = 'Page Views on English Wikipedia',
height= 600, width = 1000,
xaxis = dict(title = ''),
yaxis = dict(title = 'Page views', range=[0, 12000000000]),
legend=dict(orientation="h")
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='PageViewsonEnglishWikipedia')