In this assignment, I analyzed the traffic on English Wikipedia from 2008 to 2017 over time by making a visualization using Wikimedia's Analytics/AQS APIs. Documentation in this Jupyter notebook starts with API pulls to obtain the raw data, followed by a series of data processing steps, and lastly a data visualization of the Wikipedia traffic.
In [2]:
# import all python libraries needed in this analysis
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
from datetime import datetime
In [57]:
# provide my credential for API access
headers={'User-Agent' : 'https://github.com/jasonfeiwang', 'From' : 'fwang16@uw.edu'}
In [5]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'
In [6]:
params = {'project' : 'en.wikipedia.org',
'access' : 'desktop-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100' #use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
# save the retrieved data as json file
with open('pagecounts_desktop-site_200801-201607.json', 'w') as outfile:
json.dump(response, outfile)
In [7]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100' #use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
# save the retrieved data as json file
with open('pagecounts_mobile-site_200801-201607.json', 'w') as outfile:
json.dump(response, outfile)
In [8]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
In [9]:
params = {'project' : 'en.wikipedia.org',
'access' : 'desktop',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100' #use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
# save the retrieved data as json file
with open('pageviews_desktop_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
In [10]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-web',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100' #use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
# save the retrieved data as json file
with open('pageviews_mobile-web_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
In [11]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-app',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100' #use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
# save the retrieved data as json file
with open('pageviews_mobile-app_201507-201709.json', 'w') as outfile:
json.dump(response, outfile)
In [ ]:
#### sample code from "https://stackoverflow.com/questions/21104592/json-to-pandas-dataframe"
with open("pageviews_mobile-app_201507-201709.json", 'r') as f:
json_content = json.load(f)
df_pageviews_mobile_app = json_normalize(json_content['items'])
In [3]:
with open("pageviews_mobile-web_201507-201709.json", 'r') as f:
json_content = json.load(f)
df_pageviews_mobile_web = json_normalize(json_content['items'])
In [4]:
with open("pageviews_desktop_201507-201709.json", 'r') as f:
json_content = json.load(f)
df_pageviews_desktop = json_normalize(json_content['items'])
In [5]:
with open("pagecounts_desktop-site_200801-201607.json", 'r') as f:
json_content = json.load(f)
df_pagecounts_desktop = json_normalize(json_content['items'])
In [6]:
with open("pagecounts_mobile-site_200801-201607.json", 'r') as f:
json_content = json.load(f)
df_pagecounts_mobile = json_normalize(json_content['items'])
In [7]:
df_pageviews_mobile = pd.merge(df_pageviews_mobile_app, df_pageviews_mobile_web.ix[:,['timestamp', 'views']], on = 'timestamp', how = 'inner')
df_pageviews_mobile['views'] = df_pageviews_mobile.views_x + df_pageviews_mobile.views_y
df_pageviews_mobile = df_pageviews_mobile.drop(['views_x', 'views_y'], 1)
In [8]:
df_pageviews_mobile['year'] = df_pageviews_mobile['timestamp'].map(lambda x: x[0:4])
df_pageviews_mobile['month'] = df_pageviews_mobile['timestamp'].map(lambda x: x[4:6])
In [9]:
df_pageviews_desktop['year'] = df_pageviews_desktop['timestamp'].map(lambda x: x[0:4])
df_pageviews_desktop['month'] = df_pageviews_desktop['timestamp'].map(lambda x: x[4:6])
In [10]:
df_pagecounts_mobile['year'] = df_pagecounts_mobile['timestamp'].map(lambda x: x[0:4])
df_pagecounts_mobile['month'] = df_pagecounts_mobile['timestamp'].map(lambda x: x[4:6])
In [11]:
df_pagecounts_desktop['year'] = df_pagecounts_desktop['timestamp'].map(lambda x: x[0:4])
df_pagecounts_desktop['month'] = df_pagecounts_desktop['timestamp'].map(lambda x: x[4:6])
In [12]:
df_pageviews = pd.merge(df_pageviews_mobile.ix[:, ['year', 'month', 'views']]
, df_pageviews_desktop.ix[:, ['year', 'month', 'views']]
, on = ['year', 'month'], how = 'outer')
df_pageviews.columns = ['year', 'month', 'pageview_mobile_views', 'pageview_desktop_views']
In [14]:
df_pagecounts = pd.merge(df_pagecounts_mobile.ix[:, ['year', 'month', 'count']]
, df_pagecounts_desktop.ix[:, ['year', 'month', 'count']]
, on = ['year', 'month'], how = 'outer')
df_pagecounts.columns = ['year', 'month', 'pagecount_mobile_views', 'pagecount_desktop_views']
In [16]:
df = pd.merge(df_pageviews
, df_pagecounts
, on = ['year', 'month'], how = 'outer')
In [17]:
df = df.fillna(value = 0)
df['pagecount_all_views'] = df.pagecount_mobile_views + df.pagecount_desktop_views
df['pageview_all_views'] = df.pageview_mobile_views + df.pageview_desktop_views
df = df[['year', 'month', 'pagecount_all_views', 'pagecount_desktop_views', 'pagecount_mobile_views'
, 'pageview_all_views', 'pageview_desktop_views', 'pageview_mobile_views']]
df = df.sort_values(by = ['year', 'month'], ascending=[1, 1])
In [20]:
df.to_csv("en-wikipedia_traffic_200801-201709.csv", sep='\t')
In [3]:
df = pd.read_csv("en-wikipedia_traffic_200801-201709.csv", sep='\t')
df.drop('Unnamed: 0', axis = 1, inplace = True)
In [4]:
df['day'] = '01'
df['time'] = pd.to_datetime(df.ix[:, ['year', 'month', 'day']])
In [5]:
#### code from https://stackoverflow.com/questions/18697417/not-plotting-zero-in-matplotlib-or-change-zero-to-none-python
def zero_to_nan(values):
"""Replace every 0 with 'nan' and return a copy."""
return [float('nan') if x==0 else x for x in values]
df.pageview_all_views = zero_to_nan(df.pageview_all_views)
df.pageview_desktop_views = zero_to_nan(df.pageview_desktop_views)
df.pageview_mobile_views = zero_to_nan(df.pageview_mobile_views)
df.pagecount_all_views = zero_to_nan(df.pagecount_all_views)
df.pagecount_desktop_views = zero_to_nan(df.pagecount_desktop_views)
df.pagecount_mobile_views = zero_to_nan(df.pagecount_mobile_views)
In [18]:
del plt
In [19]:
import matplotlib.pyplot as plt
In [21]:
# set figure size
plt.rcParams["figure.figsize"] = [20,8]
# format plot edges and facecolor
# plt.rcParams['axes.linewidth'] = 2
# plt.rcParams['axes.edgecolor'] = 'black'
# plt.rcParams['axes.facecolor'] = 'lightgrey'
# format the legend
# plt.rcParams['legend.facecolor'] = 'grey'
# plt.rcParams['legend.edgecolor'] = 'black'
# plt.rcParams['legend.borderpad'] = 0.4
# specify the 6 subplots (3 for pageview data, 3 for legacy pagecount data)
fig, ax = plt.subplots()
line1, = ax.plot(df.time, df.pagecount_desktop_views/1000000, '--', linewidth=2,
label = '_nolegend_', color = 'green')
line2, = ax.plot(df.time, df.pagecount_mobile_views/1000000, '--', linewidth=2,
label = '_nolegend_', color = 'blue')
line3, = ax.plot(df.time, df.pagecount_all_views/1000000, '--', linewidth=2,
label = '_nolegend_', color = 'black')
line4, = ax.plot(df.time, df.pageview_desktop_views/1000000, '-', linewidth=2,
label='main site', color = 'green')
line5, = ax.plot(df.time, df.pageview_mobile_views/1000000, '-', linewidth=2,
label='mobile site', color = 'blue')
line6, = ax.plot(df.time, df.pageview_all_views/1000000, '-', linewidth=2,
label='total', color = 'black')
# add vertical lines to emphasis the beginning of each year
xcoords = df.time[[x for x in range(0, len(df.time), 12)]]
for xc in xcoords:
plt.axvline(x=xc, color = 'black', linewidth=1)
# set the range of x and y axies
ax.set_xlim([df.time.iloc[0], df.time.iloc[-1]])
ax.set_ylim([0, 12000])
# add title and footnote
ax.set_title("Page Views on English Wikipedia (x 1000,000)", size = 20)
ax.text(0.1, -0.15, 'May 2015: a new pageview defition took effect, which eliminated all crawler traffic. Dashed lines mark old definition.',
verticalalignment='bottom', horizontalalignment='left',
transform=ax.transAxes,
color='red', fontsize=15)
# specify legend position and font size
ax.legend(loc='upper left', prop={'size': 15})
# specify x and y ticks font size
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
plt.grid(True)
plt.show()