In [11]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import matplotlib.dates as mdates
import datetime
The provided sample API request is used to load data from pageviews and pagecount API. The data is collected from both API and save the raw result into 5 json files.
In [2]:
endpoint1 = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'
headers={'User-Agent' : 'https://github.com/jingyany', 'From' : 'jingyany@uw.edu'}
params1 = {'project' : 'en.wikipedia.org',
'access' : 'desktop-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'
}
api_call1 = requests.get(endpoint1.format(**params1))
response1 = api_call1.json()
print(response1)
In [3]:
json1 = json.dumps(response1)
f = open("pagecounts_desktop-site_200801-201607.json","w")
f.write(json1)
f.close()
In [4]:
endpoint2 = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'
headers ={'User-Agent' : 'https://github.com/jingyany', 'From' : 'jingyany@uw.edu'}
params2 = {'project' : 'en.wikipedia.org',
'access' : 'mobile-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'#use the first day of the following month to ensure a full month of data is collected
}
api_call2 = requests.get(endpoint2.format(**params2))
response2 = api_call2.json()
print(response2)
In [6]:
json2 = json.dumps(response2)
f = open("pagecounts_mobile-site_200801-201607.json","w")
f.write(json2)
f.close()
In [13]:
endpoint3 = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
headers ={'User-Agent' : 'https://github.com/jingyany', 'From' : 'jingyany@uw.edu'}
params3 = {'project' : 'en.wikipedia.org',
'access' : 'desktop',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call3 = requests.get(endpoint3.format(**params3))
response3 = api_call3.json()
print(response3)
In [14]:
json3 = json.dumps(response3)
f = open("pageviews_desktop_201507-201709.json","w")
f.write(json3)
f.close()
In [15]:
endpoint4 = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
headers ={'User-Agent' : 'https://github.com/jingyany', 'From' : 'jingyany@uw.edu'}
params4 = {'project' : 'en.wikipedia.org',
'access' : 'mobile-web',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call4 = requests.get(endpoint4.format(**params4))
response4 = api_call4.json()
print(response4)
In [16]:
json4 = json.dumps(response4)
f = open("pageviews_mobile-web_201507-201709.json","w")
f.write(json4)
f.close()
In [17]:
endpoint5 = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
headers ={'User-Agent' : 'https://github.com/jingyany', 'From' : 'jingyany@uw.edu'}
params5 = {'project' : 'en.wikipedia.org',
'access' : 'mobile-app',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call5 = requests.get(endpoint5.format(**params5))
response5 = api_call5.json()
print(response5)
In [18]:
json5 = json.dumps(response5)
f = open("pageviews_mobile-app_201507-201709.json","w")
f.write(json5)
f.close()
In [13]:
pv_mw = pd.read_json('pageviews_mobile-web_201507-201709.json',orient='columns')
pv_mw.head()
Out[13]:
In [14]:
pv_mw = pd.read_json(pv_mw['items'].to_json()).T
pv_mw.head()
Out[14]:
In [15]:
pv_ma = pd.read_json('pageviews_mobile-app_201507-201709.json',orient='columns')
pv_ma.head()
Out[15]:
In [16]:
pv_ma = pd.read_json(pv_ma['items'].to_json()).T
pv_ma.head()
Out[16]:
In [17]:
pv_mw['views'] = pv_mw['views'] + pv_ma['views']
pv_mw.head()
Out[17]:
In [18]:
pv_d = pd.read_json('pageviews_desktop_201507-201709.json',orient='columns')
pv_d = pd.read_json(pv_d['items'].to_json()).T
pv_d.head()
Out[18]:
In [19]:
pv = pd.DataFrame()
pv['timestamp'] = pv_ma['timestamp']
pv['pageview_all_views'] = pv_mw['views'] + pv_d['views']
pv['pageview_desktop_views'] = pv_d['views']
pv['pageview_mobile_views'] = pv_mw['views']
pv.head()
Out[19]:
Third, the desktop pagecount and mobile pagecount are loaded from json file. Since the time range of desktop pagecount and mobile pagecount are different, I use pandas' merge function to outer join these two dataframes. Before merge pagecount dataframe and pageviews dataframe, all cells with 'Nan' values in pagecount should be filled with 0.
In [20]:
pc_d = pd.read_json('pagecounts_desktop-site_200801-201607.json',orient='columns')
pc_d = pd.read_json(pc_d['items'].to_json()).T
pc_d.head()
Out[20]:
In [21]:
pc_m = pd.read_json('pagecounts_mobile-site_200801-201607.json',orient='columns')
pc_m = pd.read_json(pc_m['items'].to_json()).T
pc_m.head()
Out[21]:
In [22]:
pc_temp = pd.merge(pc_m, pc_d, how='outer', on='timestamp')
pc_temp.head()
Out[22]:
In [23]:
pc_temp.fillna(0, inplace=True)
pc_temp.head()
Out[23]:
In [24]:
pc = pd.DataFrame()
pc['timestamp'] = pc_temp['timestamp']
pc['pagecount_all_views'] = pc_temp['count_y'] + pc_temp['count_x']
pc['pagecount_desktop_views'] = pc_temp['count_y']
pc['pagecount_mobile_views'] = pc_temp['count_x']
pc.head()
Out[24]:
In [25]:
pc.head()
Out[25]:
In [26]:
all_temp = pd.merge(pc, pv, how='outer', on='timestamp')
all_temp.fillna(0, inplace=True)
all_temp.head()
Out[26]:
In [27]:
all_temp['timestamp'] = pd.to_datetime(all_temp['timestamp'], format = '%Y%m%d%S')
all_temp['year'] = pd.DatetimeIndex(all_temp['timestamp']).year
all_temp['month'] = pd.DatetimeIndex(all_temp['timestamp']).month
all_temp.head()
Out[27]:
In [28]:
final = pd.DataFrame()
final['year'] = all_temp['year']
final['month'] = all_temp['month'].map("{:02}".format)
final['pagecount_all_views'] = all_temp['pagecount_all_views']
final['pagecount_desktop_views'] = all_temp['pagecount_desktop_views']
final['pagecount_mobile_views'] = all_temp['pagecount_mobile_views']
final['pageview_all_views'] = all_temp['pageview_all_views']
final['pageview_desktop_views'] = all_temp['pageview_desktop_views']
final['pageview_mobile_views'] = all_temp['pageview_mobile_views']
In [ ]:
final = final.sort(['year', 'month'], ascending=[True, True])
final.head()
In [30]:
final.to_csv('en-wikipedia_traffic_200801-201709.csv')
Load data from the csv file generated in step2. Plot six variables, mobile pagecount, desktop pagecount, all pagecount, mobile pageview, desktop pageview and all pageview with different color legends.
In [45]:
df = pd.read_csv('en-wikipedia_traffic_200801-201709.csv')
df = df.replace(0, np.nan)
df['year'] = df['year'].astype(str)
df['month'] = df['month'].map("{:02}".format)
df.head()
Out[45]:
In [111]:
date = pd.to_datetime(df.year + df.month, format = '%Y%m')
years = mdates.YearLocator()
months = mdates.MonthLocator()
fmt = mdates.DateFormatter('%Y')
fig, ax = plt.subplots()
ax.plot(date, df['pagecount_mobile_views'], color = 'blue', alpha=0.7)
ax.plot(date, df['pagecount_desktop_views'], color = 'green', alpha=0.7)
ax.plot(date, df['pagecount_all_views'], color = 'purple', alpha=0.7)
ax.plot(date, df['pageview_all_views'], color = 'red', linestyle = '--', alpha=0.7)
ax.plot(date, df['pageview_mobile_views'], color = 'blue', linestyle = '--', alpha=0.7)
ax.plot(date, df['pageview_desktop_views'], color = 'purple', linestyle = '--', alpha=0.7)
# Set the ticks' format
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(fmt)
ax.xaxis.set_minor_locator(months)
datemin = datetime.date(date.min().year, 1, 1)
datemax = datetime.date(date.max().year + 1, 1, 1)
ax.set_xlim(datemin, datemax)
fig.autofmt_xdate()
# Set the plot's format
fig.set_size_inches(16, 11)
plt.legend(loc=4, bbox_to_anchor=(0.25, 0.75), prop={'size': 12})
fig.suptitle('Page Views on English Wikipedia', fontsize=25, x = 0.5, y = 0.92)
ax.xaxis.label.set_size(15)
ax.yaxis.label.set_size(15)
plt.show()
In [112]:
fig.savefig('page-views-english-wikipedia.png')