In [21]:
import pandas as pd
import requests
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
In [22]:
def get_data(url, access, file_name):
"""
This function takes an url, parameter for the key
'access'/'access-site' depends on getting pageviews
or pagecounts dataset. Then save the data as json
file with the name as given file_name to your directory.
Args:
param1 (str): an url for the API
param2 (str): name for the parameter
param3 (str): an input file name
"""
endpoint = url
if 'pageviews' in url:
params = {'project' : 'en.wikipedia.org',
'access' : access,
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2008080100',
'end' : '2017100100'}
elif 'pagecounts' in url:
params = {'project' : 'en.wikipedia.org',
'access-site' : access,
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
with open(file_name, 'w') as f:
json.dump(response, f)
In [23]:
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_desktop_201507-201709.json'
get_data(url, 'desktop', file_name)
In [24]:
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-web_201507-201709.json'
get_data(url, 'mobile-web', file_name)
In [25]:
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-app_201507-201709.json'
get_data(url, 'mobile-app', file_name)
In [26]:
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_desktop-site_200801-201607.json'
get_data(url, 'desktop-site', file_name)
In [27]:
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_mobile-site_200801-201607.json'
get_data(url, 'mobile-site', file_name)
In [28]:
def convert_json_to_dataframe(file_name):
"""
This function takes an input json file then converts it to
data frame and perform some manipulations.
Args:
param1 (str): input file name
Returns:
a data frame associated to the given input file
"""
with open(file_name) as json_file:
json_data = json.load(json_file)
col_name = file_name.split('_')[1]
data_frame = pd.DataFrame(json_data['items'])
if 'pageviews' in file_name:
data_frame.drop(data_frame.columns[[0, 1, 2, 3]], axis=1, inplace=True)
data_frame.rename(columns={'views':'pageview_' + col_name + '_views'}, inplace=True)
elif 'pagecounts' in file_name:
data_frame.drop(data_frame.columns[[0, 2, 3]], axis=1, inplace=True)
data_frame.rename(columns={'count':'pagecount_' + col_name + '_views'}, inplace=True)
data_frame['year'] = data_frame['timestamp'].apply(lambda x: x[:4])
data_frame['month'] = data_frame['timestamp'].apply(lambda x: x[4:6])
data_frame.drop('timestamp', axis=1, inplace=True)
return data_frame
In [29]:
mobile_web = convert_json_to_dataframe('pageviews_mobile-web_201507-201709.json')
mobile_app = convert_json_to_dataframe('pageviews_mobile-app_201507-201709.json')
desktop = convert_json_to_dataframe('pageviews_desktop_201507-201709.json')
desktop_site = convert_json_to_dataframe('pagecounts_desktop-site_200801-201607.json')
mobile_site = convert_json_to_dataframe('pagecounts_mobile-site_200801-201607.json')
In [30]:
total_mobile = pd.merge(mobile_web, mobile_app, how='outer', on=['year', 'month'])
total_mobile['pageview_mobile_views'] = total_mobile['pageview_mobile-web_views'] + total_mobile['pageview_mobile-app_views']
total_mobile.drop(total_mobile.columns[[0, 3]], axis=1, inplace=True)
total_mobile.head()
Out[30]:
In [31]:
pageview = pd.merge(total_mobile, desktop, how='outer', on=['year', 'month'])
pageview['pageview_all_views'] = pageview['pageview_mobile_views'] + pageview['pageview_desktop_views']
pageview.head()
Out[31]:
In [32]:
pagecount = pd.merge(desktop_site, mobile_site, how='outer', on=['year', 'month'])
pagecount.rename(columns={'pagecount_mobile-site_views':'pagecount_mobile_views'}, inplace=True)
pagecount.rename(columns={'pagecount_desktop-site_views':'pagecount_desktop_views'}, inplace=True)
pagecount.fillna(0, inplace=True)
pagecount['pagecount_all_views'] = pagecount['pagecount_mobile_views'] + pagecount['pagecount_desktop_views']
pagecount.head()
Out[32]:
In [33]:
combine = pd.merge(pagecount, pageview, how='outer', on=['year', 'month'])
combine = combine.fillna(0).astype(int)
combine = combine[["year", "month", "pagecount_all_views", "pagecount_desktop_views", "pagecount_mobile_views",
"pageview_all_views", "pageview_desktop_views", "pageview_mobile_views"]]
combine.head()
Out[33]:
In [34]:
combine.to_csv('en-wikipedia_traffic_200801-201709.csv', encoding='utf-8', index=False)
In [35]:
scale = 1000000
x = pd.date_range(start='2008-01', end='2017-10', freq='M')
y_1 = combine["pageview_desktop_views"] / scale
y_2 = combine["pageview_mobile_views"] / scale
y_3 = combine["pageview_all_views"] / scale
y_4 = combine["pagecount_desktop_views"] / scale
y_5 = combine["pagecount_mobile_views"] / scale
y_6 = combine["pagecount_all_views"] / scale
fig = plt.figure(figsize=(18, 12))
# plot pageviews data
plt.plot(x, y_1, label="pageviews main site", color="black")
plt.plot(x, y_2, label="pageviews mobile site", color="green")
plt.plot(x, y_3, label="pageviews total", color="blue")
# plot pagecounts data
plt.plot(x, y_4, linestyle="--", label="pagecounts main site", color="black")
plt.plot(x, y_5, linestyle="--", label="pagecounts mobile site", color="green")
plt.plot(x, y_6, linestyle="--", label="pagecounts total", color="blue")
plt.legend(loc='upper left', prop={'size': 18})
plt.xlabel("Year", fontsize=24)
plt.xticks(fontsize=18)
plt.ylabel("Wikipedia Page Views (x 1,000,000)", fontsize=24)
plt.yticks(fontsize=18)
plt.title("Page views Traffic on English Wikipedia (x 1,000,000)", fontsize=24)
# save the plot
fig.savefig("en-wikipedia_traffic.png")
In [ ]: