In [21]:
    
import pandas as pd
import requests
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
    
In [22]:
    
def get_data(url, access, file_name):
    """
    This function takes an url, parameter for the key 
    'access'/'access-site' depends on getting pageviews
    or pagecounts dataset. Then save the data as json
    file with the name as given file_name to your directory.
    
    Args:
        param1 (str): an url for the API
        param2 (str): name for the parameter
        param3 (str): an input file name   
    """
    
    endpoint = url
    
    if 'pageviews' in url:
    
        params = {'project' : 'en.wikipedia.org',
                  'access' : access,
                  'agent' : 'user',
                  'granularity' : 'monthly',
                  'start' : '2008080100',
                  'end' : '2017100100'}
        
    elif 'pagecounts' in url:
        
        params = {'project' : 'en.wikipedia.org',
          'access-site' : access,
          'granularity' : 'monthly',
          'start' : '2008010100',
          'end' : '2016080100'}
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    with open(file_name, 'w') as f:
        json.dump(response, f)
    
In [23]:
    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_desktop_201507-201709.json'
get_data(url, 'desktop', file_name)
    
In [24]:
    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-web_201507-201709.json'
get_data(url, 'mobile-web', file_name)
    
In [25]:
    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-app_201507-201709.json'
get_data(url, 'mobile-app', file_name)
    
In [26]:
    
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_desktop-site_200801-201607.json'
get_data(url, 'desktop-site', file_name)
    
In [27]:
    
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_mobile-site_200801-201607.json'
get_data(url, 'mobile-site', file_name)
    
In [28]:
    
def convert_json_to_dataframe(file_name):
    """
    This function takes an input json file then converts it to
    data frame and perform some manipulations.
    
    Args:
        param1 (str): input file name
    Returns:
        a data frame associated to the given input file
    """
    with open(file_name) as json_file:
        json_data = json.load(json_file)
    
    col_name = file_name.split('_')[1]
    data_frame = pd.DataFrame(json_data['items'])
    if 'pageviews' in file_name:
        data_frame.drop(data_frame.columns[[0, 1, 2, 3]], axis=1, inplace=True)
        data_frame.rename(columns={'views':'pageview_' + col_name + '_views'}, inplace=True)
    elif 'pagecounts' in file_name:
        data_frame.drop(data_frame.columns[[0, 2, 3]], axis=1, inplace=True)
        data_frame.rename(columns={'count':'pagecount_' + col_name + '_views'}, inplace=True)
    data_frame['year'] = data_frame['timestamp'].apply(lambda x: x[:4])
    data_frame['month'] = data_frame['timestamp'].apply(lambda x: x[4:6])
    data_frame.drop('timestamp', axis=1, inplace=True)
    return data_frame
    
In [29]:
    
mobile_web = convert_json_to_dataframe('pageviews_mobile-web_201507-201709.json')
mobile_app = convert_json_to_dataframe('pageviews_mobile-app_201507-201709.json')
desktop = convert_json_to_dataframe('pageviews_desktop_201507-201709.json')
desktop_site = convert_json_to_dataframe('pagecounts_desktop-site_200801-201607.json')
mobile_site = convert_json_to_dataframe('pagecounts_mobile-site_200801-201607.json')
    
In [30]:
    
total_mobile = pd.merge(mobile_web, mobile_app, how='outer', on=['year', 'month'])
total_mobile['pageview_mobile_views'] = total_mobile['pageview_mobile-web_views'] + total_mobile['pageview_mobile-app_views']
total_mobile.drop(total_mobile.columns[[0, 3]], axis=1, inplace=True)
total_mobile.head()
    
    Out[30]:
In [31]:
    
pageview = pd.merge(total_mobile, desktop, how='outer', on=['year', 'month'])
pageview['pageview_all_views'] = pageview['pageview_mobile_views'] + pageview['pageview_desktop_views']
pageview.head()
    
    Out[31]:
In [32]:
    
pagecount = pd.merge(desktop_site, mobile_site, how='outer', on=['year', 'month'])
pagecount.rename(columns={'pagecount_mobile-site_views':'pagecount_mobile_views'}, inplace=True)
pagecount.rename(columns={'pagecount_desktop-site_views':'pagecount_desktop_views'}, inplace=True)
pagecount.fillna(0, inplace=True)
pagecount['pagecount_all_views'] = pagecount['pagecount_mobile_views'] + pagecount['pagecount_desktop_views']
pagecount.head()
    
    Out[32]:
In [33]:
    
combine = pd.merge(pagecount, pageview, how='outer', on=['year', 'month'])
combine = combine.fillna(0).astype(int)
combine = combine[["year", "month", "pagecount_all_views", "pagecount_desktop_views", "pagecount_mobile_views",
                   "pageview_all_views", "pageview_desktop_views", "pageview_mobile_views"]]
combine.head()
    
    Out[33]:
In [34]:
    
combine.to_csv('en-wikipedia_traffic_200801-201709.csv', encoding='utf-8', index=False)
    
In [35]:
    
scale = 1000000
x = pd.date_range(start='2008-01', end='2017-10', freq='M')
y_1 = combine["pageview_desktop_views"] / scale
y_2 = combine["pageview_mobile_views"] / scale
y_3 = combine["pageview_all_views"] / scale
y_4 = combine["pagecount_desktop_views"] / scale
y_5 = combine["pagecount_mobile_views"] / scale
y_6 = combine["pagecount_all_views"] / scale
fig = plt.figure(figsize=(18, 12))
# plot pageviews data
plt.plot(x, y_1, label="pageviews main site", color="black")
plt.plot(x, y_2, label="pageviews mobile site", color="green")
plt.plot(x, y_3, label="pageviews total", color="blue")
# plot pagecounts data
plt.plot(x, y_4, linestyle="--", label="pagecounts main site", color="black")
plt.plot(x, y_5, linestyle="--", label="pagecounts mobile site", color="green")
plt.plot(x, y_6, linestyle="--", label="pagecounts total", color="blue")
plt.legend(loc='upper left', prop={'size': 18})
plt.xlabel("Year", fontsize=24)
plt.xticks(fontsize=18)
plt.ylabel("Wikipedia Page Views (x 1,000,000)", fontsize=24)
plt.yticks(fontsize=18)
plt.title("Page views Traffic on English Wikipedia (x 1,000,000)", fontsize=24)
# save the plot
fig.savefig("en-wikipedia_traffic.png")
    
    
In [ ]: