In [21]:

    
import pandas as pd
import requests
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

Step 1: Data acquisition

A function to get Pageviews Data & Pagecounts Data by using the API endpoint



In [22]:

    
def get_data(url, access, file_name):
    """
    This function takes an url, parameter for the key 
    'access'/'access-site' depends on getting pageviews
    or pagecounts dataset. Then save the data as json
    file with the name as given file_name to your directory.
    
    Args:
        param1 (str): an url for the API
        param2 (str): name for the parameter
        param3 (str): an input file name   
    """
    
    endpoint = url
    
    if 'pageviews' in url:
    
        params = {'project' : 'en.wikipedia.org',
                  'access' : access,
                  'agent' : 'user',
                  'granularity' : 'monthly',
                  'start' : '2008080100',
                  'end' : '2017100100'}
        
    elif 'pagecounts' in url:
        
        params = {'project' : 'en.wikipedia.org',
          'access-site' : access,
          'granularity' : 'monthly',
          'start' : '2008010100',
          'end' : '2016080100'}

    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    with open(file_name, 'w') as f:
        json.dump(response, f)

1.1 Get the Pageviews Data, desktop



In [23]:

    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_desktop_201507-201709.json'
get_data(url, 'desktop', file_name)

1.2 Get the Pageviews Data, mobile-web



In [24]:

    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-web_201507-201709.json'
get_data(url, 'mobile-web', file_name)

1.3 Get the Pageviews Data, mobile-app



In [25]:

    
url = 'http://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
file_name = 'pageviews_mobile-app_201507-201709.json'
get_data(url, 'mobile-app', file_name)

1.4 Get the Pagecounts Data, desktop-site



In [26]:

    
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_desktop-site_200801-201607.json'
get_data(url, 'desktop-site', file_name)

1.5 Get the Pagecounts Data, mobile-site



In [27]:

    
url = 'http://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
file_name = 'pagecounts_mobile-site_200801-201607.json'
get_data(url, 'mobile-site', file_name)

Step 2: Data processing

A function to convert the json file into data frame



In [28]:

    
def convert_json_to_dataframe(file_name):
    """
    This function takes an input json file then converts it to
    data frame and perform some manipulations.
    
    Args:
        param1 (str): input file name

    Returns:
        a data frame associated to the given input file
    """
    with open(file_name) as json_file:
        json_data = json.load(json_file)
    
    col_name = file_name.split('_')[1]
    data_frame = pd.DataFrame(json_data['items'])
    if 'pageviews' in file_name:
        data_frame.drop(data_frame.columns[[0, 1, 2, 3]], axis=1, inplace=True)
        data_frame.rename(columns={'views':'pageview_' + col_name + '_views'}, inplace=True)
    elif 'pagecounts' in file_name:
        data_frame.drop(data_frame.columns[[0, 2, 3]], axis=1, inplace=True)
        data_frame.rename(columns={'count':'pagecount_' + col_name + '_views'}, inplace=True)
    data_frame['year'] = data_frame['timestamp'].apply(lambda x: x[:4])
    data_frame['month'] = data_frame['timestamp'].apply(lambda x: x[4:6])
    data_frame.drop('timestamp', axis=1, inplace=True)

    return data_frame

Covert all json datasets to data frames by calling the function above.



In [29]:

    
mobile_web = convert_json_to_dataframe('pageviews_mobile-web_201507-201709.json')
mobile_app = convert_json_to_dataframe('pageviews_mobile-app_201507-201709.json')
desktop = convert_json_to_dataframe('pageviews_desktop_201507-201709.json')
desktop_site = convert_json_to_dataframe('pagecounts_desktop-site_200801-201607.json')
mobile_site = convert_json_to_dataframe('pagecounts_mobile-site_200801-201607.json')

Merge the monthly values for mobile-app and mobile-web by attributes year and month for the pageview data sets. Add a new column call pageview_mobile_views by summing the monthly values for mobile-app and mobile-web. Drop the pageview_mobile-web_views and pageview_mobile-app_views columns afterward.



In [30]:

    
total_mobile = pd.merge(mobile_web, mobile_app, how='outer', on=['year', 'month'])
total_mobile['pageview_mobile_views'] = total_mobile['pageview_mobile-web_views'] + total_mobile['pageview_mobile-app_views']
total_mobile.drop(total_mobile.columns[[0, 3]], axis=1, inplace=True)
total_mobile.head()









    Out[30]:







  
    
      
      year
      month
      pageview_mobile_views
    
  
  
    
      0
      2015
      07
      3288755294
    
    
      1
      2015
      08
      3302333038
    
    
      2
      2015
      09
      3170203333
    
    
      3
      2015
      10
      3268499132
    
    
      4
      2015
      11
      3236601070

Merge the monthly values for mobile and desktop by attributes year and month for the pageview dataset. Add a new column call pageview_all_views by summing pageview_mobile_views and pageview_desktop_views.



In [31]:

    
pageview = pd.merge(total_mobile, desktop, how='outer', on=['year', 'month'])
pageview['pageview_all_views'] = pageview['pageview_mobile_views'] + pageview['pageview_desktop_views']
pageview.head()









    Out[31]:







  
    
      
      year
      month
      pageview_mobile_views
      pageview_desktop_views
      pageview_all_views
    
  
  
    
      0
      2015
      07
      3288755294
      4376666686
      7665421980
    
    
      1
      2015
      08
      3302333038
      4332482183
      7634815221
    
    
      2
      2015
      09
      3170203333
      4485491704
      7655695037
    
    
      3
      2015
      10
      3268499132
      4477532755
      7746031887
    
    
      4
      2015
      11
      3236601070
      4287720220
      7524321290

Merge the monthly values for desktop-site and mobile-site by attributes year and month. Replace all NaN values to 0. Add a new column call pagecout_all_views by suming pagecount_mobile_views and pagecount_desktop_views.



In [32]:

    
pagecount = pd.merge(desktop_site, mobile_site, how='outer', on=['year', 'month'])
pagecount.rename(columns={'pagecount_mobile-site_views':'pagecount_mobile_views'}, inplace=True)
pagecount.rename(columns={'pagecount_desktop-site_views':'pagecount_desktop_views'}, inplace=True)
pagecount.fillna(0, inplace=True)
pagecount['pagecount_all_views'] = pagecount['pagecount_mobile_views'] + pagecount['pagecount_desktop_views']
pagecount.head()









    Out[32]:







  
    
      
      pagecount_desktop_views
      year
      month
      pagecount_mobile_views
      pagecount_all_views
    
  
  
    
      0
      4930902570
      2008
      01
      0.0
      4.930903e+09
    
    
      1
      4818393763
      2008
      02
      0.0
      4.818394e+09
    
    
      2
      4955405809
      2008
      03
      0.0
      4.955406e+09
    
    
      3
      5159162183
      2008
      04
      0.0
      5.159162e+09
    
    
      4
      5584691092
      2008
      05
      0.0
      5.584691e+09

Merge pagecount dataset and pageview dataset by attributes year and month. Replace all NaN values to 0 and reorder columns.



In [33]:

    
combine = pd.merge(pagecount, pageview, how='outer', on=['year', 'month'])
combine = combine.fillna(0).astype(int)
combine = combine[["year", "month", "pagecount_all_views", "pagecount_desktop_views", "pagecount_mobile_views",
                   "pageview_all_views", "pageview_desktop_views", "pageview_mobile_views"]]
combine.head()









    Out[33]:







  
    
      
      year
      month
      pagecount_all_views
      pagecount_desktop_views
      pagecount_mobile_views
      pageview_all_views
      pageview_desktop_views
      pageview_mobile_views
    
  
  
    
      0
      2008
      1
      4930902570
      4930902570
      0
      0
      0
      0
    
    
      1
      2008
      2
      4818393763
      4818393763
      0
      0
      0
      0
    
    
      2
      2008
      3
      4955405809
      4955405809
      0
      0
      0
      0
    
    
      3
      2008
      4
      5159162183
      5159162183
      0
      0
      0
      0
    
    
      4
      2008
      5
      5584691092
      5584691092
      0
      0
      0
      0

Last step, output the data frame to a CSV file



In [34]:

    
combine.to_csv('en-wikipedia_traffic_200801-201709.csv', encoding='utf-8', index=False)

Step 3: Analysis

Extract the x-axis and y-axis from the final dataset. Then, divided every pageviews and pagecounts by 1,000,000 in order to sacle the y-axis. Plot each line for the values from Jan 2008 to Sep 2017 and set the properties for the plot such as labels for x and y axises, title and legend etc. Finally, save the plot to the directory.



In [35]:

    
scale = 1000000
x = pd.date_range(start='2008-01', end='2017-10', freq='M')
y_1 = combine["pageview_desktop_views"] / scale
y_2 = combine["pageview_mobile_views"] / scale
y_3 = combine["pageview_all_views"] / scale
y_4 = combine["pagecount_desktop_views"] / scale
y_5 = combine["pagecount_mobile_views"] / scale
y_6 = combine["pagecount_all_views"] / scale

fig = plt.figure(figsize=(18, 12))

# plot pageviews data
plt.plot(x, y_1, label="pageviews main site", color="black")
plt.plot(x, y_2, label="pageviews mobile site", color="green")
plt.plot(x, y_3, label="pageviews total", color="blue")

# plot pagecounts data
plt.plot(x, y_4, linestyle="--", label="pagecounts main site", color="black")
plt.plot(x, y_5, linestyle="--", label="pagecounts mobile site", color="green")
plt.plot(x, y_6, linestyle="--", label="pagecounts total", color="blue")

plt.legend(loc='upper left', prop={'size': 18})
plt.xlabel("Year", fontsize=24)
plt.xticks(fontsize=18)
plt.ylabel("Wikipedia Page Views (x 1,000,000)", fontsize=24)
plt.yticks(fontsize=18)
plt.title("Page views Traffic on English Wikipedia (x 1,000,000)", fontsize=24)

# save the plot
fig.savefig("en-wikipedia_traffic.png")



In [ ]:

	year	month	pageview_mobile_views
0	2015	07	3288755294
1	2015	08	3302333038
2	2015	09	3170203333
3	2015	10	3268499132
4	2015	11	3236601070

	year	month	pageview_mobile_views	pageview_desktop_views	pageview_all_views
0	2015	07	3288755294	4376666686	7665421980
1	2015	08	3302333038	4332482183	7634815221
2	2015	09	3170203333	4485491704	7655695037
3	2015	10	3268499132	4477532755	7746031887
4	2015	11	3236601070	4287720220	7524321290

	pagecount_desktop_views	year	month	pagecount_all_views
0	4930902570	2008	01	4.930903e+09
1	4818393763	2008	02	4.818394e+09
2	4955405809	2008	03	4.955406e+09
3	5159162183	2008	04	5.159162e+09
4	5584691092	2008	05	5.584691e+09