English Wikipedia page views, 2008 - 2017

Overview

References: Note that the Legacy Pagecounts API has slightly different schema than the pageview API shown here. This sample API request would get you all pageviews by web crawlers on the mobile website for English Wikipedia during the month of September, 2017.

This step creates the initial variables that will be required for each REST API call

Please replace path in the code below: 'C:/AN/HCDS/*' with path on your local machine


In [78]:
import requests
import json

endpoint = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

headers={'User-Agent' : 'https://github.com/amitabhnag', 'From' : 'amnag@uw.edu'}

In [79]:
#DataAcquisition
params = {'project' : 'en.wikipedia.org',
            'access' : 'mobile-web',
            'agent' : 'user',
            'granularity' : 'monthly',
            'start' : '2015070100',
            'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
            }

api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)

with open('C:/AN/HCDS/pageviews_mobile-web_201507-201709.json', 'w') as f:
    json.dump(response, f)


{'items': [{'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015070100', 'views': 3179131148}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015080100', 'views': 3192663889}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015090100', 'views': 3073981649}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015100100', 'views': 3173975355}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015110100', 'views': 3142247145}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015120100', 'views': 3276836351}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016010100', 'views': 3611404079}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016020100', 'views': 3242448142}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016030100', 'views': 3288785117}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016040100', 'views': 3177044999}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016050100', 'views': 3296294723}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016060100', 'views': 3257882479}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016070100', 'views': 3395175122}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016080100', 'views': 3418646794}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016090100', 'views': 3310247842}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016100100', 'views': 3442109005}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016110100', 'views': 3507421156}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016120100', 'views': 3647567822}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017010100', 'views': 4020148351}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017020100', 'views': 3522702265}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017030100', 'views': 3719395296}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017040100', 'views': 3524571150}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017050100', 'views': 3567882051}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017060100', 'views': 3404097346}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017070100', 'views': 3600941034}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017080100', 'views': 3502234506}, {'project': 'en.wikipedia', 'access': 'mobile-web', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017090100', 'views': 3416989181}]}

In [80]:
params = {'project' : 'en.wikipedia.org',
            'access' : 'mobile-app',
            'agent' : 'user',
            'granularity' : 'monthly',
            'start' : '2015070100',
            'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
            }

api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)

with open('C:/AN/HCDS/pageviews_mobile-app_201507-201709.json', 'w') as f:
    json.dump(response, f)


{'items': [{'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015070100', 'views': 109624146}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015080100', 'views': 109669149}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015090100', 'views': 96221684}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015100100', 'views': 94523777}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015110100', 'views': 94353925}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015120100', 'views': 99438956}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016010100', 'views': 106432767}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016020100', 'views': 92414130}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016030100', 'views': 97899074}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016040100', 'views': 81719003}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016050100', 'views': 98738513}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016060100', 'views': 96908466}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016070100', 'views': 101398640}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016080100', 'views': 97172509}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016090100', 'views': 83037939}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016100100', 'views': 67174886}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016110100', 'views': 83623769}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016120100', 'views': 128976033}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017010100', 'views': 211813191}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017020100', 'views': 189059134}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017030100', 'views': 184098693}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017040100', 'views': 115051969}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017050100', 'views': 118805669}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017060100', 'views': 115285847}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017070100', 'views': 124118219}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017080100', 'views': 119171796}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017090100', 'views': 114615188}]}

In [81]:
params = {'project' : 'en.wikipedia.org',
            'access' : 'desktop',
            'agent' : 'user',
            'granularity' : 'monthly',
            'start' : '2015070100',
            'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
            }

api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)

with open('C:/AN/HCDS/pageviews_desktop_201507-201709.json', 'w') as f:
    json.dump(response, f)


{'items': [{'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015070100', 'views': 4376666686}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015080100', 'views': 4332482183}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015090100', 'views': 4485491704}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015100100', 'views': 4477532755}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015110100', 'views': 4287720220}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015120100', 'views': 4100012037}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016010100', 'views': 4436179457}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016020100', 'views': 4250997185}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016030100', 'views': 4286590426}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016040100', 'views': 4149383857}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016050100', 'views': 4191778094}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016060100', 'views': 3888839711}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016070100', 'views': 4337865827}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016080100', 'views': 4695046216}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016090100', 'views': 4135006498}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016100100', 'views': 4361737690}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016110100', 'views': 4392068236}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016120100', 'views': 4209608578}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017010100', 'views': 4521980398}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017020100', 'views': 4026702163}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017030100', 'views': 4319971902}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017040100', 'views': 3951456992}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017050100', 'views': 4187870579}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017060100', 'views': 3604550997}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017070100', 'views': 3565444544}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017080100', 'views': 3575572313}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2017090100', 'views': 3547447892}]}

In [82]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

headers={'User-Agent' : 'https://github.com/amitabhnag', 'From' : 'amnag@uw.edu'}

In [83]:
params = {'project' : 'en.wikipedia.org',
            'access-site' : 'mobile-site',
            'granularity' : 'monthly',
            'start' : '2008010100',
            'end' : '2016080100'#use the first day of the following month to ensure a full month of data is collected
            }

api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)

with open('C:/AN/HCDS/pagecounts_mobile-site_200801-201607.json', 'w') as f:
    json.dump(response, f)


{'items': [{'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2014100100', 'count': 3091546685}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2014110100', 'count': 3027489668}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2014120100', 'count': 3278950021}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015010100', 'count': 3485302091}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015020100', 'count': 3091534479}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015030100', 'count': 3330832588}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015040100', 'count': 3222089917}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015050100', 'count': 3334069483}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015060100', 'count': 3038162463}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015070100', 'count': 3254472695}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015080100', 'count': 3268487582}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015090100', 'count': 3172429827}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015100100', 'count': 3246082505}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015110100', 'count': 3218234512}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2015120100', 'count': 3387411863}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016010100', 'count': 3739628742}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016020100', 'count': 3333231392}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016030100', 'count': 3419853636}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016040100', 'count': 3301385124}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016050100', 'count': 3418435805}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016060100', 'count': 3372618063}, {'project': 'en.wikipedia', 'access-site': 'mobile-site', 'granularity': 'monthly', 'timestamp': '2016070100', 'count': 3500661121}]}

In [84]:
params = {'project' : 'en.wikipedia.org',
            'access-site' : 'desktop-site',
            'granularity' : 'monthly',
            'start' : '2008010100',
            'end' : '2016080100'#use the first day of the following month to ensure a full month of data is collected
            }

api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)

with open('C:/AN/HCDS/pagecounts_desktop-site_200801-201607.json', 'w') as f:
    json.dump(response, f)


  File "<ipython-input-84-e52c5fe297b6>", line 13
    #    json.dump(response, f)
                               ^
SyntaxError: unexpected EOF while parsing

In [85]:
#Step 2- Data Processing

with open('C:/AN/HCDS/pageviews_mobile-web_201507-201709.json', 'r') as f1:
    pageviews_mobile_web_201507_201709 = json.load(f1)

with open('C:/AN/HCDS/pageviews_mobile-app_201507-201709.json', 'r') as f2:
    pageviews_mobile_app_201507_201709 = json.load(f2)

with open('C:/AN/HCDS/pageviews_desktop_201507-201709.json', 'r') as f3:
    pageviews_desktop_201507_201709 = json.load(f3)

with open('C:/AN/HCDS/pagecounts_mobile-site_200801-201607.json', 'r') as f4:
    pagecounts_mobile_site_200801_201607 = json.load(f4)

with open('C:/AN/HCDS/pagecounts_desktop-site_200801-201607.json', 'r') as f5:
    pagecounts_desktop_site_200801_201607 = json.load(f5)

In [86]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:0,.0f}'.format
pageviews_mobile_app_df = pd.DataFrame(data=pageviews_mobile_app_201507_201709['items'])
pageviews_mobile_web_df = pd.DataFrame(data=pageviews_mobile_web_201507_201709['items'])
pageviews_mobile_df = pageviews_mobile_app_df.merge(pageviews_mobile_web_df,on='timestamp',how='outer')
pageviews_mobile_df['pageview_mobile_views'] = pageviews_mobile_df['views_x'] + pageviews_mobile_df['views_y']

pageviews_mobile_df['Year'] = pageviews_mobile_df['timestamp'].str.extract('(^[0-9]{4})')
pageviews_mobile_df['Month'] = pageviews_mobile_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pageviews_mobile_df_small = pd.concat([pageviews_mobile_df['Year'],pageviews_mobile_df['Month'],pageviews_mobile_df['pageview_mobile_views']],axis=1)
pageviews_mobile_df_small['pageview_mobile_views'] = pageviews_mobile_df_small['pageview_mobile_views'].fillna(0) 

pageviews_desktop_df = pd.DataFrame(data=pageviews_desktop_201507_201709['items'])
pageviews_desktop_df['Year'] = pageviews_desktop_df['timestamp'].str.extract('(^[0-9]{4})')
pageviews_desktop_df['Month'] = pageviews_desktop_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pageviews_desktop_df['pageview_desktop_views'] = pageviews_desktop_df['views']
pageviews_desktop_df_small = pd.concat([pageviews_desktop_df['Year'],pageviews_desktop_df['Month'],pageviews_desktop_df['pageview_desktop_views']],axis=1)
pageviews_desktop_df_small['pageview_desktop_views'] = pageviews_desktop_df_small['pageview_desktop_views'].fillna(0)

pagecount_mobile_df = pd.DataFrame(data=pagecounts_mobile_site_200801_201607['items'])
pagecount_mobile_df['Year'] = pagecount_mobile_df['timestamp'].str.extract('(^[0-9]{4})')
pagecount_mobile_df['Month'] = pagecount_mobile_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pagecount_mobile_df['pagecount_mobile_views'] = pagecount_mobile_df['count']
pagecount_mobile_df_small = pd.concat([pagecount_mobile_df['Year'],pagecount_mobile_df['Month'],pagecount_mobile_df['pagecount_mobile_views']],axis=1)
pagecount_mobile_df_small['pagecount_mobile_views'] = pagecount_mobile_df_small['pagecount_mobile_views'].fillna(0)

pagecount_desktop_df = pd.DataFrame(data=pagecounts_desktop_site_200801_201607['items'])
pagecount_desktop_df['Year'] = pagecount_desktop_df['timestamp'].str.extract('(^[0-9]{4})')
pagecount_desktop_df['Month'] = pagecount_desktop_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pagecount_desktop_df['pagecount_desktop_views'] = pagecount_desktop_df['count']
pagecount_desktop_df_small = pd.concat([pagecount_desktop_df['Year'],pagecount_desktop_df['Month'],pagecount_desktop_df['pagecount_desktop_views']],axis=1)
pagecount_desktop_df_small['pagecount_desktop_views'] = pagecount_desktop_df_small['pagecount_desktop_views'].fillna(0)


merged_pagecounts_df = pagecount_mobile_df_small.merge(pagecount_desktop_df_small,on=['Year','Month'],how='outer')
merged_pagecounts_df['pagecount_mobile_views'] = merged_pagecounts_df['pagecount_mobile_views'].fillna(0)
merged_pagecounts_df['pagecount_desktop_views'] = merged_pagecounts_df['pagecount_desktop_views'].fillna(0)
merged_pagecounts_df['pagecount_all_views'] = merged_pagecounts_df['pagecount_mobile_views'] + merged_pagecounts_df['pagecount_desktop_views']

merged_pageviews_df = pageviews_mobile_df_small.merge(pageviews_desktop_df_small,on=['Year','Month'],how='outer')
merged_pageviews_df['pageview_mobile_views'] = merged_pageviews_df['pageview_mobile_views'].fillna(0)
merged_pageviews_df['pageview_desktop_views'] = merged_pageviews_df['pageview_desktop_views'].fillna(0)
merged_pageviews_df['pageview_all_views'] = merged_pageviews_df['pageview_mobile_views'] + merged_pageviews_df['pageview_desktop_views']

merged_df = merged_pagecounts_df.merge(merged_pageviews_df,on=['Year','Month'],how='outer')
merged_df = merged_df.fillna(0)
merged_df.sort_values(['Year','Month'],ascending=True, inplace=True)
merged_df.to_csv('C:/AN/HCDS/en-wikipedia_traffic_200801-201709.csv',float_format='%.6f')
merged_df


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  if __name__ == '__main__':
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  # Remove the CWD from sys.path while we load stuff.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  from ipykernel import kernelapp as app
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  app.launch_new_instance()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:23: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:29: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:30: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
Out[86]:
Year Month pagecount_mobile_views pagecount_desktop_views pagecount_all_views pageview_mobile_views pageview_desktop_views pageview_all_views
22 2008 01 0 4,930,902,570 4,930,902,570 0 0 0
23 2008 02 0 4,818,393,763 4,818,393,763 0 0 0
24 2008 03 0 4,955,405,809 4,955,405,809 0 0 0
25 2008 04 0 5,159,162,183 5,159,162,183 0 0 0
26 2008 05 0 5,584,691,092 5,584,691,092 0 0 0
27 2008 06 0 5,712,104,279 5,712,104,279 0 0 0
28 2008 07 0 5,306,302,874 5,306,302,874 0 0 0
29 2008 08 0 5,140,155,519 5,140,155,519 0 0 0
30 2008 09 0 5,479,533,823 5,479,533,823 0 0 0
31 2008 10 0 5,679,440,782 5,679,440,782 0 0 0
32 2008 11 0 5,415,832,071 5,415,832,071 0 0 0
33 2008 12 0 5,211,708,451 5,211,708,451 0 0 0
34 2009 01 0 5,802,681,551 5,802,681,551 0 0 0
35 2009 02 0 5,547,320,860 5,547,320,860 0 0 0
36 2009 03 0 6,295,159,057 6,295,159,057 0 0 0
37 2009 04 0 5,988,817,321 5,988,817,321 0 0 0
38 2009 05 0 6,267,516,733 6,267,516,733 0 0 0
39 2009 06 0 5,818,924,182 5,818,924,182 0 0 0
40 2009 07 0 5,801,646,978 5,801,646,978 0 0 0
41 2009 08 0 5,790,850,384 5,790,850,384 0 0 0
42 2009 09 0 4,057,515,768 4,057,515,768 0 0 0
43 2009 10 0 6,016,107,147 6,016,107,147 0 0 0
44 2009 11 0 5,768,486,910 5,768,486,910 0 0 0
45 2009 12 0 5,426,505,977 5,426,505,977 0 0 0
46 2010 01 0 5,703,465,285 5,703,465,285 0 0 0
47 2010 02 0 5,762,451,418 5,762,451,418 0 0 0
48 2010 03 0 6,661,347,946 6,661,347,946 0 0 0
49 2010 04 0 6,618,552,152 6,618,552,152 0 0 0
50 2010 05 0 6,410,578,775 6,410,578,775 0 0 0
51 2010 06 0 4,898,035,014 4,898,035,014 0 0 0
... ... ... ... ... ... ... ... ...
6 2015 04 3,222,089,917 6,198,945,657 9,421,035,574 0 0 0
7 2015 05 3,334,069,483 6,323,801,814 9,657,871,297 0 0 0
8 2015 06 3,038,162,463 5,165,413,640 8,203,576,103 0 0 0
9 2015 07 3,254,472,695 5,229,226,022 8,483,698,717 3,288,755,294 4,376,666,686 7,665,421,980
10 2015 08 3,268,487,582 5,035,534,449 8,304,022,031 3,302,333,038 4,332,482,183 7,634,815,221
11 2015 09 3,172,429,827 5,409,631,355 8,582,061,182 3,170,203,333 4,485,491,704 7,655,695,037
12 2015 10 3,246,082,505 5,535,704,471 8,781,786,976 3,268,499,132 4,477,532,755 7,746,031,887
13 2015 11 3,218,234,512 5,296,956,116 8,515,190,628 3,236,601,070 4,287,720,220 7,524,321,290
14 2015 12 3,387,411,863 5,264,446,173 8,651,858,036 3,376,275,307 4,100,012,037 7,476,287,344
15 2016 01 3,739,628,742 5,569,632,502 9,309,261,244 3,717,836,846 4,436,179,457 8,154,016,303
16 2016 02 3,333,231,392 5,347,709,361 8,680,940,753 3,334,862,272 4,250,997,185 7,585,859,457
17 2016 03 3,419,853,636 5,407,676,056 8,827,529,692 3,386,684,191 4,286,590,426 7,673,274,617
18 2016 04 3,301,385,124 5,572,235,399 8,873,620,523 3,258,764,002 4,149,383,857 7,408,147,859
19 2016 05 3,418,435,805 5,330,532,334 8,748,968,139 3,395,033,236 4,191,778,094 7,586,811,330
20 2016 06 3,372,618,063 4,975,092,447 8,347,710,510 3,354,790,945 3,888,839,711 7,243,630,656
21 2016 07 3,500,661,121 5,363,966,439 8,864,627,560 3,496,573,762 4,337,865,827 7,834,439,589
103 2016 08 0 0 0 3,515,819,303 4,695,046,216 8,210,865,519
104 2016 09 0 0 0 3,393,285,781 4,135,006,498 7,528,292,279
105 2016 10 0 0 0 3,509,283,891 4,361,737,690 7,871,021,581
106 2016 11 0 0 0 3,591,044,925 4,392,068,236 7,983,113,161
107 2016 12 0 0 0 3,776,543,855 4,209,608,578 7,986,152,433
108 2017 01 0 0 0 4,231,961,542 4,521,980,398 8,753,941,940
109 2017 02 0 0 0 3,711,761,399 4,026,702,163 7,738,463,562
110 2017 03 0 0 0 3,903,493,989 4,319,971,902 8,223,465,891
111 2017 04 0 0 0 3,639,623,119 3,951,456,992 7,591,080,111
112 2017 05 0 0 0 3,686,687,720 4,187,870,579 7,874,558,299
113 2017 06 0 0 0 3,519,383,193 3,604,550,997 7,123,934,190
114 2017 07 0 0 0 3,725,059,253 3,565,444,544 7,290,503,797
115 2017 08 0 0 0 3,621,406,302 3,575,572,313 7,196,978,615
116 2017 09 0 0 0 3,531,604,369 3,547,447,892 7,079,052,261

117 rows × 8 columns


In [87]:
#Data Analysis
#merged_analysis_df = pd.DataFrame()
#merged_analysis_df['Year'] = merged_df['Year']
#merged_analysis_df['Month'] = merged_df['Month']
#merged_analysis_df['mobile traffic'] = merged_df['pagecount_mobile_views']+ merged_df['pageview_mobile_views']
#merged_analysis_df['Desktop traffic'] = merged_df['pagecount_mobile_views']+ merged_df['pageview_mobile_views']

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

merged_df_analysis = pd.DataFrame(data=merged_df)
merged_df_analysis['Date'] = pd.to_datetime(merged_df_analysis['Year']+merged_df_analysis['Month'],format='%Y%m')
merged_df_analysis['pagecount_desktop_views'] = pd.to_numeric(merged_df_analysis['pagecount_desktop_views'])
#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%Y'))

merged_df_analysis['TotalPageCount'] = merged_df_analysis['pagecount_desktop_views'] + merged_df_analysis['pagecount_mobile_views']
merged_df_analysis['TotalPageView'] = merged_df_analysis['pageview_desktop_views'] + merged_df_analysis['pageview_mobile_views']

merged_df_analysis = merged_df_analysis.replace(0,np.nan)

plt.figure(1,figsize=(15, 6), dpi=80)
plt.title('Page Views/Counts On English Wikipedia (x10,000,000,000)')
plt.ylabel('Counts/Views')
plt.xlabel('Date')

lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['TotalPageCount'],'black',label='Total Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['TotalPageView'],'black',label='Total Page View')
plt.setp(lines,ls='-')

lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pagecount_desktop_views'],'g',label='Desktop Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pageview_desktop_views'],'g',label='Desktop Page View')
plt.setp(lines,ls='-')

lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pagecount_mobile_views'],'b',label='Mobile Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pageview_mobile_views'],'b',label='Mobile Page View')
plt.setp(lines,ls='-')

plt.legend()

plt.show()