References: Note that the Legacy Pagecounts API has slightly different schema than the pageview API shown here. This sample API request would get you all pageviews by web crawlers on the mobile website for English Wikipedia during the month of September, 2017.
This step creates the initial variables that will be required for each REST API call
Please replace path in the code below: 'C:/AN/HCDS/*' with path on your local machine
In [78]:
import requests
import json
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'
headers={'User-Agent' : 'https://github.com/amitabhnag', 'From' : 'amnag@uw.edu'}
In [79]:
#DataAcquisition
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-web',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)
with open('C:/AN/HCDS/pageviews_mobile-web_201507-201709.json', 'w') as f:
json.dump(response, f)
In [80]:
params = {'project' : 'en.wikipedia.org',
'access' : 'mobile-app',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)
with open('C:/AN/HCDS/pageviews_mobile-app_201507-201709.json', 'w') as f:
json.dump(response, f)
In [81]:
params = {'project' : 'en.wikipedia.org',
'access' : 'desktop',
'agent' : 'user',
'granularity' : 'monthly',
'start' : '2015070100',
'end' : '2017100100'#use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)
with open('C:/AN/HCDS/pageviews_desktop_201507-201709.json', 'w') as f:
json.dump(response, f)
In [82]:
endpoint = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
headers={'User-Agent' : 'https://github.com/amitabhnag', 'From' : 'amnag@uw.edu'}
In [83]:
params = {'project' : 'en.wikipedia.org',
'access-site' : 'mobile-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'#use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)
with open('C:/AN/HCDS/pagecounts_mobile-site_200801-201607.json', 'w') as f:
json.dump(response, f)
In [84]:
params = {'project' : 'en.wikipedia.org',
'access-site' : 'desktop-site',
'granularity' : 'monthly',
'start' : '2008010100',
'end' : '2016080100'#use the first day of the following month to ensure a full month of data is collected
}
api_call = requests.get(endpoint.format(**params),headers=headers)
response = api_call.json()
print(response)
with open('C:/AN/HCDS/pagecounts_desktop-site_200801-201607.json', 'w') as f:
json.dump(response, f)
In [85]:
#Step 2- Data Processing
with open('C:/AN/HCDS/pageviews_mobile-web_201507-201709.json', 'r') as f1:
pageviews_mobile_web_201507_201709 = json.load(f1)
with open('C:/AN/HCDS/pageviews_mobile-app_201507-201709.json', 'r') as f2:
pageviews_mobile_app_201507_201709 = json.load(f2)
with open('C:/AN/HCDS/pageviews_desktop_201507-201709.json', 'r') as f3:
pageviews_desktop_201507_201709 = json.load(f3)
with open('C:/AN/HCDS/pagecounts_mobile-site_200801-201607.json', 'r') as f4:
pagecounts_mobile_site_200801_201607 = json.load(f4)
with open('C:/AN/HCDS/pagecounts_desktop-site_200801-201607.json', 'r') as f5:
pagecounts_desktop_site_200801_201607 = json.load(f5)
In [86]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:0,.0f}'.format
pageviews_mobile_app_df = pd.DataFrame(data=pageviews_mobile_app_201507_201709['items'])
pageviews_mobile_web_df = pd.DataFrame(data=pageviews_mobile_web_201507_201709['items'])
pageviews_mobile_df = pageviews_mobile_app_df.merge(pageviews_mobile_web_df,on='timestamp',how='outer')
pageviews_mobile_df['pageview_mobile_views'] = pageviews_mobile_df['views_x'] + pageviews_mobile_df['views_y']
pageviews_mobile_df['Year'] = pageviews_mobile_df['timestamp'].str.extract('(^[0-9]{4})')
pageviews_mobile_df['Month'] = pageviews_mobile_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pageviews_mobile_df_small = pd.concat([pageviews_mobile_df['Year'],pageviews_mobile_df['Month'],pageviews_mobile_df['pageview_mobile_views']],axis=1)
pageviews_mobile_df_small['pageview_mobile_views'] = pageviews_mobile_df_small['pageview_mobile_views'].fillna(0)
pageviews_desktop_df = pd.DataFrame(data=pageviews_desktop_201507_201709['items'])
pageviews_desktop_df['Year'] = pageviews_desktop_df['timestamp'].str.extract('(^[0-9]{4})')
pageviews_desktop_df['Month'] = pageviews_desktop_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pageviews_desktop_df['pageview_desktop_views'] = pageviews_desktop_df['views']
pageviews_desktop_df_small = pd.concat([pageviews_desktop_df['Year'],pageviews_desktop_df['Month'],pageviews_desktop_df['pageview_desktop_views']],axis=1)
pageviews_desktop_df_small['pageview_desktop_views'] = pageviews_desktop_df_small['pageview_desktop_views'].fillna(0)
pagecount_mobile_df = pd.DataFrame(data=pagecounts_mobile_site_200801_201607['items'])
pagecount_mobile_df['Year'] = pagecount_mobile_df['timestamp'].str.extract('(^[0-9]{4})')
pagecount_mobile_df['Month'] = pagecount_mobile_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pagecount_mobile_df['pagecount_mobile_views'] = pagecount_mobile_df['count']
pagecount_mobile_df_small = pd.concat([pagecount_mobile_df['Year'],pagecount_mobile_df['Month'],pagecount_mobile_df['pagecount_mobile_views']],axis=1)
pagecount_mobile_df_small['pagecount_mobile_views'] = pagecount_mobile_df_small['pagecount_mobile_views'].fillna(0)
pagecount_desktop_df = pd.DataFrame(data=pagecounts_desktop_site_200801_201607['items'])
pagecount_desktop_df['Year'] = pagecount_desktop_df['timestamp'].str.extract('(^[0-9]{4})')
pagecount_desktop_df['Month'] = pagecount_desktop_df['timestamp'].str.extract('^[0-9]{4}([0-9]{2})')
pagecount_desktop_df['pagecount_desktop_views'] = pagecount_desktop_df['count']
pagecount_desktop_df_small = pd.concat([pagecount_desktop_df['Year'],pagecount_desktop_df['Month'],pagecount_desktop_df['pagecount_desktop_views']],axis=1)
pagecount_desktop_df_small['pagecount_desktop_views'] = pagecount_desktop_df_small['pagecount_desktop_views'].fillna(0)
merged_pagecounts_df = pagecount_mobile_df_small.merge(pagecount_desktop_df_small,on=['Year','Month'],how='outer')
merged_pagecounts_df['pagecount_mobile_views'] = merged_pagecounts_df['pagecount_mobile_views'].fillna(0)
merged_pagecounts_df['pagecount_desktop_views'] = merged_pagecounts_df['pagecount_desktop_views'].fillna(0)
merged_pagecounts_df['pagecount_all_views'] = merged_pagecounts_df['pagecount_mobile_views'] + merged_pagecounts_df['pagecount_desktop_views']
merged_pageviews_df = pageviews_mobile_df_small.merge(pageviews_desktop_df_small,on=['Year','Month'],how='outer')
merged_pageviews_df['pageview_mobile_views'] = merged_pageviews_df['pageview_mobile_views'].fillna(0)
merged_pageviews_df['pageview_desktop_views'] = merged_pageviews_df['pageview_desktop_views'].fillna(0)
merged_pageviews_df['pageview_all_views'] = merged_pageviews_df['pageview_mobile_views'] + merged_pageviews_df['pageview_desktop_views']
merged_df = merged_pagecounts_df.merge(merged_pageviews_df,on=['Year','Month'],how='outer')
merged_df = merged_df.fillna(0)
merged_df.sort_values(['Year','Month'],ascending=True, inplace=True)
merged_df.to_csv('C:/AN/HCDS/en-wikipedia_traffic_200801-201709.csv',float_format='%.6f')
merged_df
Out[86]:
In [87]:
#Data Analysis
#merged_analysis_df = pd.DataFrame()
#merged_analysis_df['Year'] = merged_df['Year']
#merged_analysis_df['Month'] = merged_df['Month']
#merged_analysis_df['mobile traffic'] = merged_df['pagecount_mobile_views']+ merged_df['pageview_mobile_views']
#merged_analysis_df['Desktop traffic'] = merged_df['pagecount_mobile_views']+ merged_df['pageview_mobile_views']
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
merged_df_analysis = pd.DataFrame(data=merged_df)
merged_df_analysis['Date'] = pd.to_datetime(merged_df_analysis['Year']+merged_df_analysis['Month'],format='%Y%m')
merged_df_analysis['pagecount_desktop_views'] = pd.to_numeric(merged_df_analysis['pagecount_desktop_views'])
#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%Y'))
merged_df_analysis['TotalPageCount'] = merged_df_analysis['pagecount_desktop_views'] + merged_df_analysis['pagecount_mobile_views']
merged_df_analysis['TotalPageView'] = merged_df_analysis['pageview_desktop_views'] + merged_df_analysis['pageview_mobile_views']
merged_df_analysis = merged_df_analysis.replace(0,np.nan)
plt.figure(1,figsize=(15, 6), dpi=80)
plt.title('Page Views/Counts On English Wikipedia (x10,000,000,000)')
plt.ylabel('Counts/Views')
plt.xlabel('Date')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['TotalPageCount'],'black',label='Total Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['TotalPageView'],'black',label='Total Page View')
plt.setp(lines,ls='-')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pagecount_desktop_views'],'g',label='Desktop Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pageview_desktop_views'],'g',label='Desktop Page View')
plt.setp(lines,ls='-')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pagecount_mobile_views'],'b',label='Mobile Page Count')
plt.setp(lines,ls='--')
lines = plt.plot(merged_df_analysis['Date'],merged_df_analysis['pageview_mobile_views'],'b',label='Mobile Page View')
plt.setp(lines,ls='-')
plt.legend()
plt.show()