This notebook details the steps needed to construct and analyze a dataset of monthly traffic on English Wikipedia from January 2008 to September 2017. It is divided into three sections:
Data were collected from two API endpoints:
When calling the Pageviews API, automated traffic was excluded by specifying /user/ in the endpoint. For example: https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/desktop/user/monthly/2015070100/2017100100
In [ ]:
## Load packages
library(httr)
library(jsonlite)
library(data.table)
library(plyr)
library(lubridate)
library(tidyr)
library(ggplot2)
## Set wd
wd <- "/Users/MB/Desktop/DATA_512/Week 3/Assignment/" # specify working directory here
setwd(wd)
In [ ]:
## English, Desktop Site, Monthly, 1 Jan 2008 to 31 July 2016
pagecounts_desktop_url <- "https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/en.wikipedia.org/desktop-site/monthly/2008010100/2016080100"
pagecounts_desktop <- GET(pagecounts_desktop_url)
pagecounts_desktop.json <- toJSON(content(pagecounts_desktop))
## English, Mobile Site, Monthly, 1 Jan 2008 to 31 July 2016
pagecounts_mobile_url <- "https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/en.wikipedia.org/mobile-site/monthly/2008010100/2016080100"
pagecounts_mobile <- GET(pagecounts_mobile_url)
pagecounts_mobile.json <- toJSON(content(pagecounts_mobile))
In [ ]:
## English, Desktop Site, User traffic, Monthly, 1 Jul 2015 to 30 September 2017
pageviews_desktop_url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/desktop/user/monthly/2015070100/2017100100"
pageviews_desktop <- GET(pageviews_desktop_url)
pageviews_desktop.json <- toJSON(content(pageviews_desktop))
## English, Mobile Web, User Traffic, Monthly, 1 Jul 2015 to 30 September 2017
pageviews_mobileweb_url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/mobile-web/user/monthly/2015070100/2017100100"
pageviews_mobileweb <- GET(pageviews_mobileweb_url)
pageviews_mobileweb.json <- toJSON(content(pageviews_mobileweb))
## English, Mobile App, User Traffic, Monthly, 1 Jul 2015 to 30 September 2017
pageviews_mobileapp_url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/mobile-app/user/monthly/2015070100/2017100100"
pageviews_mobileapp <- GET(pageviews_mobileapp_url)
pageviews_mobileapp.json <- toJSON(content(pageviews_mobileapp))
In [ ]:
write(pagecounts_desktop.json, "pagecounts_desktop-site_200801-201607.json")
write(pagecounts_mobile.json, "pagecounts_mobile-site_200801-201607.json")
write(pageviews_desktop.json, "pageviews_desktop_201507-201709.json")
write(pageviews_mobileweb.json, "pageviews_mobile-web_201507-201709.json")
write(pageviews_mobileapp.json, "pageviews_mobile-app_201507-201709.json")
There were five JSON files created in (1):
These were read into R and converted into data tables.
In [ ]:
pagecounts_desktop.json <- jsonlite::fromJSON("pagecounts_desktop-site_200801-201607.json")
pagecounts_mobile.json <- jsonlite::fromJSON("pagecounts_mobile-site_200801-201607.json")
pageviews_desktop.json <- jsonlite::fromJSON("pageviews_desktop_201507-201709.json")
pageviews_mobileweb.json <- jsonlite::fromJSON("pageviews_mobile-web_201507-201709.json")
pageviews_mobileapp.json <- jsonlite::fromJSON("pageviews_mobile-app_201507-201709.json")
In [ ]:
pagecounts_desktop <- data.table(pagecounts_desktop.json$items)
pagecounts_mobile <- data.table(pagecounts_mobile.json$items)
pageviews_desktop <- data.table(pageviews_desktop.json$items)
pageviews_mobileweb <- data.table(pageviews_mobileweb.json$items)
pageviews_mobileapp <- data.table(pageviews_mobileapp.json$items)
There were some redundant columns in b): project, access, access-site, agent, granularity which were dropped.
In [ ]:
drop_columns <- c("project", "access", "access-site", "agent", "granularity")
pagecounts_desktop[, (drop_columns) := NULL]
pagecounts_mobile [, (drop_columns) := NULL]
pageviews_desktop[, (drop_columns) := NULL]
pageviews_mobileweb[, (drop_columns) := NULL]
pageviews_mobileapp[, (drop_columns) := NULL]
In pagecounts_desktop and pagecounts_mobile, the two remaining columns were:
In pageviews_desktop, pageviews_mobileweb and pageviews_mobileapp, the two remaining columns were:
Prior to merging the five data.tables, the views and counts columns were given unique names and a date field was added using the ymd command in the lubridate package.
In [ ]:
## Rename views and counts columns
names(pagecounts_desktop)[2] <- "pagecount_desktop_views"
names(pagecounts_mobile)[2] <- "pagecount_mobile_views"
names(pageviews_desktop)[2] <- "pageview_desktop_views"
names(pageviews_mobileweb)[2] <- "pageview_mobileweb_views"
names(pageviews_mobileapp)[2] <- "pageview_mobileapp_views"
## Add date field
pagecounts_desktop$date <- (ymd(substr(pagecounts_desktop$timestamp, 1, 8)))
pagecounts_mobile$date <- (ymd(substr(pagecounts_mobile$timestamp, 1, 8)))
pageviews_desktop$date <- (ymd(substr(pageviews_desktop$timestamp, 1, 8)))
pageviews_mobileweb$date<- (ymd(substr(pageviews_mobileweb$timestamp, 1, 8)))
pageviews_mobileapp$date<- (ymd(substr(pageviews_mobileapp$timestamp, 1, 8)))
To create a single unified dataset, the five data.tables in d) were merged (full join) on the date column.
In [ ]:
merge_set <- list(pagecounts_desktop, pagecounts_mobile, pageviews_desktop, pageviews_mobileweb, pageviews_mobileapp)
pageviews_all <- join_all(merge_set, by="date", type="full")
setDT(pageviews_all)
Since the three pageviews tables only had data going back to July 2015, there were missing data in the merged data.table (represented as NULL) which were replaced with zeros.
In [ ]:
zero_col.fun <- function(x) {
ifelse(is.null(x), 0, x)
}
pageviews_all$pagecount_desktop_views <- lapply(pageviews_all$pagecount_desktop_views, zero_col.fun)
pageviews_all$pagecount_mobile_views <- lapply(pageviews_all$pagecount_mobile_views, zero_col.fun)
pageviews_all$pageview_desktop_views <- lapply(pageviews_all$pageview_desktop_views, zero_col.fun)
pageviews_all$pageview_mobileweb_views <- lapply(pageviews_all$pageview_mobileweb_views, zero_col.fun)
pageviews_all$pageview_mobileapp_views <- lapply(pageviews_all$pageview_mobileapp_views, zero_col.fun)
To prepare the final dataset, year and month columns were added to f) and since the components of f) were stored as lists rather than vectors, these were converted to vectors using the unlist function.
In [ ]:
## Add year, month columns to pageviews_all
pageviews_all$year <- year(pageviews_all$date)
pageviews_all$month <- month(pageviews_all$date)
## Create new dataframe and unlist listed components
pageviews_2 <- data.frame(
year = pageviews_all$year,
month = pageviews_all$month,
pagecount_desktop_views = unlist(pageviews_all$pagecount_desktop_views),
pagecount_mobile_views = unlist(pageviews_all$pagecount_mobile_views),
pageview_desktop_views = unlist(pageviews_all$pageview_desktop_views),
pageview_mobileweb_views = unlist(pageviews_all$pageview_mobileweb_views),
pageview_mobileapp_views = unlist(pageviews_all$pageview_mobileapp_views)
)
In [39]:
head(pageviews_2)
Finally, a new data.frame was created with additional columns:
In [ ]:
pageviews_final <- data.frame(
year = pageviews_2$year,
month = pageviews_2$month,
pagecount_all_views = pageviews_2$pagecount_desktop_views + pageviews_2$pagecount_mobile_views,
pagecount_desktop_views = pageviews_2$pagecount_desktop_views,
pagecount_mobile_views = pageviews_2$pagecount_mobile_views,
pageview_all_views = pageviews_2$pageview_desktop_views + pageviews_2$pageview_mobileweb_views + pageviews_2$pageview_mobileapp_views,
pageview_desktop_views = pageviews_2$pageview_desktop_views,
pageview_mobile_views = pageviews_2$pageview_mobileweb_views + pageviews_2$pageview_mobileapp_views
)
In [40]:
head(pageviews_final)
In [ ]:
write.csv(pageviews_final, file="en-wikipedia_traffic_200801_201709.csv", row.names=FALSE)
In this step, a time series graph was created which showed the trends of:
In [41]:
traffic <- read.csv(file="en-wikipedia_traffic_200801_201709.csv", header=TRUE, sep=",")
Since time series graphs require a date field, a new column, date, was created from the year and month columns.
In [43]:
traffic$date <- parse_date_time(paste(traffic$year, traffic$month, "01", sep="-"), c("%y-%m-%d"))
head(traffic)
Since ggplot works best when the data is tidy, the data.frame in b) was reshaped such that both views and the platform (pagecount_desktop, pagecount_mobile etc) were columns. Furthermore, zeros were replaced by NAs to avoid formatting issues in the time series chart.
In [44]:
traffic_tidy <- gather(traffic, key=metric, value=views, -date, -month, -year)
traffic_tidy[traffic_tidy==0] <- NA
Finally, a time series plot was created using ggplot.
In [46]:
traffic_plot <- ggplot(traffic_tidy, aes(x=date, y=views, color=metric)) + geom_line(alpha=0.6) + scale_y_continuous(name="# of views", labels=scales::comma, expand=c(0, 0), limits=c(0, 12000000000)) + scale_x_datetime(name="year", expand=c(0,0)) + labs(title="Monthly page views on English Wikipedia - Jan 2008 to Sep 2017")
traffic_plot
In [47]:
png(filename="wikipedia_traffic.png")
plot(traffic_plot)
dev.off()