In [ ]:
library(forecast)
library(fpp)
library(dplyr)
library(lubridate)
library(xts)
In [ ]:
df <- read.csv("../../../cocUptoJuly2016.csv", stringsAsFactors=F)
In [ ]:
df$Num.Complaints <- 1
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$Month <- month(df$Complaint.Date)
df$Year <- year(df$Complaint.Date)
In [ ]:
min.year <- min(df$Year)
max.year <- max(df$Year)
In [ ]:
# for top 5 wards and top 5 complaint types
M <- 5 # number of top wards to consider
N <- 5 # number of top complaint types to consider
In [ ]:
ward.table <- table(df$Ward)
ward.table <- as.data.frame(ward.table[order(-ward.table)])
colnames(ward.table) <- c("Count")
ward.table[0:M,]
top.wards <- names(ward.table[0:M,])
top.wards
In [ ]:
cpl.table <- table(df$Complaint.Type)
cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
colnames(cpl.table) <- c("Count")
cpl.table[0:M,]
top.cpl <- names(cpl.table[0:M,])
top.cpl
In [ ]:
seq(min.year, max.year)
In [ ]:
DATA_FOLDER <- "../data/wardLevel/"
for(cpl in top.cpl) {
subDf <- df[(df$Complaint.Type == cpl) & (df$Ward %in% top.wards),]
wards <- unique(subDf$Ward)
ideal <- expand.grid(Year=seq(min.year, max.year), Month=seq(1, 12), Ward=wards, stringsAsFactors = F)
monthly <- (subDf %>% group_by(Year, Month, Ward) %>% summarise(Num.Complaints = sum(Num.Complaints)))
monthly <- left_join(ideal, monthly, by=c("Year", "Month", "Ward"))
monthly$Num.Complaints[is.na(monthly$Num.Complaints)] <- 0
monthly <- monthly[order(monthly$Ward, monthly$Year, monthly$Month), ]
path <- file.path(DATA_FOLDER, paste0(gsub("/", "", cpl), ".csv"))
print(paste0("Saving file ", path))
write.csv(monthly, file=path, row.names=F)
}
In [ ]:
# write function to load ward level data into a easily usable structure
load.ward.data <- function(dataFolder) {
files <- list.files(dataFolder, include.dirs=F)
data <- list()
for(file in files) {
path <- paste0(dataFolder, "/", file)
if(file.info(path)$isdir) {
next
}
print(paste0("Reading ", path))
df <- read.csv(path, stringsAsFactors=F)
minYear <- min(df$Year)
complaintType <- substr(file,1,(nchar(file))-4)
wardData <- list()
for(ward in unique(df$Ward)) {
subDf <- df[(df$Ward == ward), ]
tsObject <- ts(subDf$Num.Complaints, start=c(minYear, 1), frequency = 12)
wardData[[ward]] <- tsObject
}
data[[complaintType]] <- wardData
}
data
}
ward.data <- load.ward.data(DATA_FOLDER)
In [ ]:
tsdisplay(ward.data[["Dog menace "]][["N188"]])
In [ ]:
# load percentages
load.ward.percentages <- function(dataFolder) {
data <- list()
for(file in list.files(dataFolder)) {
print(paste0("Loading: ", file))
complaintType <- substr(file,1,(nchar(file))-4)
df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
data[[complaintType]] <- df
}
data
}
ward.perc <- load.ward.percentages("../data/wardLevel/percentages")
In [ ]:
# let's pick a complaint type to do forecasting
complaint.type <- "Dog menace "
loadData <- function(dataFolder) {
files <- list.files(dataFolder)
data <- list()
for(file in files) {
df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
minYear <- min(df$Year)
complaintType <- substr(file,1,(nchar(file))-4)
tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
data[[complaintType]] <- tsObject
}
data
}
city.data <- loadData("../data/topNComplaints/")
series <- city.data[[complaint.type]]
tsdisplay(series)
In [ ]:
train.start <- c(2012, 3)
train.end <- c(2015, 6)
test.start <- c(2015, 7)
test.end <- c(2016, 7)
train.series <- window(series, start=train.start, end=train.end)
test.series <- window(series, start=test.start, end=test.end)
plot(train.series, xlim=c(2012, max(df$Year) + 0.5))
lines(test.series, lty=2)
In [ ]:
tsdisplay(train.series)
In [ ]:
# build a model
forecast.model <- Arima(train.series, order=c(6, 1, 4))
prediction <- forecast(forecast.model, h=length(test.series))
plot(prediction)
lines(test.series, lty=2)
In [ ]:
# source: http://stackoverflow.com/questions/30510148/select-a-value-from-time-series-by-date-in-r
extract.val <- function(tsobj, year, month) {
window(tsobj, start = year + month/12, end = year + month/12)[[1]]
}
In [ ]:
prediction$mean
In [ ]:
extract.val(prediction$mean, 2015, 6)
In [ ]:
cpl.ward.perc <- ward.perc[[complaint.type]]
# ward level forecasts for 2015, 6
(cpl.ward.perc$Percentage * extract.val(prediction$mean, 2015, 6))
In [ ]:
# forecast for a ward
cpl.ward.perc[cpl.ward.perc$Ward == "N188", ]$Percentage * prediction$mean
In [ ]:
# lot's of parameters!
compute.forecast <- function(prediction, ward.data,
ward.perc, complaint.type, ward,
train.start, train.end,
test.start, test.end,
model.description) {
series <- ward.data[[complaint.type]][[ward]]
train.series <- window(series, start=train.start, end=train.end)
test.series <- window(series, start=test.start, end=test.end)
cpl.ward.perc <- ward.perc[[complaint.type]]
forecast.mean <- cpl.ward.perc[cpl.ward.perc$Ward == ward, ]$Percentage * prediction$mean
plot(train.series, main=paste0("Forecasts for ", complaint.type, " for ward ", ward),
ylim=c(0, max(series)), xlim=c(2012, max(df$Year) + 0.5))
lines(test.series, lty=2, col="red")
lines(forecast.mean, col="blue")
legend("topleft",
lty = c(1, 2, 1),
col=c("black", "red", "blue"),
legend=c("Train", "Test", "Prediction"))
acc.result <- accuracy(forecast.mean, test.series)
data.frame(Complaint.Type=complaint.type,
Ward=ward,
TestMAPE=acc.result[1, 5],
Description=model.description)
}
compute.forecast(prediction, ward.data, ward.perc, complaint.type, "N188", train.start, train.end,
test.start, test.end,
"Naive")
In [ ]:
# function to compute forcasts for all wards for a complaint type
complaint.forecast <- function(city.data, ward.data, ward.perc, cpl.type) {
series <- city.data[[cpl.type]]
train.start <- c(2012, 3)
train.end <- c(2015, 6)
test.start <- c(2015, 7)
test.end <- c(2016, 7)
train.series <- window(series, start=train.start, end=train.end)
test.series <- window(series, start=test.start, end=test.end)
forecast.model <- snaive(train.series, h = length(test.series))
model.description <- "Seasonal Naive Method"
prediction <- forecast(forecast.model, h = length(test.series))
plot(prediction)
lines(test.series, lty=2)
cpl.data <- ward.data[[cpl.type]]
results <- data.frame(Complaint.Type=character(),
Ward=character(),
TestMAPE=double(),
Description=character())
for(ward in names(cpl.data)) {
results <- rbind(results, compute.forecast(prediction, ward.data, ward.perc, cpl.type, ward, train.start, train.end,
test.start, test.end, model.description))
}
results
}
results <- data.frame(Complaint.Type=character(),
Ward=character(),
TestMAPE=double(),
Description=character())
In [ ]:
complaint.types <- names(ward.data)
In [ ]:
complaint.types
In [ ]:
results <- rbind(results, complaint.forecast(city.data, ward.data, ward.perc, complaint.types[1]))
In [ ]:
results <- rbind(results, complaint.forecast(city.data, ward.data, ward.perc, complaint.types[2]))
In [ ]:
results <- rbind(results, complaint.forecast(city.data, ward.data, ward.perc, complaint.types[3]))
In [ ]:
results <- rbind(results, complaint.forecast(city.data, ward.data, ward.perc, complaint.types[4]))
In [ ]:
results <- rbind(results, complaint.forecast(city.data, ward.data, ward.perc, complaint.types[5]))
In [ ]:
results %>% arrange(TestMAPE)
In [ ]:
write.csv(results, "seasonalNaive.csv")