In [ ]:
library(hash)
library(xts)
library(lubridate)
library(forecast)
library(fpp)
In [ ]:
DATA_FOLDER <- "/home/samarth/workspaces/datakind-workspace/analytics/time-series/data/topNComplaints"
RESULTS_FOLDER <- "/home/samarth/workspaces/datakind-workspace/analytics/time-series/results/topNComplaints/arima"
In [ ]:
loadData <- function(dataFolder) {
files <- list.files(dataFolder)
data <- list()
for(file in files) {
df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
minYear <- min(df$Year)
complaintType <- substr(file,1,(nchar(file))-4)
tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
data[[complaintType]] <- tsObject
}
data
}
data <- loadData(DATA_FOLDER)
complaintTypes <- names(data)
data[[complaintTypes[1]]]
In [ ]:
# time windows for the data
dataStart <- c(2012, 1)
dataEnd <- c(2016, 6)
trainStart <- c(2012, 1)
trainEnd <- c(2015, 6)
testStart <- c(2015, 7)
testEnd <- c(2016, 6)
In [ ]:
buildArimaModel <- function(data, complaintType, order=c(12, 1, 1)) {
print(paste0("Building ARIMA model for ", complaintType))
raw <- data[[complaintType]]
# since pre-2012, there were very few data points, subset it. also, we only consider 2015 data
monthly <- window(raw, start=dataStart, end=dataEnd)
# split into 'test' and 'train' set
trainData <- window(monthly, start=trainStart, end=trainEnd)
testData <- window(monthly, start=testStart, end=testEnd)
# plot acf and pacf
tsdisplay(trainData, main="Plot of data")
fit <- Arima(trainData, order=order, method="ML")
plot(forecast(fit, h=12), main="Forecasts")
lines(testData, lty=2)
}
buildArimaModel(data, complaintTypes[4], order=c(12, 2, 4))
In [ ]:
## since there's no easy way to guess the (p, d, q) values for ARIMA,
# construct a grid of possible values, and find the best set of params that give best RMSE
grid <- expand.grid(p=seq(1,12), q=seq(1, 12), d=seq(1, 3))
head(grid)
In [ ]:
complaintTypes
In [ ]:
getMAPE <- function(trainData, validationData, order, seasonalOrder=c(0,0,0), lambda=NA) {
# get a MAPE of the forecase, after a model with the given params have been fitted
fit <- Arima(trainData, order=order, seasonal=seasonalOrder, method="ML")
acc <- accuracy(forecast(fit, h=9), validationData)
return(list("testMAPE"=acc[2,5], "trainMAPE"=acc[1,5], "testRMSE"=acc[2, 2]))
}
findBestParams <- function(grid, data, complaintType) {
start.time <- Sys.time()
print(paste0("Finding best (p, d, q) values for ", complaintType))
raw <- data[[complaintType]]
# since pre-2012, there were very few data points, subset it. also, we only consider 2015 data
# for grid, search, we need to split the data into 3 parts (simple cross-validation)
# a train set - used to fit the model
# a validation set - the MAPE values are computed on this to find the best value
# a test set - to get a accurate MAPE value
monthly <- window(raw, start=dataStart, end=dataEnd)
trainStart <- c(2012, 1)
trainEnd <- c(2015, 12)
# 3 months of data
validationStart <- c(2016, 1)
validationEnd <- c(2016, 3)
# 3 months of data
testStart <- c(2016, 4)
testEnd <- c(2016, 6)
# split into 'test' and 'train' set
trainData <- window(monthly, start=trainStart, end=trainEnd)
validationData <- window(monthly, start=validationStart, end=validationEnd)
testData <- window(monthly, start=testStart, end=testEnd)
# plot acf and pacf
tsdisplay(trainData, main="Plot of data")
bestParam <- NULL
bestMAPE <- list("testMAPE"=100.0, "trainMAPE"=100.0, "testRMSE"=9999.0)
for(i in 1:nrow(grid)) {
o <- grid[i,]
currentOrder <- c(o$p, o$d, o$q)
mapes <- tryCatch(
getMAPE(trainData, validationData, order=currentOrder),
error=function(e) e
)
if(inherits(mapes, "error")) next
if(mapes$testRMSE < bestMAPE$testRMSE) {
bestMAPE <- mapes
bestParam <- currentOrder
}
}
print("Best params")
print(bestParam)
print("with MAPE:")
print(bestMAPE)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(paste0("Time:", time.taken))
return(bestParam)
}
In [ ]:
paramList <- list()
for(complaintType in complaintTypes) {
bestParam <- findBestParams(grid, data, complaintType)
paramList[[complaintType]] <- bestParam
}
In [ ]:
# for some complaints, the method about doesn't really work well, for those I manually overrode them
paramListClone <- list()
for (complaintType in names(paramList)) {
paramListClone[[complaintType]] <- paramList[[complaintType]]
}
# Request to relay road
#paramList[[9]] <- c(9, 1, 4)
#paramList[[10]] <- c(12, 1, 4)
#paramList[[8]] <- c(12, 2, 1)
#paramList[[4]] <- c(12, 2, 4)
In [ ]:
# print best params for each complaint type, and save the results
for (complaintType in names(paramList)) {
print(complaintType)
print(paramList[[complaintType]])
o <- paramList[[complaintType]]
print(paste0("Building ARIMA model for ", complaintType))
raw <- data[[complaintType]]
# since pre-2012, there were very few data points, subset it. also, we only consider 2015 data
monthly <- window(raw, start=dataStart, end=dataEnd)
# split into 'test' and 'train' set
trainData <- window(monthly, start=trainStart, end=trainEnd)
testData <- window(monthly, start=testStart, end=testEnd)
fit <- Arima(trainData, order=o, method="ML")
png(filename = file.path(file.path(RESULTS_FOLDER, paste0(complaintType, ".png"))), res=500, width = 12, height = 8, units = 'in')
pred <- forecast(fit, h=12)
mapes <- accuracy(pred, testData)
testMape <- mapes[2, 5]
plot(pred, main=paste0("Forecasts for ", complaintType), xlab="Time", ylab="Complaints")
lines(testData, lty=2)
legend("topleft",col=c(1,1,4), lty=c(1, 2, 1),
legend=c("Actual Data", "Unseen data", "Prediction"))
legend("topright", legend=c(paste0("MAPE: ", testMape), paste0("Params:", paste(o, collapse=" "))))
dev.off()
}
In this section, we'll try to do some pre-processing of the data to see if it works for certain types of data. The first transformation is called the Box-Cox transform specified in here
In [ ]:
grid <- expand.grid(p=seq(1,12), q=seq(1, 6), d=seq(1, 3), P=seq(1, 6), Q=seq(1, 3), D=seq(1))
tail(grid)
In [ ]:
findBestParamsSeasonal <- function(grid, data, complaintType) {
start.time <- Sys.time()
print(paste0("Finding best [(p, d, q), (P,Q,D)] values for ", complaintType))
raw <- data[[complaintType]]
# since pre-2012, there were very few data points, subset it. also, we only consider 2015 data
# for grid, search, we need to split the data into 3 parts (simple cross-validation)
# a train set - used to fit the model
# a validation set - the MAPE values are computed on this to find the best value
# a test set - to get a accurate MAPE value
monthly <- window(raw, start=dataStart, end=dataEnd)
trainStart <- c(2012, 1)
trainEnd <- c(2015, 12)
# 3 months of data
validationStart <- c(2016, 1)
validationEnd <- c(2016, 3)
# 3 months of data
testStart <- c(2016, 4)
testEnd <- c(2016, 6)
# split into 'test' and 'train' set
trainData <- window(monthly, start=trainStart, end=trainEnd)
validationData <- window(monthly, start=validationStart, end=validationEnd)
testData <- window(monthly, start=testStart, end=testEnd)
# plot acf and pacf
tsdisplay(trainData, main="Plot of data")
bestParam <- NULL
bestMAPE <- list("testMAPE"=100.0, "trainMAPE"=100.0, "testRMSE"=9999.0)
for(i in 1:nrow(grid)) {
o <- grid[i,]
currentOrder <- c(o$p, o$d, o$q)
seasonalOrder <- c(o$P, o$D, o$q)
mapes <- tryCatch(
getMAPE(trainData, validationData, order=currentOrder, seasonalOrder = seasonalOrder),
error=function(e) e
)
if(inherits(mapes, "error")) next
if(mapes$testRMSE < bestMAPE$testRMSE) {
bestMAPE <- mapes
bestParam <- list(order=currentOrder, seasonal=seasonalOrder)
}
}
print("Best params")
print(bestParam)
print("with MAPE:")
print(bestMAPE)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(paste0("Time:", time.taken))
return(bestParam)
}
In [ ]:
bestParam <- findBestParamsSeasonal(grid, data, complaintTypes[1])