In [ ]:
library(hash)
library(xts)
library(lubridate)
library(forecast)
library(fpp)

In [ ]:
DATA_FOLDER <- "/home/samarth/workspaces/datakind-workspace/analytics/time-series/data/topNComplaints"
RESULTS_FOLDER <- "/home/samarth/workspaces/datakind-workspace/analytics/time-series/results/topNComplaints/arima"

In [ ]:
loadData <- function(dataFolder) {
    files <- list.files(dataFolder)
    data <- list()
    for(file in files) {    
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)    
        minYear <- min(df$Year)
        complaintType <- substr(file,1,(nchar(file))-4)    
        tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
        data[[complaintType]] <- tsObject
    }
    data
}
data <- loadData(DATA_FOLDER)
complaintTypes <- names(data)
data[[complaintTypes[1]]]

In [ ]:
# time windows for the data
dataStart <- c(2012, 1)
dataEnd <- c(2016, 6)
trainStart <- c(2012, 1)
trainEnd <- c(2015, 6)
testStart <- c(2015, 7)
testEnd <- c(2016, 6)

In [ ]:
buildArimaModel <- function(data, complaintType, order=c(12, 1, 1)) {
    print(paste0("Building ARIMA model for ", complaintType))
    raw <- data[[complaintType]]        
    # since pre-2012, there were very few data points, subset it. also, we only consider 2015 data    
    monthly <- window(raw, start=dataStart, end=dataEnd)        
    # split into 'test' and 'train' set
    trainData <- window(monthly, start=trainStart, end=trainEnd)
    testData <- window(monthly, start=testStart, end=testEnd)
    # plot acf and pacf    
    tsdisplay(trainData, main="Plot of data")            
    fit <- Arima(trainData, order=order, method="ML")    
    plot(forecast(fit, h=12), main="Forecasts")
    lines(testData, lty=2)
}
buildArimaModel(data, complaintTypes[4], order=c(12, 2, 4))

In [ ]:
## since there's no easy way to guess the (p, d, q) values for ARIMA, 
# construct a grid of possible values, and find the best set of params that give best RMSE
grid <- expand.grid(p=seq(1,12), q=seq(1, 12), d=seq(1, 3))
head(grid)

In [ ]:
complaintTypes

In [ ]:
getMAPE <- function(trainData, validationData, order, seasonalOrder=c(0,0,0), lambda=NA) {
    # get a MAPE of the forecase, after a model with the given params have been fitted
    fit <- Arima(trainData, order=order, seasonal=seasonalOrder, method="ML")        
    acc <- accuracy(forecast(fit, h=9), validationData)
    return(list("testMAPE"=acc[2,5], "trainMAPE"=acc[1,5], "testRMSE"=acc[2, 2]))
}
findBestParams <- function(grid, data, complaintType) {    
    start.time <- Sys.time()
    print(paste0("Finding best (p, d, q) values for ", complaintType))
    raw <- data[[complaintType]]        
    # since pre-2012, there were very few data points, subset it. also, we only consider 2015 data 
    # for grid, search, we need to split the data into 3 parts (simple cross-validation)
    # a train set - used to fit the model
    # a validation set - the MAPE values are computed on this to find the best value
    # a test set - to get a accurate MAPE value    
    monthly <- window(raw, start=dataStart, end=dataEnd)    
    trainStart <- c(2012, 1)
    trainEnd <- c(2015, 12)
    # 3 months of data
    validationStart <- c(2016, 1)
    validationEnd <- c(2016, 3)
    # 3 months of data
    testStart <- c(2016, 4)
    testEnd <- c(2016, 6)
    
    # split into 'test' and 'train' set
    trainData <- window(monthly, start=trainStart, end=trainEnd)
    validationData <- window(monthly, start=validationStart, end=validationEnd)
    testData <- window(monthly, start=testStart, end=testEnd)
    # plot acf and pacf
    tsdisplay(trainData, main="Plot of data")
    bestParam <- NULL
    bestMAPE <- list("testMAPE"=100.0, "trainMAPE"=100.0, "testRMSE"=9999.0)
    for(i in 1:nrow(grid)) {
        o <- grid[i,]
        currentOrder <- c(o$p, o$d, o$q)
        mapes <- tryCatch(
          getMAPE(trainData, validationData, order=currentOrder),
          error=function(e) e
        )
        if(inherits(mapes, "error")) next        
        if(mapes$testRMSE < bestMAPE$testRMSE) {
            bestMAPE <- mapes
            bestParam <- currentOrder
        }        
    }
    print("Best params")
    print(bestParam)
    print("with MAPE:")
    print(bestMAPE)
    end.time <- Sys.time()
    time.taken <- end.time - start.time
    print(paste0("Time:", time.taken))
    return(bestParam)
}

In [ ]:
paramList <- list()
for(complaintType in complaintTypes) {
    bestParam <- findBestParams(grid, data, complaintType)
    paramList[[complaintType]] <- bestParam
}

In [ ]:
# for some complaints, the method about doesn't really work well, for those I manually overrode them
paramListClone <- list()
for (complaintType in names(paramList)) {
    paramListClone[[complaintType]] <- paramList[[complaintType]]
}

# Request to relay road
#paramList[[9]] <- c(9, 1, 4)
#paramList[[10]] <- c(12, 1, 4)
#paramList[[8]] <- c(12, 2, 1)
#paramList[[4]] <- c(12, 2, 4)

In [ ]:
# print best params for each complaint type, and save the results
for (complaintType in names(paramList)) {
    print(complaintType)
    print(paramList[[complaintType]])
    o <- paramList[[complaintType]]    
    print(paste0("Building ARIMA model for ", complaintType))
    raw <- data[[complaintType]]        
    # since pre-2012, there were very few data points, subset it. also, we only consider 2015 data    
    monthly <- window(raw, start=dataStart, end=dataEnd)        
    # split into 'test' and 'train' set
    trainData <- window(monthly, start=trainStart, end=trainEnd)
    testData <- window(monthly, start=testStart, end=testEnd)        
    fit <- Arima(trainData, order=o, method="ML")
    png(filename = file.path(file.path(RESULTS_FOLDER, paste0(complaintType, ".png"))), res=500,  width = 12, height = 8, units = 'in')    
    pred <- forecast(fit, h=12)
    mapes <- accuracy(pred, testData)
    testMape <- mapes[2, 5]    
    plot(pred, main=paste0("Forecasts for ", complaintType), xlab="Time", ylab="Complaints")    
    lines(testData, lty=2)
    legend("topleft",col=c(1,1,4), lty=c(1, 2, 1),
      legend=c("Actual Data", "Unseen data", "Prediction"))    
    legend("topright", legend=c(paste0("MAPE: ", testMape), paste0("Params:", paste(o, collapse=" "))))
    dev.off()    
}

ARIMA with pre-processing of data

In this section, we'll try to do some pre-processing of the data to see if it works for certain types of data. The first transformation is called the Box-Cox transform specified in here

Seasonal ARIMA

Seasonal arima or SARIMA models have two parts to it - a non-seasonal part and a seasonal part. In the R, library, you can set the params using P,D,Q in the Arima method


In [ ]:
grid <- expand.grid(p=seq(1,12), q=seq(1, 6), d=seq(1, 3), P=seq(1, 6), Q=seq(1, 3), D=seq(1))
tail(grid)

In [ ]:
findBestParamsSeasonal <- function(grid, data, complaintType) {    
    start.time <- Sys.time()
    print(paste0("Finding best [(p, d, q), (P,Q,D)] values for ", complaintType))
    raw <- data[[complaintType]]        
    # since pre-2012, there were very few data points, subset it. also, we only consider 2015 data 
    # for grid, search, we need to split the data into 3 parts (simple cross-validation)
    # a train set - used to fit the model
    # a validation set - the MAPE values are computed on this to find the best value
    # a test set - to get a accurate MAPE value    
    monthly <- window(raw, start=dataStart, end=dataEnd)    
    trainStart <- c(2012, 1)
    trainEnd <- c(2015, 12)
    # 3 months of data
    validationStart <- c(2016, 1)
    validationEnd <- c(2016, 3)
    # 3 months of data
    testStart <- c(2016, 4)
    testEnd <- c(2016, 6)
    
    # split into 'test' and 'train' set
    trainData <- window(monthly, start=trainStart, end=trainEnd)
    validationData <- window(monthly, start=validationStart, end=validationEnd)
    testData <- window(monthly, start=testStart, end=testEnd)
    # plot acf and pacf
    tsdisplay(trainData, main="Plot of data")
    bestParam <- NULL
    bestMAPE <- list("testMAPE"=100.0, "trainMAPE"=100.0, "testRMSE"=9999.0)
    for(i in 1:nrow(grid)) {
        o <- grid[i,]
        currentOrder <- c(o$p, o$d, o$q)
        seasonalOrder <- c(o$P, o$D, o$q)
        mapes <- tryCatch(
          getMAPE(trainData, validationData, order=currentOrder, seasonalOrder = seasonalOrder),
          error=function(e) e
        )
        if(inherits(mapes, "error")) next        
        if(mapes$testRMSE < bestMAPE$testRMSE) {
            bestMAPE <- mapes
            bestParam <- list(order=currentOrder, seasonal=seasonalOrder)
        }        
    }
    print("Best params")
    print(bestParam)
    print("with MAPE:")
    print(bestMAPE)
    end.time <- Sys.time()
    time.taken <- end.time - start.time
    print(paste0("Time:", time.taken))
    return(bestParam)
}

In [ ]:
bestParam <- findBestParamsSeasonal(grid, data, complaintTypes[1])