In [ ]:
library(hash)
library(xts)
library(lubridate)
library(forecast)
library(fpp)

In [ ]:
# Constants used throughout the code
INPUT_FILE <- "../../../cocUptoDec2016.csv"
DATA_FOLDER <- "../data/topNComplaints"

Base Vignette

Purpose:

  • To provide a quick start code snippet to get the data, loaded into a useable format for forecasting modules
  • Establish a baseline forecast

In [ ]:
# load the data
df <- read.csv(INPUT_FILE, stringsAsFactors = F)
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1
minDate <- min(df$Complaint.Date)
maxDate <- max(df$Complaint.Date)

head(df)

In [ ]:
# pick top complaint types, and model only that data
topComplaintTypes <- data.frame(table(df$Complaint.Type))
topComplaintTypes <- topComplaintTypes[order(-topComplaintTypes$Freq),]
topComplaintTypes <- topComplaintTypes[1:10, ]
topComplaintTypes <- as.character(topComplaintTypes$Var1)
print(topComplaintTypes)

In [ ]:
data <- df[df$Complaint.Type %in% topComplaintTypes, ]

In [ ]:
print(unique(data$Complaint.Type))

Create data files

For ease of modeling, construct data in the following format: "Month", "Year", "Complaints" , with missing values filled in.


In [ ]:
# create the 'ideal' data set
# TODO Sahil, isn't there a better way to do this?
minYear <- year(minDate)
maxYear <- year(maxDate)
ideal <- data.frame(Month=character(), Year=integer(), stringsAsFactors=F)
for(year in seq(from=minYear, to=maxYear)) {    
    for(month in month.abb) {
        r <- nrow(ideal)
        month <- as.character(month)
        ideal[nrow(ideal)+1,] <- c(month, year)        
    }
}

In [ ]:
head(ideal)

In [ ]:
constructMonthlyData <- function(data, complaintType) {
    # make this a function for re-use
    d <- data[data$Complaint.Type == complaintType, ]
    # create xts object for rolling up the data 
    series <- xts(d$NumComplaints, d$Complaint.Date)
    series <- apply.monthly(series, FUN = sum)
    # create a df for easy access
    monthlyData <- data.frame(Date=index(series), Complaints=coredata(series))
    # create columns for join 
    monthlyData$Month <- month.abb[month(monthlyData$Date)]
    monthlyData$Year <- year(monthlyData$Date)
    joined <- merge(x = ideal, y = monthlyData, by = c("Month", "Year"), sort=F, all= T)        
    # don't need date
    joined$Date <- NULL
    # sort it by year-month, since R doesn't do it otherwise /endrant    
    joined <- joined[order(as.yearmon(paste0(joined$Year, "-", joined$Month), "%Y-%b")), ]
    joined[is.na(joined$Complaints), ]$Complaints <- 0   
    joined
}
# create the files
for(complaintType in topComplaintTypes) {    
    joined <- constructMonthlyData(data, complaintType)
    # one complaint type has a '/' in it, which messes up the paths    
    path <- file.path(DATA_FOLDER, paste0(gsub("/", "", complaintType), ".csv"))
    print(paste0("Saving file", path))
    write.csv(joined, file=path, row.names=F)
}

Baseline method

The purpose of this exercise is to establish a baseline to help us compare the 'naive' method with ML models.

The modelling methods used are described in detail in here.


In [ ]:
# trying it with one complaint type
complaintType <- topComplaintTypes[1]
monthly <- constructMonthlyData(data, complaintType)
monthly$Complaints

In [ ]:
# convert it to a ts object
monthly <- ts(monthly$Complaints, start=c(minYear, 1), frequency = 12)

In [ ]:
print(monthly)

In [ ]:
seasonplot(monthly,ylab="Number of complaints", xlab="Year",
  main=paste0("Seasona plot for ", complaintType),
  year.labels=TRUE, year.labels.left=TRUE, col=1:20, pch=19)

In [ ]:
naiveMethodsPlot <- function(monthly, complaintType) {
    h <- 12
    trainStart <- c(2013, 1)
    trainEnd <- c(2015, 1)
    testStart <- trainEnd
    testEnd <- c(2015, 12)
    monthly2 <- window(monthly,start=trainStart, end=trainEnd)
    monthlyAfter <- window(monthly, start=testStart, end=testEnd)    
    monthlyfit1 <- meanf(monthly2, h=h)
    monthlyfit2 <- naive(monthly2, h=h)
    monthlyfit3 <- snaive(monthly2, h=h)
    monthlyfit4 <- rwf(monthly2, h=h, drift=TRUE)
    
    plot(monthlyfit1, plot.conf=FALSE,
      main=paste0("Forecasts for ", complaintType))    
    lines(monthlyAfter, lty=2)
    lines(monthlyfit2$mean,col=2)
    lines(monthlyfit3$mean,col=3)
    lines(monthlyfit4$mean, col=6)
    legend("topleft",col=c(1,4,2,3,6), lty=c(2, 1, 1, 1,1),
      legend=c("Actual Data", "Pred: Mean method",
               "Pred: Naive method","Pred: Seasonal naive method",
              "Pred: Drift Method"))        
    # TODO: Remove this line, since later on, we'll have all the data for 2015 
    monthlyAfter <- window(monthly, start=c(maxYear, 1), end=c(maxYear, 6))    
    print(paste0(complaintType, ": Mean Method"))
    print(accuracy(monthlyfit1, monthlyAfter))
    print(paste0(complaintType, ": Naive Method"))
    print(accuracy(monthlyfit2, monthlyAfter))
    print(paste0(complaintType, ": Seasonal Method"))
    print(accuracy(monthlyfit3, monthlyAfter))
    print(paste0(complaintType, ": Drift Method"))
    print(accuracy(monthlyfit4, monthlyAfter))
}
naiveMethodsPlot(monthly, topComplaintTypes[1])

In [ ]:
# do this for other complaint types as well
for(complaintType in topComplaintTypes[2:length(topComplaintTypes)]) {
    monthly <- constructMonthlyData(data, complaintType)
    monthly <- ts(monthly$Complaints, start=c(minYear, 1), frequency = 12)
    naiveMethodsPlot(monthly, complaintType)    
}

Boilerplate Code

The code below contains some boilerplate code that loads the data into a usable format


In [ ]:
loadData <- function(dataFolder) {
    files <- list.files(dataFolder)
    data <- list()
    for(file in files) {    
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)    
        minYear <- min(df$Year)
        complaintType <- substr(file,1,(nchar(file))-4)    
        tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
        data[[complaintType]] <- tsObject
    }
    data
}
print(loadData(DATA_FOLDER))