In [ ]:
library(hash)
library(xts)
library(lubridate)
library(forecast)
library(fpp)
In [ ]:
# Constants used throughout the code
INPUT_FILE <- "../../../cocUptoDec2016.csv"
DATA_FOLDER <- "../data/topNComplaints"
In [ ]:
# load the data
df <- read.csv(INPUT_FILE, stringsAsFactors = F)
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1
minDate <- min(df$Complaint.Date)
maxDate <- max(df$Complaint.Date)
head(df)
In [ ]:
# pick top complaint types, and model only that data
topComplaintTypes <- data.frame(table(df$Complaint.Type))
topComplaintTypes <- topComplaintTypes[order(-topComplaintTypes$Freq),]
topComplaintTypes <- topComplaintTypes[1:10, ]
topComplaintTypes <- as.character(topComplaintTypes$Var1)
print(topComplaintTypes)
In [ ]:
data <- df[df$Complaint.Type %in% topComplaintTypes, ]
In [ ]:
print(unique(data$Complaint.Type))
In [ ]:
# create the 'ideal' data set
# TODO Sahil, isn't there a better way to do this?
minYear <- year(minDate)
maxYear <- year(maxDate)
ideal <- data.frame(Month=character(), Year=integer(), stringsAsFactors=F)
for(year in seq(from=minYear, to=maxYear)) {
for(month in month.abb) {
r <- nrow(ideal)
month <- as.character(month)
ideal[nrow(ideal)+1,] <- c(month, year)
}
}
In [ ]:
head(ideal)
In [ ]:
constructMonthlyData <- function(data, complaintType) {
# make this a function for re-use
d <- data[data$Complaint.Type == complaintType, ]
# create xts object for rolling up the data
series <- xts(d$NumComplaints, d$Complaint.Date)
series <- apply.monthly(series, FUN = sum)
# create a df for easy access
monthlyData <- data.frame(Date=index(series), Complaints=coredata(series))
# create columns for join
monthlyData$Month <- month.abb[month(monthlyData$Date)]
monthlyData$Year <- year(monthlyData$Date)
joined <- merge(x = ideal, y = monthlyData, by = c("Month", "Year"), sort=F, all= T)
# don't need date
joined$Date <- NULL
# sort it by year-month, since R doesn't do it otherwise /endrant
joined <- joined[order(as.yearmon(paste0(joined$Year, "-", joined$Month), "%Y-%b")), ]
joined[is.na(joined$Complaints), ]$Complaints <- 0
joined
}
# create the files
for(complaintType in topComplaintTypes) {
joined <- constructMonthlyData(data, complaintType)
# one complaint type has a '/' in it, which messes up the paths
path <- file.path(DATA_FOLDER, paste0(gsub("/", "", complaintType), ".csv"))
print(paste0("Saving file", path))
write.csv(joined, file=path, row.names=F)
}
In [ ]:
# trying it with one complaint type
complaintType <- topComplaintTypes[1]
monthly <- constructMonthlyData(data, complaintType)
monthly$Complaints
In [ ]:
# convert it to a ts object
monthly <- ts(monthly$Complaints, start=c(minYear, 1), frequency = 12)
In [ ]:
print(monthly)
In [ ]:
seasonplot(monthly,ylab="Number of complaints", xlab="Year",
main=paste0("Seasona plot for ", complaintType),
year.labels=TRUE, year.labels.left=TRUE, col=1:20, pch=19)
In [ ]:
naiveMethodsPlot <- function(monthly, complaintType) {
h <- 12
trainStart <- c(2013, 1)
trainEnd <- c(2015, 1)
testStart <- trainEnd
testEnd <- c(2015, 12)
monthly2 <- window(monthly,start=trainStart, end=trainEnd)
monthlyAfter <- window(monthly, start=testStart, end=testEnd)
monthlyfit1 <- meanf(monthly2, h=h)
monthlyfit2 <- naive(monthly2, h=h)
monthlyfit3 <- snaive(monthly2, h=h)
monthlyfit4 <- rwf(monthly2, h=h, drift=TRUE)
plot(monthlyfit1, plot.conf=FALSE,
main=paste0("Forecasts for ", complaintType))
lines(monthlyAfter, lty=2)
lines(monthlyfit2$mean,col=2)
lines(monthlyfit3$mean,col=3)
lines(monthlyfit4$mean, col=6)
legend("topleft",col=c(1,4,2,3,6), lty=c(2, 1, 1, 1,1),
legend=c("Actual Data", "Pred: Mean method",
"Pred: Naive method","Pred: Seasonal naive method",
"Pred: Drift Method"))
# TODO: Remove this line, since later on, we'll have all the data for 2015
monthlyAfter <- window(monthly, start=c(maxYear, 1), end=c(maxYear, 6))
print(paste0(complaintType, ": Mean Method"))
print(accuracy(monthlyfit1, monthlyAfter))
print(paste0(complaintType, ": Naive Method"))
print(accuracy(monthlyfit2, monthlyAfter))
print(paste0(complaintType, ": Seasonal Method"))
print(accuracy(monthlyfit3, monthlyAfter))
print(paste0(complaintType, ": Drift Method"))
print(accuracy(monthlyfit4, monthlyAfter))
}
naiveMethodsPlot(monthly, topComplaintTypes[1])
In [ ]:
# do this for other complaint types as well
for(complaintType in topComplaintTypes[2:length(topComplaintTypes)]) {
monthly <- constructMonthlyData(data, complaintType)
monthly <- ts(monthly$Complaints, start=c(minYear, 1), frequency = 12)
naiveMethodsPlot(monthly, complaintType)
}
In [ ]:
loadData <- function(dataFolder) {
files <- list.files(dataFolder)
data <- list()
for(file in files) {
df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
minYear <- min(df$Year)
complaintType <- substr(file,1,(nchar(file))-4)
tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
data[[complaintType]] <- tsObject
}
data
}
print(loadData(DATA_FOLDER))