In [ ]:
library(forecast)
library(xts)
library(lubridate)
library(dplyr)
In [ ]:
RAW_PATH = "../../cocUptoJuly2016.csv"
In [ ]:
nas <- c(NA, 'NA', '', ' ', 'NULL')
df <- read.csv(RAW_PATH, stringsAsFactors = F,
na.strings = nas)
In [ ]:
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$Resolution.Date <- as.Date(df$Resolution.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1
# discard pre-2012 data, and post 2016 data
df <- df[df$Complaint.Date >= as.Date("01/01/2012", format = "%m/%d/%Y"), ]
df <- df[df$Complaint.Date < as.Date("01/01/2016", format = "%m/%d/%Y"), ]
In [ ]:
head(df)
In [ ]:
table(df$Complaint.Type)
In [ ]:
# plot distribution of complaints
wardComplaints <- table(df$Ward)
wardComplaints <- wardComplaints[order(-wardComplaints)]
plot(wardComplaints)
In [ ]:
(wardComplaints / nrow(df)) * 100
In [ ]:
plot.ward.details <- function(df, ward) {
df <- df[df$Ward == ward, ]
series <- xts(df$NumComplaints, df$Complaint.Date)
series <- apply.monthly(series, FUN = sum)
plot(series, main="Overall Complaint Distribution")
# plot top 6 complaint types
complaintCounts <- table(df$Complaint.Type)
complaintCounts <- complaintCounts[order(-complaintCounts)]
opar <- par(mfrow=c(3,2))
for(i in 1:6) {
complaintType <- names(complaintCounts[i])
sub <- df[df$Complaint.Type == complaintType, ]
series <- xts(sub$NumComplaints, sub$Complaint.Date)
series <- apply.monthly(series, FUN = sum)
plot(series, main=paste0("Complaint Distribution for ", complaintType))
}
par(opar)
df$Month <- month(df$Complaint.Date)
df$Year <- year(df$Complaint.Date)
monthly <- table(Month=df$Month)
plot(monthly)
monthly <- as.data.frame(monthly[order(-monthly)])
#monthly$Month <- month.abb[monthly$Month]
print(monthly)
yearly <- as.data.frame(table(Month=df$Month, Year=df$Year))
yearly$Month <- month.abb[yearly$Month]
}
plot.ward.details(df, "N188")
In [ ]:
viz.heatmap <- function(df, num.complaint.types = 5, num.wards = 5, marg=c(15, 5)) {
cpl.table <- table(df$Complaint.Type)
cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
cpl.top <- names(cpl.table[0:num.complaint.types, ])
ward.table <- table(df$Ward)
ward.table <- as.data.frame(ward.table[order(-ward.table)])
ward.top <- names(ward.table[0:num.wards, ])
subDf <- df[(df$Complaint.Type %in% top5) & (df$Ward %in% ward.top), ]
heatmap(as.matrix(table(subDf$Ward, subDf$Complaint.Type)), margins=marg, Rowv=NA, Colv=NA)
}
viz.heatmap(df, 5, 5)
In [ ]:
# compute % ages
cpl.type <- "Change of address in Electoral Roll"
all.wards <- unique(df$Ward, as.factor=False)
all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
nrow(all.wards)
In [ ]:
subDf <- df[(df$Complaint.Type == cpl.type), ]
head(subDf, 1)
In [ ]:
ward.table <- table(subDf$Ward)
ward.table
total.cpl <- sum(ward.table)
total.cpl
perc.table <- (ward.table / total.cpl) * 100
perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
colnames(perc.table) <- c("Ward", "Percentage")
perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
perc.table
sum(perc.table$Percentage)
In [ ]:
# put it in a nice function
compute.perc.simple <- function(df, cpl.type) {
subDf <- df[(df$Complaint.Type == cpl.type), ]
all.wards <- unique(df$Ward, as.factor=False)
all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
subDf <- df[(df$Complaint.Type == cpl.type), ]
ward.table <- table(subDf$Ward)
total.cpl <- sum(ward.table)
perc.table <- (ward.table / total.cpl)
perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
colnames(perc.table) <- c("Ward", "Percentage")
perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
return(perc.table)
}
# we care about the top 10 complaints
num.complaint.types <- 10
cpl.table <- table(df$Complaint.Type)
cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
cpl.top <- names(cpl.table[0:num.complaint.types, ])
DATA_FOLDER <- "../time-series/data/wardLevel/percentages"
for(complaintType in cpl.top) {
perc.table <- compute.perc.simple(df, complaintType)
# one complaint type has a '/' in it, which messes up the paths
path <- file.path(DATA_FOLDER, paste0(gsub("/", "-", complaintType), ".csv"))
print(paste0("Saving file ", path))
write.csv(perc.table, file=path, row.names=F)
}
In [ ]:
# helper function to load this data up
load.ward.percentages <- function(dataFolder) {
data <- list()
for(file in list.files(dataFolder)) {
print(paste0("Loading: ", file))
complaintType <- substr(file,1,(nchar(file))-4)
df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
data[[complaintType]] <- df
}
data
}
load.ward.percentages(DATA_FOLDER)
In [ ]: