EDA for Ward x Complaint Type Time Series Analysis

Objectives:

  • Find out which Ward x Complaint Type is suitable for modeling using TSM
  • Find obvious patterns in the data. In particular, answer what complaint types occur predominatly during which time periods.
  • Construct data files to be used in TSM

In [ ]:
library(forecast)
library(xts)
library(lubridate)
library(dplyr)

In [ ]:
RAW_PATH = "../../cocUptoJuly2016.csv"

In [ ]:
nas <- c(NA, 'NA', '', ' ', 'NULL')
df <- read.csv(RAW_PATH, stringsAsFactors = F,
                    na.strings = nas)

In [ ]:
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$Resolution.Date <- as.Date(df$Resolution.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1

# discard pre-2012 data, and post 2016 data
df <- df[df$Complaint.Date >= as.Date("01/01/2012", format = "%m/%d/%Y"), ] 
df <- df[df$Complaint.Date < as.Date("01/01/2016", format = "%m/%d/%Y"), ]

In [ ]:
head(df)

In [ ]:
table(df$Complaint.Type)

In [ ]:
# plot distribution of complaints
wardComplaints <- table(df$Ward)
wardComplaints <- wardComplaints[order(-wardComplaints)]
plot(wardComplaints)

In [ ]:
(wardComplaints / nrow(df)) * 100

In [ ]:
plot.ward.details <- function(df, ward) {
    df <- df[df$Ward == ward, ]
    series <- xts(df$NumComplaints, df$Complaint.Date)
    series <- apply.monthly(series, FUN = sum)    
    plot(series, main="Overall Complaint Distribution")
    
    # plot top 6 complaint types    
    complaintCounts <- table(df$Complaint.Type)
    complaintCounts <- complaintCounts[order(-complaintCounts)]    
    opar <- par(mfrow=c(3,2))
    for(i in 1:6) {
        complaintType <- names(complaintCounts[i])        
        sub <- df[df$Complaint.Type == complaintType, ]        
        series <- xts(sub$NumComplaints, sub$Complaint.Date)
        series <- apply.monthly(series, FUN = sum)    
        plot(series, main=paste0("Complaint Distribution for ", complaintType))
    }
    par(opar)
    
    df$Month <- month(df$Complaint.Date)
    df$Year <- year(df$Complaint.Date)
    
    monthly <- table(Month=df$Month)    
    plot(monthly)
    monthly <- as.data.frame(monthly[order(-monthly)])
    #monthly$Month <- month.abb[monthly$Month]
    print(monthly)    
    yearly <- as.data.frame(table(Month=df$Month, Year=df$Year))
    yearly$Month <- month.abb[yearly$Month]    
}
plot.ward.details(df, "N188")

In [ ]:
viz.heatmap <- function(df, num.complaint.types = 5, num.wards = 5, marg=c(15, 5)) {
    cpl.table <- table(df$Complaint.Type)
    cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
    cpl.top <- names(cpl.table[0:num.complaint.types, ])
    
    ward.table <- table(df$Ward)
    ward.table <- as.data.frame(ward.table[order(-ward.table)])
    ward.top <- names(ward.table[0:num.wards, ])
    
    subDf <- df[(df$Complaint.Type %in% top5) & (df$Ward %in% ward.top), ]
    
    heatmap(as.matrix(table(subDf$Ward, subDf$Complaint.Type)), margins=marg, Rowv=NA, Colv=NA)
}
viz.heatmap(df, 5, 5)

In [ ]:
# compute % ages
cpl.type <- "Change of address in Electoral Roll"
all.wards <- unique(df$Ward, as.factor=False)
all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
nrow(all.wards)

In [ ]:
subDf <- df[(df$Complaint.Type == cpl.type), ]
head(subDf, 1)

In [ ]:
ward.table <- table(subDf$Ward)
ward.table

total.cpl <- sum(ward.table)
total.cpl

perc.table <- (ward.table / total.cpl) * 100

perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
colnames(perc.table) <- c("Ward", "Percentage")
perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
perc.table

sum(perc.table$Percentage)

In [ ]:
# put it in a nice function
compute.perc.simple <- function(df, cpl.type) {
    subDf <- df[(df$Complaint.Type == cpl.type), ]
    all.wards <- unique(df$Ward, as.factor=False)
    all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
    subDf <- df[(df$Complaint.Type == cpl.type), ]
    
    ward.table <- table(subDf$Ward)
    total.cpl <- sum(ward.table)
    perc.table <- (ward.table / total.cpl)
    perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
    colnames(perc.table) <- c("Ward", "Percentage")
    perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
    perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
    return(perc.table)
}

# we care about the top 10 complaints

num.complaint.types <- 10
cpl.table <- table(df$Complaint.Type)
cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
cpl.top <- names(cpl.table[0:num.complaint.types, ])
DATA_FOLDER <- "../time-series/data/wardLevel/percentages"
for(complaintType in cpl.top) {    
    perc.table <- compute.perc.simple(df, complaintType)
    # one complaint type has a '/' in it, which messes up the paths    
    path <- file.path(DATA_FOLDER, paste0(gsub("/", "-", complaintType), ".csv"))
    print(paste0("Saving file ", path))
    write.csv(perc.table, file=path, row.names=F)
}

In [ ]:
# helper function to load this data up
load.ward.percentages <- function(dataFolder) {
    data <- list()
    for(file in list.files(dataFolder)) {
        print(paste0("Loading: ", file))
        complaintType <- substr(file,1,(nchar(file))-4)
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
        data[[complaintType]] <- df
    }
    data
}
load.ward.percentages(DATA_FOLDER)

In [ ]: