EDA for Ward x Complaint Type Time Series Analysis

Objectives:

Find out which Ward x Complaint Type is suitable for modeling using TSM
Find obvious patterns in the data. In particular, answer what complaint types occur predominatly during which time periods.
Construct data files to be used in TSM



In [ ]:

    
library(forecast)
library(xts)
library(lubridate)
library(dplyr)



In [ ]:

    
RAW_PATH = "../../cocUptoJuly2016.csv"



In [ ]:

    
nas <- c(NA, 'NA', '', ' ', 'NULL')
df <- read.csv(RAW_PATH, stringsAsFactors = F,
                    na.strings = nas)



In [ ]:

    
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$Resolution.Date <- as.Date(df$Resolution.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1

# discard pre-2012 data, and post 2016 data
df <- df[df$Complaint.Date >= as.Date("01/01/2012", format = "%m/%d/%Y"), ] 
df <- df[df$Complaint.Date < as.Date("01/01/2016", format = "%m/%d/%Y"), ]



In [ ]:

    
head(df)



In [ ]:

    
table(df$Complaint.Type)



In [ ]:

    
# plot distribution of complaints
wardComplaints <- table(df$Ward)
wardComplaints <- wardComplaints[order(-wardComplaints)]
plot(wardComplaints)



In [ ]:

    
(wardComplaints / nrow(df)) * 100



In [ ]:

    
plot.ward.details <- function(df, ward) {
    df <- df[df$Ward == ward, ]
    series <- xts(df$NumComplaints, df$Complaint.Date)
    series <- apply.monthly(series, FUN = sum)    
    plot(series, main="Overall Complaint Distribution")
    
    # plot top 6 complaint types    
    complaintCounts <- table(df$Complaint.Type)
    complaintCounts <- complaintCounts[order(-complaintCounts)]    
    opar <- par(mfrow=c(3,2))
    for(i in 1:6) {
        complaintType <- names(complaintCounts[i])        
        sub <- df[df$Complaint.Type == complaintType, ]        
        series <- xts(sub$NumComplaints, sub$Complaint.Date)
        series <- apply.monthly(series, FUN = sum)    
        plot(series, main=paste0("Complaint Distribution for ", complaintType))
    }
    par(opar)
    
    df$Month <- month(df$Complaint.Date)
    df$Year <- year(df$Complaint.Date)
    
    monthly <- table(Month=df$Month)    
    plot(monthly)
    monthly <- as.data.frame(monthly[order(-monthly)])
    #monthly$Month <- month.abb[monthly$Month]
    print(monthly)    
    yearly <- as.data.frame(table(Month=df$Month, Year=df$Year))
    yearly$Month <- month.abb[yearly$Month]    
}
plot.ward.details(df, "N188")



In [ ]:

    
viz.heatmap <- function(df, num.complaint.types = 5, num.wards = 5, marg=c(15, 5)) {
    cpl.table <- table(df$Complaint.Type)
    cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
    cpl.top <- names(cpl.table[0:num.complaint.types, ])
    
    ward.table <- table(df$Ward)
    ward.table <- as.data.frame(ward.table[order(-ward.table)])
    ward.top <- names(ward.table[0:num.wards, ])
    
    subDf <- df[(df$Complaint.Type %in% top5) & (df$Ward %in% ward.top), ]
    
    heatmap(as.matrix(table(subDf$Ward, subDf$Complaint.Type)), margins=marg, Rowv=NA, Colv=NA)
}
viz.heatmap(df, 5, 5)



In [ ]:

    
# compute % ages
cpl.type <- "Change of address in Electoral Roll"
all.wards <- unique(df$Ward, as.factor=False)
all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
nrow(all.wards)



In [ ]:

    
subDf <- df[(df$Complaint.Type == cpl.type), ]
head(subDf, 1)



In [ ]:

    
ward.table <- table(subDf$Ward)
ward.table

total.cpl <- sum(ward.table)
total.cpl

perc.table <- (ward.table / total.cpl) * 100

perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
colnames(perc.table) <- c("Ward", "Percentage")
perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
perc.table

sum(perc.table$Percentage)



In [ ]:

    
# put it in a nice function
compute.perc.simple <- function(df, cpl.type) {
    subDf <- df[(df$Complaint.Type == cpl.type), ]
    all.wards <- unique(df$Ward, as.factor=False)
    all.wards <- data.frame(Ward=all.wards, stringsAsFactors = F)
    subDf <- df[(df$Complaint.Type == cpl.type), ]
    
    ward.table <- table(subDf$Ward)
    total.cpl <- sum(ward.table)
    perc.table <- (ward.table / total.cpl)
    perc.table <- as.data.frame(perc.table, stringsAsFactors = F)
    colnames(perc.table) <- c("Ward", "Percentage")
    perc.table <- left_join(all.wards, perc.table, by = c("Ward"))
    perc.table$Percentage[is.na(perc.table$Percentage)] <- 0.0
    return(perc.table)
}

# we care about the top 10 complaints

num.complaint.types <- 10
cpl.table <- table(df$Complaint.Type)
cpl.table <- as.data.frame(cpl.table[order(-cpl.table)])
cpl.top <- names(cpl.table[0:num.complaint.types, ])
DATA_FOLDER <- "../time-series/data/wardLevel/percentages"
for(complaintType in cpl.top) {    
    perc.table <- compute.perc.simple(df, complaintType)
    # one complaint type has a '/' in it, which messes up the paths    
    path <- file.path(DATA_FOLDER, paste0(gsub("/", "-", complaintType), ".csv"))
    print(paste0("Saving file ", path))
    write.csv(perc.table, file=path, row.names=F)
}



In [ ]:

    
# helper function to load this data up
load.ward.percentages <- function(dataFolder) {
    data <- list()
    for(file in list.files(dataFolder)) {
        print(paste0("Loading: ", file))
        complaintType <- substr(file,1,(nchar(file))-4)
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)
        data[[complaintType]] <- df
    }
    data
}
load.ward.percentages(DATA_FOLDER)



In [ ]: