In [ ]:
# Add required packages here
require('plyr')
require('dplyr')
require('tidyr')
require('lubridate')
require('stringr')
require("ggplot2")
# Path of data used for analysis
RAW_PATH = "C:\\Users\\sahil.maheshwari\\Documents\\R Scripts\\eGov_DC\\data\\datadive\\coc.csv"
ROLLED_DATA_PATH = "C:\\Users\\sahil.maheshwari\\Documents\\R Scripts\\eGov_DC\\data\\datadive\\ddive_rolled_seq.csv"
In [ ]:
# Put TODOs here
The purpose of this exercise is to:
The notebook is organized into different sections, based on the levels already present in the data.
To simplify the EDA process, let's start with some basic questions we can ask
Since we will be building alerts and forecast models, it is essential to perform a more focussed EDA
In [ ]:
Read data
In [ ]:
nas <- c(NA, 'NA', '', ' ', 'NULL')
rolled_data <- read.csv(ROLLED_DATA_PATH, stringsAsFactors = F,
na.strings = nas)
In [ ]:
glimpse(rolled_data)
In [ ]:
rolled_data$Complaint.Date <- ymd(rolled_data$Complaint.Date)
rolled_data$Complaint.Type <- as.factor(rolled_data$Complaint.Type)
rolled_data$Ward <- as.factor(rolled_data$Ward)
In [ ]:
glimpse(rolled_data)
How is the data spread?
In [ ]:
summary(rolled_data$Complaint.Date)
In [ ]:
rolled_data$Year <- year(rolled_data$Complaint.Date)
rolled_data$Month <- month(rolled_data$Complaint.Date, label=T)
In [ ]:
rolled_data %>%
group_by(Year) %>%
summarise(comp_count=sum(complaint_count, na.rm=T)) %>%
mutate(count_percent=(comp_count/sum(comp_count))*100)
In [ ]:
rolled_data %>%
group_by(Month) %>%
summarise(comp_count=sum(complaint_count, na.rm=T)) %>%
mutate(count_percent=(comp_count/sum(comp_count))*100)
In [ ]:
summary(rolled_data$complaint_count)
In [ ]:
comp_count_m1 <- rolled_data %>% dplyr::select(complaint_count) %>% dplyr::filter(complaint_count>1)
In [ ]:
bins <- c(0,2,4,6,8,10,15,20,25,30,40,50,60)
hist(comp_count_m1$complaint_count, breaks=bins, xlim=c(0,20))
In [ ]:
bins <- c(0, seq(5,55,5))
hist(comp_count_m1$complaint_count, breaks=bins, xlim=c(10,55), ylim=c(0,100), freq=T)
What is the complaint share across the wards? Is it uniform? Do some wards have a dispproportionate share of complaints?
In [ ]:
length(unique(rolled_data$Ward))
In [ ]:
count_by_ward<- rolled_data %>%
group_by(Ward) %>%
dplyr::filter(!is.na(Ward)) %>%
summarise(comp_count=sum(complaint_count, na.rm=T)) %>%
mutate(count_percent=(comp_count/sum(comp_count))*100) %>%
arrange(count_percent)
count_by_ward
In [ ]:
ggplot(count_by_ward, aes(reorder(Ward,count_percent), count_percent)) +
geom_bar(stat="identity")
In [ ]:
summary(count_by_ward$count_percent)
In [ ]:
ward_count_high <- dplyr::filter(count_by_ward, count_percent>0.70050)
sum(ward_count_high$count_percent)
So, the complaint count across Wards is not evenly distributed. Top 25% Wards by count of complaints have ~47% of complaints
What days are the complaints filed? How is the pattern across the wards? How is the pattern across complaint types? What type of distribution do these patterns follow?
In [ ]:
rolled_data$WDay <- wday(rolled_data$Complaint.Date, label=T)
In [ ]:
rolled_data %>%
group_by(WDay) %>%
summarise(comp_count=sum(complaint_count, na.rm=T)) %>%
mutate(count_percent=(comp_count/sum(comp_count))*100)
Almost uniformally distributed
In [ ]: