For more information on our methodology, see
In [ ]:
# load in libraries
library(dplyr)
library(ggplot2)
# Helper functions
formatbucks <- function(df,monetary) {
for (m in monetary) {
df[,m] <- as.numeric(gsub( "[$,]", "",df[,m]))
}
return(df)
}
asnum <- function(df,cols) {
for (c in cols) {
df[,c] <- as.numeric(as.character(df[,c]))
}
return(df)
}
nullify <- function(df, cols){
for (c in cols) {
df[,c] <- NULL
}
return(df)
}
In [3]:
# We load in our equivalences file: it has 243 towns
# It's a crosswalk file listing a town's NAME, FAC_ID, and PLACEID
# Note: Leyden Township was added manually, from P5 table of the Census 2010
map <- read.csv('facid_name_place.csv')
map <- unique(map)
In [4]:
# LMO-II
# 2016 lmo rows = 1538
lmo <- read.csv('../lmo/LMO2016_COMPILED.csv')
lmo$FAC_NAME <- lmo$FAC_ID
lmo$FAC_ID <- lmo$GEOCODE # rename this
lmo$GEOCODE <- NULL
lmo$FAC_ID<-as.character(lmo$FAC_ID)
lmo$countycode <- sapply(lmo[,"FAC_ID"], function(x) substr(x, 3, 5))
In [5]:
# 2014 lmo rows = 1541
lmo_14 <- read.csv('../lmo/LMO2014_COMPILED.csv')
lmo_14$FAC_NAME <- lmo_14$FAC_ID
lmo_14$FAC_ID <- lmo_14$GEOCODE
lmo_14$GEOCODE <- NULL
lmo_14$FAC_ID<-as.character(lmo_14$FAC_ID)
lmo_14$countycode <- sapply(lmo_14[,"FAC_ID"], function(x) substr(x, 3, 5))
In [6]:
# merge, and keep all, lmos
# rows = 1592
lmos <- merge(lmo, lmo_14, by="FAC_ID",all=T)
In [30]:
# turn these columns into numbers
monetary <- c("COST_APPARENT_LOSSES","COST_REAL_LOSS")
lmos <- formatbucks(lmos,monetary)
# turn factors into numeric
# NAs warning are due to words like "NA" and "BLANK".
# The only fields we should be missing are for LMO-2 that IDNR could not locate (5)
factors <- c("WATER_SUPPLIED_ADJUSTED_MGY","WATER_SUPPLIED_ADJUSTED_MGD","REAL_LOSSES_MGY","REAL_LOSSES_MGD","COST_REAL_LOSS")
lmos <- asnum(lmos, factors)
In [8]:
# Get rid of redundant columns
lmos$Name_F <-lmos$Name_F.x
lmos$Name_F.x<- NULL
lmos$Name_F.y<- NULL
In [9]:
nrow(lmos[lmos$SOURCE=="Lake Michigan",])
Out[9]:
In [10]:
# Condense pipe, and capped costs
lmos <- lmos %>% mutate(
ALL_60 = CAST_IRON_PIPES_PUMPS_.60 + ALL_OTHER_PIPES_PUMPS_.60,
ALL_4060 = CAST_IRON_PIPES_PUMPS_40_TO_60 + ALL_OTHER_PIPES_PUMPS_40_TO_60,
ALL_2040 = CAST_IRON_PIPES_PUMPS_20_TO_40 + ALL_OTHER_PIPES_PUMPS_20_TO_40,
ALL_20 = CAST_IRON_PIPES_PUMPS_..20 + ALL_OTHER_PIPES_PUMPS_..20,
TOTAL_PIPE_LENGTH = ALL_60+ALL_4060+ALL_2040+ALL_20,
ALL_60_P = (ALL_60/TOTAL_PIPE_LENGTH)*100,
ALL_4060_P = (ALL_4060/TOTAL_PIPE_LENGTH)*100,
ALL_2040_P = (ALL_2040/TOTAL_PIPE_LENGTH)*100,
ALL_20_P = (ALL_20/TOTAL_PIPE_LENGTH)*100)
In [11]:
# Cap losses and costs, only with real loss figures, of which somtimes only MGD is available
lmos <- lmos %>% mutate(
PREALLOSSES_MGY = (REAL_LOSSES_MGY/WATER_SUPPLIED_ADJUSTED_MGY)*100,
PREALLOSSES_MGD = (REAL_LOSSES_MGD/WATER_SUPPLIED_ADJUSTED_MGD)*100,
CAP_LOSSES_MGY = (WATER_SUPPLIED_ADJUSTED_MGY)*0.12, # get 12% of the water supplied
CAP_LOSSES_MGD = ((WATER_SUPPLIED_ADJUSTED_MGD*0.12)),
WATERCONSERVED_MGY = REAL_LOSSES_MGY - CAP_LOSSES_MGY, # what would be conserved if capped
WATERCONSERVED_MGD = (REAL_LOSSES_MGD - CAP_LOSSES_MGD),
CAP_COST_MGY = ( COST_REAL_LOSS / REAL_LOSSES_MGY) * CAP_LOSSES_MGY,
CAP_COST_MGD = (( (COST_REAL_LOSS/365) / REAL_LOSSES_MGD) * CAP_LOSSES_MGD),
COSTCONSERVED_MGY = COST_REAL_LOSS - CAP_COST_MGY,
COSTCONSERVED_MGD = ((COST_REAL_LOSS/365) - CAP_COST_MGD)
)
In [12]:
# load in the water bills data
rates <- read.csv('water_bills_lm.csv')
In [13]:
# merge rates and lmos, keeping all records
rates_lmo <- merge(rates, lmos, by="FAC_ID", all.x=T)
In [14]:
rates_lmo$FAC_NAME.y <- NULL
rates_lmo$FAC_NAME <- rates$FAC_NAME.x
rates_lmo <- merge(rates_lmo, map, by="FAC_ID",all.x=T)
In [15]:
##############################
# Now we get the census info #
##############################
total00 <- read.csv('../census/dec00/SF3_P007.csv')
total10 <- read.csv('../census/dec10/SF1_P5.csv')
total00$PLACEID <- total00$GEO.id2
total10$PLACEID <- total10$GEO.id2
# totals
total00$CENSUS00_total <- total00$VD01
total10$CENSUS10_total <- total10$D001
# clean the total figure
total00$CENSUS00_total<-as.numeric(as.character(total00$CENSUS00_total))
total10$CENSUS10_total <- gsub("\\s*\\([^\\)]+\\)","",as.character(total10$D001))
total10$CENSUS10_total <-as.numeric(as.character(total10$CENSUS10_total))
In [16]:
## RACE BY CENSUS 2010
total10$white <- total10$D003
total10$black <- total10$D004
total10$other_nh <- total10$D005+total10$D006+total10$D007+total10$D008+total10$D009
total10$hispanic <- total10$D010
total10<-total10 %>% mutate(
percent_white = (white / CENSUS10_total)*100,
percent_black = (black / CENSUS10_total)*100,
percent_hispanic = (hispanic / CENSUS10_total)*100,
percent_other_nh = (other_nh / CENSUS10_total)*100
)
total10 <- total10 %>% mutate(
RACE_10CENSUS =
ifelse(percent_black > 50, "black"
,ifelse(percent_white > 50, "white"
,ifelse(percent_hispanic > 50, "hispanic","mixed"
)
)
)
)
In [17]:
total00 <- total00 %>% select(CENSUS00_total, PLACEID)
census10 <- total10 %>% select(CENSUS10_total,PLACEID,RACE_10CENSUS,percent_white,percent_black,percent_hispanic,percent_other_nh)
census_totals <- merge(total00, census10, by='PLACEID', all=T)
rates_cl <- merge(rates_lmo, census_totals, by="PLACEID",all.x=T)
In [18]:
# we can also add median household income from the ACS 15 estimates
# The comparable table for B19013 is in the 2000 census (P053), which is too old to use
mhi <- read.csv('../census/acs/MHIB19013.csv')
lmrmhi <- merge(rates_cl, mhi, by="PLACEID", x.all=T)
lmrmhi$MHI_2015 <- as.numeric(as.character(lmrmhi$MHI_2015))
lmrmhi$MHI_2015_me <- as.numeric(as.character(lmrmhi$MHI_2015_me))
In [19]:
# Calculate the RI, a measure of affordability.
# Keep the upper and lower quartiles of the MHI number
# percent margin of error
lmrmhi$percent_me<-(lmrmhi$MHI_2015_me/lmrmhi$MHI_2015)*100
lmrmhi$affordability<-(lmrmhi$gal5K_17*12 / lmrmhi$MHI_2015 )*100
lmrmhi$aff_upper<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015+lmrmhi$MHI_2015_me) )*100
lmrmhi$aff_lower<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015-lmrmhi$MHI_2015_me) )*100
In [20]:
nrow(lmrmhi)
Out[20]:
In [21]:
########################
## Write out to files ##
########################
# we clean up some leftovers from merging
tonull <- c(
"TYPE","FAC_NAME.x","countycode.x",
"countycode.y","Name_F","NAME","PLACEID","GEO.id")
lmrmhi <- nullify(lmrmhi, tonull)
In [22]:
# get rid of DUPLICATEs but not NA values in rows
# also remove a duplicate for "northlake" that misspells the name
# also remove willow springs duplicates
data <- subset(lmrmhi, SERVICE_POPULATION!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- subset(data, BOUGHT_FROM!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- data %>% filter(!(FAC_NAME == 'north lake'))
data <- unique(data)
In [23]:
# use MGD calculations when MGY is not available, for completeness
# The LMO-2 forms say MGD to MGY transformation should use 365 days
data <- transform(data, PREALLOSSES = ifelse(!is.na(PREALLOSSES_MGY), PREALLOSSES_MGY, PREALLOSSES_MGD))
data <- transform(data, REAL_LOSSES = ifelse(!is.na(REAL_LOSSES_MGY), REAL_LOSSES_MGY, (REAL_LOSSES_MGD*365)))
In [28]:
# congrats, you have your file!
unique(data[data$SOURCE=="Lake Michigan",]) %>% write.csv('lm-rates-lmo-1023.csv',row.names=F) # nrow=163
In [31]:
####################
## Bulletproofing ##
####################
# PART 1: SAME LAKE, UNEQUAL RATES
###################################
In [32]:
# African-American residents’ median water bill is 20 percent higher
# for the same amount of water than residents pay in
# predominantly white communities
# # get raw amt rate went up
data$rate_diff <- data$gal5K_17 - data$gal5K_13
# # get percent rate went up
data$per_rate_diff <- (data$rate_diff / data$gal5K_13)*100
black <- data[data$RACE_10CENSUS == "black",]
white <- data[data$RACE_10CENSUS == "white",]
((median(black$gal5K_17,na.rm=T) - median(white$gal5K_17,na.rm=T))/median(white$gal5K_17,na.rm=T))*100
Out[32]:
In [33]:
# Ford Heights... People there pay nearly six times more for
# water than residents of Highland Park
# and four times more than Chicago residents
data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
data[which(data$FAC_NAME == "ford heights"), ]$gal5K_17
data[which(data$FAC_NAME == "highland park"), ]$gal5K_17
Out[33]:
Out[33]:
Out[33]:
In [34]:
# Overall, towns with median household incomes in the bottom 10 percent
# of the region pay 31 percent more a month for water than
# towns with a median household income in the top 10 percent.
q <- quantile(data$MHI_2015, prob = seq(0, 1, length = 11), type = 5)
lowest <- mean(data[data$MHI_2015 <= q[2],]$gal5K_17,na.rm=TRUE)
top <- mean(data[data$MHI_2015 >= q[10],]$gal5K_17,na.rm=TRUE)
((lowest - top )/ top)*100
Out[34]:
In [35]:
# Of the 10 towns with the highest water rates, five, or 50 percent,
# are majority-black towns, while only 14 percent of communities
# surveyed by the Tribune have majority-black populations.
data %>% arrange(desc(gal5K_17)) %>% select(FAC_NAME, RACE_10CENSUS) %>% head(10)
(nrow(black)) / (nrow(data))
Out[35]:
Out[35]:
In [36]:
# Residents in three towns that receive Lake Michigan water
# are under that strain. All those communities are
# predominantly African-American.
data %>% arrange(desc(affordability)) %>%
select(FAC_NAME, RACE_10CENSUS, affordability) %>% head(10)
Out[36]:
In [37]:
# More than 25 billion gallons of Lake Michigan water are lost each year
# after flowing through the crumbling system, at a cost of more than $44 million
sum(data$REAL_LOSSES, na.rm=T)*1000000
Out[37]:
In [38]:
# Maywood, Hometown, East Hazel Crest, Burnham and Posen lost at least one-third
# of their water last year — the most in the region, according to documents
# they filed with IDNR.
data %>% arrange(desc(PREALLOSSES)) %>%
select(FAC_NAME, PREALLOSSES) %>% head(8)
Out[38]:
In [39]:
# Another 16 communities lost 20 to 32 percent of their
# water, the documents show.
data %>% arrange(desc(PREALLOSSES)) %>%
select(FAC_NAME, PREALLOSSES) %>% head(25)
Out[39]:
In [41]:
# As Chicago raised rates, nearly 1 in 4 suburbs
# in the past four years have quietly passed along
# higher increases. In a handful of towns,
# those increases were twice as high as Chicago’s.
now <- data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
then <-data[which(data$FAC_NAME == "chicago"), ]$gal5K_13
inc <- ((now-then)/then)*100 # 34.62873
over <- data[data$per_rate_diff>inc,] %>% filter(!(per_rate_diff == 'NA'))
nrow(over) / (nrow(data)-1) # -1 because we don't have harvey's data
# 0.2407407
twice_over <- data[data$per_rate_diff>(inc*2),] %>% filter(!(per_rate_diff == 'NA'))
nrow(twice_over) # 5 towns
Out[41]:
Out[41]:
In [42]:
# The result was that between 2013 and 2017, the median cost of
# Lake Michigan water for a typical household rose from
# $34 to $45 per month, or 32 percent, the Tribune’s findings show.
((median(data$gal5K_17, na.rm=T) - median(data$gal5K_13, na.rm=T))/median(data$gal5K_13, na.rm=T))*100
Out[42]:
In [43]:
# In the 10 towns with the highest water rates,
# the typical family’s water bill grew
# to $75 from $51,
# or by 45 percent.
# Five of those towns are predominantly African-American.
top10 <- data %>% arrange(desc(gal5K_17)) %>% head(10)
((median(top10$gal5K_17, na.rm=T) - median(top10$gal5K_13, na.rm=T))/median(top10$gal5K_13, na.rm=T))*100
Out[43]:
In [45]:
# many of those communities hit with the largest rate hikes
# already were paying high water prices.
# But those towns’ populations are dwindling.
# Many of their businesses and industries have departed.
data$pop_change <- ((data$CENSUS10_total - data$CENSUS00_total) / data$CENSUS00_total)*100
data %>% arrange(desc(per_rate_diff)) %>%
select(FAC_NAME,per_rate_diff,pop_change) %>% head(10)
Out[45]:
In [46]:
# Part 2: BILLIONS LOST, MILLIONS WASTED
################################
In [47]:
# Of the 946 million gallons that Maywood bought from Melrose Park
# in 2016, 367 million gallons, or 38.7 percent,
# never made it to taps, costing residents and businesses
# in this cash-strapped village nearly $1.7 million
data[which(data$FAC_NAME == "maywood"), ] %>%
select(PREALLOSSES,REAL_LOSSES_MGY,WATER_SUPPLIED_ADJUSTED_MGY,COST_REAL_LOSS)
Out[47]:
In [48]:
# northeast Illinois would have saved nearly
# $9.1 million if towns using Lake Michigan water
# had been held to the state’s water loss standard of 12 percent.
d<-data %>% filter(COSTCONSERVED_MGY>0)
sum(d$COSTCONSERVED_MGY)
Out[48]:
In [49]:
# Last year, eight towns surveyed by the Tribune —
# Hometown, East Hazel Crest, Posen, Burnham, Riverdale, Flossmoor,
# Lyons and Maywood — lost more than 30 percent of their water.
data %>% arrange(desc(PREALLOSSES)) %>%
select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(8)
Out[49]:
In [50]:
# Over 1 in 4 towns in the region exceeded the 12
# percent standard set by the Illinois Department of Natural Resources.
exceed <- data %>% filter(PREALLOSSES>12)
nrow(exceed) / (nrow(data)-5) # 5 did not submit loss reports to IDNR
# 0.2721519
Out[50]:
In [51]:
# Towns with majority-black populations lost an average
# of 18 percent of their water, compared to the region’s
# overall rate of 10 percent.
# These towns pay some of the highest rates for water in the area.
mean(black$PREALLOSSES,na.rm=T)
mean(data$PREALLOSSES,na.rm=T)
Out[51]:
Out[51]:
In [52]:
# The losses exacted an insidious cost in town after town. In addition
# to the $1.66 million that Maywood residents paid for all of its
# wasted water in 2016, Hometown residents paid $163,000; Flossmoor,
# $846,000; East Hazel Crest, $198,000; Posen, $351,000; and Burnham, $257,000.
data %>% arrange(desc(PREALLOSSES)) %>%
select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(7)
Out[52]:
In [53]:
# Over a quarter of pipes in the Lake Michigan water
# use system are more than 60 years old, according to
# reports submitted to IDNR. Another quarter consists of pipes
# under 20 years old.
(sum(data$ALL_60,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100
Out[53]:
In [54]:
(sum(data$ALL_20,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100
Out[54]:
In [ ]: