For more information on our methodology, see



In [ ]:

    
# load in libraries
library(dplyr)
library(ggplot2)

# Helper functions

formatbucks <- function(df,monetary) {
  for (m in monetary) {
    df[,m] <- as.numeric(gsub( "[$,]", "",df[,m]))
  }
  return(df)
}

asnum <- function(df,cols) {
  for (c in cols) {
    df[,c] <- as.numeric(as.character(df[,c]))
  }
  return(df)
}

nullify <- function(df, cols){
  for (c in cols) {
    df[,c] <- NULL
  }
  return(df)
}



In [3]:

    
# We load in our equivalences file: it has 243 towns
# It's a crosswalk file listing a town's NAME, FAC_ID, and PLACEID
# Note: Leyden Township was added manually, from P5 table of the Census 2010
map <- read.csv('facid_name_place.csv')
map <- unique(map)



In [4]:

    
# LMO-II 
# 2016 lmo rows = 1538
lmo <- read.csv('../lmo/LMO2016_COMPILED.csv')
lmo$FAC_NAME <- lmo$FAC_ID
lmo$FAC_ID <- lmo$GEOCODE # rename this
lmo$GEOCODE <- NULL

lmo$FAC_ID<-as.character(lmo$FAC_ID)
lmo$countycode <- sapply(lmo[,"FAC_ID"], function(x) substr(x, 3, 5))



In [5]:

    
# 2014 lmo rows = 1541
lmo_14 <- read.csv('../lmo/LMO2014_COMPILED.csv')
lmo_14$FAC_NAME <- lmo_14$FAC_ID
lmo_14$FAC_ID <- lmo_14$GEOCODE
lmo_14$GEOCODE <- NULL

lmo_14$FAC_ID<-as.character(lmo_14$FAC_ID)
lmo_14$countycode <- sapply(lmo_14[,"FAC_ID"], function(x) substr(x, 3, 5))



In [6]:

    
# merge, and keep all, lmos
# rows = 1592
lmos <- merge(lmo, lmo_14, by="FAC_ID",all=T)



In [30]:

    
# turn these columns into numbers
monetary <- c("COST_APPARENT_LOSSES","COST_REAL_LOSS")
lmos <- formatbucks(lmos,monetary)

# turn factors into numeric
# NAs warning are due to words like "NA" and "BLANK". 
# The only fields we should be missing are for LMO-2 that IDNR could not locate (5)
factors <- c("WATER_SUPPLIED_ADJUSTED_MGY","WATER_SUPPLIED_ADJUSTED_MGD","REAL_LOSSES_MGY","REAL_LOSSES_MGD","COST_REAL_LOSS")
lmos <- asnum(lmos, factors)



In [8]:

    
# Get rid of redundant columns
lmos$Name_F <-lmos$Name_F.x
lmos$Name_F.x<- NULL
lmos$Name_F.y<- NULL



In [9]:

    
nrow(lmos[lmos$SOURCE=="Lake Michigan",])









    Out[9]:




227



In [10]:

    
# Condense pipe, and capped costs
lmos <- lmos %>% mutate(
  ALL_60 = CAST_IRON_PIPES_PUMPS_.60 + ALL_OTHER_PIPES_PUMPS_.60,
  ALL_4060 = CAST_IRON_PIPES_PUMPS_40_TO_60 + ALL_OTHER_PIPES_PUMPS_40_TO_60,
  ALL_2040 = CAST_IRON_PIPES_PUMPS_20_TO_40 + ALL_OTHER_PIPES_PUMPS_20_TO_40,
  ALL_20 = CAST_IRON_PIPES_PUMPS_..20 + ALL_OTHER_PIPES_PUMPS_..20,
  TOTAL_PIPE_LENGTH = ALL_60+ALL_4060+ALL_2040+ALL_20,
  ALL_60_P = (ALL_60/TOTAL_PIPE_LENGTH)*100,
  ALL_4060_P = (ALL_4060/TOTAL_PIPE_LENGTH)*100,
  ALL_2040_P = (ALL_2040/TOTAL_PIPE_LENGTH)*100,
  ALL_20_P = (ALL_20/TOTAL_PIPE_LENGTH)*100)



In [11]:

    
# Cap losses and costs, only with real loss figures, of which somtimes only MGD is available
lmos <- lmos %>% mutate(
  PREALLOSSES_MGY = (REAL_LOSSES_MGY/WATER_SUPPLIED_ADJUSTED_MGY)*100,
  PREALLOSSES_MGD = (REAL_LOSSES_MGD/WATER_SUPPLIED_ADJUSTED_MGD)*100,

  CAP_LOSSES_MGY = (WATER_SUPPLIED_ADJUSTED_MGY)*0.12, # get 12% of the water supplied
  CAP_LOSSES_MGD = ((WATER_SUPPLIED_ADJUSTED_MGD*0.12)),

  WATERCONSERVED_MGY = REAL_LOSSES_MGY - CAP_LOSSES_MGY, # what would be conserved if capped
  WATERCONSERVED_MGD = (REAL_LOSSES_MGD - CAP_LOSSES_MGD),

  CAP_COST_MGY = ( COST_REAL_LOSS  / REAL_LOSSES_MGY) * CAP_LOSSES_MGY,
  CAP_COST_MGD = (( (COST_REAL_LOSS/365) / REAL_LOSSES_MGD) * CAP_LOSSES_MGD),

  COSTCONSERVED_MGY = COST_REAL_LOSS - CAP_COST_MGY,
  COSTCONSERVED_MGD = ((COST_REAL_LOSS/365) - CAP_COST_MGD)
)



In [12]:

    
# load in the water bills data
rates <- read.csv('water_bills_lm.csv')



In [13]:

    
# merge rates and lmos, keeping all records
rates_lmo <- merge(rates, lmos, by="FAC_ID", all.x=T)



In [14]:

    
rates_lmo$FAC_NAME.y <- NULL
rates_lmo$FAC_NAME <- rates$FAC_NAME.x

rates_lmo <- merge(rates_lmo, map, by="FAC_ID",all.x=T)



In [15]:

    
##############################
# Now we get the census info #
##############################

total00 <- read.csv('../census/dec00/SF3_P007.csv')
total10 <- read.csv('../census/dec10/SF1_P5.csv')

total00$PLACEID <- total00$GEO.id2
total10$PLACEID <- total10$GEO.id2

# totals

total00$CENSUS00_total <- total00$VD01
total10$CENSUS10_total <- total10$D001

# clean the total figure
total00$CENSUS00_total<-as.numeric(as.character(total00$CENSUS00_total))
total10$CENSUS10_total <- gsub("\\s*\\([^\\)]+\\)","",as.character(total10$D001))
total10$CENSUS10_total <-as.numeric(as.character(total10$CENSUS10_total))



In [16]:

    
## RACE BY CENSUS 2010

total10$white <- total10$D003
total10$black <- total10$D004
total10$other_nh <- total10$D005+total10$D006+total10$D007+total10$D008+total10$D009
total10$hispanic <- total10$D010

total10<-total10 %>% mutate(
  percent_white = (white / CENSUS10_total)*100,
  percent_black = (black / CENSUS10_total)*100,
  percent_hispanic = (hispanic / CENSUS10_total)*100,
  percent_other_nh = (other_nh / CENSUS10_total)*100
)

total10 <- total10 %>% mutate(
  RACE_10CENSUS = 
    ifelse(percent_black > 50, "black"
      ,ifelse(percent_white > 50, "white"
        ,ifelse(percent_hispanic > 50, "hispanic","mixed"
        )
      )
    )
  )



In [17]:

    
total00 <- total00 %>% select(CENSUS00_total, PLACEID)
census10 <- total10 %>% select(CENSUS10_total,PLACEID,RACE_10CENSUS,percent_white,percent_black,percent_hispanic,percent_other_nh)

census_totals <- merge(total00, census10, by='PLACEID', all=T)

rates_cl <- merge(rates_lmo, census_totals, by="PLACEID",all.x=T)



In [18]:

    
# we can also add median household income from the ACS 15 estimates 
# The comparable table for B19013 is in the 2000 census (P053), which is too old to use
mhi <- read.csv('../census/acs/MHIB19013.csv')

lmrmhi <- merge(rates_cl, mhi, by="PLACEID", x.all=T)

lmrmhi$MHI_2015 <- as.numeric(as.character(lmrmhi$MHI_2015))
lmrmhi$MHI_2015_me <- as.numeric(as.character(lmrmhi$MHI_2015_me))



In [19]:

    
# Calculate the RI, a measure of affordability.
# Keep the upper and lower quartiles of the MHI number

# percent margin of error
lmrmhi$percent_me<-(lmrmhi$MHI_2015_me/lmrmhi$MHI_2015)*100

lmrmhi$affordability<-(lmrmhi$gal5K_17*12 / lmrmhi$MHI_2015 )*100
lmrmhi$aff_upper<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015+lmrmhi$MHI_2015_me) )*100
lmrmhi$aff_lower<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015-lmrmhi$MHI_2015_me) )*100



In [20]:

    
nrow(lmrmhi)









    Out[20]:




181



In [21]:

    
########################
## Write out to files ##
########################

# we clean up some leftovers from merging

tonull <- c(
  "TYPE","FAC_NAME.x","countycode.x",
  "countycode.y","Name_F","NAME","PLACEID","GEO.id")

lmrmhi <- nullify(lmrmhi, tonull)



In [22]:

    
# get rid of DUPLICATEs but not NA values in rows
# also remove a duplicate for "northlake" that misspells the name
# also remove willow springs duplicates

data <- subset(lmrmhi, SERVICE_POPULATION!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- subset(data, BOUGHT_FROM!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- data %>% filter(!(FAC_NAME == 'north lake'))
data <- unique(data)



In [23]:

    
# use MGD calculations when MGY is not available, for completeness
# The LMO-2 forms say MGD to MGY transformation should use 365 days
data <- transform(data, PREALLOSSES = ifelse(!is.na(PREALLOSSES_MGY), PREALLOSSES_MGY, PREALLOSSES_MGD))
data <- transform(data, REAL_LOSSES = ifelse(!is.na(REAL_LOSSES_MGY), REAL_LOSSES_MGY, (REAL_LOSSES_MGD*365)))



In [28]:

    
# congrats, you have your file!
unique(data[data$SOURCE=="Lake Michigan",]) %>% write.csv('lm-rates-lmo-1023.csv',row.names=F) # nrow=163



In [31]:

    
####################
## Bulletproofing ##
####################

# PART 1: SAME LAKE, UNEQUAL RATES
###################################



In [32]:

    
# African-American residents’ median water bill is 20 percent higher 
# for the same amount of water than residents pay in 
# predominantly white communities

# # get raw amt rate went up
data$rate_diff <- data$gal5K_17 - data$gal5K_13

# # get percent rate went up
data$per_rate_diff <- (data$rate_diff / data$gal5K_13)*100

black <- data[data$RACE_10CENSUS == "black",]
white <- data[data$RACE_10CENSUS == "white",]

((median(black$gal5K_17,na.rm=T) - median(white$gal5K_17,na.rm=T))/median(white$gal5K_17,na.rm=T))*100









    Out[32]:




20.4387732258787



In [33]:

    
# Ford Heights... People there pay nearly six times more for
# water than residents of Highland Park
# and four times more than Chicago residents
data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
data[which(data$FAC_NAME == "ford heights"), ]$gal5K_17
data[which(data$FAC_NAME == "highland park"), ]$gal5K_17









    Out[33]:




19.4






    Out[33]:




85






    Out[33]:




14.43



In [34]:

    
# Overall, towns with median household incomes in the bottom 10 percent 
# of the region pay 31 percent more a month for water than 
# towns with a median household income in the top 10 percent.

q <- quantile(data$MHI_2015, prob = seq(0, 1, length = 11), type = 5)

lowest <- mean(data[data$MHI_2015 <= q[2],]$gal5K_17,na.rm=TRUE) 
top <- mean(data[data$MHI_2015 >= q[10],]$gal5K_17,na.rm=TRUE) 

((lowest - top )/ top)*100









    Out[34]:




31.4852298998202



In [35]:

    
# Of the 10 towns with the highest water rates, five, or 50 percent,
# are majority-black towns, while only 14 percent of communities 
# surveyed by the Tribune have majority-black populations.

data %>% arrange(desc(gal5K_17)) %>% select(FAC_NAME, RACE_10CENSUS) %>% head(10)

(nrow(black)) / (nrow(data))









    Out[35]:





FAC_NAME RACE_10CENSUS

	1 indian head park white           
	2 ford heights black       
	3 bannockburn white      
	4 clarendon hills white          
	5 prospect heights white           
	6 maywood black  
	7 franklin park white        
	8 glenwood black   
	9 phoenix black  
	10 dixmoor black  









    Out[35]:




0.134969325153374



In [36]:

    
# Residents in three towns that receive Lake Michigan water 
# are under that strain. All those communities are 
# predominantly African-American.

data %>% arrange(desc(affordability)) %>% 
         select(FAC_NAME, RACE_10CENSUS, affordability) %>% head(10)









    Out[36]:





FAC_NAME RACE_10CENSUS affordability

	1 ford heights    black           4.84560570071259
	2 robbins         black           3.23452189379182
	3 phoenix         black           2.59767972490921
	4 dixmoor        black          2.1199653188303
	5 maywood         black           2.05995555345406
	6 markham         black           1.87494554642349
	7 riverdale       black           1.68329153896164
	8 south chicago heights mixed                1.63910053893328     
	9 posen           hispanic        1.63855421686747
	10 prospect heights white           1.49663096904398



In [37]:

    
# More than 25 billion gallons of Lake Michigan water are lost each year 
# after flowing through the crumbling system, at a cost of more than $44 million

sum(data$REAL_LOSSES, na.rm=T)*1000000









    Out[37]:




25780611700



In [38]:

    
# Maywood, Hometown, East Hazel Crest, Burnham and Posen lost at least one-third 
# of their water last year — the most in the region, according to documents 
# they filed with IDNR. 

data %>% arrange(desc(PREALLOSSES)) %>% 
          select(FAC_NAME, PREALLOSSES) %>% head(8)









    Out[38]:





FAC_NAME PREALLOSSES

	1 maywood         38.7253005774924
	2 hometown        37.3359591488175
	3 east hazel crest 33.9123853458995
	4 posen           33.1250290850347
	5 burnham         33.0749226953377
	6 riverdale       32.0194086087647
	7 flossmoor       31.3991922613254
	8 lyons           31.3112748568483



In [39]:

    
# Another 16 communities lost 20 to 32 percent of their 
# water, the documents show.

data %>% arrange(desc(PREALLOSSES)) %>% 
         select(FAC_NAME, PREALLOSSES) %>% head(25)









    Out[39]:





FAC_NAME PREALLOSSES

	1 maywood         38.7253005774924
	2 hometown        37.3359591488175
	3 east hazel crest 33.9123853458995
	4 posen           33.1250290850347
	5 burnham         33.0749226953377
	6 riverdale       32.0194086087647
	7 flossmoor       31.3991922613254
	8 lyons           31.3112748568483
	9 berwyn          26.9225443348135
	10 summit          26.4800223583437
	11 bellwood       26.158982831295
	12 justice         26.0548475705859
	13 winthrop harbor 24.1949742588216
	14 brookfield      23.8134855158559
	15 markham         23.4598287476497
	16 kenilworth      23.4363830004523
	17 dolton          23.4106083411056
	18 melrose park    22.1803922062804
	19 la grange      21.019009953394
	20 elmwood park    20.1234512429372
	21 matteson        20.0310726877701
	22 vernon hills    19.4094915348141
	23 blue island     19.1572772427649
	24 worth           19.1553276393789
	25 lincolnshire    19.0828265079473



In [41]:

    
# As Chicago raised rates, nearly 1 in 4 suburbs 
# in the past four years have quietly passed along 
# higher increases. In a handful of towns, 
# those increases were twice as high as Chicago’s.
 
now <- data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
then <-data[which(data$FAC_NAME == "chicago"), ]$gal5K_13

inc <- ((now-then)/then)*100 # 34.62873

over <- data[data$per_rate_diff>inc,] %>% filter(!(per_rate_diff == 'NA'))
nrow(over) / (nrow(data)-1) # -1 because we don't have harvey's data
# 0.2407407
twice_over <- data[data$per_rate_diff>(inc*2),]  %>% filter(!(per_rate_diff == 'NA'))

nrow(twice_over) # 5 towns









    Out[41]:




0.240740740740741






    Out[41]:




5



In [42]:

    
# The result was that between 2013 and 2017, the median cost of
# Lake Michigan water for a typical household rose from 
# $34 to $45 per month, or 32 percent, the Tribune’s findings show.

((median(data$gal5K_17, na.rm=T) - median(data$gal5K_13, na.rm=T))/median(data$gal5K_13, na.rm=T))*100









    Out[42]:




32.1513353115727



In [43]:

    
# In the 10 towns with the highest water rates, 
# the typical family’s water bill grew 
# to $75 from $51,
# or by 45 percent.
# Five of those towns are predominantly African-American.

top10 <- data %>% arrange(desc(gal5K_17)) %>% head(10)
((median(top10$gal5K_17, na.rm=T) - median(top10$gal5K_13, na.rm=T))/median(top10$gal5K_13, na.rm=T))*100









    Out[43]:




45.8846529814272



In [45]:

    
# many of those communities hit with the largest rate hikes 
# already were paying high water prices. 
# But those towns’ populations are dwindling. 
# Many of their businesses and industries have departed.

data$pop_change <- ((data$CENSUS10_total - data$CENSUS00_total) / data$CENSUS00_total)*100

data %>% arrange(desc(per_rate_diff)) %>% 
        select(FAC_NAME,per_rate_diff,pop_change) %>% head(10)









    Out[45]:





FAC_NAME per_rate_diff pop_change

	1 calumet city     104.98687664042  -5.00102585145671
	2 phoenix          100.0594000594   -8.43822843822844
	3 north riverside   99.5033112582781  -0.905985444824001
	4 winthrop harbor 96.6717095310136 1.38345864661654
	5 indian head park  74.1293532338308  -0.392259414225941
	6 crestwood        60.6995884773662 -2.39771815669846
	7 oak park         59.6850393700788 -1.22991394410174
	8 itasca          58.7153652392947 4.62078142010403
	9 calumet park     57.8947368421053 -8.56576029875131
	10 norridge          56.25             -0.803267528931246



In [46]:

    
# Part 2: BILLIONS LOST, MILLIONS WASTED
################################



In [47]:

    
# Of the 946 million gallons that Maywood bought from Melrose Park 
# in 2016, 367 million gallons, or 38.7 percent, 
# never made it to taps, costing residents and businesses 
# in this cash-strapped village nearly $1.7 million

data[which(data$FAC_NAME == "maywood"), ] %>% 
    select(PREALLOSSES,REAL_LOSSES_MGY,WATER_SUPPLIED_ADJUSTED_MGY,COST_REAL_LOSS)









    Out[47]:





PREALLOSSES REAL_LOSSES_MGY WATER_SUPPLIED_ADJUSTED_MGY COST_REAL_LOSS

	88      38.7253     367.4090     948.7570 1660688.0000



In [48]:

    
# northeast Illinois would have saved nearly 
# $9.1 million if towns using Lake Michigan water 
# had been held to the state’s water loss standard of 12 percent.

d<-data %>% filter(COSTCONSERVED_MGY>0)
sum(d$COSTCONSERVED_MGY)









    Out[48]:




9097690.01602041



In [49]:

    
# Last year, eight towns surveyed by the Tribune — 
# Hometown, East Hazel Crest, Posen, Burnham, Riverdale, Flossmoor, 
# Lyons and Maywood — lost more than 30 percent of their water. 

data %>% arrange(desc(PREALLOSSES)) %>% 
      select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(8)









    Out[49]:





FAC_NAME COST_REAL_LOSS PREALLOSSES

	1 maywood         1660688         38.7253005774924
	2 hometown        163244          37.3359591488175
	3 east hazel crest 197602          33.9123853458995
	4 posen           350584          33.1250290850347
	5 burnham         257298          33.0749226953377
	6 riverdale       585059          32.0194086087647
	7 flossmoor       846481          31.3991922613254
	8 lyons           589545          31.3112748568483



In [50]:

    
# Over 1 in 4 towns in the region exceeded the 12 
# percent standard set by the Illinois Department of Natural Resources.

exceed <- data %>% filter(PREALLOSSES>12)

nrow(exceed) / (nrow(data)-5) # 5 did not submit loss reports to IDNR
# 0.2721519









    Out[50]:




0.272151898734177



In [51]:

    
# Towns with majority-black populations lost an average 
# of 18 percent of their water, compared to the region’s 
# overall rate of 10 percent. 
# These towns pay some of the highest rates for water in the area. 

mean(black$PREALLOSSES,na.rm=T)
mean(data$PREALLOSSES,na.rm=T)









    Out[51]:




17.5798635965452






    Out[51]:




9.84426363462414



In [52]:

    
# The losses exacted an insidious cost in town after town. In addition 
# to the $1.66 million that Maywood residents paid for all of its 
# wasted water in 2016, Hometown residents paid $163,000; Flossmoor,
#  $846,000; East Hazel Crest, $198,000; Posen, $351,000; and Burnham, $257,000. 

data %>% arrange(desc(PREALLOSSES)) %>% 
      select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(7)









    Out[52]:





FAC_NAME COST_REAL_LOSS PREALLOSSES

	1 maywood         1660688         38.7253005774924
	2 hometown        163244          37.3359591488175
	3 east hazel crest 197602          33.9123853458995
	4 posen           350584          33.1250290850347
	5 burnham         257298          33.0749226953377
	6 riverdale       585059          32.0194086087647
	7 flossmoor       846481          31.3991922613254



In [53]:

    
# Over a quarter of pipes in the Lake Michigan water 
# use system are more than 60 years old, according to 
# reports submitted to IDNR. Another quarter consists of pipes 
# under 20 years old. 

(sum(data$ALL_60,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100









    Out[53]:




29.1627544675424



In [54]:

    
(sum(data$ALL_20,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100









    Out[54]:




25.7124857125859



In [ ]:

	FAC_NAME	RACE_10CENSUS
1	indian head park	white
2	ford heights	black
3	bannockburn	white
4	clarendon hills	white
5	prospect heights	white
6	maywood	black
7	franklin park	white
8	glenwood	black
9	phoenix	black
10	dixmoor	black

	FAC_NAME	RACE_10CENSUS	affordability
1	ford heights	black	4.84560570071259
2	robbins	black	3.23452189379182
3	phoenix	black	2.59767972490921
4	dixmoor	black	2.1199653188303
5	maywood	black	2.05995555345406
6	markham	black	1.87494554642349
7	riverdale	black	1.68329153896164
8	south chicago heights	mixed	1.63910053893328
9	posen	hispanic	1.63855421686747
10	prospect heights	white	1.49663096904398

	FAC_NAME	PREALLOSSES
1	maywood	38.7253005774924
2	hometown	37.3359591488175
3	east hazel crest	33.9123853458995
4	posen	33.1250290850347
5	burnham	33.0749226953377
6	riverdale	32.0194086087647
7	flossmoor	31.3991922613254
8	lyons	31.3112748568483

	FAC_NAME	per_rate_diff	pop_change
1	calumet city	104.98687664042	-5.00102585145671
2	phoenix	100.0594000594	-8.43822843822844
3	north riverside	99.5033112582781	-0.905985444824001
4	winthrop harbor	96.6717095310136	1.38345864661654
5	indian head park	74.1293532338308	-0.392259414225941
6	crestwood	60.6995884773662	-2.39771815669846
7	oak park	59.6850393700788	-1.22991394410174
8	itasca	58.7153652392947	4.62078142010403
9	calumet park	57.8947368421053	-8.56576029875131
10	norridge	56.25	-0.803267528931246

	FAC_NAME	COST_REAL_LOSS	PREALLOSSES
1	maywood	1660688	38.7253005774924
2	hometown	163244	37.3359591488175
3	east hazel crest	197602	33.9123853458995
4	posen	350584	33.1250290850347
5	burnham	257298	33.0749226953377
6	riverdale	585059	32.0194086087647
7	flossmoor	846481	31.3991922613254
8	lyons	589545	31.3112748568483