For more information on our methodology, see


In [ ]:
# load in libraries
library(dplyr)
library(ggplot2)

# Helper functions

formatbucks <- function(df,monetary) {
  for (m in monetary) {
    df[,m] <- as.numeric(gsub( "[$,]", "",df[,m]))
  }
  return(df)
}

asnum <- function(df,cols) {
  for (c in cols) {
    df[,c] <- as.numeric(as.character(df[,c]))
  }
  return(df)
}

nullify <- function(df, cols){
  for (c in cols) {
    df[,c] <- NULL
  }
  return(df)
}

In [3]:
# We load in our equivalences file: it has 243 towns
# It's a crosswalk file listing a town's NAME, FAC_ID, and PLACEID
# Note: Leyden Township was added manually, from P5 table of the Census 2010
map <- read.csv('facid_name_place.csv')
map <- unique(map)

In [4]:
# LMO-II 
# 2016 lmo rows = 1538
lmo <- read.csv('../lmo/LMO2016_COMPILED.csv')
lmo$FAC_NAME <- lmo$FAC_ID
lmo$FAC_ID <- lmo$GEOCODE # rename this
lmo$GEOCODE <- NULL

lmo$FAC_ID<-as.character(lmo$FAC_ID)
lmo$countycode <- sapply(lmo[,"FAC_ID"], function(x) substr(x, 3, 5))

In [5]:
# 2014 lmo rows = 1541
lmo_14 <- read.csv('../lmo/LMO2014_COMPILED.csv')
lmo_14$FAC_NAME <- lmo_14$FAC_ID
lmo_14$FAC_ID <- lmo_14$GEOCODE
lmo_14$GEOCODE <- NULL

lmo_14$FAC_ID<-as.character(lmo_14$FAC_ID)
lmo_14$countycode <- sapply(lmo_14[,"FAC_ID"], function(x) substr(x, 3, 5))

In [6]:
# merge, and keep all, lmos
# rows = 1592
lmos <- merge(lmo, lmo_14, by="FAC_ID",all=T)

In [30]:
# turn these columns into numbers
monetary <- c("COST_APPARENT_LOSSES","COST_REAL_LOSS")
lmos <- formatbucks(lmos,monetary)

# turn factors into numeric
# NAs warning are due to words like "NA" and "BLANK". 
# The only fields we should be missing are for LMO-2 that IDNR could not locate (5)
factors <- c("WATER_SUPPLIED_ADJUSTED_MGY","WATER_SUPPLIED_ADJUSTED_MGD","REAL_LOSSES_MGY","REAL_LOSSES_MGD","COST_REAL_LOSS")
lmos <- asnum(lmos, factors)

In [8]:
# Get rid of redundant columns
lmos$Name_F <-lmos$Name_F.x
lmos$Name_F.x<- NULL
lmos$Name_F.y<- NULL

In [9]:
nrow(lmos[lmos$SOURCE=="Lake Michigan",])


Out[9]:
227

In [10]:
# Condense pipe, and capped costs
lmos <- lmos %>% mutate(
  ALL_60 = CAST_IRON_PIPES_PUMPS_.60 + ALL_OTHER_PIPES_PUMPS_.60,
  ALL_4060 = CAST_IRON_PIPES_PUMPS_40_TO_60 + ALL_OTHER_PIPES_PUMPS_40_TO_60,
  ALL_2040 = CAST_IRON_PIPES_PUMPS_20_TO_40 + ALL_OTHER_PIPES_PUMPS_20_TO_40,
  ALL_20 = CAST_IRON_PIPES_PUMPS_..20 + ALL_OTHER_PIPES_PUMPS_..20,
  TOTAL_PIPE_LENGTH = ALL_60+ALL_4060+ALL_2040+ALL_20,
  ALL_60_P = (ALL_60/TOTAL_PIPE_LENGTH)*100,
  ALL_4060_P = (ALL_4060/TOTAL_PIPE_LENGTH)*100,
  ALL_2040_P = (ALL_2040/TOTAL_PIPE_LENGTH)*100,
  ALL_20_P = (ALL_20/TOTAL_PIPE_LENGTH)*100)

In [11]:
# Cap losses and costs, only with real loss figures, of which somtimes only MGD is available
lmos <- lmos %>% mutate(
  PREALLOSSES_MGY = (REAL_LOSSES_MGY/WATER_SUPPLIED_ADJUSTED_MGY)*100,
  PREALLOSSES_MGD = (REAL_LOSSES_MGD/WATER_SUPPLIED_ADJUSTED_MGD)*100,

  CAP_LOSSES_MGY = (WATER_SUPPLIED_ADJUSTED_MGY)*0.12, # get 12% of the water supplied
  CAP_LOSSES_MGD = ((WATER_SUPPLIED_ADJUSTED_MGD*0.12)),

  WATERCONSERVED_MGY = REAL_LOSSES_MGY - CAP_LOSSES_MGY, # what would be conserved if capped
  WATERCONSERVED_MGD = (REAL_LOSSES_MGD - CAP_LOSSES_MGD),

  CAP_COST_MGY = ( COST_REAL_LOSS  / REAL_LOSSES_MGY) * CAP_LOSSES_MGY,
  CAP_COST_MGD = (( (COST_REAL_LOSS/365) / REAL_LOSSES_MGD) * CAP_LOSSES_MGD),

  COSTCONSERVED_MGY = COST_REAL_LOSS - CAP_COST_MGY,
  COSTCONSERVED_MGD = ((COST_REAL_LOSS/365) - CAP_COST_MGD)
)

In [12]:
# load in the water bills data
rates <- read.csv('water_bills_lm.csv')

In [13]:
# merge rates and lmos, keeping all records
rates_lmo <- merge(rates, lmos, by="FAC_ID", all.x=T)

In [14]:
rates_lmo$FAC_NAME.y <- NULL
rates_lmo$FAC_NAME <- rates$FAC_NAME.x

rates_lmo <- merge(rates_lmo, map, by="FAC_ID",all.x=T)

In [15]:
##############################
# Now we get the census info #
##############################

total00 <- read.csv('../census/dec00/SF3_P007.csv')
total10 <- read.csv('../census/dec10/SF1_P5.csv')

total00$PLACEID <- total00$GEO.id2
total10$PLACEID <- total10$GEO.id2

# totals

total00$CENSUS00_total <- total00$VD01
total10$CENSUS10_total <- total10$D001

# clean the total figure
total00$CENSUS00_total<-as.numeric(as.character(total00$CENSUS00_total))
total10$CENSUS10_total <- gsub("\\s*\\([^\\)]+\\)","",as.character(total10$D001))
total10$CENSUS10_total <-as.numeric(as.character(total10$CENSUS10_total))

In [16]:
## RACE BY CENSUS 2010

total10$white <- total10$D003
total10$black <- total10$D004
total10$other_nh <- total10$D005+total10$D006+total10$D007+total10$D008+total10$D009
total10$hispanic <- total10$D010

total10<-total10 %>% mutate(
  percent_white = (white / CENSUS10_total)*100,
  percent_black = (black / CENSUS10_total)*100,
  percent_hispanic = (hispanic / CENSUS10_total)*100,
  percent_other_nh = (other_nh / CENSUS10_total)*100
)

total10 <- total10 %>% mutate(
  RACE_10CENSUS = 
    ifelse(percent_black > 50, "black"
      ,ifelse(percent_white > 50, "white"
        ,ifelse(percent_hispanic > 50, "hispanic","mixed"
        )
      )
    )
  )

In [17]:
total00 <- total00 %>% select(CENSUS00_total, PLACEID)
census10 <- total10 %>% select(CENSUS10_total,PLACEID,RACE_10CENSUS,percent_white,percent_black,percent_hispanic,percent_other_nh)

census_totals <- merge(total00, census10, by='PLACEID', all=T)

rates_cl <- merge(rates_lmo, census_totals, by="PLACEID",all.x=T)

In [18]:
# we can also add median household income from the ACS 15 estimates 
# The comparable table for B19013 is in the 2000 census (P053), which is too old to use
mhi <- read.csv('../census/acs/MHIB19013.csv')

lmrmhi <- merge(rates_cl, mhi, by="PLACEID", x.all=T)

lmrmhi$MHI_2015 <- as.numeric(as.character(lmrmhi$MHI_2015))
lmrmhi$MHI_2015_me <- as.numeric(as.character(lmrmhi$MHI_2015_me))

In [19]:
# Calculate the RI, a measure of affordability.
# Keep the upper and lower quartiles of the MHI number

# percent margin of error
lmrmhi$percent_me<-(lmrmhi$MHI_2015_me/lmrmhi$MHI_2015)*100

lmrmhi$affordability<-(lmrmhi$gal5K_17*12 / lmrmhi$MHI_2015 )*100
lmrmhi$aff_upper<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015+lmrmhi$MHI_2015_me) )*100
lmrmhi$aff_lower<-(lmrmhi$gal5K_17*12 / (lmrmhi$MHI_2015-lmrmhi$MHI_2015_me) )*100

In [20]:
nrow(lmrmhi)


Out[20]:
181

In [21]:
########################
## Write out to files ##
########################

# we clean up some leftovers from merging

tonull <- c(
  "TYPE","FAC_NAME.x","countycode.x",
  "countycode.y","Name_F","NAME","PLACEID","GEO.id")

lmrmhi <- nullify(lmrmhi, tonull)

In [22]:
# get rid of DUPLICATEs but not NA values in rows
# also remove a duplicate for "northlake" that misspells the name
# also remove willow springs duplicates

data <- subset(lmrmhi, SERVICE_POPULATION!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- subset(data, BOUGHT_FROM!="DUPLICATE" | is.na(SERVICE_POPULATION))
data <- data %>% filter(!(FAC_NAME == 'north lake'))
data <- unique(data)

In [23]:
# use MGD calculations when MGY is not available, for completeness
# The LMO-2 forms say MGD to MGY transformation should use 365 days
data <- transform(data, PREALLOSSES = ifelse(!is.na(PREALLOSSES_MGY), PREALLOSSES_MGY, PREALLOSSES_MGD))
data <- transform(data, REAL_LOSSES = ifelse(!is.na(REAL_LOSSES_MGY), REAL_LOSSES_MGY, (REAL_LOSSES_MGD*365)))

In [28]:
# congrats, you have your file!
unique(data[data$SOURCE=="Lake Michigan",]) %>% write.csv('lm-rates-lmo-1023.csv',row.names=F) # nrow=163

In [31]:
####################
## Bulletproofing ##
####################

# PART 1: SAME LAKE, UNEQUAL RATES
###################################

In [32]:
# African-American residents’ median water bill is 20 percent higher 
# for the same amount of water than residents pay in 
# predominantly white communities

# # get raw amt rate went up
data$rate_diff <- data$gal5K_17 - data$gal5K_13

# # get percent rate went up
data$per_rate_diff <- (data$rate_diff / data$gal5K_13)*100

black <- data[data$RACE_10CENSUS == "black",]
white <- data[data$RACE_10CENSUS == "white",]

((median(black$gal5K_17,na.rm=T) - median(white$gal5K_17,na.rm=T))/median(white$gal5K_17,na.rm=T))*100


Out[32]:
20.4387732258787

In [33]:
# Ford Heights... People there pay nearly six times more for
# water than residents of Highland Park
# and four times more than Chicago residents
data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
data[which(data$FAC_NAME == "ford heights"), ]$gal5K_17
data[which(data$FAC_NAME == "highland park"), ]$gal5K_17


Out[33]:
19.4
Out[33]:
85
Out[33]:
14.43

In [34]:
# Overall, towns with median household incomes in the bottom 10 percent 
# of the region pay 31 percent more a month for water than 
# towns with a median household income in the top 10 percent.

q <- quantile(data$MHI_2015, prob = seq(0, 1, length = 11), type = 5)

lowest <- mean(data[data$MHI_2015 <= q[2],]$gal5K_17,na.rm=TRUE) 
top <- mean(data[data$MHI_2015 >= q[10],]$gal5K_17,na.rm=TRUE) 

((lowest - top )/ top)*100


Out[34]:
31.4852298998202

In [35]:
# Of the 10 towns with the highest water rates, five, or 50 percent,
# are majority-black towns, while only 14 percent of communities 
# surveyed by the Tribune have majority-black populations.

data %>% arrange(desc(gal5K_17)) %>% select(FAC_NAME, RACE_10CENSUS) %>% head(10)

(nrow(black)) / (nrow(data))


Out[35]:
FAC_NAMERACE_10CENSUS
1indian head parkwhite
2ford heightsblack
3bannockburnwhite
4clarendon hillswhite
5prospect heightswhite
6maywoodblack
7franklin parkwhite
8glenwoodblack
9phoenixblack
10dixmoorblack
Out[35]:
0.134969325153374

In [36]:
# Residents in three towns that receive Lake Michigan water 
# are under that strain. All those communities are 
# predominantly African-American.

data %>% arrange(desc(affordability)) %>% 
         select(FAC_NAME, RACE_10CENSUS, affordability) %>% head(10)


Out[36]:
FAC_NAMERACE_10CENSUSaffordability
1ford heights black 4.84560570071259
2robbins black 3.23452189379182
3phoenix black 2.59767972490921
4dixmoor black 2.1199653188303
5maywood black 2.05995555345406
6markham black 1.87494554642349
7riverdale black 1.68329153896164
8south chicago heightsmixed 1.63910053893328
9posen hispanic 1.63855421686747
10prospect heightswhite 1.49663096904398

In [37]:
# More than 25 billion gallons of Lake Michigan water are lost each year 
# after flowing through the crumbling system, at a cost of more than $44 million

sum(data$REAL_LOSSES, na.rm=T)*1000000


Out[37]:
25780611700

In [38]:
# Maywood, Hometown, East Hazel Crest, Burnham and Posen lost at least one-third 
# of their water last year — the most in the region, according to documents 
# they filed with IDNR. 

data %>% arrange(desc(PREALLOSSES)) %>% 
          select(FAC_NAME, PREALLOSSES) %>% head(8)


Out[38]:
FAC_NAMEPREALLOSSES
1maywood 38.7253005774924
2hometown 37.3359591488175
3east hazel crest33.9123853458995
4posen 33.1250290850347
5burnham 33.0749226953377
6riverdale 32.0194086087647
7flossmoor 31.3991922613254
8lyons 31.3112748568483

In [39]:
# Another 16 communities lost 20 to 32 percent of their 
# water, the documents show.

data %>% arrange(desc(PREALLOSSES)) %>% 
         select(FAC_NAME, PREALLOSSES) %>% head(25)


Out[39]:
FAC_NAMEPREALLOSSES
1maywood 38.7253005774924
2hometown 37.3359591488175
3east hazel crest33.9123853458995
4posen 33.1250290850347
5burnham 33.0749226953377
6riverdale 32.0194086087647
7flossmoor 31.3991922613254
8lyons 31.3112748568483
9berwyn 26.9225443348135
10summit 26.4800223583437
11bellwood 26.158982831295
12justice 26.0548475705859
13winthrop harbor 24.1949742588216
14brookfield 23.8134855158559
15markham 23.4598287476497
16kenilworth 23.4363830004523
17dolton 23.4106083411056
18melrose park 22.1803922062804
19la grange 21.019009953394
20elmwood park 20.1234512429372
21matteson 20.0310726877701
22vernon hills 19.4094915348141
23blue island 19.1572772427649
24worth 19.1553276393789
25lincolnshire 19.0828265079473

In [41]:
# As Chicago raised rates, nearly 1 in 4 suburbs 
# in the past four years have quietly passed along 
# higher increases. In a handful of towns, 
# those increases were twice as high as Chicago’s.
 
now <- data[which(data$FAC_NAME == "chicago"), ]$gal5K_17
then <-data[which(data$FAC_NAME == "chicago"), ]$gal5K_13

inc <- ((now-then)/then)*100 # 34.62873

over <- data[data$per_rate_diff>inc,] %>% filter(!(per_rate_diff == 'NA'))
nrow(over) / (nrow(data)-1) # -1 because we don't have harvey's data
# 0.2407407
twice_over <- data[data$per_rate_diff>(inc*2),]  %>% filter(!(per_rate_diff == 'NA'))

nrow(twice_over) # 5 towns


Out[41]:
0.240740740740741
Out[41]:
5

In [42]:
# The result was that between 2013 and 2017, the median cost of
# Lake Michigan water for a typical household rose from 
# $34 to $45 per month, or 32 percent, the Tribune’s findings show.

((median(data$gal5K_17, na.rm=T) - median(data$gal5K_13, na.rm=T))/median(data$gal5K_13, na.rm=T))*100


Out[42]:
32.1513353115727

In [43]:
# In the 10 towns with the highest water rates, 
# the typical family’s water bill grew 
# to $75 from $51,
# or by 45 percent.
# Five of those towns are predominantly African-American.

top10 <- data %>% arrange(desc(gal5K_17)) %>% head(10)
((median(top10$gal5K_17, na.rm=T) - median(top10$gal5K_13, na.rm=T))/median(top10$gal5K_13, na.rm=T))*100


Out[43]:
45.8846529814272

In [45]:
# many of those communities hit with the largest rate hikes 
# already were paying high water prices. 
# But those towns’ populations are dwindling. 
# Many of their businesses and industries have departed.

data$pop_change <- ((data$CENSUS10_total - data$CENSUS00_total) / data$CENSUS00_total)*100

data %>% arrange(desc(per_rate_diff)) %>% 
        select(FAC_NAME,per_rate_diff,pop_change) %>% head(10)


Out[45]:
FAC_NAMEper_rate_diffpop_change
1calumet city 104.98687664042 -5.00102585145671
2phoenix 100.0594000594 -8.43822843822844
3north riverside 99.5033112582781 -0.905985444824001
4winthrop harbor 96.67170953101361.38345864661654
5indian head park 74.1293532338308 -0.392259414225941
6crestwood 60.6995884773662 -2.39771815669846
7oak park 59.6850393700788 -1.22991394410174
8itasca 58.71536523929474.62078142010403
9calumet park 57.8947368421053 -8.56576029875131
10norridge 56.25 -0.803267528931246

In [46]:
# Part 2: BILLIONS LOST, MILLIONS WASTED
################################

In [47]:
# Of the 946 million gallons that Maywood bought from Melrose Park 
# in 2016, 367 million gallons, or 38.7 percent, 
# never made it to taps, costing residents and businesses 
# in this cash-strapped village nearly $1.7 million

data[which(data$FAC_NAME == "maywood"), ] %>% 
    select(PREALLOSSES,REAL_LOSSES_MGY,WATER_SUPPLIED_ADJUSTED_MGY,COST_REAL_LOSS)


Out[47]:
PREALLOSSESREAL_LOSSES_MGYWATER_SUPPLIED_ADJUSTED_MGYCOST_REAL_LOSS
88 38.7253 367.4090 948.75701660688.0000

In [48]:
# northeast Illinois would have saved nearly 
# $9.1 million if towns using Lake Michigan water 
# had been held to the state’s water loss standard of 12 percent.

d<-data %>% filter(COSTCONSERVED_MGY>0)
sum(d$COSTCONSERVED_MGY)


Out[48]:
9097690.01602041

In [49]:
# Last year, eight towns surveyed by the Tribune — 
# Hometown, East Hazel Crest, Posen, Burnham, Riverdale, Flossmoor, 
# Lyons and Maywood — lost more than 30 percent of their water. 

data %>% arrange(desc(PREALLOSSES)) %>% 
      select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(8)


Out[49]:
FAC_NAMECOST_REAL_LOSSPREALLOSSES
1maywood 1660688 38.7253005774924
2hometown 163244 37.3359591488175
3east hazel crest197602 33.9123853458995
4posen 350584 33.1250290850347
5burnham 257298 33.0749226953377
6riverdale 585059 32.0194086087647
7flossmoor 846481 31.3991922613254
8lyons 589545 31.3112748568483

In [50]:
# Over 1 in 4 towns in the region exceeded the 12 
# percent standard set by the Illinois Department of Natural Resources.

exceed <- data %>% filter(PREALLOSSES>12)

nrow(exceed) / (nrow(data)-5) # 5 did not submit loss reports to IDNR
# 0.2721519


Out[50]:
0.272151898734177

In [51]:
# Towns with majority-black populations lost an average 
# of 18 percent of their water, compared to the region’s 
# overall rate of 10 percent. 
# These towns pay some of the highest rates for water in the area. 

mean(black$PREALLOSSES,na.rm=T)
mean(data$PREALLOSSES,na.rm=T)


Out[51]:
17.5798635965452
Out[51]:
9.84426363462414

In [52]:
# The losses exacted an insidious cost in town after town. In addition 
# to the $1.66 million that Maywood residents paid for all of its 
# wasted water in 2016, Hometown residents paid $163,000; Flossmoor,
#  $846,000; East Hazel Crest, $198,000; Posen, $351,000; and Burnham, $257,000. 

data %>% arrange(desc(PREALLOSSES)) %>% 
      select(FAC_NAME, COST_REAL_LOSS, PREALLOSSES) %>% head(7)


Out[52]:
FAC_NAMECOST_REAL_LOSSPREALLOSSES
1maywood 1660688 38.7253005774924
2hometown 163244 37.3359591488175
3east hazel crest197602 33.9123853458995
4posen 350584 33.1250290850347
5burnham 257298 33.0749226953377
6riverdale 585059 32.0194086087647
7flossmoor 846481 31.3991922613254

In [53]:
# Over a quarter of pipes in the Lake Michigan water 
# use system are more than 60 years old, according to 
# reports submitted to IDNR. Another quarter consists of pipes 
# under 20 years old. 

(sum(data$ALL_60,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100


Out[53]:
29.1627544675424

In [54]:
(sum(data$ALL_20,na.rm=T)/sum(data$TOTAL_PIPE_LENGTH,na.rm=T))*100


Out[54]:
25.7124857125859

In [ ]: