You should get the community area boundaries from https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Community-Areas-current-/cauq-8yn6 (Export->Shapefile) and adjust the path below to the shapefile. Also, use pull_data to pull the 311 reports for Potholes from https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Pot-Holes-Reported/7as2-ds3y into a csv and adjust the path to the file below. Since finding community areas takes a while, the processed potholes file will be saved back to potholes_processed_fname. If that file is found, the preprocessing steps will be skipped and that file loaded.
For later, also grab the csv from here: https://data.cityofchicago.org/Health-Human-Services/Census-Data-Selected-socioeconomic-indicators-in-C/kn9c-c2s2 and save it somewhere, and add the path below.
In [4]:
area_shape_fname = "/home/joerg/data/chicago/shapes/CommAreas.shp"
potholes_fname = "/home/joerg/data/chicago/potholes.csv"
potholes_processed_fname = "/home/joerg/data/chicago/potholes_processed.csv"
socio_fname = "/home/joerg/data/chicago/socioeconomic.csv"
In [19]:
%matplotlib inline
import os
import sys
from dateutil.parser import parse as dtparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gp
from shapely import geometry
from IPython.display import clear_output
#This one is optional, but makes the plots look nicer:
import seaborn as sns
sns.set_context('poster')
In [6]:
geodf = gp.read_file(area_shape_fname)
if os.path.exists(potholes_processed_fname):
dat = pd.read_csv(potholes_processed_fname)
else: #Preprocess the data
dat = pd.read_csv(potholes_fname)
dat = dat[dat.status == 'Completed']
print(dat.shape)
area = []
for i, (_, row) in enumerate(dat.iterrows()):
if i and i%100 == 0:
clear_output()
print("{} rows of {} processed".format(i, dat.shape[0]))
sys.stdout.flush()
point = geometry.Point(row["x_coordinate"], row["y_coordinate"])
found = False
for _, georow in geodf.iterrows():
if georow.geometry.contains(point):
area.append(georow["COMMUNITY"])
found = True
continue
if not found:
area.append("UNKNOWN")
dat["community_area"] = area
print("{} potholes not assigned to community areas".format(dat[dat.community_area=="UNKNOWN"].shape[0]))
dat = dat[dat.community_area!="UNKNOWN"]
dat.to_csv(potholes_processed_fname, index=False)
In [7]:
dat['days_to_repair'] = [(dtparse(codt)-dtparse(credt)).days for codt, credt in zip(dat.completion_date, dat.creation_date)]
In [8]:
time_per_area = dat.groupby('community_area').aggregate({'days_to_repair': 'mean'}).reset_index()
time_per_area['community_area'] = [x.lower() for x in time_per_area.community_area]
socio = pd.read_csv(socio_fname)
socio['community_area'] = [x.lower() for x in socio['COMMUNITY AREA NAME']]
X = pd.merge(time_per_area, socio, on='community_area')
In [20]:
plt.figure()
plt.scatter(X["PER CAPITA INCOME "], X.days_to_repair)
plt.xlabel('Per capita income')
plt.ylabel('Avg. #days to patch a pothole')
plt.title('Days to patch a pothole over Chicago community area per capita income')
labeled = pd.concat((X[X.days_to_repair>25], X[X["PER CAPITA INCOME "] > 50000],
X[X.days_to_repair==X.days_to_repair.min()], X[X["PER CAPITA INCOME "] == X["PER CAPITA INCOME "].min()]))
for _, row in labeled.iterrows():
label = row["community_area"]
label = label[0].upper () + label[1:]
x = row["PER CAPITA INCOME "]
y = row["days_to_repair"]
plt.annotate(label, xy=(x,y), ha = 'left', va = 'bottom', bbox=dict(boxstyle='round,pad=0.2', fc='SlateGrey', alpha=0.3))
plt.grid('on')
plt.show()
In [35]:
plt.figure()
geodf['community_area'] = [x.lower() for x in geodf.COMMUNITY]
geodf['days_to_repair'] = [X[X.community_area == ar]["days_to_repair"].iloc[0]
if X[X.community_area == ar]["days_to_repair"].shape[0] else None for ar in geodf.community_area]
geodf['dtr_cat'] = ['10 or less' if dtr < 10 else '10-15' if 10<=dtr<15 else '15-20' if 15<=dtr<20
else '20-25' if 20<dtr<25 else '25-30' if 25<dtr<30 else '>30' for dtr in geodf.days_to_repair]
geodf.plot('dtr_cat', categorical=True, legend=True, colormap='OrRd')
plt.title('Avg. #days to repair a pothole in Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()
In [37]:
plt.figure()
geodf['income'] = [X[X.community_area == ar]["PER CAPITA INCOME "].iloc[0]
if X[X.community_area == ar]["PER CAPITA INCOME "].shape[0] else None for ar in geodf.community_area]
geodf['inc_cat'] = ['10k or less' if inc < 10000 else '10k-20k' if 10000<=inc<20000 else '20k-30k' if 20000<=inc<30000
else '30k-40k' if 30000<inc<40000 else '40k-50k' if 40000<inc<50000 else '>50k' for inc in geodf.income]
geodf.plot('inc_cat', categorical=True, legend=True, colormap='cool')
plt.title('Per capita income Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()