This is a copy of the Pothole notebook for 311 graffiti removal requests. Refer to the pothole notebook for documentation.
In [9]:
area_shape_fname = "/home/joerg/data/chicago/shapes/CommAreas.shp"
graffiti_fname = "/home/joerg/data/chicago/graffiti.csv"
graffiti_processed_fname = "/home/joerg/data/chicago/graffiti_processed.csv"
socio_fname = "/home/joerg/data/chicago/socioeconomic.csv"
In [10]:
%matplotlib inline
import os
import sys
from dateutil.parser import parse as dtparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gp
from shapely import geometry
from IPython.display import clear_output
#This one is optional, but makes the plots look nicer:
import seaborn as sns
sns.set_context('poster')
In [11]:
geodf = gp.read_file(area_shape_fname)
if os.path.exists(graffiti_processed_fname):
dat = pd.read_csv(graffiti_processed_fname)
else: #Preprocess the data
dat = pd.read_csv(graffiti_fname)
dat = dat[dat.status == 'Completed']
print(dat.shape)
area = []
for i, (_, row) in enumerate(dat.iterrows()):
if i and i%100 == 0:
clear_output()
print("{} rows of {} processed".format(i, dat.shape[0]))
sys.stdout.flush()
point = geometry.Point(row["x_coordinate"], row["y_coordinate"])
found = False
for _, georow in geodf.iterrows():
if georow.geometry.contains(point):
area.append(georow["COMMUNITY"])
found = True
continue
if not found:
area.append("UNKNOWN")
dat["community_area"] = area
print("{} potholes not assigned to community areas".format(dat[dat.community_area=="UNKNOWN"].shape[0]))
dat = dat[dat.community_area!="UNKNOWN"]
dat.to_csv(graffiti_processed_fname, index=False)
In [12]:
dat['days_to_repair'] = [(dtparse(codt)-dtparse(credt)).days for codt, credt in zip(dat.completion_date, dat.creation_date)]
In [13]:
time_per_area = dat.groupby('community_area').aggregate({'days_to_repair': 'mean'}).reset_index()
time_per_area['community_area'] = [x.lower() for x in time_per_area.community_area]
socio = pd.read_csv(socio_fname)
socio['community_area'] = [x.lower() for x in socio['COMMUNITY AREA NAME']]
X = pd.merge(time_per_area, socio, on='community_area')
In [ ]:
In [19]:
plt.figure()
geodf['community_area'] = [x.lower() for x in geodf.COMMUNITY]
geodf['days_to_repair'] = [X[X.community_area == ar]["days_to_repair"].iloc[0]
if X[X.community_area == ar]["days_to_repair"].shape[0] else None for ar in geodf.community_area]
geodf['dtr_cat'] = ['4 or less' if dtr < 4 else '4-6' if 4<=dtr<6 else '6-8' if 6<=dtr<8
else '8-10' if 8<dtr<10 else '>10' for dtr in geodf.days_to_repair]
geodf.plot('dtr_cat', categorical=True, legend=True, colormap='OrRd')
plt.title('Avg. #days to remove graffiti in Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()
In [16]:
plt.figure()
geodf['income'] = [X[X.community_area == ar]["PER CAPITA INCOME "].iloc[0]
if X[X.community_area == ar]["PER CAPITA INCOME "].shape[0] else None for ar in geodf.community_area]
geodf['inc_cat'] = ['10k or less' if inc < 10000 else '10k-20k' if 10000<=inc<20000 else '20k-30k' if 20000<=inc<30000
else '30k-40k' if 30000<inc<40000 else '40k-50k' if 40000<inc<50000 else '>50k' for inc in geodf.income]
geodf.plot('inc_cat', categorical=True, legend=True, colormap='cool')
plt.title('Per capita income Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()