Graffiti removal times

This is a copy of the Pothole notebook for 311 graffiti removal requests. Refer to the pothole notebook for documentation.

Configuration


In [9]:
area_shape_fname = "/home/joerg/data/chicago/shapes/CommAreas.shp"
graffiti_fname = "/home/joerg/data/chicago/graffiti.csv"
graffiti_processed_fname = "/home/joerg/data/chicago/graffiti_processed.csv"
socio_fname = "/home/joerg/data/chicago/socioeconomic.csv"

Imports


In [10]:
%matplotlib inline
import os
import sys

from dateutil.parser import parse as dtparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gp
from shapely import geometry
from IPython.display import clear_output

#This one is optional, but makes the plots look nicer:
import seaborn as sns
sns.set_context('poster')

In [11]:
geodf = gp.read_file(area_shape_fname)
if os.path.exists(graffiti_processed_fname):
    dat = pd.read_csv(graffiti_processed_fname)
else: #Preprocess the data
    dat = pd.read_csv(graffiti_fname)
    dat = dat[dat.status == 'Completed']
    print(dat.shape)
    area = []
    for i, (_, row) in enumerate(dat.iterrows()):
        if i and i%100 == 0:
            clear_output()
            print("{} rows of {} processed".format(i, dat.shape[0]))
            sys.stdout.flush()
        point = geometry.Point(row["x_coordinate"], row["y_coordinate"])
        found = False
        for _, georow in geodf.iterrows():
            if georow.geometry.contains(point):
                area.append(georow["COMMUNITY"])
                found = True
                continue
        if not found:
            area.append("UNKNOWN")
    dat["community_area"] = area

    print("{} potholes not assigned to community areas".format(dat[dat.community_area=="UNKNOWN"].shape[0]))

    dat = dat[dat.community_area!="UNKNOWN"]
    dat.to_csv(graffiti_processed_fname, index=False)

In [12]:
dat['days_to_repair'] = [(dtparse(codt)-dtparse(credt)).days for codt, credt in zip(dat.completion_date, dat.creation_date)]

In [13]:
time_per_area = dat.groupby('community_area').aggregate({'days_to_repair': 'mean'}).reset_index()
time_per_area['community_area'] = [x.lower() for x in time_per_area.community_area]
socio = pd.read_csv(socio_fname)
socio['community_area'] = [x.lower() for x in socio['COMMUNITY AREA NAME']]
X = pd.merge(time_per_area, socio, on='community_area')

Plot results


In [ ]:

plt.figure() plt.scatter(X["PER CAPITA INCOME "], X.days_to_repair) plt.xlabel('Per capita income') plt.ylabel('Avg. #days to remove graffiti') plt.title('Days to remove graffiti over Chicago community area per capita income') labeled = pd.concat((X[X.days_to_repair>8], X[X["PER CAPITA INCOME "] > 50000], X[X.days_to_repair<4], X[X["PER CAPITA INCOME "] == X["PER CAPITA INCOME "].min()])) for _, row in labeled.iterrows(): label = row["community_area"] label = label[0].upper () + label[1:] x = row["PER CAPITA INCOME "] y = row["days_to_repair"] plt.annotate(label, xy=(x,y), ha = 'left', va = 'bottom', bbox=dict(boxstyle='round,pad=0.2', fc='SlateGrey', alpha=0.3)) plt.grid('on') plt.show()

Let's also plot that as a map


In [19]:
plt.figure()
geodf['community_area'] = [x.lower() for x in geodf.COMMUNITY]
geodf['days_to_repair'] = [X[X.community_area == ar]["days_to_repair"].iloc[0] 
                           if X[X.community_area == ar]["days_to_repair"].shape[0] else None for ar in geodf.community_area]
geodf['dtr_cat'] = ['4 or less' if dtr < 4 else '4-6' if 4<=dtr<6 else '6-8' if 6<=dtr<8
                    else '8-10' if 8<dtr<10 else '>10' for dtr in geodf.days_to_repair]
geodf.plot('dtr_cat', categorical=True, legend=True, colormap='OrRd')
plt.title('Avg. #days to remove graffiti in Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()



In [16]:
plt.figure()
geodf['income'] = [X[X.community_area == ar]["PER CAPITA INCOME "].iloc[0] 
                           if X[X.community_area == ar]["PER CAPITA INCOME "].shape[0] else None for ar in geodf.community_area]
geodf['inc_cat'] = ['10k or less' if inc < 10000 else '10k-20k' if 10000<=inc<20000 else '20k-30k' if 20000<=inc<30000
                    else '30k-40k' if 30000<inc<40000 else '40k-50k' if 40000<inc<50000 else '>50k' for inc in geodf.income]
geodf.plot('inc_cat', categorical=True, legend=True, colormap='cool')
plt.title('Per capita income Chicago')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.show()