In [2]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import quote_plus, urlencode
In [3]:
# Helper Functions
def check_nulls(df, col):
'''
returns df with NaN in specified column
'''
return df.ix[df.ix[:,col].isnull()]
def get_notnulls(df, col):
'''
returns df with NaN in specified column
'''
return df.ix[df.ix[:,col].notnull()]
def clean_data(df, cleaning_tuples):
'''
replace a string in a column (pat) with a clean string (repl):
e.g. cleaning_tuples = [(col, pat, repl)]
'''
for col, pat, repl in cleaning_tuples:
df.ix[:,col] = df.ix[:,col].str.replace(pat, repl)
def clean_grouped_data(grouped_df):
'''
returns df with counts that result from groupby
'''
counts = pd.DataFrame(grouped_df.count().ix[:,"service_request_number"])
counts = counts.unstack()
counts.columns = counts.columns.droplevel()
counts.columns.name = None
counts.index.name = None
counts.fillna(0, inplace=True)
return counts
In [4]:
BASE_URL = "https://data.cityofchicago.org/resource/"
def make_data_frame(verbose=True):
'''
Idiosyncratic command that collects 311 response data and cleans up pandas dataframe
'''
date = ["creation_date","completion_date","type_of_service_request"]
date2 = ["date_service_request_was_received","service_request_type"]
endpoints = {"787j-mys9.json?":{"SELECT": date, "WHERE": date[0]},
"cdmx-wzbz.json?":{"SELECT": date, "WHERE": date[0]},
"kcdz-f29q.json?":{"SELECT": date, "WHERE": date[0]},
"yama-9had.json?":{"SELECT": date2, "WHERE": date2[0]}}
base_query = {"SELECT": ["service_request_number",
"community_area",
"latitude",
"longitude"],
"WHERE": " between '2017-03-21T00:00:00' and '2017-03-31T00:00:00'"}
city_data = get_city_data(endpoints, base_query,verbose=verbose)
combine_cols(city_data, date[0], date2[0])
combine_cols(city_data, date[2], date2[1])
return city_data
def get_city_data(endpoints, base_query,verbose=False):
'''
Collects 311 response data based on SoQL query
returns data (pd.DataFrame)
'''
data = pd.DataFrame()
for endpoint, vals in endpoints.items():
query = dict(base_query)
query["SELECT"] = query["SELECT"] + vals["SELECT"]
query["WHERE"] = vals["WHERE"] + query["WHERE"]
new_data = get_data(endpoint, query, verbose=verbose)
data = pd.concat([data, new_data], ignore_index=True)
return data
def get_data(endpoint, query, BASE_URL=BASE_URL, verbose=False):
'''
Inputs: endpoint (string) (e.g.: "yama-9had.json?")
query (dictionary) (SoQL statements divided into constituent parts)
'''
url = BASE_URL + endpoint
count_query = url + "$query=SELECT%20count(*)"
r = requests.get(count_query)
query["LIMIT"] = r.json()[0]["count"]
query = encode_query(query)
query_url = url + "$query=" + query
if verbose:
print("query url: ", query_url)
return pd.read_json(query_url)
def encode_query(dict):
'''
Take dictionary with SQL statements parts and combine them into a web-safe string
'''
encoded = ""
for key, val in dict.items():
if isinstance(val, list):
try:
val = ", ".join(val)
except:
return "Lists must contain strings only"
try:
encoded += key + " " + val + " "
except:
return "Dictionary values must be strings or list of strings"
return quote_plus(encoded)
def combine_cols(df, col, extra):
'''
Inputs:
df (pd.DataFrame)
col,extra (string) column names
Combines columns with similar information into a single column and drops extra.
'''
df.ix[:,col] = df.ix[:,col].where(df.ix[:,col].notnull(), df.ix[:,extra])
df.drop(extra, axis=1, inplace=True)
def get_com_areas(df):
'''
downloads community areas information from data portal and appends to data frame
'''
try:
community_areas=pd.read_json("https://data.cityofchicago.org/resource/igwz-8jzy.json")
except:
return "download failed"
com_areas = community_areas.ix[:,["area_numbe","community"]]
df = pd.merge(df, com_areas, left_on="community_area", right_on="area_numbe")
return df
In [5]:
# To re-download data from city portal uncomment below.
# city_data = make_data_frame()
In [6]:
FIPS_URL = "https://www.broadbandmap.gov/broadbandmap/census/block?format=json&"
def fips_from_broadbandmapAPI(url):
'''
Pulls FIPS code from census json
e.g. https://www.broadbandmap.gov/broadbandmap/census/block?format=json&latitude=&longitude=
'''
if pd.notnull(url):
try:
r = requests.get(url).json()
except:
return None
if r['status']=='OK':
return r.get("Results").get("block")[0].get("FIPS")
else:
return r['status']
def add_latlng(lat,lng):
if pd.notnull(lat) and pd.notnull(lng):
return "latitude={}&longitude={}".format(lat, lng)
def add_lnglat_url(df,base_url = FIPS_URL):
'''
Add url for API to dataframe with latitude and longitude
'''
URLS = pd.DataFrame(base_url + pd.Series(map(add_latlng, df.ix[:,"latitude"], df.ix[:,"longitude"])))
df = df.assign(URLS=URLS)
return df
def batch_FIPS_from_API(df):
FIPS = pd.DataFrame(df.ix[:,"URLS"]).applymap(fips_from_broadbandmapAPI)
FIPS = FIPS.rename(columns={"URLS": "FIPS"})
return FIPS
In [ ]:
#Remove triple quotes to pull FIPS data.
'''
df = batch_FIPS_from_API(city_data.ix[city_data.loc[:,'type_of_service_request']=='Vacant/Abandoned Building'])
df = df.rename(columns={"URLS": "FIPS"})
sanitation_df = batch_FIPS_from_API(city_data.ix[city_data.loc[:,'type_of_service_request']=="Sanitation Code Violation"])
sanitation_df = sanitation_df.rename(columns={"URLS": "FIPS"})
city_data = pd.merge(city_data, df, left_index=True, right_index=True, how='left')
city_data = pd.merge(city_data, sanitation_df, left_index=True, right_index=True, how='left')
city_data.ix[:,"FIPS_x"] = city_data.ix[:,"FIPS_x"].combine_first(city_data.ix[:,"FIPS_y"])
city_data = city_data.rename(columns={"FIPS_x":"FIPS"})
city_data.drop("FIPS_y",axis=1,inplace=True)
city_data.to_csv("Safety_csv_1.csv")
del df, sanitation_df
missing_fips = check_nulls(city_data.ix[(city_data.loc[:,'type_of_service_request'] == "Sanitation Code Violation") | (city_data.loc[:,'type_of_service_request'] == 'Vacant/Abandoned Building')], "FIPS")
df = batch_FIPS_from_API(missing_fips)
city_data.ix[:,"FIPS"] = city_data.ix[:,"FIPS"].combine_first(df.ix[:,"FIPS"])
del df
'''
At this point my kernel crashed and I had to reload the data from CSV. This served as a natural break point, because the API heavy-lifting just finished.
In [24]:
#http://stackoverflow.com/questions/21269399/datetime-dtypes-in-pandas-read-csv
col_dtypes = {"":int,"community_area":float,"completion_date":object,
"creation_date":object,"latitude":float,
"longitude":float,"service_request_number":object,
"type_of_service_request":str,"URLS":str,"FIPS":object}
parse_dates = ["completion_date","creation_date"]
city_data = pd.read_csv("Safety_csv_1.csv",dtype=col_dtypes, parse_dates=parse_dates)
city_data.drop('Unnamed: 0',axis=1,inplace=True)
city_data.drop('URLS',axis=1,inplace=True)
city_data.ix[:,"time_to_completion"] = city_data.ix[:,"completion_date"] - city_data.ix[:,"creation_date"]
city_data.ix[:,"time_to_completion"] = city_data.ix[:,"time_to_completion"]/ np.timedelta64(1, 'D')
city_data = get_com_areas(city_data)
########
In [9]:
CENSUS_URL = "http://api.census.gov/data/2015/pdb/blockgroup?get={}&for=block+group:*&in=state:17+county:031"
census_fields = ["pct_NH_Blk_alone_ACS_09_13",
"pct_NH_White_alone_ACS_09_13",
"pct_Hispanic_ACS_09_13",
"pct_College_ACS_09_13",
"pct_Renter_Occp_HU_ACS_09_13",
"pct_Males_ACS_09_13",
"pct_Prs_Blw_Pov_Lev_ACS_09_13",
"pct_Diff_HU_1yr_Ago_ACS_09_13",
"Tot_Population_ACS_09_13",
"GIDBG"]
fields = len(census_fields) - 1
uscensus = pd.read_json(CENSUS_URL.format(",".join(census_fields)))
uscensus.columns = uscensus.ix[0]
uscensus = uscensus.rename(columns={"GIDBG": "FIPS"})
uscensus = uscensus.ix[1:]
uscensus.ix[:,0:fields] = uscensus.ix[:,0:fields].apply(pd.to_numeric, axis=1)
cnsdf = get_notnulls(city_data, "FIPS")
cnsdf=cnsdf.groupby(["FIPS", "type_of_service_request"])
counts = clean_grouped_data(cnsdf)
counts.ix[:,"FIPS"] = counts.index
counts.ix[:,"FIPS"] = pd.DataFrame(counts.ix[:,"FIPS"]).applymap(lambda x: x[:-3])
census_by_311 = pd.merge(counts, uscensus, left_on="FIPS", right_on="FIPS")
for col in census_fields[:-1]:
census_by_311[col] = pd.to_numeric(census_by_311[col])
census_by_311[census_by_311.columns[0] + " per 10000"]=10000*census_by_311.ix[:,0].fillna(0)/pd.to_numeric(census_by_311.Tot_Population_ACS_09_13)
census_by_311[census_by_311.columns[1] + " per 10000"]=10000*census_by_311.ix[:,1].fillna(0)/pd.to_numeric(census_by_311.Tot_Population_ACS_09_13)
In [26]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook,save
from bokeh.charts import Histogram
from bokeh.layouts import gridplot
output_notebook()
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.patches import PathPatch
% matplotlib inline
In [11]:
df = city_data
In [12]:
gdf = df.groupby("type_of_service_request")
totals = pd.DataFrame(gdf.count().ix[:,"service_request_number"])
totals.index.name=None
totals.columns
TYPES = totals.index
totals
Out[12]:
In [13]:
s = df.ix[:,"time_to_completion"].describe(percentiles=np.linspace(0,1,11))
s[1:]
Out[13]:
In [14]:
g2df = df.groupby(["community","type_of_service_request"])
counts = clean_grouped_data(g2df)
community_area_descriptives=round(counts.describe(),1)
community_area_descriptives
Out[14]:
In [15]:
means = counts.mean()
counts.ix[(counts.ix[:,0]>means[0]) & (counts.ix[:,1]>means[1]) & (counts.ix[:,2]>means[2]) & (counts.ix[:,3]>means[3])]
Out[15]:
In [27]:
p1 = Histogram(counts.ix[:,0], title = counts.columns[0])
p2 = Histogram(counts.ix[:,1], title = counts.columns[1])
p3 = Histogram(counts.ix[:,2], title = counts.columns[2])
p4 = Histogram(counts.ix[:,3], title = counts.columns[3])
row1 = [p1,p2]
row2 = [p3,p4]
layout = gridplot([[p1,p2],[p3,p4]], plot_width=250, plot_height = 250)
show(layout)
save(layout)
Out[27]:
In [30]:
def graph_over_time(col = 'type_of_service_request',request_type = "Sanitation Code Violation"):
a = df[df.ix[:,col]==request_type]
g = a.groupby("completion_date").count().ix[:,"service_request_number"]
p = figure(plot_width=400, plot_height=400,x_axis_type="datetime",title=request_type)
p.line(g.index, g)
save(p, filename= quote_plus(request_type) + ".html")
return p
for t in TYPES:
show(graph_over_time(request_type = t))
Of 77 community areas, only Ashburn and Chicago Lawn had above average call-ins for all four 311 categories.
In Chicago, graffiti is far more common than the other disturbances with twice the amount of calls for Potholes, 6 times the amount of sanitation code violations and 24 times the number of calls for vacant buildings. As a Chicago driver, the result is suprising. On the other hand, people may have given up on potholes and do not make 311 calls as often as they encouneter them.
If the completion data is to be trusted, the city resolved over 60 percent of the 200,000 complaints within one day; And over 80 percent in a week. After that the numbers increase significantly so that the people in the top decile are waiting at least 3 weeks.
Each call has it's own seasonality, but calls seem to decrease in December: especially the vacant building calls. Potholes have the most distinct shape with a bimodal distribution that peaks in July and February.
The variance and means are very close in the neighborhood level table; also the histograms shape show the probability mass is skewed to the left for all four graphs. This suggests to me a poisson process, which matches what theory says happens when we are waiting for events.
In [18]:
census_by_311.ix[:,-2:].describe()
Out[18]:
In [19]:
vacants = census_by_311.ix[census_by_311.ix[:,"Vacant/Abandoned Building per 10000"] > 5]
vd = vacants.describe(percentiles=[]).ix[1:,2:10]
vd
Out[19]:
In [20]:
sanitation_violation = census_by_311.ix[census_by_311.ix[:,'Sanitation Code Violation per 10000'] > 19]
sv = sanitation_violation.describe(percentiles=[]).ix[1:,2:10]
sv
Out[20]:
In [21]:
sv - vd
Out[21]:
Vacant property and sanitation complaints are common 311 complaints. Over the past year, the City recieved roughly 5000 vacant property complaints and 20000 sanitation complaints. With block level Census data, we can characterize the communities that draw these respective claims. The following analysis describes the average census block originating these 311 calls. To draw sharper edges, I describe census blocks in the top quarter of the distribution taking into account population.
The average block made 4.4 calls per 10000 residents to 311 about Vacant property; however the median census block had 0 calls. This suggest that these calls are isolated to certain areas. On average, the top quarter, those with 5 or more calls per 10000 residents over the last year, have large African American populations and relatively high levels of poverty compared to sanitation violations. On average these blocks have 54 percent renter occupancy and 15 percent of households moved, which are very close to the sanitation violation blocks. The sanitation violation blocks have larger white and latino populations, and also higher levels of college attainment. Compare 25 percent to 15 percent.
In [22]:
prob = counts.ix[["WEST ENGLEWOOD", "AUBURN GRESHAM"],:]
s = prob.sum()
pr = s/s.sum()
pr
Out[22]:
In [23]:
prob = counts.ix[["SOUTH LAWNDALE","UPTOWN"],:]
SLAWNDALE = prob.ix[1]/prob.T.sum().T[1]
UPTOWN = prob.ix[1]/prob.T.sum().T[1]
prob.T.sum().T/prob.T.sum().T.sum()
Out[23]:
Graffiti Removal 0.150587 Pothole in Street 0.422058 Sanitation Code Violation 0.245452 Vacant/Abandoned Building 0.181902
SOUTH LAWNDALE 0.714 Graffiti Removal 0.787102
UPTOWN 0.28579 Graffiti Removal 0.661727
TOTAL Graffiti Removal 0.75127
Pr(SOUTH LAWNDALE | Graffiti) $P(A|B) = \frac{0.787102 * 0.714}{.75127} = 0.748$
Pr(UPTOWN | Graffiti) $P(A|B) = \frac{ 0.661727 * 0.28579}{.75127} = 0.25$
South Lawndale is about 50% more likely to be making the call.
So the probability that a call for Graffiti Removal originated in Engelwood is $P(A|B) = \frac{.6 * \frac{1}{6}}{.24} = \frac{5}{12}$.
And so the probability in originated in Uptown is $\frac{7}{12}$, which is $\frac{1}{6}$ more likely.
In [ ]: