Machine Learning Project 0


In [5]:
import numpy as np
import pandas as pd
import requests
from urllib.parse import quote_plus, urlencode

In [6]:
# Helper Functions
def check_nulls(df, col):
    '''
    returns df with NaN in specified column
    '''
    return df.ix[df.ix[:,col].isnull()]

def get_notnulls(df, col):
    '''
    returns df with NaN in specified column
    '''
    return df.ix[df.ix[:,col].notnull()]



def clean_data(df, cleaning_tuples):
    '''
    replace a string in a column (pat) with a clean string (repl):
    e.g. cleaning_tuples = [(col, pat, repl)]
    '''
    for col, pat, repl in cleaning_tuples:
        df.ix[:,col] = df.ix[:,col].str.replace(pat, repl)
        
def clean_grouped_data(grouped_df):
    '''
    returns df with counts that result from groupby
    '''
    counts = pd.DataFrame(grouped_df.count().ix[:,"service_request_number"])
    counts = counts.unstack()
    counts.columns = counts.columns.droplevel()
    counts.columns.name = None
    counts.index.name = None
    counts.fillna(0, inplace=True)
    return counts

In [7]:
BASE_URL = "https://data.cityofchicago.org/resource/"

def make_data_frame(verbose=True):
    '''
    Idiosyncratic command that collects 311 response data and cleans up pandas dataframe 
    '''
    date  =            ["creation_date","completion_date","type_of_service_request"]
    date2 =            ["date_service_request_was_received","service_request_type"]

    endpoints = {"787j-mys9.json?":{"SELECT": date, "WHERE": date[0]},
                 "cdmx-wzbz.json?":{"SELECT": date, "WHERE": date[0]},
                 "kcdz-f29q.json?":{"SELECT": date, "WHERE": date[0]},
                 "yama-9had.json?":{"SELECT": date2, "WHERE": date2[0]}}

    base_query = {"SELECT": ["service_request_number",
                            "community_area",
                            "latitude",
                            "longitude"], 
                  "WHERE": " between '2017-03-21T00:00:00' and '2017-03-31T00:00:00'"}
    
    city_data = get_city_data(endpoints, base_query,verbose=verbose)

    combine_cols(city_data, date[0], date2[0])
    combine_cols(city_data, date[2], date2[1])
    
    return city_data
    
def get_city_data(endpoints, base_query,verbose=False):
    '''
    Collects 311 response data based on SoQL query
    returns data (pd.DataFrame)
    '''
    data = pd.DataFrame()
    for endpoint, vals in endpoints.items():
        query = dict(base_query)
        query["SELECT"] = query["SELECT"] + vals["SELECT"]
        query["WHERE"] = vals["WHERE"] + query["WHERE"]
        new_data = get_data(endpoint, query, verbose=verbose)
        data = pd.concat([data, new_data], ignore_index=True)
    return data

def get_data(endpoint, query, BASE_URL=BASE_URL, verbose=False):
    '''
    Inputs: endpoint (string) (e.g.: "yama-9had.json?")
            query (dictionary) (SoQL statements divided into constituent parts)
    '''
    url = BASE_URL + endpoint
    count_query = url + "$query=SELECT%20count(*)"
    r = requests.get(count_query)
    query["LIMIT"] = r.json()[0]["count"]
    query = encode_query(query)
    query_url = url + "$query=" + query
    
    if verbose:
        print("query url: ", query_url)
    
    return pd.read_json(query_url)
        
def encode_query(dict):
    '''
    Take dictionary with SQL statements parts and combine them into a web-safe string
    '''
    encoded = ""
    for key, val in dict.items():
        if isinstance(val, list):
            try:
                val = ", ".join(val)
            except:
                return "Lists must contain strings only"
        try:
            encoded += key + " " + val + " "
        except:
            return "Dictionary values must be strings or list of strings"
    return quote_plus(encoded)

def combine_cols(df, col, extra):
    '''
    Inputs:
        df (pd.DataFrame)
        col,extra (string) column names
    
    Combines columns with similar information into a single column and drops extra.
    '''
    df.ix[:,col] = df.ix[:,col].where(df.ix[:,col].notnull(), df.ix[:,extra])
    df.drop(extra, axis=1, inplace=True)
    
def get_com_areas(df):
    '''
    downloads community areas information from data portal and appends to data frame
    '''
    try:
        community_areas=pd.read_json("https://data.cityofchicago.org/resource/igwz-8jzy.json")
    except:
        return "download failed"
    
    com_areas = community_areas.ix[:,["area_numbe","community"]]
    df = pd.merge(df, com_areas, left_on="community_area", right_on="area_numbe")
    return df

In [5]:
# To re-download data from city portal uncomment below.    
# city_data = make_data_frame()

In [8]:
FIPS_URL = "https://www.broadbandmap.gov/broadbandmap/census/block?format=json&"

def fips_from_broadbandmapAPI(url):
    '''
    Pulls FIPS code from census json 
    e.g. https://www.broadbandmap.gov/broadbandmap/census/block?format=json&latitude=&longitude=
    '''
    if pd.notnull(url):
        try:
            r = requests.get(url).json()
        except:
            return None
       
        if r['status']=='OK':
            return r.get("Results").get("block")[0].get("FIPS")
        else:
            return r['status']

def add_latlng(lat,lng):
    if pd.notnull(lat) and pd.notnull(lng):
        return "latitude={}&longitude={}".format(lat, lng) 
    
    

def add_lnglat_url(df,base_url = FIPS_URL):
    '''
    Add url for API to dataframe with latitude and longitude
    '''
    URLS = pd.DataFrame(base_url + pd.Series(map(add_latlng, df.ix[:,"latitude"], df.ix[:,"longitude"])))
    df = df.assign(URLS=URLS)
    return df

def batch_FIPS_from_API(df):
    FIPS = pd.DataFrame(df.ix[:,"URLS"]).applymap(fips_from_broadbandmapAPI)
    FIPS = FIPS.rename(columns={"URLS": "FIPS"})
    return FIPS

Get FIPS Data


In [ ]:
#Remove triple quotes to pull FIPS data.
'''
df = batch_FIPS_from_API(city_data.ix[city_data.loc[:,'type_of_service_request']=='Vacant/Abandoned Building'])
df = df.rename(columns={"URLS": "FIPS"})

sanitation_df = batch_FIPS_from_API(city_data.ix[city_data.loc[:,'type_of_service_request']=="Sanitation Code Violation"])
sanitation_df = sanitation_df.rename(columns={"URLS": "FIPS"})

city_data = pd.merge(city_data, df, left_index=True, right_index=True, how='left')
city_data = pd.merge(city_data, sanitation_df, left_index=True, right_index=True, how='left')
city_data.ix[:,"FIPS_x"] = city_data.ix[:,"FIPS_x"].combine_first(city_data.ix[:,"FIPS_y"])
city_data = city_data.rename(columns={"FIPS_x":"FIPS"})
city_data.drop("FIPS_y",axis=1,inplace=True)
city_data.to_csv("Safety_csv_1.csv")
del df, sanitation_df

missing_fips = check_nulls(city_data.ix[(city_data.loc[:,'type_of_service_request'] == "Sanitation Code Violation") | (city_data.loc[:,'type_of_service_request'] == 'Vacant/Abandoned Building')], "FIPS")
df = batch_FIPS_from_API(missing_fips)
city_data.ix[:,"FIPS"] = city_data.ix[:,"FIPS"].combine_first(df.ix[:,"FIPS"])
del df
'''

At this point my kernel crashed and I had to reload the data from CSV. This served as a natural break point, because the API heavy-lifting just finished.


In [9]:
#http://stackoverflow.com/questions/21269399/datetime-dtypes-in-pandas-read-csv
col_dtypes = {"":int,"community_area":float,"completion_date":object,
              "creation_date":object,"latitude":float,
              "longitude":float,"service_request_number":object,
              "type_of_service_request":str,"URLS":str,"FIPS":object}
parse_dates = ["completion_date","creation_date"]
city_data = pd.read_csv("Safety_csv_1.csv",dtype=col_dtypes, parse_dates=parse_dates)
city_data.drop('Unnamed: 0',axis=1,inplace=True)
city_data.drop('URLS',axis=1,inplace=True)

city_data.ix[:,"time_to_completion"] = city_data.ix[:,"completion_date"] - city_data.ix[:,"creation_date"]
city_data.ix[:,"time_to_completion"] = city_data.ix[:,"time_to_completion"]/ np.timedelta64(1, 'D')

city_data = get_com_areas(city_data)

########

In [11]:
city_data


Out[11]:
community_area completion_date creation_date latitude longitude service_request_number type_of_service_request FIPS time_to_completion area_numbe community
0 2.0 NaT 2016-05-17 41.992343 -87.694061 16-03406588 Pothole in Street NaN NaN 2 WEST RIDGE
1 2.0 2016-08-09 2016-06-07 41.990514 -87.706035 16-03947868 Pothole in Street NaN 63.0 2 WEST RIDGE
2 2.0 2016-07-14 2016-04-06 41.991169 -87.693191 16-02316658 Pothole in Street NaN 99.0 2 WEST RIDGE
3 2.0 2016-06-01 2016-05-16 41.983239 -87.697522 16-03377569 Pothole in Street NaN 16.0 2 WEST RIDGE
4 2.0 2016-11-29 2016-09-29 41.994094 -87.683295 16-06855563 Pothole in Street NaN 61.0 2 WEST RIDGE
5 2.0 2016-06-23 2016-04-20 41.994022 -87.687457 16-02693526 Pothole in Street NaN 64.0 2 WEST RIDGE
6 2.0 2016-07-06 2016-04-13 42.007561 -87.694014 16-02481910 Pothole in Street NaN 84.0 2 WEST RIDGE
7 2.0 2016-07-20 2016-07-20 41.985078 -87.689642 16-05110932 Pothole in Street NaN 0.0 2 WEST RIDGE
8 2.0 2016-07-22 2016-04-20 41.996508 -87.689990 16-02691691 Pothole in Street NaN 93.0 2 WEST RIDGE
9 2.0 NaT 2016-05-12 42.000482 -87.690103 16-03275794 Pothole in Street NaN NaN 2 WEST RIDGE
10 2.0 2017-01-24 2017-01-18 41.994377 -87.692367 17-00319598 Pothole in Street NaN 6.0 2 WEST RIDGE
11 2.0 2016-07-06 2016-04-27 42.008751 -87.690261 16-02881031 Pothole in Street NaN 70.0 2 WEST RIDGE
12 2.0 2016-06-16 2016-04-26 41.988843 -87.704589 16-02842587 Pothole in Street NaN 51.0 2 WEST RIDGE
13 2.0 2016-12-12 2016-10-05 41.998067 -87.677005 16-07018460 Pothole in Street NaN 68.0 2 WEST RIDGE
14 2.0 2016-07-08 2016-04-20 41.992894 -87.693246 16-02693359 Pothole in Street NaN 79.0 2 WEST RIDGE
15 2.0 2017-01-25 2017-01-24 42.001332 -87.698208 17-00436085 Pothole in Street NaN 1.0 2 WEST RIDGE
16 2.0 2016-09-19 2016-07-31 42.002735 -87.691380 16-05391137 Pothole in Street NaN 50.0 2 WEST RIDGE
17 2.0 2016-09-19 2016-06-17 41.995187 -87.694830 16-04240497 Pothole in Street NaN 94.0 2 WEST RIDGE
18 2.0 2017-01-25 2017-01-22 42.019298 -87.695846 17-00388777 Pothole in Street NaN 3.0 2 WEST RIDGE
19 2.0 2016-11-15 2016-10-28 42.017825 -87.690257 16-07481765 Pothole in Street NaN 18.0 2 WEST RIDGE
20 2.0 2016-07-08 2016-04-08 41.992349 -87.693513 16-02369698 Pothole in Street NaN 91.0 2 WEST RIDGE
21 2.0 2016-04-26 2016-04-20 42.015695 -87.702156 16-02685768 Pothole in Street NaN 6.0 2 WEST RIDGE
22 2.0 2016-08-02 2016-06-13 42.017065 -87.706854 16-04111927 Pothole in Street NaN 50.0 2 WEST RIDGE
23 2.0 2016-09-01 2016-08-24 42.006037 -87.690215 16-06001399 Pothole in Street NaN 8.0 2 WEST RIDGE
24 2.0 2016-08-30 2016-08-28 41.997462 -87.698924 16-06083680 Pothole in Street NaN 2.0 2 WEST RIDGE
25 2.0 2016-06-07 2016-05-03 42.007460 -87.699289 16-03043429 Pothole in Street NaN 35.0 2 WEST RIDGE
26 2.0 2016-07-08 2016-03-31 41.987324 -87.694581 16-02171175 Pothole in Street NaN 99.0 2 WEST RIDGE
27 2.0 NaT 2016-10-18 41.992839 -87.680972 16-07278846 Pothole in Street NaN NaN 2 WEST RIDGE
28 2.0 2016-12-13 2016-09-26 41.991829 -87.679713 16-06790492 Pothole in Street NaN 78.0 2 WEST RIDGE
29 2.0 2016-05-11 2016-04-09 42.000369 -87.690100 16-02387721 Pothole in Street NaN 32.0 2 WEST RIDGE
... ... ... ... ... ... ... ... ... ... ... ...
200171 36.0 2017-03-21 2017-03-20 41.831052 -87.610739 17-01524248 Graffiti Removal NaN 1.0 36 OAKLAND
200172 36.0 2017-03-24 2017-03-23 41.831097 -87.608370 17-01600892 Graffiti Removal NaN 1.0 36 OAKLAND
200173 36.0 2017-03-27 2017-03-23 NaN NaN 17-01598510 Graffiti Removal NaN 4.0 36 OAKLAND
200174 36.0 2017-03-30 2017-03-30 NaN NaN 17-01795364 Graffiti Removal NaN 0.0 36 OAKLAND
200175 36.0 2017-02-08 2017-01-31 41.821597 -87.604846 17-00566342 Sanitation Code Violation 170318364003001 8.0 36 OAKLAND
200176 36.0 2017-02-23 2017-02-15 41.825900 -87.604461 17-00841429 Sanitation Code Violation 170318365002011 8.0 36 OAKLAND
200177 36.0 2017-03-14 2017-03-09 41.821576 -87.604909 17-01319292 Sanitation Code Violation 170318364003001 5.0 36 OAKLAND
200178 36.0 2016-04-18 2016-04-14 41.826317 -87.606964 16-02518482 Sanitation Code Violation 170318365002008 4.0 36 OAKLAND
200179 36.0 2016-05-31 2016-05-27 41.826427 -87.606509 16-03677037 Sanitation Code Violation 170318365002009 4.0 36 OAKLAND
200180 36.0 2016-06-02 2016-06-01 41.821442 -87.605311 16-03784684 Sanitation Code Violation 170318364003001 1.0 36 OAKLAND
200181 36.0 2016-06-21 2016-06-17 41.825630 -87.606060 16-04238460 Sanitation Code Violation 170318365002009 4.0 36 OAKLAND
200182 36.0 2016-06-21 2016-06-18 41.826065 -87.606305 16-04264816 Sanitation Code Violation 170318365002009 3.0 36 OAKLAND
200183 36.0 2016-07-01 2016-06-25 41.821148 -87.601498 16-04450154 Sanitation Code Violation 170318364001009 6.0 36 OAKLAND
200184 36.0 2016-07-06 2016-07-06 41.818385 -87.604703 16-04739960 Sanitation Code Violation 170318364003006 0.0 36 OAKLAND
200185 36.0 2016-07-06 2016-07-06 41.818489 -87.604715 16-04739982 Sanitation Code Violation 170318364003006 0.0 36 OAKLAND
200186 36.0 2016-07-06 2016-07-05 41.828093 -87.605927 16-04714478 Sanitation Code Violation 170318365002011 1.0 36 OAKLAND
200187 36.0 2016-07-26 2016-07-21 41.822593 -87.600953 16-05133653 Sanitation Code Violation 170318364001009 5.0 36 OAKLAND
200188 36.0 2016-08-17 2016-08-12 41.817842 -87.601882 16-05707848 Sanitation Code Violation 170318364003025 5.0 36 OAKLAND
200189 36.0 2016-08-17 2016-08-10 41.825743 -87.610229 16-05656310 Sanitation Code Violation 170313602001007 7.0 36 OAKLAND
200190 36.0 2016-09-02 2016-08-31 41.825900 -87.604461 16-06168017 Sanitation Code Violation 170318365002011 2.0 36 OAKLAND
200191 36.0 2016-08-25 2016-08-18 41.825743 -87.610229 16-05871338 Sanitation Code Violation 170313602001007 7.0 36 OAKLAND
200192 36.0 2016-09-29 2016-09-26 41.818625 -87.606424 16-06798121 Sanitation Code Violation 170318364003016 3.0 36 OAKLAND
200193 36.0 2016-10-03 2016-09-14 41.829347 -87.608327 16-06516429 Sanitation Code Violation 170318365001006 19.0 36 OAKLAND
200194 36.0 2016-10-26 2016-10-23 41.826427 -87.606509 16-07367625 Sanitation Code Violation 170318365002009 3.0 36 OAKLAND
200195 36.0 2017-03-29 2017-03-21 41.819483 -87.602370 17-01548782 Sanitation Code Violation 170318364003022 8.0 36 OAKLAND
200196 36.0 2017-03-30 2017-03-28 41.817578 -87.604615 17-01731747 Sanitation Code Violation 170318364003029 2.0 36 OAKLAND
200197 36.0 2016-10-05 NaT 41.821299 -87.605221 16-07010180 Vacant/Abandoned Building 170318364003005 NaN 36 OAKLAND
200198 36.0 2016-10-27 NaT 41.821097 -87.603284 16-07457765 Vacant/Abandoned Building 170318364002003 NaN 36 OAKLAND
200199 36.0 2016-11-01 NaT 41.821299 -87.605221 16-07564007 Vacant/Abandoned Building 170318364003005 NaN 36 OAKLAND
200200 36.0 2017-03-07 NaT 41.817578 -87.604615 17-01264540 Vacant/Abandoned Building 170318364003029 NaN 36 OAKLAND

200201 rows × 11 columns

Get Census Data


In [12]:
CENSUS_URL = "http://api.census.gov/data/2015/pdb/blockgroup?get={}&for=block+group:*&in=state:17+county:031"

census_fields = ["pct_NH_Blk_alone_ACS_09_13",
                "pct_NH_White_alone_ACS_09_13",
                "pct_Hispanic_ACS_09_13",
                
                "pct_College_ACS_09_13",
                "pct_Renter_Occp_HU_ACS_09_13",
                "pct_Males_ACS_09_13",
                "pct_Prs_Blw_Pov_Lev_ACS_09_13",
                "pct_Diff_HU_1yr_Ago_ACS_09_13",
                 "Tot_Population_ACS_09_13",
                "GIDBG"]

fields = len(census_fields) - 1 

uscensus = pd.read_json(CENSUS_URL.format(",".join(census_fields)))
uscensus.columns = uscensus.ix[0]
uscensus = uscensus.rename(columns={"GIDBG": "FIPS"})
uscensus = uscensus.ix[1:]
uscensus.ix[:,0:fields] = uscensus.ix[:,0:fields].apply(pd.to_numeric, axis=1)


cnsdf = get_notnulls(city_data, "FIPS")
cnsdf=cnsdf.groupby(["FIPS", "type_of_service_request"])
counts = clean_grouped_data(cnsdf)
counts.ix[:,"FIPS"] = counts.index
counts.ix[:,"FIPS"] = pd.DataFrame(counts.ix[:,"FIPS"]).applymap(lambda x: x[:-3])
census_by_311 = pd.merge(counts, uscensus, left_on="FIPS", right_on="FIPS")


for col in census_fields[:-1]:
    census_by_311[col] = pd.to_numeric(census_by_311[col])


census_by_311[census_by_311.columns[0] + " per 10000"]=10000*census_by_311.ix[:,0].fillna(0)/pd.to_numeric(census_by_311.Tot_Population_ACS_09_13)
census_by_311[census_by_311.columns[1] + " per 10000"]=10000*census_by_311.ix[:,1].fillna(0)/pd.to_numeric(census_by_311.Tot_Population_ACS_09_13)

Generate summary statistics:

number of requests of each type (and subtype within each of the types above) over time, by neighborhood response time by the city


In [26]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook,save
from bokeh.charts import Histogram
from bokeh.layouts import gridplot
output_notebook()

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.patches import PathPatch
% matplotlib inline


Loading BokehJS ...

In [11]:
df = city_data

In [12]:
gdf = df.groupby("type_of_service_request")
totals = pd.DataFrame(gdf.count().ix[:,"service_request_number"])
totals.index.name=None
totals.columns
TYPES = totals.index
totals


Out[12]:
service_request_number
Graffiti Removal 117711
Pothole in Street 56185
Sanitation Code Violation 21540
Vacant/Abandoned Building 4765

In [13]:
s = df.ix[:,"time_to_completion"].describe(percentiles=np.linspace(0,1,11))
s[1:]


Out[13]:
mean       7.855461
std       21.615558
min        0.000000
0%         0.000000
10%        0.000000
20%        0.000000
30.0%      0.000000
40%        0.000000
50%        1.000000
60.0%      1.000000
70%        3.000000
80%        6.000000
90%       21.000000
100%     321.000000
max      321.000000
Name: time_to_completion, dtype: float64

In [14]:
g2df = df.groupby(["community","type_of_service_request"])
counts = clean_grouped_data(g2df)
community_area_descriptives=round(counts.describe(),1)
community_area_descriptives


Out[14]:
Graffiti Removal Pothole in Street Sanitation Code Violation Vacant/Abandoned Building
count 77.0 77.0 77.0 77.0
mean 1528.7 729.7 279.7 61.9
std 1954.9 432.2 209.1 92.5
min 8.0 66.0 8.0 0.0
25% 160.0 392.0 115.0 10.0
50% 591.0 672.0 233.0 23.0
75% 2306.0 1050.0 419.0 67.0
max 9539.0 2116.0 858.0 475.0

In [15]:
means = counts.mean()
counts.ix[(counts.ix[:,0]>means[0]) & (counts.ix[:,1]>means[1]) & (counts.ix[:,2]>means[2]) & (counts.ix[:,3]>means[3])]


Out[15]:
Graffiti Removal Pothole in Street Sanitation Code Violation Vacant/Abandoned Building
ASHBURN 1948.0 1395.0 353.0 67.0
CHICAGO LAWN 2380.0 794.0 451.0 184.0

In [27]:
p1 = Histogram(counts.ix[:,0], title = counts.columns[0])
p2 = Histogram(counts.ix[:,1], title = counts.columns[1])
p3 = Histogram(counts.ix[:,2], title = counts.columns[2])
p4 = Histogram(counts.ix[:,3], title = counts.columns[3])

row1 = [p1,p2]
row2 = [p3,p4]
layout = gridplot([[p1,p2],[p3,p4]], plot_width=250, plot_height = 250)

show(layout)
save(layout)


Out[27]:
'/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/runpy.html'

In [30]:
def graph_over_time(col = 'type_of_service_request',request_type = "Sanitation Code Violation"):
    a = df[df.ix[:,col]==request_type]
    g = a.groupby("completion_date").count().ix[:,"service_request_number"]
    p = figure(plot_width=400, plot_height=400,x_axis_type="datetime",title=request_type)
    p.line(g.index, g)
    save(p, filename= quote_plus(request_type) + ".html")
    return p
    
    
for  t in TYPES:
    show(graph_over_time(request_type = t))


/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/site-packages/bokeh/io.py:419: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warnings.warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/Users/arianisfeld/anaconda2/envs/py36/lib/python3.6/site-packages/bokeh/io.py:429: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warnings.warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

5 interesting findings from 311 data:

Of 77 community areas, only Ashburn and Chicago Lawn had above average call-ins for all four 311 categories.

In Chicago, graffiti is far more common than the other disturbances with twice the amount of calls for Potholes, 6 times the amount of sanitation code violations and 24 times the number of calls for vacant buildings. As a Chicago driver, the result is suprising. On the other hand, people may have given up on potholes and do not make 311 calls as often as they encouneter them.

If the completion data is to be trusted, the city resolved over 60 percent of the 200,000 complaints within one day; And over 80 percent in a week. After that the numbers increase significantly so that the people in the top decile are waiting at least 3 weeks.

Each call has it's own seasonality, but calls seem to decrease in December: especially the vacant building calls. Potholes have the most distinct shape with a bimodal distribution that peaks in July and February.

The variance and means are very close in the neighborhood level table; also the histograms shape show the probability mass is skewed to the left for all four graphs. This suggests to me a poisson process, which matches what theory says happens when we are waiting for events.


In [18]:
census_by_311.ix[:,-2:].describe()


Out[18]:
Sanitation Code Violation per 10000 Vacant/Abandoned Building per 10000
count 1.170400e+04 11697.000000
mean inf 4.407622
std NaN 15.408849
min 0.000000e+00 0.000000
25% 6.172840e+00 0.000000
50% 1.029071e+01 0.000000
75% 1.902950e+01 4.977601
max inf 1111.111111

In [19]:
vacants = census_by_311.ix[census_by_311.ix[:,"Vacant/Abandoned Building per 10000"] > 5]
vd = vacants.describe(percentiles=[]).ix[1:,2:10]
vd


Out[19]:
pct_NH_Blk_alone_ACS_09_13 pct_NH_White_alone_ACS_09_13 pct_Hispanic_ACS_09_13 pct_College_ACS_09_13 pct_Renter_Occp_HU_ACS_09_13 pct_Males_ACS_09_13 pct_Prs_Blw_Pov_Lev_ACS_09_13 pct_Diff_HU_1yr_Ago_ACS_09_13
mean 75.510387 8.360839 13.970421 15.719904 53.52797 46.181369 32.346967 15.130931
std 35.783775 18.911239 25.290452 14.605759 22.61187 6.851958 17.609540 10.789673
min 0.000000 0.000000 0.000000 0.000000 0.00000 25.340000 0.000000 0.000000
50% 93.980000 0.830000 1.460000 11.370000 57.08000 46.470000 31.860000 13.790000
max 100.000000 100.000000 100.000000 96.880000 100.00000 77.580000 100.000000 81.340000

In [20]:
sanitation_violation = census_by_311.ix[census_by_311.ix[:,'Sanitation Code Violation per 10000'] > 19]
sv = sanitation_violation.describe(percentiles=[]).ix[1:,2:10]
sv


Out[20]:
pct_NH_Blk_alone_ACS_09_13 pct_NH_White_alone_ACS_09_13 pct_Hispanic_ACS_09_13 pct_College_ACS_09_13 pct_Renter_Occp_HU_ACS_09_13 pct_Males_ACS_09_13 pct_Prs_Blw_Pov_Lev_ACS_09_13 pct_Diff_HU_1yr_Ago_ACS_09_13
mean 53.425456 21.969299 19.942636 25.899928 53.647595 47.148232 26.127249 15.837395
std 43.765658 29.294776 27.541073 23.607052 23.111254 7.715226 17.394081 11.559911
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 73.250000 4.860000 5.330000 17.080000 55.770000 47.390000 23.050000 14.010000
max 100.000000 100.000000 100.000000 96.880000 100.000000 77.580000 100.000000 66.180000

In [21]:
sv - vd


Out[21]:
pct_NH_Blk_alone_ACS_09_13 pct_NH_White_alone_ACS_09_13 pct_Hispanic_ACS_09_13 pct_College_ACS_09_13 pct_Renter_Occp_HU_ACS_09_13 pct_Males_ACS_09_13 pct_Prs_Blw_Pov_Lev_ACS_09_13 pct_Diff_HU_1yr_Ago_ACS_09_13
mean -22.084930 13.608460 5.972215 10.180024 0.119625 0.966863 -6.219718 0.706464
std 7.981883 10.383537 2.250621 9.001293 0.499385 0.863268 -0.215459 0.770238
min 0.000000 0.000000 0.000000 0.000000 0.000000 -25.340000 0.000000 0.000000
50% -20.730000 4.030000 3.870000 5.710000 -1.310000 0.920000 -8.810000 0.220000
max 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -15.160000

Analysis of Blocks by Census Data

Vacant property and sanitation complaints are common 311 complaints. Over the past year, the City recieved roughly 5000 vacant property complaints and 20000 sanitation complaints. With block level Census data, we can characterize the communities that draw these respective claims. The following analysis describes the average census block originating these 311 calls. To draw sharper edges, I describe census blocks in the top quarter of the distribution taking into account population.

The average block made 4.4 calls per 10000 residents to 311 about Vacant property; however the median census block had 0 calls. This suggest that these calls are isolated to certain areas. On average, the top quarter, those with 5 or more calls per 10000 residents over the last year, have large African American populations and relatively high levels of poverty compared to sanitation violations. On average these blocks have 54 percent renter occupancy and 15 percent of households moved, which are very close to the sanitation violation blocks. The sanitation violation blocks have larger white and latino populations, and also higher levels of college attainment. Compare 25 percent to 15 percent.


In [22]:
prob = counts.ix[["WEST ENGLEWOOD", "AUBURN GRESHAM"],:]
s = prob.sum() 
pr = s/s.sum()
pr


Out[22]:
Graffiti Removal             0.150587
Pothole in Street            0.422058
Sanitation Code Violation    0.245452
Vacant/Abandoned Building    0.181902
dtype: float64

In [23]:
prob = counts.ix[["SOUTH LAWNDALE","UPTOWN"],:]


SLAWNDALE = prob.ix[1]/prob.T.sum().T[1]
UPTOWN = prob.ix[1]/prob.T.sum().T[1]
prob.T.sum().T/prob.T.sum().T.sum()


Out[23]:
SOUTH LAWNDALE    0.71421
UPTOWN            0.28579
dtype: float64

Probability

  1. This address is on the border of West Englewood and Auburn Gresham. For simplicity, I assigned probabilities based on the sum of those two areas. The most likely 311 call is the Pothole in Street call.

Graffiti Removal 0.150587 Pothole in Street 0.422058 Sanitation Code Violation 0.245452 Vacant/Abandoned Building 0.181902

  1. Using the table above, we see there were 7095 calls about Graffiti Removal. 5309 came from South Lawndale, so we'd say the base probabilities suggest South Lawndale is more likely. Using Bayes Theorm

SOUTH LAWNDALE 0.714 Graffiti Removal 0.787102

UPTOWN 0.28579 Graffiti Removal 0.661727

TOTAL Graffiti Removal 0.75127

Pr(SOUTH LAWNDALE | Graffiti) $P(A|B) = \frac{0.787102 * 0.714}{.75127} = 0.748$

Pr(UPTOWN | Graffiti) $P(A|B) = \frac{ 0.661727 * 0.28579}{.75127} = 0.25$

South Lawndale is about 50% more likely to be making the call.

  1. This problem is solved with Bayes Theorem. $$ P(A|B) =\frac{P(B|A)\cdot P(A)}{P(B)} $$ Where: B is a graffiti call. A is the event takes place in Englewood. Then $P(A) = .6$, $P(B) = .24$, and $P(B|A) = .16666$.

So the probability that a call for Graffiti Removal originated in Engelwood is $P(A|B) = \frac{.6 * \frac{1}{6}}{.24} = \frac{5}{12}$.

And so the probability in originated in Uptown is $\frac{7}{12}$, which is $\frac{1}{6}$ more likely.


In [ ]: