In [1]:
%matplotlib inline
In [2]:
import sys
import csv
import json
import numpy as np
from collections import Counter, defaultdict
import leafletwidget as lw
import matplotlib as mpl
import matplotlib.cm
import matplotlib.colors
import matplotlib.pyplot as plt
In [3]:
lw.initialize_notebook()
This is a big dataset!
In [4]:
# Number of records (+ header)
#!wc -l hmda_lar-2012.csv
We would like to present this data in an informative manner. We're going to take advantage of the fact that the data is coded by state and county to aggregate based on this information when it's available in the data.
In [5]:
state_actions = defaultdict(Counter)
county_actions = defaultdict(Counter)
bad_records = []
with open('hmda_lar-2012.csv') as csv_file:
dialect = csv.Sniffer().sniff(csv_file.read(4096))
csv_file.seek(0)
reader = csv.reader(csv_file, dialect)
header_list = reader.next()
action_idx = header_list.index('action_taken_name')
county_code_idx = header_list.index('county_code')
state_code_idx = header_list.index('state_code')
state_name_idx = header_list.index('state_name')
def parse_row_list(row_list):
action = row_list[action_idx]
county = int(row_list[county_code_idx])
state = int(row_list[state_code_idx])
state_name = row_list[state_name_idx]
county_fips = state*1000 + county
return action, county_fips, state, state_name
for i, row_list in enumerate(reader):
try:
action, county_fips, state, state_name = parse_row_list(row_list)
county_actions[county_fips][action] += 1
state_actions[state_name][action] += 1
except:
bad_records.append(row_list)
if (i+1) % 100000 == 0:
sys.stdout.write('.')
sys.stdout.flush()
#uncomment the line below to only run 100K records
break
print ''
print 'Processed %d records' % i
print 'Found %d records with missing county data' % len(bad_records)
We can now take a look at some of our accumulated data for a given state.
In [6]:
state_actions['California']
Out[6]:
In [7]:
def query(county_id):
return county_actions[county_id]['Application denied by financial institution']
In [8]:
# http://eric.clst.org/wupl/Stuff/gz_2010_us_050_00_5m.json
with open('gz_2010_us_050_00_5m.json', 'rb') as f:
county_json = json.load(f, encoding='latin-1')
geo_id_stream = (feature['properties']['GEO_ID'] for feature in county_json['features'])
In [9]:
def to_county_id(geo_id):
return int(geo_id[-5:])
In [10]:
counties = [to_county_id(geo_id) for geo_id in geo_id_stream]
denials = [query(key) for key in counties]
In [11]:
normalized_denials = mpl.colors.Normalize()(denials)
In [12]:
colormap=mpl.cm.Blues
denial_colors = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(normalized_denials)]
for feature, color in zip(county_json['features'],
denial_colors):
feature['properties']['style'] = {'color': color, 'weight': 1, 'fillColor': color, 'fillOpacity': 0.5}
In [15]:
#m = lw.Map(zoom=4, center=[37.996162679728116, -97.294921875])
m = lw.Map(zoom=4, center=[37.996162679728116, -97.294921875], default_tiles=None)
In [16]:
m
In [17]:
m.bounds
Out[17]:
In [18]:
g = lw.GeoJSON(data=county_json)
In [19]:
m.add_layer(g)