Initialization


In [1]:
%matplotlib inline

In [2]:
import sys
import csv
import json
import numpy as np
from collections import Counter, defaultdict
import leafletwidget as lw
import matplotlib as mpl
import matplotlib.cm
import matplotlib.colors
import matplotlib.pyplot as plt

In [3]:
lw.initialize_notebook()


This is a big dataset!


In [4]:
# Number of records (+ header)
#!wc -l hmda_lar-2012.csv

We would like to present this data in an informative manner. We're going to take advantage of the fact that the data is coded by state and county to aggregate based on this information when it's available in the data.

Loading and aggregating the data


In [5]:
state_actions = defaultdict(Counter)
county_actions = defaultdict(Counter)
bad_records = []
with open('hmda_lar-2012.csv') as csv_file:
    dialect = csv.Sniffer().sniff(csv_file.read(4096))
    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect)
    header_list = reader.next()
    action_idx = header_list.index('action_taken_name')
    county_code_idx = header_list.index('county_code')
    state_code_idx = header_list.index('state_code')
    state_name_idx = header_list.index('state_name')
    
    def parse_row_list(row_list):
        action = row_list[action_idx]
        county = int(row_list[county_code_idx])
        state  = int(row_list[state_code_idx])    
        state_name  = row_list[state_name_idx]
        county_fips = state*1000 + county
        return action, county_fips, state, state_name
        
    for i, row_list in enumerate(reader):
        try:
            action, county_fips, state, state_name = parse_row_list(row_list)
            county_actions[county_fips][action] += 1
            state_actions[state_name][action] += 1
        except:
            bad_records.append(row_list)
        if (i+1) % 100000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
            #uncomment the line below to only run 100K records
            break
print ''
print 'Processed %d records' % i
print 'Found %d records with missing county data' % len(bad_records)


.
Processed 99999 records
Found 368 records with missing county data

We can now take a look at some of our accumulated data for a given state.


In [6]:
state_actions['California']


Out[6]:
Counter({'Loan originated': 8317, 'Application withdrawn by applicant': 1527, 'Application denied by financial institution': 1245, 'Loan purchased by the institution': 943, 'Application approved but not accepted': 608, 'File closed for incompleteness': 577})

Querying the Data Set


In [7]:
def query(county_id):
    return county_actions[county_id]['Application denied by financial institution']

Computing the Query


In [8]:
# http://eric.clst.org/wupl/Stuff/gz_2010_us_050_00_5m.json
with open('gz_2010_us_050_00_5m.json', 'rb') as f:
    county_json = json.load(f, encoding='latin-1')

geo_id_stream = (feature['properties']['GEO_ID'] for feature in county_json['features'])

In [9]:
def to_county_id(geo_id):
    return int(geo_id[-5:])

In [10]:
counties = [to_county_id(geo_id) for geo_id in geo_id_stream]
denials = [query(key) for key in counties]

Visualizing the Query


In [11]:
normalized_denials = mpl.colors.Normalize()(denials)

In [12]:
colormap=mpl.cm.Blues
denial_colors = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(normalized_denials)]

for feature, color in zip(county_json['features'],
                          denial_colors):
    feature['properties']['style'] = {'color': color, 'weight': 1, 'fillColor': color, 'fillOpacity': 0.5}

In [15]:
#m = lw.Map(zoom=4, center=[37.996162679728116, -97.294921875])
m = lw.Map(zoom=4, center=[37.996162679728116, -97.294921875], default_tiles=None)

In [16]:
m

In [17]:
m.bounds


Out[17]:
[(22.917922936146045, -123.662109375), (50.51342652633956, -70.927734375)]

In [18]:
g = lw.GeoJSON(data=county_json)

In [19]:
m.add_layer(g)