In [ ]:
import pandas as pd
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
from geoalchemy2 import Geometry
from mapzen.api import MapzenAPI
import glob
import sys
import os
import time
import pandas as pd
sys.path.insert(0, os.path.realpath('..'))
import django
django.setup()
from django.db import connections
pd.set_option('display.max_rows', 3000)
from IPython.display import display
from firecares.firestation.models import FireDepartment
import editdistance
from IPython.lib.pretty import pprint

engine = create_engine('postgresql://firecares:password@localhost:5432/geocoding_afg')
conn = engine.connect()

entering the danger zone!


In [ ]:
conn.execute('drop table if exists to_geocode;')
tot = 0
for f in glob.glob('*.xls'):
    df = pd.read_excel(f)
    n = len(df)
    tot += n
    print 'Loading: {} records'.format(n)
    df.to_sql('to_geocode', con=engine, if_exists='append')

assert conn.execute('select count(1) from to_geocode;').fetchone()[0] == tot

In [ ]:
conn.execute('alter table to_geocode add column geom geometry(MultiPolygon, 4326);')
conn.execute('alter table to_geocode add column status integer;')
conn.execute('alter table to_geocode add column coverage geometry(MultiPolygon, 4326);')
conn.execute('ALTER TABLE to_geocode ADD COLUMN id SERIAL PRIMARY KEY;')

out of the danger zone


In [ ]:
meta = MetaData()
from sqlalchemy.ext.automap import automap_base

Base = automap_base()
Base.prepare(engine, reflect=True)
maker = sessionmaker(bind=engine)

ToGeocode = Base.classes['to_geocode']

from contextlib import contextmanager

@contextmanager
def session_scope():
    """Provide a transactional scope around a series of operations."""
    session = maker()
    try:
        yield session
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

In [ ]:
states = pd.read_csv('states.csv')

def get_state_abbrev(name):
    try:
        return states[states['State'] == name]['Abbreviation'].values[0]
    except:
        return None

In [ ]:
fd_mappings = {}

def massage_fdid(fdid, state):
    import re
    if fdid and state:
        if state in fd_mappings and fdid in fd_mappings[state]:
            return fd_mappings[state][fdid]
        stripped = re.sub('-|\s+', '', fdid)
        if len(stripped) < 5 and stripped:
            return fdid.zfill(5)
        elif len(stripped) > 5 and stripped.lower().startswith(state.lower()):
            return re.sub('^{}'.format(state), '', stripped)
        else:
            return stripped
    else:
        return ''

def fuzzy_find_department(fdid, state):
    massaged_fdid = massage_fdid(fdid, state)
    if massaged_fdid:
        fds = FireDepartment.objects.filter(fdid__iexact=massaged_fdid, state__iexact=state)
        if fds.exists():
            if len(fds) > 1:
                print 'Multiple matches for {} - {}'.format(fdid, state)
                return None, None
            return fds.first(), None
        else:
            return None, None
    else:
        return None, None

In [ ]:
with session_scope() as session:
    fds = session.query(ToGeocode).filter_by(status=None).all()
    i = 0

    for fd in fds:
        i += 1
        if i % 100 == 0:
            print 'Only {} left'.format(float(session.query(ToGeocode).count()) - session.query(ToGeocode).filter(ToGeocode.status != None).count())
        if fd.FDIN and fd.STATE:
            hit, _ = fuzzy_find_department(fd.FDIN, get_state_abbrev(fd.STATE))
            if hit:
                fd.geom = hit.geom.ewkt if hit.geom else None
                fd.coverage = hit.owned_tracts_geom.ewkt if hit.owned_tracts_geom else None
                fd.status = 1
                session.add(fd)
                session.commit()
            else:
                print 'Miss on: {} {}'.format(fd.FDIN, get_state_abbrev(fd.STATE))
                fd.status = 0
                session.add(fd)
                session.commit()

In [ ]:
# Double-check to make SURE that the grant IDs have the correct year since we're extracting the year into its own col

import glob
import re

for fn in glob.glob('*.xls'):
    year = re.match('(\d+).*', fn).groups()[0]
    df = pd.read_excel(fn)
    records = map(lambda x: x[1], df[['GRANT_ID']].to_records())
    for r in records:
        assert re.match('\w+-(\d+)-.*', r).groups()[0] == year

In [ ]:
conn.execute('ALTER TABLE to_geocode ADD COLUMN year int;')

In [ ]:
conn.execute('update to_geocode set year = cast(substring("GRANT_ID" from 5 for 4) as int)')

In [ ]:
conn.execute("select count(1), year from to_geocode group by year").fetchall()

In [ ]:
conn.close()

In [ ]: