In [ ]:
import pandas as pd
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
from geoalchemy2 import Geometry
from mapzen.api import MapzenAPI
import glob
import sys
import os
import time
import pandas as pd
sys.path.insert(0, os.path.realpath('..'))
import django
django.setup()
from django.db import connections
pd.set_option('display.max_rows', 3000)
from IPython.display import display
from firecares.firestation.models import FireDepartment
import editdistance
from IPython.lib.pretty import pprint
engine = create_engine('postgresql://firecares:password@localhost:5432/geocoding_afg')
conn = engine.connect()
In [ ]:
conn.execute('drop table if exists to_geocode;')
tot = 0
for f in glob.glob('*.xls'):
df = pd.read_excel(f)
n = len(df)
tot += n
print 'Loading: {} records'.format(n)
df.to_sql('to_geocode', con=engine, if_exists='append')
assert conn.execute('select count(1) from to_geocode;').fetchone()[0] == tot
In [ ]:
conn.execute('alter table to_geocode add column geom geometry(MultiPolygon, 4326);')
conn.execute('alter table to_geocode add column status integer;')
conn.execute('alter table to_geocode add column coverage geometry(MultiPolygon, 4326);')
conn.execute('ALTER TABLE to_geocode ADD COLUMN id SERIAL PRIMARY KEY;')
In [ ]:
meta = MetaData()
from sqlalchemy.ext.automap import automap_base
Base = automap_base()
Base.prepare(engine, reflect=True)
maker = sessionmaker(bind=engine)
ToGeocode = Base.classes['to_geocode']
from contextlib import contextmanager
@contextmanager
def session_scope():
"""Provide a transactional scope around a series of operations."""
session = maker()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()
In [ ]:
states = pd.read_csv('states.csv')
def get_state_abbrev(name):
try:
return states[states['State'] == name]['Abbreviation'].values[0]
except:
return None
In [ ]:
fd_mappings = {}
def massage_fdid(fdid, state):
import re
if fdid and state:
if state in fd_mappings and fdid in fd_mappings[state]:
return fd_mappings[state][fdid]
stripped = re.sub('-|\s+', '', fdid)
if len(stripped) < 5 and stripped:
return fdid.zfill(5)
elif len(stripped) > 5 and stripped.lower().startswith(state.lower()):
return re.sub('^{}'.format(state), '', stripped)
else:
return stripped
else:
return ''
def fuzzy_find_department(fdid, state):
massaged_fdid = massage_fdid(fdid, state)
if massaged_fdid:
fds = FireDepartment.objects.filter(fdid__iexact=massaged_fdid, state__iexact=state)
if fds.exists():
if len(fds) > 1:
print 'Multiple matches for {} - {}'.format(fdid, state)
return None, None
return fds.first(), None
else:
return None, None
else:
return None, None
In [ ]:
with session_scope() as session:
fds = session.query(ToGeocode).filter_by(status=None).all()
i = 0
for fd in fds:
i += 1
if i % 100 == 0:
print 'Only {} left'.format(float(session.query(ToGeocode).count()) - session.query(ToGeocode).filter(ToGeocode.status != None).count())
if fd.FDIN and fd.STATE:
hit, _ = fuzzy_find_department(fd.FDIN, get_state_abbrev(fd.STATE))
if hit:
fd.geom = hit.geom.ewkt if hit.geom else None
fd.coverage = hit.owned_tracts_geom.ewkt if hit.owned_tracts_geom else None
fd.status = 1
session.add(fd)
session.commit()
else:
print 'Miss on: {} {}'.format(fd.FDIN, get_state_abbrev(fd.STATE))
fd.status = 0
session.add(fd)
session.commit()
In [ ]:
# Double-check to make SURE that the grant IDs have the correct year since we're extracting the year into its own col
import glob
import re
for fn in glob.glob('*.xls'):
year = re.match('(\d+).*', fn).groups()[0]
df = pd.read_excel(fn)
records = map(lambda x: x[1], df[['GRANT_ID']].to_records())
for r in records:
assert re.match('\w+-(\d+)-.*', r).groups()[0] == year
In [ ]:
conn.execute('ALTER TABLE to_geocode ADD COLUMN year int;')
In [ ]:
conn.execute('update to_geocode set year = cast(substring("GRANT_ID" from 5 for 4) as int)')
In [ ]:
conn.execute("select count(1), year from to_geocode group by year").fetchall()
In [ ]:
conn.close()
In [ ]: