In [ ]:
import sys
import os
import time
import pandas as pd
sys.path.insert(0, os.path.realpath('..'))
import django
django.setup()
from django.db import connections
pd.set_option('display.max_rows', 3000)
from IPython.display import display
from firecares.firestation.models import FireDepartment
import editdistance
from IPython.lib.pretty import pprint
from firecares.firestation.templatetags import firecares_tags
In [ ]:
df = pd.read_csv('./2015_AFG_FDID.csv')
In [ ]:
df
In [ ]:
fd_names = FireDepartment.objects.values_list('name', 'state')
fd_mappings = {
'NC': {
'16045': '01645'
},
'PA': {
'54064': '54564'
}
}
def massage_fdid(fdid, state):
import re
if state in fd_mappings and fdid in fd_mappings[state]:
return fd_mappings[state][fdid]
stripped = re.sub('-|\s+', '', fdid)
if len(stripped) < 5 and stripped:
return fdid.zfill(5)
elif len(stripped) > 5 and stripped.lower().startswith(state.lower()):
return re.sub('^{}'.format(state), '', stripped)
else:
return stripped
def fuzzy_find_department(name, fdid, state):
massaged_fdid = massage_fdid(fdid, state)
if massaged_fdid:
fds = FireDepartment.objects.filter(fdid__iexact=massaged_fdid, state__iexact=state)
if fds.exists():
if len(fds) > 1:
raise Exception('Multiple matches for {} - {} - {}'.format(name, fdid, state))
return fds.first(), None
else:
fds = FireDepartment.objects.filter(state__iexact=state, name__iexact=name)
if fds.exists():
return fds.first(), None
potentials = filter(lambda x: x[1] == state, fd_names)
matches = sorted((editdistance.eval(name, x[0]), x[0], name, fdid, state) for x in potentials)
return None, matches[:5]
else:
return None, None
In [ ]:
hits = []
misses = []
records = df.to_dict(orient='index')
for index, data in records.items():
state = data.get('STATE')
fdid = data.get('FDID')
name = data.get('Name')
dept, options = fuzzy_find_department(name, fdid, state)
if dept:
print 'Hit on {}, row {}'.format(dept, index)
hits.append((index, dept))
else:
if options:
print 'Options for {}, {} - {}:'.format(name, state, fdid)
pprint(options)
misses.append((index, name, fdid, state, options))
else:
misses.append((index, name, fdid, state, None))
print 'Complete miss on {}, missing fdid : {}'.format(name, fdid)
In [ ]:
print 'Percent empty: {}'.format(float(len(misses)) / (len(hits) + len(misses)))
display(hits)
In [ ]:
pprint(misses)
In [ ]:
def to_risk(n):
if n == 1:
return 'Low'
elif n == 2 or n == 3:
return 'Medium'
elif n == 4:
return 'High'
else:
return 'Unknown'
def explode_department(fd):
if fd.population_class is not None:
pop_range = FireDepartment.POPULATION_CLASSES[fd.population_class][1]
else:
pop_range = 'Unknown'
return [fd.region,
pop_range,
fd.metrics.dist_model_score.all,
to_risk(fd.metrics.community_fire_risk.all),
to_risk(fd.metrics.community_fire_spread_risk.all),
to_risk(fd.metrics.community_death_and_injury_risk.all)]
In [ ]:
%%time
exploded_hits = map(lambda x: (list(df.iloc[x[0]][['Name', 'STATE', 'FDID']]) + explode_department(x[1])), hits)
In [ ]:
df2 = pd.DataFrame(data=exploded_hits, columns=df.columns)
display(df2)
In [ ]:
def get_options(opt):
if opt and opt[4]:
return ','.join(map(lambda x: x[1], opt[4]))
df_misses = map(lambda x: (x[1], x[3], x[2], 'Not found', 'Not found', 'Not found', 'Not found', 'Not found', 'Not found', get_options(x)), misses)
df3 = pd.DataFrame(data=df_misses, columns=['Name', 'STATE', 'FDID', 'NFPA Region', 'Population Range', 'PERF_SCORE', 'FIRE_RISK', 'FIRE_SPREAD_RISK', 'DEAT_INJURY_RISK', 'FD_OPTIONS_IN_STATE'])
df3
In [ ]:
df4 = df2.append(df3)[['Name', 'STATE', 'FDID', 'NFPA Region', 'Population Range', 'PERF_SCORE', 'FIRE_RISK', 'FIRE_SPREAD_RISK', 'DEAT_INJURY_RISK', 'FD_OPTIONS_IN_STATE']]
df4
In [ ]:
df4.to_csv('/tmp/afg_populated.csv')