In [ ]:
import sys
import os
import time
import pandas as pd
sys.path.insert(0, os.path.realpath('..'))
import django
django.setup()
from django.db import connections
pd.set_option('display.max_rows', 3000)
from IPython.display import display
from firecares.firestation.models import FireDepartment
import editdistance
from IPython.lib.pretty import pprint
from firecares.firestation.templatetags import firecares_tags

In [ ]:
df = pd.read_csv('./2015_AFG_FDID.csv')

In [ ]:
df

In [ ]:
fd_names = FireDepartment.objects.values_list('name', 'state')

fd_mappings = {
    'NC': {
        '16045': '01645'
    },
    'PA': {
        '54064': '54564'
    }
}

def massage_fdid(fdid, state):
    import re
    if state in fd_mappings and fdid in fd_mappings[state]:
        return fd_mappings[state][fdid]
    stripped = re.sub('-|\s+', '', fdid)
    if len(stripped) < 5 and stripped:
        return fdid.zfill(5)
    elif len(stripped) > 5 and stripped.lower().startswith(state.lower()):
        return re.sub('^{}'.format(state), '', stripped)
    else:
        return stripped
    
def fuzzy_find_department(name, fdid, state):
    massaged_fdid = massage_fdid(fdid, state)
    if massaged_fdid:
        fds = FireDepartment.objects.filter(fdid__iexact=massaged_fdid, state__iexact=state)
        if fds.exists():
            if len(fds) > 1:
                raise Exception('Multiple matches for {} - {} - {}'.format(name, fdid, state))
            return fds.first(), None
        else:
            fds = FireDepartment.objects.filter(state__iexact=state, name__iexact=name)
            if fds.exists():
                return fds.first(), None
            potentials = filter(lambda x: x[1] == state, fd_names)
            matches = sorted((editdistance.eval(name, x[0]), x[0], name, fdid, state) for x in potentials)
            return None, matches[:5]
    else:
        return None, None

In [ ]:
hits = []
misses = []
records = df.to_dict(orient='index')
for index, data in records.items():
    state = data.get('STATE')
    fdid = data.get('FDID')
    name = data.get('Name')
    dept, options = fuzzy_find_department(name, fdid, state)
    if dept:
        print 'Hit on {}, row {}'.format(dept, index)
        hits.append((index, dept))
    else:
        if options:
            print 'Options for {}, {} - {}:'.format(name, state, fdid)
            pprint(options)
            misses.append((index, name, fdid, state, options))
        else:
            misses.append((index, name, fdid, state, None))
            print 'Complete miss on {}, missing fdid : {}'.format(name, fdid)

In [ ]:
print 'Percent empty: {}'.format(float(len(misses)) / (len(hits) + len(misses)))
display(hits)

In [ ]:
pprint(misses)

Invalid FDID (either in FC or in the CSV):

Emerald Isle Fire Department, NC - 16045 / 01645 (FC)


In [ ]:
def to_risk(n):
    if n == 1:
        return 'Low'
    elif n == 2 or n == 3:
        return 'Medium'
    elif n == 4:
        return 'High'
    else:
        return 'Unknown'

def explode_department(fd):
    if fd.population_class is not None:
        pop_range = FireDepartment.POPULATION_CLASSES[fd.population_class][1]
    else:
        pop_range = 'Unknown'
        
    return [fd.region,
            pop_range,
            fd.metrics.dist_model_score.all,
            to_risk(fd.metrics.community_fire_risk.all),
            to_risk(fd.metrics.community_fire_spread_risk.all),
            to_risk(fd.metrics.community_death_and_injury_risk.all)]

In [ ]:
%%time
exploded_hits = map(lambda x: (list(df.iloc[x[0]][['Name', 'STATE', 'FDID']]) + explode_department(x[1])), hits)

In [ ]:
df2 = pd.DataFrame(data=exploded_hits, columns=df.columns)
display(df2)

In [ ]:
def get_options(opt):
    if opt and opt[4]:
        return ','.join(map(lambda x: x[1], opt[4]))

df_misses = map(lambda x: (x[1], x[3], x[2], 'Not found', 'Not found', 'Not found', 'Not found', 'Not found', 'Not found', get_options(x)), misses)

df3 = pd.DataFrame(data=df_misses, columns=['Name', 'STATE', 'FDID', 'NFPA Region', 'Population Range', 'PERF_SCORE', 'FIRE_RISK', 'FIRE_SPREAD_RISK', 'DEAT_INJURY_RISK', 'FD_OPTIONS_IN_STATE'])
df3

In [ ]:
df4 = df2.append(df3)[['Name', 'STATE', 'FDID', 'NFPA Region', 'Population Range', 'PERF_SCORE', 'FIRE_RISK', 'FIRE_SPREAD_RISK', 'DEAT_INJURY_RISK', 'FD_OPTIONS_IN_STATE']]
df4

In [ ]:
df4.to_csv('/tmp/afg_populated.csv')