The purpose of this collection of scripts is to load the NFIRS yearly data into a dump of the existing Firecares Database. The individual 18 incident types and ancillary tables (fireincident, hazchem, hazmat, etc) are loaded into temporary tables and then appended to the master "fireincident", etc tables. The bulk of this script has to do w/ matching addresses in geocoding results to the new incidents in the "incidentaddress_2014" and appending that to the "incidentaddress" table. The incident_address_2014_aa/ab/ac/ad/ae tables contain the geocoding information from shapefiles and will be used to augment the incidentaddress_2014 table's records with geometries.
incident_address_2015_a*.shp data has been loaded into the database as "incident_address_2015_aa/ab/ac/ad/ae" tablesfireincident.txtincidentaddress.txtarsonjuvsub.txtbasicincident.txtciviliancasualty.txtffcasualty.txt\000) characters have been stripped from incidentaddress.txt and fdheader.txtRISK_FACT1 codes in arsonjuvsub.txt have been replaced with their single-character equivalent (eg. "U", "1", etc)fireincident.txt => maps to fireincident_2015hazchem.txthazmat.txthazmatequipinvolved.txthazmobprop.txtincidentaddress.txtwildlands.txtarson.txtarsonagencyreferal.txtarsonjuvsub.txtbasicaid.txtbasicincident.txtciviliancasualty.txtcodelookup.txtems.txtfdheader.txtffcasualty.txtffequipfail.txtid serial primary key has been added to the "incidentaddress_2015" tablesource column addedThe rest continues from (as of this writing) line 101 of append_nfirs_yearly_data.sh
In [2]:
#import pandas as pd
import psycopg2
conn = psycopg2.connect("dbname=geocoding_2015")
In [306]:
# Need to change the zip5 datatype to match incidentaddress_2014
with conn.cursor() as cursor:
cursor.execute("alter table geocoding_2015.geocoded_addresses alter column zip5 type character varying(5);")
cursor.execute("update geocoding_2015.geocoded_addresses set zip5 = lpad(zip5, 5, '0')")
print cursor.rowcount
conn.commit()
In [308]:
# Need to change the loc_type datatype to match incidentaddress_2014
with conn.cursor() as cursor:
cursor.execute("alter table geocoding_2015.geocoded_addresses alter column loc_type type character varying(1);")
conn.commit()
In [311]:
# Ugh, forgot to update the apt_no
with conn.cursor() as cursor:
cursor.execute("update geocoding_2015.geocoded_addresses set apt_no = '' where apt_no is null")
print cursor.rowcount
conn.commit()
In [322]:
# Lets see how many incidents there are in each state for 2014
from pretty import pprint
with conn.cursor() as cursor:
cursor.execute("select state_id, count(state_id) from geocoding_2015.geocoded_addresses group by state_id order by state_id;")
pprint(cursor.fetchall())
conn.commit()
In [ ]:
In [3]:
# Create some indexes so this query doesn't take years to complete
with conn.cursor() as cursor:
cursor.execute("""
create index on geocoding_2015.geocoded_addresses (num_mile, upper(apt_no), upper(city), zip5, upper(street_pre),
loc_type, upper(streetname), upper(streettype), upper(streetsuf), upper(state_id));""")
cursor.execute("""
create index on geocoding_2015.geocoded_addresses (num_mile, upper(apt_no), upper(city_1), zip5, upper(street_pre), loc_type, upper(streetname), upper(streettype), upper(streetsuf), upper(state_id));
create index on geocoding_2015.geocoded_addresses (state_id);
create index on geocoding_2015.geocoded_addresses (source);""")
conn.commit()
In [4]:
# Execute the matching of the geocoded addresses to incident address for 2014, updating the geometry associated w/ each incident
with conn.cursor() as cursor:
cursor.execute("""
update incidentaddress_2014 as ia set geom = res.wkb_geometry, source = res.source
from (
select ia.id, aa.wkb_geometry, aa.source from address_to_geo_2014 aa inner join incidentaddress_2014 ia on (
aa.num_mile = ia.num_mile and
upper(aa.apt_no) = upper(ia.apt_no) and
upper(aa.city_1) = upper(ia.city) and
aa.zip5 = ia.zip5 and
upper(aa.street_pre) = upper(ia.street_pre) and
aa.loc_type = ia.loc_type and
upper(aa.streetname) = upper(ia.streetname) and
upper(aa.streettype) = upper(ia.streettype) and
upper(aa.streetsuf) = upper(ia.streetsuf) and
upper(aa.state_id) = upper(ia.state_id)
) where aa.num_mile != '' and score != 0) as res
where ia.id = res.id
""")
print cursor.rowcount
conn.commit()
In [18]:
# Now lets see how many incidents there are in each state for 2014 that are geocoded vs not geocoded
from decimal import *
from pretty import pprint
with conn.cursor() as cursor:
cursor.execute("""
select (100.0 * sum(case when geom is null then 0 else 1 end) / count(1)) as percent_with_geom,
sum(case when geom is null then 0 else 1 end) as with_geom, state_id,
count(state_id) as total_incidents
from incidentaddress_2014 group by state_id order by percent_with_geom desc;
""")
print '% | Has geom | State | Total rows'
for row in cursor.fetchall():
print '{} | {} | {} | {}'.format(row[0], row[1], row[2], row[3])
cursor.execute("""
select sum(case when geom is null then 0 else 1 end) as matches,
(100.0 * sum(case when geom is null then 0 else 1 end) / count(1)) as percent,
count(1) as total
from incidentaddress_2014 order by percent desc;""")
row = cursor.fetchone()
print '# geocode matches: {}, Percent geocoded: {}, Total {}'.format(row[0], row[1], row[2])
conn.commit()
In [31]:
# Just for sanity sake, make sure that the column types are the same between the 2014 and base data
addl_tables = ['arson', 'arsonagencyreferal', 'arsonjuvsub', 'basicaid', 'basicincident', 'civiliancasualty', 'codelookup', 'ems', 'fdheader', 'ffcasualty', 'ffequipfail', 'fireincident', 'hazchem', 'hazmat', 'hazmatequipinvolved', 'hazmobprop', 'incidentaddress', 'wildlands']
for table in addl_tables:
t = pd.read_sql_query("select column_name, data_type from information_schema.columns where table_name='%s'" % table, conn)
t_2014 = pd.read_sql_query("select column_name, data_type from information_schema.columns where table_name='%s_2014'" % table, conn)
t_cols = set([(x.items()[1][1], x.items()[0][1]) for x in arson.to_dict('records')])
t2014_cols = set([(x.items()[1][1], x.items()[0][1]) for x in arson2014.to_dict('records')])
print 'Table: {}, col diffs: {}'.format(table, t2014_cols - t_cols)
In [32]:
# get some pre counts
with conn.cursor() as cursor:
for t in addl_tables:
cursor.execute("select '{table}', count(1) from {table};".format(table=t))
print cursor.fetchone()
conn.commit()
In [43]:
# Update the fdheader table first
with conn.cursor() as cursor:
cursor.execute("select count(1) from fdheader_2014;")
print 'fdheader_2014 count: %s' % cursor.fetchone()[0]
cursor.execute("select count(1) from fdheader;")
pre = cursor.fetchone()[0]
cursor.execute("""INSERT INTO fdheader(
state, fdid, fd_name, fd_str_no, fd_str_pre, fd_street, fd_str_typ,
fd_str_suf, fd_city, fd_zip, fd_phone, fd_fax, fd_email, fd_fip_cty,
no_station, no_pd_ff, no_vol_ff, no_vol_pdc)
(SELECT distinct on (state, fdid) state, fdid, fd_name, fd_str_no, fd_str_pre, fd_street, fd_str_typ,
fd_str_suf, fd_city, fd_zip, fd_phone, fd_fax, fd_email, fd_fip_cty,
no_station, no_pd_ff, no_vol_ff, no_vol_pdc
FROM fdheader_2014 where (state, fdid) not in (select state, fdid from fdheader));""")
inserted = cursor.rowcount
cursor.execute("select count(1) from fdheader;")
post = cursor.fetchone()[0]
print 'FDHeader Pre: {}, Post: {}, Insert count: {}, Post - pre: {}'.format(pre, post, inserted, post - pre)
conn.commit()
In [55]:
# Update the "codeloop" (appears to be a static table) w/ the 2014 data (truncate and load)
with conn.cursor() as cursor:
cursor.execute("delete from codelookup;")
print 'Deleted: %s' % cursor.rowcount
cursor.execute("insert into codelookup select * from codelookup_2014;")
print 'Inserted: %s' % cursor.rowcount
conn.commit()
In [ ]:
# Insert the 2014 data into the foundation data
def load_2014(table, commit=True):
with conn.cursor() as cursor:
cursor.execute("select count(1) from %s;" % table)
pre = cursor.fetchone()[0]
cursor.execute("insert into %s select * from %s_2014;" % (table, table))
inserted = cursor.rowcount
cursor.execute("select count(1) from %s;" % table)
post = cursor.fetchone()[0]
print 'Table: {}, Pre: {}, Post: {}, Insert count: {}, Post - pre: {}'.format(table, pre, post, inserted, post - pre)
if commit:
conn.commit()
print 'COMMITTED'
else:
conn.rollback()
print 'ROLLBACK'
In [8]:
# Load EVERYTHING except for basicincident, fireincident, incidentaddress
for table in ['arson', 'arsonagencyreferal', 'arsonjuvsub', 'basicaid', 'civiliancasualty', 'ems', 'ffcasualty',
'ffequipfail', 'hazchem', 'hazmat', 'hazmatequipinvolved', 'hazmobprop', 'wildlands']:
load_2014(table)
In [17]:
# There must have been an error with the MO 06101 entry in "fdheader_2014", since there are 101 references to it in "basicincident_2014"
with conn.cursor() as cursor:
cursor.execute("""select count(state), state, fdid from basicincident_2014 bi
where (state, fdid) not in (select state, fdid from fdheader)
and (state, fdid) not in (select state, fdid from fdheader_2014)
group by state, fdid""")
print cursor.fetchall()
cursor.execute("select count(1) from basicincident where state = 'MO' and fdid = '6101'")
print cursor.fetchall()
cursor.execute("select count(1) from basicincident_2014 where state = 'MO' and fdid = '6101'")
print cursor.fetchall()
In [19]:
# Going to do a one-off update of (MO,6101) => (MO,06101) since that's the only offender
with conn.cursor() as cursor:
cursor.execute("update basicincident_2014 set fdid = '6101' where state = 'MO' and fdid = '06101'")
print cursor.rowcount
conn.commit()
In [21]:
with conn.cursor() as cursor:
cursor.execute("""select 'fireincident_2014', count(1) from fireincident_2014
union select 'basicincident_2014', count(1) from basicincident_2014
union select 'incidentaddress_2014', count(1) from incidentaddress_2014""")
print cursor.fetchall()
In [22]:
load_2014('basicincident', commit=True)
In [25]:
# Looks like fireincident_2014 has the same issue w/ that MO 6101 fire department
with conn.cursor() as cursor:
cursor.execute("""select count(state), state, fdid from fireincident_2014 bi
where (state, fdid) not in (select state, fdid from fdheader)
and (state, fdid) not in (select state, fdid from fdheader_2014)
group by state, fdid""")
print cursor.fetchall()
cursor.execute("select count(1) from fireincident where state = 'MO' and fdid = '6101'")
print cursor.fetchall()
cursor.execute("select count(1) from fireincident_2014 where state = 'MO' and fdid = '6101'")
print cursor.fetchall()
In [27]:
# Going to do a one-off update of (MO,6101) => (MO,06101) since that's the only offender
with conn.cursor() as cursor:
cursor.execute("update fireincident_2014 set fdid = '6101' where state = 'MO' and fdid = '06101'")
print cursor.rowcount
conn.commit()
In [29]:
load_2014('fireincident', commit=True)
In [31]:
# Loading all of the 2014 incident addresses into the master "incidentaddress" table
with conn.cursor() as cursor:
cursor.execute("select count(1) from incidentaddress_2014;")
print 'incidentaddress_2014 count: %s' % cursor.fetchone()[0]
cursor.execute("select count(1) from incidentaddress;")
pre = cursor.fetchone()[0]
cursor.execute("""INSERT INTO incidentaddress(
state, fdid, inc_date, inc_no, exp_no, loc_type, num_mile, street_pre,
streetname, streettype, streetsuf, apt_no, city, state_id, zip5,
zip4, x_street, addid, addid_try, geom, bkgpidfp00, bkgpidfp10)
(SELECT state, fdid, inc_date, inc_no, exp_no, loc_type, num_mile, street_pre,
streetname, streettype, streetsuf, apt_no, city, state_id, zip5,
zip4, x_street, addid, addid_try, geom, bkgpidfp00, bkgpidfp10
FROM incidentaddress_2014);""")
inserted = cursor.rowcount
cursor.execute("select count(1) from incidentaddress;")
post = cursor.fetchone()[0]
print 'Incident address Pre: {}, Post: {}, Insert count: {}, Post - pre: {}'.format(pre, post, inserted, post - pre)
conn.commit()
In [69]:
# Start moving things over to a separate schema
with conn.cursor() as cursor:
cursor.execute("create schema if not exists geocoding_2014;")
conn.commit()
In [35]:
with conn.cursor() as cursor:
cursor.execute("alter table address_to_geo_2014 set schema geocoding_2014;")
conn.commit()
In [36]:
tables = ["incident_address_2014_aa", "incident_address_2014_ab", "incident_address_2014_ac", "incident_address_2014_ad",
"incident_address_2014_ae"]
with conn.cursor() as cursor:
for table in tables:
cursor.execute("alter table %s set schema geocoding_2014;" % table)
conn.commit()
In [37]:
# We should be able to drop all of the 2014 tables now
tables_2014 = ['arson_2014', 'arsonagencyreferal_2014', 'arsonjuvsub_2014', 'basicaid_2014', 'basicincident_2014',
'civiliancasualty_2014', 'codelookup_2014', 'ems_2014', 'fdheader_2014', 'ffcasualty_2014',
'ffequipfail_2014', 'fireincident_2014', 'hazchem_2014', 'hazmat_2014', 'hazmatequipinvolved_2014',
'hazmobprop_2014', 'incidentaddress_2014', 'wildlands_2014']
with conn.cursor() as cursor:
for table in addl_tables_2014:
cursor.execute("drop table %s;" % table)
conn.commit()
[x] - arson_2014 -> arson [x] - arsonagencyreferal_2014 -> arsonagencyreferal [x] - arsonjuvsub_2014 -> arsonjuvsub [x] - basicaid_2014 -> basicaid [x] - basicincident_2014 -> basicincident [x] - civiliancasualty_2014 -> civiliancasualty [x] - codelookup_2014 -> codelookup [x] - ems_2014 -> ems [x] - fdheader_2014 -> fdheader [x] - ffcasualty_2014 -> ffcasualty [x] - ffequipfail_2014 -> ffequipfail [x] - fireincident_2014 -> fireincident [x] - hazchem_2014 -> hazchem [x] - hazmat_2014 - > hazmat [x] - hazmatequipinvolved_2014 -> hazmatequipinvolved [x] - hazmobprop_2014 -> hazmobprop [x] - incidentaddress_2014 -> incidentaddress [x] - wildlands_2014 -> wildlands
In [ ]: