TO DO:
In [1]:
# import necessary packages
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import geojson
Notes:
In [2]:
# Read in csvs
city_df = pd.read_csv('cities_v3.csv', encoding='ISO-8859-1', engine='python')
In [3]:
state_centroids_df = pd.read_csv('state_centroids.csv')
In [4]:
# alternate centroids - with None option with FIPS 99
state_centroids_v2_df = pd.read_csv('state_centroids_v2.csv')
In [5]:
county_centroids_df = pd.read_csv('county_centroids.csv', encoding='Windows-1252')
In [6]:
# alternate centroids - with None options with count FIPS 999
county_centroids_v2_df = pd.read_csv('county_centroids_v2.csv', encoding='Windows-1252')
In [7]:
missing_df = pd.read_csv('Missing_04182020.csv')
In [8]:
unclaimed_df = pd.read_csv('Unclaimed_04182020.csv')
In [9]:
unidentified_df = pd.read_csv('Unidentified_04182020.csv')
In [10]:
# check dataframe
state_centroids_v2_df.head()
Out[10]:
In [11]:
# check dataframe
missing_df.head()
Out[11]:
In [12]:
## Check if any rows with null value for DLC [NOTE: Leave as NaN]
missing_attr_df = missing_df.loc[missing_df['DLC'].isnull()]
# len(missing_attr_df)
missing_attr_df
Out[12]:
In [13]:
## Check if any rows with null value for Missing Age
missing_attr_1_df = missing_df.loc[missing_df['Missing Age'].isnull()]
# len(missing_attr_1_df)
missing_attr_1_df
Out[13]:
In [14]:
# Re-assign missing age values to be 'Unknown'
missing_df['Missing Age'] = missing_df['Missing Age'].fillna('Unknown')
missing_df.head()
Out[14]:
In [15]:
## Double check re-assigned values
missing_age_test_df = missing_df.loc[missing_df['Missing Age']=='Unknown']
missing_age_test_df
# len(missing_age_test_df)
Out[15]:
In [16]:
## Check if any rows with null value for Race / Ethnicity
missing_attr_2_df = missing_df.loc[missing_df['Race / Ethnicity'].isnull()]
# len(missing_attr_2_df)
missing_attr_2_df
Out[16]:
In [17]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
missing_df['Race / Ethnicity'] = missing_df['Race / Ethnicity'].fillna('Uncertain')
missing_df.head()
Out[17]:
In [18]:
# Re-assign 'unsure' race / ethnicity values to be 'Uncertain'
missing_df['Race / Ethnicity'] = missing_df['Race / Ethnicity'].str.replace('Unsure','Uncertain')
missing_df.head()
Out[18]:
In [19]:
# Re-assign 'other' race / ethnicity values to be 'Uncertain'
missing_df['Race / Ethnicity'] = missing_df['Race / Ethnicity'].str.replace('Other','Uncertain')
missing_df.head()
Out[19]:
In [20]:
## Double check re-assigned values
missing_race_test_df = missing_df.loc[missing_df['Race / Ethnicity']=='Uncertain']
missing_race_test_df
# len(missing_race_test_df)
Out[20]:
In [21]:
# check dataframe
missing_df.head()
Out[21]:
In [22]:
missing_df['Last Name'] = missing_df['Last Name'].str.lower()
missing_df.head()
Out[22]:
In [23]:
missing_df['Last Name'] = missing_df['Last Name'].str.capitalize()
missing_df.head()
Out[23]:
In [24]:
missing_df['First Name'] = missing_df['First Name'].str.lower()
missing_df.head()
Out[24]:
In [25]:
missing_df['First Name'] = missing_df['First Name'].str.capitalize()
missing_df.head()
Out[25]:
In [26]:
# check dataframe
unclaimed_df.head()
Out[26]:
In [27]:
## Check if any rows with null value for DBF [NOTE: Leave as NaN]
unclaimed_attr_df = unclaimed_df.loc[unclaimed_df['DBF'].isnull()]
# len(unclaimed_attr_df)
unclaimed_attr_df
Out[27]:
In [28]:
## Check if any rows with null value for last name
unclaimed_attr_1_df = unclaimed_df.loc[unclaimed_df['Last Name'].isnull()]
# len(unclaimed_attr_1_df)
unclaimed_attr_1_df
Out[28]:
In [29]:
# Re-assign missing last name values to be 'Unknown'
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].fillna('Unknown')
unclaimed_df.head()
Out[29]:
In [30]:
## Double check re-assigned values
unclaimed_lname_test_df = unclaimed_df.loc[unclaimed_df['Last Name']=='Unknown']
unclaimed_lname_test_df
# len(unclaimed_lname_test_df)
Out[30]:
In [31]:
## Check if any rows with null value for first name
unclaimed_attr_2_df = unclaimed_df.loc[unclaimed_df['First Name'].isnull()]
# len(unclaimed_attr_2_df)
unclaimed_attr_2_df
Out[31]:
In [32]:
# Re-assign missing first name values to be 'Unknown'
unclaimed_df['First Name'] = unclaimed_df['First Name'].fillna('Unknown')
unclaimed_df.head()
Out[32]:
In [33]:
## Double check re-assigned values
unclaimed_fname_test_df = unclaimed_df.loc[unclaimed_df['First Name']=='Unknown']
unclaimed_fname_test_df
# len(unclaimed_fname_test_df)
Out[33]:
In [34]:
## Check if any rows with null value for sex
unclaimed_attr_3_df = unclaimed_df.loc[unclaimed_df['Sex'].isnull()]
# len(unclaimed_attr_3_df)
unclaimed_attr_3_df
Out[34]:
In [35]:
# Re-assign missing sex values to be 'Unknown'
unclaimed_df['Sex'] = unclaimed_df['Sex'].fillna('Unknown')
unclaimed_df.head()
Out[35]:
In [36]:
## Double check re-assigned values
unclaimed_sex_test_df = unclaimed_df.loc[unclaimed_df['Sex']=='Unknown']
unclaimed_sex_test_df
# len(unclaimed_sex_test_df)
Out[36]:
In [37]:
## Check if any rows with null value for race / ethnicity
unclaimed_attr_4_df = unclaimed_df.loc[unclaimed_df['Race / Ethnicity'].isnull()]
# len(unclaimed_attr_4_df)
unclaimed_attr_4_df
Out[37]:
In [38]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
unclaimed_df['Race / Ethnicity'] = unclaimed_df['Race / Ethnicity'].fillna('Uncertain')
unclaimed_df.head()
Out[38]:
In [39]:
# Re-assign 'unsure' race / ethnicity values to be 'Uncertain'
unclaimed_df['Race / Ethnicity'] = unclaimed_df['Race / Ethnicity'].str.replace('Unsure','Uncertain')
unclaimed_df.head()
Out[39]:
In [40]:
# Re-assign 'other' race / ethnicity values to be 'Uncertain'
unclaimed_df['Race / Ethnicity'] = unclaimed_df['Race / Ethnicity'].str.replace('Other','Uncertain')
unclaimed_df.head()
Out[40]:
In [41]:
## Double check re-assigned values
unclaimed_race_test_df = unclaimed_df.loc[unclaimed_df['Race / Ethnicity']=='Uncertain']
unclaimed_race_test_df
# len(unclaimed_race_test_df)
Out[41]:
In [42]:
# check dataframe
unclaimed_df.head()
Out[42]:
In [43]:
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].str.lower()
unclaimed_df.head()
Out[43]:
In [44]:
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].str.capitalize()
unclaimed_df.head()
Out[44]:
In [45]:
unclaimed_df['First Name'] = unclaimed_df['First Name'].str.lower()
unclaimed_df.head()
Out[45]:
In [46]:
unclaimed_df['First Name'] = unclaimed_df['First Name'].str.capitalize()
unclaimed_df.head()
Out[46]:
In [47]:
# check dataframe
unidentified_df.head()
Out[47]:
In [48]:
## Check if any rows with null value for DBF [NOTE: Leave as NaN]
unidentified_attr_df = unidentified_df.loc[unidentified_df['DBF'].isnull()]
# len(unidentified_attr_df)
unidentified_attr_df
Out[48]:
In [49]:
## Check if any rows with null value for Age From
unidentified_attr_1_df = unidentified_df.loc[unidentified_df['Age From'].isnull()]
# len(unidentified_attr_1_df)
unidentified_attr_1_df
Out[49]:
In [50]:
# Re-assign missing age from values to be 0
unidentified_df['Age From'] = unidentified_df['Age From'].fillna(0)
unidentified_df.head()
Out[50]:
In [51]:
## Double check re-assigned values
unidentified_agefrom_test_df = unidentified_df.loc[unidentified_df['Age From']==0]
unidentified_agefrom_test_df
# len(unidentified_agefrom_test_df)
Out[51]:
In [52]:
## Check if any rows with null value for Age To
unidentified_attr_2_df = unidentified_df.loc[unidentified_df['Age To'].isnull()]
# len(unidentified_attr_2_df)
unidentified_attr_2_df
Out[52]:
In [53]:
# Re-assign missing age to values to be 120
unidentified_df['Age To'] = unidentified_df['Age To'].fillna(120)
unidentified_df.head()
Out[53]:
In [54]:
## Double check re-assigned values
unidentified_ageto_test_df = unidentified_df.loc[unidentified_df['Age To']==120]
unidentified_ageto_test_df
# len(unidentified_ageto_test_df)
Out[54]:
In [55]:
## Check if any rows with null value for sex
unidentified_attr_3_df = unidentified_df.loc[unidentified_df['Sex'].isnull()]
# len(unidentified_attr_3_df)
unidentified_attr_3_df
Out[55]:
In [56]:
# Re-assign missing sex values to be 'Unknown'
unidentified_df['Sex'] = unidentified_df['Sex'].fillna('Unknown')
unidentified_df.head()
Out[56]:
In [57]:
## Double check re-assigned values
unidentified_sex_test_df = unidentified_df.loc[unidentified_df['Sex']=='Unknown']
unidentified_sex_test_df
# len(unidentified_sex_test_df)
Out[57]:
In [58]:
## Check if any rows with null value for race / ethnicity
unidentified_attr_4_df = unidentified_df.loc[unidentified_df['Race / Ethnicity'].isnull()]
# len(unidentified_attr_4_df)
unidentified_attr_4_df
Out[58]:
In [59]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
unidentified_df['Race / Ethnicity'] = unidentified_df['Race / Ethnicity'].fillna('Uncertain')
unidentified_df.head()
Out[59]:
In [60]:
# Re-assign 'unsure' race / ethnicity values to be 'Uncertain'
unidentified_df['Race / Ethnicity'] = unidentified_df['Race / Ethnicity'].str.replace('Unsure','Uncertain')
unidentified_df.head()
Out[60]:
In [61]:
# Re-assign 'other' race / ethnicity values to be 'Uncertain'
unidentified_df['Race / Ethnicity'] = unidentified_df['Race / Ethnicity'].str.replace('Other','Uncertain')
unidentified_df.head()
Out[61]:
In [62]:
## Double check re-assigned values
unidentified_race_test_df = unidentified_df.loc[unidentified_df['Race / Ethnicity']=='Uncertain']
unidentified_race_test_df
# len(unidentified_race_test_df)
Out[62]:
In [63]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict
# state_dict['Alaska']
len(state_dict)
Out[63]:
In [64]:
missing_df.head()
Out[64]:
In [65]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['State_FIPS'] = missing_df['State'].map(state_dict)
missing_df.head()
Out[65]:
In [66]:
# Check unique values in new dataframe field
missing_df['State_FIPS'].unique()
Out[66]:
In [67]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
mis_test_df = missing_df.loc[missing_df['State_FIPS'].isnull()]
mis_test_df.head()
Out[67]:
In [68]:
# check missing df
missing_df.head()
# len(missing_df)
Out[68]:
In [69]:
unclaimed_df.head()
# len(unclaimed_df)
Out[69]:
In [70]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['State_FIPS'] = unclaimed_df['State'].map(state_dict)
unclaimed_df.head()
Out[70]:
In [71]:
# Check unique values in new dataframe field
unclaimed_df['State_FIPS'].unique()
Out[71]:
In [72]:
## If any nulls, check to see what is producing null values [NOTE: cannot be addressed, as these cases do not have a state or county assigned]
unc_test_df = unclaimed_df.loc[unclaimed_df['State_FIPS'].isnull()]
# len(unc_test_df)
unc_test_df
Out[72]:
In [73]:
# # change NaN FIPS codes (for cases with no city, county or state) to 99 (none option in state_centroids_v2.csv)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].fillna(99)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].astype(int)
unclaimed_df.head()
Out[73]:
In [74]:
## Double check re-assigned values
unc_test_df2 = unclaimed_df.loc[unclaimed_df['State_FIPS']==99]
unc_test_df2
# len(unc_test_df2)
Out[74]:
In [75]:
# check full dataframe
len(unclaimed_df)
Out[75]:
In [76]:
unidentified_df.head()
Out[76]:
In [77]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['State_FIPS'] = unidentified_df['State'].map(state_dict)
unidentified_df.head()
Out[77]:
In [78]:
# Check unique values in new dataframe field
unidentified_df['State_FIPS'].unique()
Out[78]:
In [79]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
uni_test_df = unidentified_df.loc[unidentified_df['State_FIPS'].isnull()]
uni_test_df.head()
Out[79]:
In [80]:
# make dictionary of state FIPS codes and state names
state_FIPS_dict = dict(zip(state_centroids_v2_df.STATEFP, state_centroids_v2_df.NAME))
# state_FIPS_dict
# state_FIPS_dict['Alaska']
len(state_FIPS_dict)
Out[80]:
In [81]:
# check county centroids v2 df
county_centroids_v2_df.head()
Out[81]:
In [82]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
county_centroids_v2_df['STATE_NAME'] = county_centroids_v2_df['STATEFP'].map(state_FIPS_dict)
# check dataframe
county_centroids_v2_df.head()
# Check unique values in new dataframe field
# county_centroids_v2_df['STATE_NAME'].unique()
Out[82]:
In [83]:
# sort county centroids by state FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['STATEFP'])
county_centroids_v2_df.head()
Out[83]:
In [84]:
# Add column with compound field key
county_centroids_v2_df['County_Key'] = county_centroids_v2_df['STATEFP'].astype(str) + "_" + county_centroids_v2_df['NAME']
county_centroids_v2_df.head()
Out[84]:
In [85]:
# check unique values and length
county_key_v2_list = county_centroids_v2_df['County_Key'].unique()
len(county_key_v2_list)
Out[85]:
In [86]:
# make dictionary of counties and county FIPS code (GEOID field)
county_v2_dict = dict(zip(county_centroids_v2_df.County_Key, county_centroids_v2_df.GEOID))
len(county_v2_dict)
Out[86]:
In [87]:
# make dictionary of counties FIPS codes (GEOID field) and county names
countynames_v2_dict = dict(zip(county_centroids_v2_df.GEOID, county_centroids_v2_df.NAME))
len(countynames_v2_dict)
# countynames_v2_dict
Out[87]:
In [88]:
type(county_centroids_v2_df['GEOID'][0])
Out[88]:
In [89]:
# sort county centroids by state FIPS
county_centroids_df = county_centroids_df.sort_values(by=['STATEFP'])
county_centroids_df.head()
Out[89]:
In [90]:
# Add column with compound field key
county_centroids_df['County_Key'] = county_centroids_df['STATEFP'].astype(str) + "_" + county_centroids_df['NAME']
county_centroids_df.head()
Out[90]:
In [91]:
# check unique values and length
county_key_list = county_centroids_df['County_Key'].unique()
len(county_key_list)
Out[91]:
In [92]:
# make dictionary of counties and county FIPS code (GEOID field)
county_dict = dict(zip(county_centroids_df.County_Key, county_centroids_df.GEOID))
len(county_dict)
Out[92]:
In [93]:
# check missing df
missing_df.head()
Out[93]:
In [94]:
# Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()
Out[94]:
In [95]:
# Add new column to dataframe, using the State_County name field as a key in the county_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()
Out[95]:
In [96]:
# check null values [NOTE: all ok - no city or county assigned]
missing_county_null_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_df.shape
missing_county_null_df
Out[96]:
In [97]:
# As needed, export nulls to address [NOTE: all that can have been addressed]
# missing_county_null_df.to_csv('missing_county_nulls.csv', encoding='Windows-1252')
In [98]:
# check missing_df
missing_df.head()
Out[98]:
In [99]:
## Check if any rows with null value for 'State'
mis_state_test_df = missing_df.loc[missing_df['State'].isnull()]
# len(mis_state_test_df)
mis_state_test_df
Out[99]:
In [100]:
## Check if any rows with null value for 'City' and County'
mis_county_test_df = missing_df.loc[missing_df['County'].isnull()]
# len(mis_county_test_df)
mis_county_test_df
Out[100]:
In [101]:
# # change NaN County name (for cases with no city or county) to 'None'
missing_df['County'] = missing_df['County'].fillna('None')
missing_df.head()
Out[101]:
In [102]:
## Double check re-assigned values
mis_county_test_2_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_2_df
# len(mis_county_test_2_df)
Out[102]:
In [103]:
# Re-Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()
Out[103]:
In [104]:
## Double check re-assigned values
mis_county_test_3_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_3_df
# len(mis_county_test_3_df)
Out[104]:
In [105]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()
Out[105]:
In [106]:
# check null values to make sure none are left [Note - all gone!]
missing_county_null_v2_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_v2_df.shape
missing_county_null_v2_df
Out[106]:
In [107]:
## Double check re-assigned values
mis_county_test_4_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_4_df
# len(mis_county_test_4_df)
Out[107]:
In [108]:
# check unclaimed df
unclaimed_df.head()
Out[108]:
In [109]:
# Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()
Out[109]:
In [110]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()
Out[110]:
In [111]:
# check null values
unclaimed_county_null_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_df.shape
unclaimed_county_null_df
Out[111]:
In [112]:
# As needed, export nulls to address [NOTE: all have been addressed. 1241 cases have no county]
# unclaimed_county_null_df.to_csv('unclaimed_county_nulls.csv', encoding='Windows-1252')
In [113]:
# check unclaimed_df
unclaimed_df.head()
Out[113]:
In [114]:
## Check if any rows with null value for 'State'
unc_state_test_df = unclaimed_df.loc[unclaimed_df['State'].isnull()]
# len(unc_state_test_df)
unc_state_test_df
Out[114]:
In [115]:
# # change NaN State name (for cases with no city or county or state) to 'None'
unclaimed_df['State'] = unclaimed_df['State'].fillna('None')
unclaimed_df.head()
Out[115]:
In [116]:
## Double check re-assigned values
unc_state_test_2_df = unclaimed_df.loc[unclaimed_df['State']=='None']
unc_state_test_2_df
# len(unc_state_test_2_df)
Out[116]:
In [117]:
## Check if any rows with null value for 'City' and County'
unc_county_test_df = unclaimed_df.loc[unclaimed_df['County'].isnull()]
# len(unc_county_test_df)
unc_county_test_df
Out[117]:
In [118]:
# # change NaN County name (for cases with no city or county) to 'None'
unclaimed_df['County'] = unclaimed_df['County'].fillna('None')
unclaimed_df.head()
Out[118]:
In [119]:
## Double check re-assigned values
unc_county_test_2_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_2_df
# len(unc_county_test_2_df)
Out[119]:
In [120]:
# Re-Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()
Out[120]:
In [121]:
## Double check re-assigned values
unc_county_test_3_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_3_df
# len(unc_county_test_3_df)
Out[121]:
In [122]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()
Out[122]:
In [123]:
# check null values to make sure none are left [Note - all gone!]
unclaimed_county_null_v2_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_v2_df.shape
unclaimed_county_null_v2_df
Out[123]:
In [124]:
## Double check re-assigned values
unc_county_test_4_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_4_df
# len(unc_county_test_4_df)
Out[124]:
In [125]:
# check unidentified df
unidentified_df.head()
Out[125]:
In [126]:
# Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()
Out[126]:
In [127]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()
Out[127]:
In [128]:
# check null values
unidentified_county_null_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_df.shape
# len(unidentified_county_null_df)
unidentified_county_null_df
Out[128]:
In [129]:
# As needed, export nulls to address [NOTE - all have been addressed - 28 remain with no city or county]
# unidentified_county_null_df.to_csv('unidentified_county_nulls.csv', encoding='Windows-1252')
In [130]:
# check unclaimed_df
unidentified_df.head()
Out[130]:
In [131]:
## Check if any rows with null value for 'State'
uni_state_test_df = unidentified_df.loc[unidentified_df['State'].isnull()]
# len(uni_state_test_df)
uni_state_test_df
Out[131]:
In [132]:
## Check if any rows with null value for 'City' and County'
uni_county_test_df = unidentified_df.loc[unidentified_df['County'].isnull()]
# len(uni_county_test_df)
uni_county_test_df
Out[132]:
In [133]:
# # change NaN County name (for cases with no city or county) to 'None'
unidentified_df['County'] = unidentified_df['County'].fillna('None')
unidentified_df.head()
Out[133]:
In [134]:
## Double check re-assigned values
uni_county_test_2_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_2_df
# len(uni_county_test_2_df)
Out[134]:
In [135]:
# Re-Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()
Out[135]:
In [136]:
## Double check re-assigned values
uni_county_test_3_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_3_df
# len(uni_county_test_3_df)
Out[136]:
In [137]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()
Out[137]:
In [138]:
# check null values to make sure none are left [Note - all gone!]
unidentified_county_null_v2_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_v2_df.shape
unidentified_county_null_v2_df
Out[138]:
In [139]:
## Double check re-assigned values
uni_county_test_4_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_4_df
# len(uni_county_test_4_df)
Out[139]:
In [140]:
city_df.head()
Out[140]:
In [141]:
# pr_df = city_df.loc[city_df['FEATURE_NA']=='Mayagüez']
# pr_df
In [142]:
# Drop columns that aren't needed
city_df = city_df.drop(columns=['FEATURE_CL','PRIMARY_LA', 'PRIM_LONG_', 'SOURCE_LAT', 'SOURCE_LON', 'SOURCE_L_1', 'SOURCE_L_2', 'ELEV_IN_M', 'ELEV_IN_FT', 'MAP_NAME', 'DATE_CREAT', 'DATE_EDITE'])
In [143]:
city_df = city_df.sort_values(by=['STATE_ALPH'])
city_df.head()
Out[143]:
In [144]:
# Rename columns
city_df.rename(columns={'FEATURE_NA':'CITY_NAME', 'STATE_ALPH':'STUSPS', 'STATE_NUME':'STATEFP', 'COUNTY_NAM':'GNIS_COUNTY', 'PRIM_LAT_D': 'Lat_dd', 'PRIM_LON_1': 'Lon_dd'}, inplace=True)
In [145]:
city_df.head()
Out[145]:
In [146]:
# Make state FIPS int
city_df['STATEFP'] = city_df['STATEFP'].astype(int)
city_df.head()
Out[146]:
In [147]:
# Make couty FIPS int
city_df['COUNTY_NUM'] = city_df['COUNTY_NUM'].astype(int)
city_df.head()
Out[147]:
In [148]:
# reformat county num with leading zeros
city_df['COUNTY_NUM']=city_df['COUNTY_NUM'].apply('{:0>3}'.format)
city_df.head()
Out[148]:
In [149]:
# create county FIPS field
city_df['COUNTY_FIPS'] = city_df['STATEFP'].astype(str) + city_df['COUNTY_NUM'].astype(str)
city_df.head(5000)
Out[149]:
In [150]:
# Make couty FIPS int
city_df['COUNTY_FIPS'] = city_df['COUNTY_FIPS'].astype(int)
type(city_df['COUNTY_FIPS'][0])
Out[150]:
In [151]:
city_df.head()
Out[151]:
In [152]:
# Add new column to dataframe, using the county fips field as a key in the countynames_dict, to pull the correct county name for each row
city_df['COUNTY_NAME'] = city_df['COUNTY_FIPS'].map(countynames_v2_dict)
city_df.head()
Out[152]:
In [153]:
# Add new column to dataframe, using the state fips field as a key in the state_dict, to pull the correct FIPS code for each row
city_df['STATE_NAME'] = city_df['STATEFP'].map(state_FIPS_dict)
city_df.head()
Out[153]:
In [154]:
city_df['CITY_NAME'] = city_df['CITY_NAME'].str.replace("?","")
In [155]:
city_df['CITY_STRING'] = city_df['CITY_NAME']
city_df.head()
Out[155]:
In [156]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("."," ")
In [157]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("St ","saint")
In [158]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("Township","")
In [159]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("Near ","")
In [160]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace(",","")
In [161]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("'","")
In [162]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("-","")
In [163]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("/","")
In [164]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace("?","")
In [165]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.replace(" ","")
In [166]:
city_df['CITY_STRING'] = city_df['CITY_STRING'].str.lower()
city_df.head()
Out[166]:
In [167]:
# make joint county_city column
city_df['COUNTYFP_CITY'] = city_df['COUNTY_FIPS'].astype(str) + '_' + city_df['CITY_STRING']
city_df.head()
Out[167]:
In [168]:
# check length of dataframe
len(city_df)
Out[168]:
In [169]:
# if necessary export csv to check if duplicates [NOTE: yes - there were duplicates of the same city in the same county in the same state]
# city_df.to_csv('city_dups.csv', encoding='ISO-8859-1')
In [170]:
# get rid of duplicates
city_df = city_df.drop_duplicates(subset='COUNTYFP_CITY')
len(city_df)
Out[170]:
In [171]:
# make dictionary of COUNTYFP_CITY and city name string
city_dict = dict(zip(city_df.COUNTYFP_CITY, city_df.CITY_NAME))
# city_dict
# city_dict['4013_adobe']
len(city_dict)
Out[171]:
In [172]:
missing_df = missing_df.sort_values(by=['Case Number'])
missing_df.head()
Out[172]:
In [173]:
# check dataframe
missing_df.head()
Out[173]:
In [174]:
# check length of dataframe
len(missing_df)
Out[174]:
In [175]:
missing_df['City'] = missing_df['City'].str.replace("?","")
In [176]:
# make city names titled
missing_df['City_string'] = missing_df['City'].str.title()
missing_df.head()
Out[176]:
In [177]:
missing_df['City_string'] = missing_df['City_string'].str.replace("."," ")
In [178]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Ft","fort")
In [179]:
missing_df['City_string'] = missing_df['City_string'].str.replace("St ","saint")
In [180]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Ste ","saint")
In [181]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Mt","mount")
In [182]:
missing_df['City_string'] = missing_df['City_string'].str.replace("^S ","south")
In [183]:
missing_df['City_string'] = missing_df['City_string'].str.replace("^E ","east")
In [184]:
missing_df['City_string'] = missing_df['City_string'].str.replace("^N ","north")
In [185]:
missing_df['City_string'] = missing_df['City_string'].str.replace("^W ","west")
In [186]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Twp","")
In [187]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Township","")
In [188]:
missing_df['City_string'] = missing_df['City_string'].str.replace("Near ","")
In [189]:
missing_df['City_string'] = missing_df['City_string'].str.replace("North Of ","")
In [190]:
missing_df['City_string'] = missing_df['City_string'].str.replace(" Dc","")
In [191]:
missing_df['City_string'] = missing_df['City_string'].str.replace(" District Of Columbia","")
In [192]:
missing_df['City_string'] = missing_df['City_string'].str.replace(",","")
In [193]:
missing_df['City_string'] = missing_df['City_string'].str.replace("'","")
In [194]:
missing_df['City_string'] = missing_df['City_string'].str.replace("-","")
In [195]:
missing_df['City_string'] = missing_df['City_string'].str.replace("/","")
In [196]:
missing_df['City_string'] = missing_df['City_string'].str.replace("?","")
In [197]:
missing_df['City_string'] = missing_df['City_string'].str.replace(" ","")
In [198]:
missing_df['City_string'] = missing_df['City_string'].str.replace("County","")
In [199]:
missing_df['City_string'] = missing_df['City_string'].str.lower()
missing_df.head()
Out[199]:
In [200]:
missing_df['City_string'] = missing_df['City_string'].str.replace('unknown','', regex=False)
In [201]:
missing_df['City_string'] = missing_df['City_string'].str.replace('undetermined','', regex=False)
In [202]:
missing_df['City_string'] = missing_df['City_string'].str.replace('.*\d.*', '', regex=True)
In [203]:
missing_df['City_string'] = missing_df['City_string'].replace(r'^\s*$', np.NaN, regex=True)
missing_df.head()
Out[203]:
In [204]:
# check null values [NOTE: ]
missing_citystring_null_df = missing_df.loc[missing_df['City_string'].isnull()]
# missing_citystring_null_df.shape
missing_citystring_null_df
Out[204]:
In [205]:
# Add column with compound field key
missing_df['County_city'] = missing_df['County_FIPS'].astype(str) + "_" + missing_df['City_string']
missing_df.head()
Out[205]:
In [206]:
# Add new column to dataframe, using the County_city name field as a key in the city_dict, to pull the GNIS city name for each row
missing_df['GNIS_city'] = missing_df['County_city'].map(city_dict)
missing_df.head()
Out[206]:
In [207]:
# Check unique values in new dataframe field
missing_df['GNIS_city'].unique()
Out[207]:
In [208]:
# check null values [NOTE: ]
missing_city_null_df = missing_df.loc[missing_df['GNIS_city'].isnull()]
missing_city_null_df.shape
# missing_city_null_df
Out[208]:
In [209]:
# As needed, export nulls to address [NOTE:]
# missing_city_null_df.to_csv('missing_city_nulls.csv', encoding='utf-8')
In [ ]:
In [210]:
unclaimed_df = unclaimed_df.sort_values(by=['DBF'])
unclaimed_df.head()
Out[210]:
In [211]:
# check dataframe
unclaimed_df.head()
Out[211]:
In [212]:
# check length of dataframe
len(unclaimed_df)
Out[212]:
In [213]:
unclaimed_df['City'] = unclaimed_df['City'].str.replace("?","")
In [214]:
# make city names titled
unclaimed_df['City_string'] = unclaimed_df['City'].str.title()
unclaimed_df.head()
Out[214]:
In [215]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("."," ")
In [216]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Ft","fort")
In [217]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("St ","saint")
In [218]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Ste ","saint")
In [219]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Mt","mount")
In [220]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("^S ","south")
In [221]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("^E ","east")
In [222]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("^N ","north")
In [223]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("^W ","west")
In [224]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Twp","")
In [225]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Township","")
In [226]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("Near ","")
In [227]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("North Of ","")
In [228]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace(" Dc","")
In [229]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace(" District Of Columbia","")
In [230]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace(",","")
In [231]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("'","")
In [232]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("-","")
In [233]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("/","")
In [234]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("?","")
In [235]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace(" ","")
In [236]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace("County","")
In [237]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.lower()
unclaimed_df.head()
Out[237]:
In [238]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace('unknown','', regex=False)
In [239]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace('undetermined','', regex=False)
In [240]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].str.replace('.*\d.*', '', regex=True)
In [241]:
unclaimed_df['City_string'] = unclaimed_df['City_string'].replace(r'^\s*$', np.NaN, regex=True)
unclaimed_df.head()
Out[241]:
In [242]:
# check null values [NOTE: ]
unclaimed_citystring_null_df = unclaimed_df.loc[unclaimed_df['City_string'].isnull()]
# unclaimed_citystring_null_df.shape
unclaimed_citystring_null_df
Out[242]:
In [243]:
# Add column with compound field key
unclaimed_df['County_city'] = unclaimed_df['County_FIPS'].astype(str) + "_" + unclaimed_df['City_string']
unclaimed_df.head()
Out[243]:
In [244]:
# Add new column to dataframe, using the County_city name field as a key in the city_dict, to pull the GNIS city name for each row
unclaimed_df['GNIS_city'] = unclaimed_df['County_city'].map(city_dict)
unclaimed_df.head()
Out[244]:
In [245]:
# Check unique values in new dataframe field
unclaimed_df['GNIS_city'].unique()
Out[245]:
In [246]:
# check null values [NOTE: ]
unclaimed_city_null_df = unclaimed_df.loc[unclaimed_df['GNIS_city'].isnull()]
unclaimed_city_null_df.shape
# unclaimed_city_null_df
Out[246]:
In [247]:
# As needed, export nulls to address [NOTE:]
# unclaimed_city_null_df.to_csv('unclaimed_city_nulls.csv', encoding='utf-8')
In [ ]:
In [248]:
# check dataframe
unidentified_df.head()
Out[248]:
In [249]:
# check length of dataframe
len(unidentified_df)
Out[249]:
In [250]:
unidentified_df['City'] = unidentified_df['City'].str.replace("?","")
In [251]:
# make city names titled
unidentified_df['City_string'] = unidentified_df['City'].str.title()
unidentified_df.head()
Out[251]:
In [252]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("."," ")
In [253]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Ft","fort")
In [254]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("St ","saint")
In [255]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Ste ","saint")
In [256]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Mt","mount")
In [257]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("^S ","south")
In [258]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("^E ","east")
In [259]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("^N ","north")
In [260]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("^W ","west")
In [261]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Twp","")
In [262]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Township","")
In [263]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Near ","")
In [264]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Vicinity Of ","")
In [265]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("North Of ","")
In [266]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Northeast Of ","")
In [267]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("East Of ","")
In [268]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Southeast Of ","")
In [269]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("South Of ","")
In [270]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Southwest Of ","")
In [271]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("West Of ","")
In [272]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("Northwest Of ","")
In [273]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace(" Vicinity","")
In [274]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace(" Dc","")
In [275]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace(" District Of Columbia","")
In [276]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace(",","")
In [277]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("'","")
In [278]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("-","")
In [279]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("/","")
In [280]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("?","")
In [281]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace(" ","")
In [282]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace("County","")
In [283]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.lower()
unidentified_df.head()
Out[283]:
In [284]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace('unknown','', regex=False)
In [285]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace('undetermined','', regex=False)
In [286]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace('unincorporated','', regex=False)
In [287]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace('rural','', regex=False)
In [288]:
unidentified_df['City_string'] = unidentified_df['City_string'].str.replace('.*\d.*', '', regex=True)
In [289]:
unidentified_df['City_string'] = unidentified_df['City_string'].replace(r'^\s*$', np.NaN, regex=True)
unidentified_df.head()
Out[289]:
In [290]:
# check null values [NOTE: ]
unidentified_citystring_null_df = unidentified_df.loc[unidentified_df['City_string'].isnull()]
# unidentified_citystring_null_df.shape
unidentified_citystring_null_df
Out[290]:
In [291]:
# Add column with compound field key
unidentified_df['County_city'] = unidentified_df['County_FIPS'].astype(str) + "_" + unidentified_df['City_string']
unidentified_df.head()
Out[291]:
In [292]:
# Add new column to dataframe, using the County_city name field as a key in the city_dict, to pull the GNIS city name for each row
unidentified_df['GNIS_city'] = unidentified_df['County_city'].map(city_dict)
unidentified_df.head()
Out[292]:
In [293]:
# Check unique values in new dataframe field
unidentified_df['GNIS_city'].unique()
Out[293]:
In [294]:
# check null values [NOTE: ]
unidentified_city_null_df = unidentified_df.loc[unidentified_df['GNIS_city'].isnull()]
unidentified_city_null_df.shape
# unidentified_city_null_df
Out[294]:
In [295]:
# As needed, export nulls to address [NOTE:]
# unidentified_city_null_df.to_csv('unidentified_city_nulls.csv', encoding='utf-8')
In [ ]:
# Load state shapefile
state_shapefile_gdf = gpd.read_file('Shapefiles/state_polygons.shp')
In [ ]:
state_shapefile_gdf.plot()
In [ ]:
# write to geoJSON
state_shapefile_gdf.to_file("JSON/state_polygons.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
# load county shapefile
county_shapefile_gdf = gpd.read_file('Shapefiles/county_polygons.shp')
In [ ]:
county_shapefile_gdf.plot()
In [ ]:
# write to geoJSON
county_shapefile_gdf.to_file("JSON/county_polygons.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
missing_count = missing_df.groupby('State').count()
len(missing_count)
missing_count.head()
In [ ]:
# Remove all columns except case number count
missing_count = missing_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for state (since state is now index)
missing_count['State'] = missing_count.index
In [ ]:
# rename case number count column
missing_count = missing_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
missing_count.head()
In [ ]:
# make dictionary of states and missing person counts
missing_statecount_dict = dict(zip(missing_count.State, missing_count.Missing_CaseCount))
len(missing_statecount_dict)
# type(missing_statecount_dict['Alabama'])
In [ ]:
# Get count of values in missing database w/ no state assigned
mis_null_series = missing_df.loc[missing_df['State']=='None'].count()
mis_null_ct = mis_null_series['Case Number']
mis_null_ct
In [ ]:
unclaimed_count = unclaimed_df.groupby('State').count()
# len(unclaimed_count)
unclaimed_count.head()
In [ ]:
# Remove all columns except case number count
unclaimed_count = unclaimed_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for state (since state is now index)
unclaimed_count['State'] = unclaimed_count.index
In [ ]:
# rename case number count column
unclaimed_count = unclaimed_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
unclaimed_count.head()
In [ ]:
# make dictionary of states and unclaimed case counts
unclaimed_statecount_dict = dict(zip(unclaimed_count.State, unclaimed_count.Unclaimed_CaseCount))
len(unclaimed_statecount_dict)
# unclaimed_statecount_dict
In [ ]:
# Get count of values in unclaimed database w/ no state assigned
unc_null_series = unclaimed_df.loc[unclaimed_df['State']=='None'].count()
unc_null_ct = unc_null_series['Case Number']
unc_null_ct
In [ ]:
unidentified_count = unidentified_df.groupby('State').count()
# len(unidentified_count)
unidentified_count.head()
In [ ]:
# Remove all columns except case number count
unidentified_count = unidentified_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for state (since state is now index)
unidentified_count['State'] = unidentified_count.index
In [ ]:
# rename case number count column
unidentified_count = unidentified_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
unidentified_count.head()
In [ ]:
# make dictionary of states and unidentified case counts
unidentified_statecount_dict = dict(zip(unidentified_count.State, unidentified_count.Unidentified_CaseCount))
len(unidentified_statecount_dict)
# unidentified_statecount_dict
In [ ]:
# Get count of values in unidentified database w/ no state assigned
uni_null_series = unidentified_df.loc[unidentified_df['State']=='None'].count()
uni_null_ct = uni_null_series['Case Number']
uni_null_ct
In [ ]:
# make a new summary dataframe based on the state centroids
summary_df = state_centroids_v2_df
# sort by state name
summary_df = summary_df.sort_values(by=['STATEFP'])
summary_df.head()
In [ ]:
# Remove all columns except state info and case number counts
summary_df = summary_df.drop(columns=['OBJECTID','STATENS','AFFGEOID','GEOID','LSAD','ALAND','AWATER','ORIG_FID'])
summary_df.head()
In [ ]:
len(summary_df)
In [ ]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_df['Missing_Count'] = summary_df['NAME'].map(missing_statecount_dict)
summary_df.head()
In [ ]:
# Check null values [NOTE: seems fine to have nulls]
missing_null_df = summary_df.loc[summary_df['Missing_Count'].isnull()]
missing_null_df
In [ ]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_df['Unclaimed_Count'] = summary_df['NAME'].map(unclaimed_statecount_dict)
summary_df.head()
In [ ]:
# Check null values [NOTE: seems fine to have nulls]
unclaimed_null_df = summary_df.loc[summary_df['Unclaimed_Count'].isnull()]
unclaimed_null_df
In [ ]:
# No longer needed, since summary dataframe now made AFTER data cleaning and adding State with FIPS 99
# # # change NaN Unclaimed count for fake FIPS 99 to count of values in missing database w/ no state assigned (21)
# index_Series = summary_df.loc[summary_df['STATEFP']==99]
# index_None = index_Series.index[0]
# summary_df.loc[index_None, 'Unclaimed_Count'] = unc_null_ct
# # check value
# summary_df['Unclaimed_Count'][index_None]
In [ ]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_df['Unidentified_Count'] = summary_df['NAME'].map(unidentified_statecount_dict)
summary_df.head()
In [ ]:
# Check null values [NOTE: seems fine to have nulls]
unidentified_null_df = summary_df.loc[summary_df['Unidentified_Count'].isnull()]
unidentified_null_df
In [ ]:
summary_df.head()
In [ ]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_df['Total_Count'] = summary_df.iloc[:, -4:-1].sum(axis=1)
summary_df.head(56)
In [ ]:
# check final summary_df
summary_df.head()
In [ ]:
# Convert dataframe to geodataframe
summary_gdf = gpd.GeoDataFrame(summary_df, geometry=gpd.points_from_xy(x=summary_df.Lon_dd, y=summary_df.Lat_dd))
summary_gdf.head()
In [ ]:
# write to geoJSON
summary_gdf.to_file("JSON/summary_counts.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
# load state json
summary_state_poly_gdf = gpd.read_file('JSON/state_polygons.json')
In [ ]:
summary_state_poly_gdf.head()
In [ ]:
# Remove all columns except county info and case number counts
summary_state_poly_gdf = summary_state_poly_gdf.drop(columns=['STATENS','AFFGEOID','GEOID', 'LSAD','ALAND','AWATER', 'Shape_Leng', 'Shape_Area'])
summary_state_poly_gdf.head()
In [ ]:
len(summary_state_poly_gdf)
In [ ]:
# sort by state FIPS
summary_state_poly_gdf = summary_state_poly_gdf.sort_values(by=['STATEFP'])
summary_state_poly_gdf.head()
In [ ]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_state_poly_gdf['Missing_Count'] = summary_state_poly_gdf['NAME'].map(missing_statecount_dict)
summary_state_poly_gdf.head()
In [ ]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_state_poly_gdf['Unclaimed_Count'] = summary_state_poly_gdf['NAME'].map(unclaimed_statecount_dict)
summary_state_poly_gdf.head()
In [ ]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_state_poly_gdf['Unidentified_Count'] = summary_state_poly_gdf['NAME'].map(unidentified_statecount_dict)
summary_state_poly_gdf.head()
In [ ]:
summary_state_poly_gdf.head()
In [ ]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_state_poly_gdf['Total_Count'] = summary_state_poly_gdf.iloc[:, -4:-1].sum(axis=1)
summary_state_poly_gdf.head(56)
In [ ]:
# check final summary_df
summary_state_poly_gdf.head()
In [ ]:
# write to geoJSON
summary_state_poly_gdf.to_file("JSON/state_poly_counts.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
missing_df.head()
In [ ]:
missing_county_count = missing_df.groupby('County_FIPS').count()
# len(missing_county_count)
missing_county_count.head()
In [ ]:
# Remove all columns except case number count
missing_county_count = missing_county_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for county FIPS (since county FIPS is now index)
missing_county_count['County_FIPS'] = missing_county_count.index
In [ ]:
# rename case number count column
missing_county_count = missing_county_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
missing_county_count.head()
In [ ]:
# make dictionary of county_FIPS and missing person counts
missing_countycount_dict = dict(zip(missing_county_count.County_FIPS, missing_county_count.Missing_CaseCount))
len(missing_countycount_dict)
# missing_countycount_dict[1001]
In [ ]:
unclaimed_df.head()
In [ ]:
unclaimed_county_count = unclaimed_df.groupby('County_FIPS').count()
# len(unclaimed_county_count)
unclaimed_county_count.head()
In [ ]:
# Remove all columns except case number count
unclaimed_county_count = unclaimed_county_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS', 'State_County', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for county FIPS (since county FIPS is now index)
unclaimed_county_count['County_FIPS'] = unclaimed_county_count.index
In [ ]:
# rename case number count column
unclaimed_county_count = unclaimed_county_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
unclaimed_county_count.head()
In [ ]:
# make dictionary of county_FIPS and missing person counts
unclaimed_countycount_dict = dict(zip(unclaimed_county_count.County_FIPS, unclaimed_county_count.Unclaimed_CaseCount))
len(unclaimed_countycount_dict)
# unclaimed_countycount_dict
In [ ]:
unidentified_df.head()
In [ ]:
unidentified_county_count = unidentified_df.groupby('County_FIPS').count()
# len(unidentified_county_count)
unidentified_county_count.head()
In [ ]:
# Remove all columns except case number count
unidentified_county_count = unidentified_county_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'City_string', 'County_city', 'GNIS_city'])
In [ ]:
# add column for county FIPS (since county FIPS is now index)
unidentified_county_count['County_FIPS'] = unidentified_county_count.index
In [ ]:
# rename case number count column
unidentified_county_count = unidentified_county_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)
In [ ]:
# check dataframe
unidentified_county_count.head()
In [ ]:
# make dictionary of county_FIPS and missing person counts
unidentified_countycount_dict = dict(zip(unidentified_county_count.County_FIPS, unidentified_county_count.Unidentified_CaseCount))
len(unidentified_countycount_dict)
# unidentified_countycount_dict
In [ ]:
county_centroids_v2_df.head()
In [ ]:
len(county_centroids_v2_df)
In [ ]:
# make a new summary dataframe based on the county centroids
summary_county_df = county_centroids_v2_df
# sort by county FIPS
summary_county_df = summary_county_df.sort_values(by=['GEOID'])
summary_county_df.head()
In [ ]:
# Remove all columns except county info and case number counts
summary_county_df = summary_county_df.drop(columns=['OBJECTID','COUNTYFP','COUNTYNS','AFFGEOID','LSAD','ALAND','AWATER','ORIG_FID'])
summary_county_df.head()
In [ ]:
len(summary_county_df)
In [ ]:
# Add new column to dataframe, using the county FIPS field as a key in the missing_county_dict, to pull the correct missing case count for each county
summary_county_df['Missing_Count'] = summary_county_df['GEOID'].map(missing_countycount_dict)
summary_county_df.head()
In [ ]:
# Check null values [NOTE: fine to have nulls]
missing_county_null_df = summary_county_df.loc[summary_county_df['Missing_Count'].isnull()]
missing_county_null_df
In [ ]:
# Add new column to dataframe, using the county FIPS field as a key in the unclaimed_county_dict, to pull the correct unclaimed case count for each county
summary_county_df['Unclaimed_Count'] = summary_county_df['GEOID'].map(unclaimed_countycount_dict)
summary_county_df.head()
In [ ]:
# Check null values [NOTE: fine to have nulls]
unclaimed_county_null_df = summary_county_df.loc[summary_county_df['Unclaimed_Count'].isnull()]
unclaimed_county_null_df
In [ ]:
# Add new column to dataframe, using the county_FIPS field as a key in the unidentified_county_dict, to pull the correct unidentified case count for each county
summary_county_df['Unidentified_Count'] = summary_county_df['GEOID'].map(unidentified_countycount_dict)
summary_county_df.head()
In [ ]:
# Check null values [NOTE: fine to have nulls]
unidentified_county_null_df = summary_county_df.loc[summary_county_df['Unidentified_Count'].isnull()]
unidentified_county_null_df
In [ ]:
summary_county_df.head()
In [ ]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_county_df['Total_Count'] = summary_county_df.iloc[:, -4:-1].sum(axis=1)
summary_county_df.head()
In [ ]:
# check final summary_county_df
summary_county_df.head()
In [ ]:
len(summary_county_df)
In [ ]:
# Convert dataframe to geodataframe
summary_county_gdf = gpd.GeoDataFrame(summary_county_df, geometry=gpd.points_from_xy(x=summary_county_df.Lon_dd, y=summary_county_df.Lat_dd))
summary_county_gdf.head()
In [ ]:
len(summary_county_gdf)
In [ ]:
# write to geoJSON
summary_county_gdf.to_file("JSON/county_counts.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
# load county json
summary_county_poly_gdf = gpd.read_file('JSON/county_polygons.json')
In [ ]:
summary_county_poly_gdf.head()
In [ ]:
# Remove all columns except county info and case number counts
summary_county_poly_gdf = summary_county_poly_gdf.drop(columns=['COUNTYFP','COUNTYNS','AFFGEOID','LSAD','ALAND','AWATER','Shape_Leng', 'Shape_Area'])
summary_county_poly_gdf.head()
In [ ]:
len(summary_county_poly_gdf)
In [ ]:
# sort by county FIPS
summary_county_poly_gdf = summary_county_poly_gdf.sort_values(by=['GEOID'])
summary_county_poly_gdf.head()
In [ ]:
type(summary_county_poly_gdf['GEOID'][0])
In [ ]:
summary_county_poly_gdf['GEOID'] = summary_county_poly_gdf['GEOID'].astype(int)
In [ ]:
type(summary_county_poly_gdf['GEOID'][0])
In [ ]:
# Add new column to dataframe, using the county FIPS field as a key in the missing_county_dict, to pull the correct missing case count for each county
summary_county_poly_gdf['Missing_Count'] = summary_county_poly_gdf['GEOID'].map(missing_countycount_dict)
summary_county_poly_gdf.head()
In [ ]:
# Add new column to dataframe, using the county FIPS field as a key in the unclaimed_county_dict, to pull the correct unclaimed case count for each county
summary_county_poly_gdf['Unclaimed_Count'] = summary_county_poly_gdf['GEOID'].map(unclaimed_countycount_dict)
summary_county_poly_gdf.head()
In [ ]:
# Add new column to dataframe, using the county_FIPS field as a key in the unidentified_county_dict, to pull the correct unidentified case count for each county
summary_county_poly_gdf['Unidentified_Count'] = summary_county_poly_gdf['GEOID'].map(unidentified_countycount_dict)
summary_county_poly_gdf.head()
In [ ]:
summary_county_poly_gdf.head()
In [ ]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_county_poly_gdf['Total_Count'] = summary_county_poly_gdf.iloc[:, -4:-1].sum(axis=1)
summary_county_poly_gdf.head()
In [ ]:
# check final summary_county_poly_gdf
summary_county_poly_gdf.head()
In [ ]:
len(summary_county_poly_gdf)
In [ ]:
# write to geoJSON
summary_county_poly_gdf.to_file("JSON/county_poly_counts.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
In [296]:
missing_df.head()
Out[296]:
In [297]:
missing_city_count = missing_df.groupby('County_city').count()
# len(missing_city_count)
missing_city_count.head()
Out[297]:
In [298]:
# Remove all columns except case number count
missing_city_count = missing_city_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'GNIS_city'])
In [299]:
# add column for county city (since county city is now index)
missing_city_count['County_city'] = missing_city_count.index
In [300]:
# rename case number count column
missing_city_count = missing_city_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)
In [301]:
# check dataframe
missing_city_count.head()
Out[301]:
In [302]:
# make dictionary of county_city and missing person counts
missing_city_dict = dict(zip(missing_city_count.County_city, missing_city_count.Missing_CaseCount))
len(missing_city_dict)
# missing_city_dict
Out[302]:
In [303]:
unclaimed_df.head()
Out[303]:
In [304]:
unclaimed_city_count = unclaimed_df.groupby('County_city').count()
# len(unclaimed_city_count)
unclaimed_city_count.head()
Out[304]:
In [305]:
# Remove all columns except case number count
unclaimed_city_count = unclaimed_city_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'GNIS_city'])
In [306]:
# add column for county_city (since county_city is now index)
unclaimed_city_count['County_city'] = unclaimed_city_count.index
In [307]:
# rename case number count column
unclaimed_city_count = unclaimed_city_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)
In [308]:
# check dataframe
unclaimed_city_count.head()
Out[308]:
In [309]:
# make dictionary of county_city and missing person counts
unclaimed_city_dict = dict(zip(unclaimed_city_count.County_city, unclaimed_city_count.Unclaimed_CaseCount))
len(unclaimed_city_dict)
# unclaimed_city_dict
Out[309]:
In [310]:
unidentified_df.head()
Out[310]:
In [311]:
unidentified_city_count = unidentified_df.groupby('County_city').count()
# len(unidentified_city_count)
unidentified_city_count.head()
Out[311]:
In [312]:
# Remove all columns except case number count
unidentified_city_count = unidentified_city_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS', 'City_string', 'GNIS_city'])
In [313]:
# add column for county_city (since county_city is now index)
unidentified_city_count['County_city'] = unidentified_city_count.index
In [314]:
# rename case number count column
unidentified_city_count = unidentified_city_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)
In [315]:
# check dataframe
unidentified_city_count.head()
Out[315]:
In [316]:
# make dictionary of county_city and missing person counts
unidentified_city_dict = dict(zip(unidentified_city_count.County_city, unidentified_city_count.Unidentified_CaseCount))
len(unidentified_city_dict)
# unidentified_city_dict
Out[316]:
In [317]:
city_df.head()
Out[317]:
In [318]:
len(city_df)
Out[318]:
In [319]:
# make a new summary dataframe based on the city centroids
summary_city_df = city_df
# sort by countyfp_city
summary_city_df = summary_city_df.sort_values(by=['COUNTYFP_CITY'])
summary_city_df.head()
Out[319]:
In [320]:
len(summary_city_df)
Out[320]:
In [321]:
# Add new column to dataframe, using the countyfp_city field as a key in the missing_city_dict, to pull the correct missing case count for each city
summary_city_df['Missing_Count'] = summary_city_df['COUNTYFP_CITY'].map(missing_city_dict)
summary_city_df.head()
Out[321]:
In [322]:
# Check null values [NOTE: fine to have nulls]
missing_city_null_df = summary_city_df.loc[summary_city_df['Missing_Count'].isnull()]
missing_city_null_df
Out[322]:
In [323]:
# Add new column to dataframe, using the countyfp_city field as a key in the unclaimed_city_dict, to pull the correct unclaimed case count for each city
summary_city_df['Unclaimed_Count'] = summary_city_df['COUNTYFP_CITY'].map(unclaimed_city_dict)
summary_city_df.head()
Out[323]:
In [324]:
# Check null values [NOTE: fine to have nulls]
unclaimed_city_null_df = summary_city_df.loc[summary_city_df['Unclaimed_Count'].isnull()]
unclaimed_city_null_df
Out[324]:
In [325]:
# Add new column to dataframe, using the countyfp_city field as a key in the unidentified_city_dict, to pull the correct unidentified case count for each city
summary_city_df['Unidentified_Count'] = summary_city_df['COUNTYFP_CITY'].map(unidentified_city_dict)
summary_city_df.head()
Out[325]:
In [326]:
# Check null values [NOTE: fine to have nulls]
unidentified_city_null_df = summary_city_df.loc[summary_city_df['Unidentified_Count'].isnull()]
unidentified_city_null_df
Out[326]:
In [327]:
summary_city_df.head()
Out[327]:
In [330]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_city_df['Total_Count'] = summary_city_df.iloc[:, -4:-1].sum(axis=1)
summary_city_df.head(120780)
Out[330]:
In [331]:
len(summary_city_df)
Out[331]:
In [332]:
summary_city_zeros_df = summary_city_df.loc[summary_city_df['Total_Count']==0]
summary_city_zeros_df.shape
Out[332]:
In [333]:
199473-193049
Out[333]:
In [334]:
final_summary_city_df = summary_city_df[summary_city_df['Total_Count'] != 0]
final_summary_city_df.shape
Out[334]:
In [335]:
# check dataframe
final_summary_city_df.head(20)
Out[335]:
In [336]:
# reset index of dataframe
final_summary_city_df = final_summary_city_df.reset_index(drop=True)
final_summary_city_df.head()
Out[336]:
In [337]:
city_honolulu_df = final_summary_city_df.loc[final_summary_city_df['COUNTY_FIPS']==15003]
city_honolulu_df
Out[337]:
In [338]:
type(final_summary_city_df['CITY_NAME'][0])
Out[338]:
In [339]:
# check final summary_city_df
final_summary_city_df.head(20)
Out[339]:
In [340]:
len(final_summary_city_df)
Out[340]:
In [341]:
# Convert dataframe to geodataframe
final_summary_city_gdf = gpd.GeoDataFrame(final_summary_city_df, geometry=gpd.points_from_xy(x=final_summary_city_df.Lon_dd, y=final_summary_city_df.Lat_dd))
final_summary_city_gdf.head()
Out[341]:
In [342]:
len(final_summary_city_gdf)
Out[342]:
In [343]:
# write to geoJSON
final_summary_city_gdf.to_file("JSON/city_counts.json", driver="GeoJSON", encoding='utf-8')
In [ ]:
Goal format: { "type": "Feature", "name": "Wisconsin", "properties": { "missing": [ ], "unclaimed": [ ], "unidentified": [ ], "filtered": [ ] } "geometry": { "type": "Point", "coordinates": [ -117.79750667, 36.03755926 ] }
In [ ]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)
In [ ]:
# check df
unclaimed_df.head()
In [ ]:
# check data types
type(unclaimed_df['State_FIPS'][0])
In [ ]:
# check centroids
state_centroids_v2_df.head()
len(state_centroids_v2_df)
In [ ]:
# sort each database by state FIPS
# sort state centroids by state FIPS
state_centroids_v2_df = state_centroids_v2_df.sort_values(by=['STATEFP'])
state_centroids_v2_df.head()
In [ ]:
# sort each database by state FIPS
# sort missing by state FIPS
missing_df = missing_df.sort_values(by=['State_FIPS'])
missing_df.head()
In [ ]:
# sort unclaimed by state FIPS
unclaimed_df = unclaimed_df.sort_values(by=['State_FIPS'])
unclaimed_df.head()
In [ ]:
# sort unidentified by state FIPS
unidentified_df = unidentified_df.sort_values(by=['State_FIPS'])
unidentified_df.head()
In [ ]:
## Create state array
state_array = []
## for each state in state_centroids...
i = 0
while i < len(state_centroids_v2_df):
state_dict = {}
state_dict["type"] = "Feature"
state_dict["name"] = state_centroids_v2_df["NAME"][i]
state_dict["name_abbr"] = state_centroids_v2_df["STUSPS"][i]
state_dict["state_FIPS"] = str(state_centroids_v2_df["STATEFP"][i])
state_dict["properties"] = {}
missing_array = []
j = 0
while j < len(missing_df):
# check if state_fips matches
if missing_df['State_FIPS'][j] == state_centroids_v2_df["STATEFP"][i]:
missing_dict = {}
for item in missing_header:
missing_dict[item] = str(missing_df[item][j])
# append dictionary to missing array
missing_array.append(missing_dict)
# increment j
j += 1
state_dict["properties"]['missing'] = missing_array
unclaimed_array = []
k = 0
while k < len(unclaimed_df):
# check if state_fips matches
if unclaimed_df['State_FIPS'][k] == state_centroids_v2_df["STATEFP"][i]:
unclaimed_dict = {}
for item in unclaimed_header:
unclaimed_dict[item] = str(unclaimed_df[item][k])
# append dictionary to unclaimed array
unclaimed_array.append(unclaimed_dict)
# increment k
k += 1
state_dict["properties"]['unclaimed'] = unclaimed_array
unidentified_array = []
l = 0
while l < len(unidentified_df):
# check if state_fips matches
if unidentified_df['State_FIPS'][l] == state_centroids_v2_df["STATEFP"][i]:
unidentified_dict = {}
for item in unidentified_header:
unidentified_dict[item] = str(unidentified_df[item][l])
# append dictionary to unclaimed array
unidentified_array.append(unidentified_dict)
# increment l
l += 1
state_dict["properties"]['unidentified'] = unidentified_array
state_dict["properties"]['filtered'] = []
## set geometry
state_dict["geometry"] = {}
state_dict["geometry"]["type"] = "Point"
state_dict["geometry"]["coordinates"] = [state_centroids_v2_df['Lon_dd'][i], state_centroids_v2_df['Lat_dd'][i]]
## append state dictionary to array
state_array.append(state_dict)
# test statement
print("added state", str(i+1), "of 56")
# increment interator
i += 1
# state_array
In [ ]:
# check item in array
len(state_array)
# state_array[0]['properties']['unclaimed']
In [ ]:
# Create FeatureCollection
state_feature_collection = {}
state_feature_collection["type"] = "FeatureCollection"
state_feature_collection["features"] = state_array
# state_feature_collection
In [ ]:
# Convert FeatureCollection to JSON format
state_geojson = geojson.dumps(state_feature_collection)
# check type to make sure conversion was sucessful
print(type(state_geojson))
In [ ]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/state_geojson.json', 'w', encoding='utf-8') as json_file:
json_file.write(state_geojson)
Goal format: { "type": "Feature", "name": "Dane", "state": "Wisconsin", "state_FIPS": , "county_FIPS": , "properties": { "missing": [ ], "unclaimed": [ ], "unidentified": [ ], "filtered": [ ] } "geometry": { "type": "Point", "coordinates": [ -117.79750667, 36.03755926 ] }
In [ ]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)
In [ ]:
# check df
unclaimed_df.head()
In [ ]:
# check data types
type(unclaimed_df['State_FIPS'][0])
In [ ]:
# check centroids
county_centroids_v2_df.head()
len(county_centroids_v2_df)
In [ ]:
# sort each database by county FIPS
# sort county centroids by county FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['GEOID'])
county_centroids_v2_df.head()
In [ ]:
# sort each database by county FIPS
# sort missing by county FIPS
missing_df = missing_df.sort_values(by=['County_FIPS'])
missing_df.head()
In [ ]:
# sort unclaimed by county FIPS
unclaimed_df = unclaimed_df.sort_values(by=['County_FIPS'])
unclaimed_df.head()
In [ ]:
# sort unidentified by county FIPS
unidentified_df = unidentified_df.sort_values(by=['County_FIPS'])
unidentified_df.head()
In [ ]:
## Create county array
county_array = []
## for each county in county_centroids_v2...
i = 0
while i < len(county_centroids_v2_df):
county_dict = {}
county_dict["type"] = "Feature"
county_dict["name"] = str(county_centroids_v2_df["NAME"][i])
county_dict["county_FIPS"] = str(county_centroids_v2_df["GEOID"][i])
county_dict["state_name"] = str(county_centroids_v2_df["STATE_NAME"][i])
county_dict["state_FIPS"] = str(county_centroids_v2_df["STATEFP"][i])
county_dict["properties"] = {}
missing_array = []
j = 0
while j < len(missing_df):
# check if county_fips matches
if missing_df['County_FIPS'][j] == county_centroids_v2_df["GEOID"][i]:
missing_dict = {}
for item in missing_header:
missing_dict[item] = str(missing_df[item][j])
# append dictionary to missing array
missing_array.append(missing_dict)
# increment j
j += 1
county_dict["properties"]['missing'] = missing_array
unclaimed_array = []
k = 0
while k < len(unclaimed_df):
# check if county_fips matches
if unclaimed_df['County_FIPS'][k] == county_centroids_v2_df["GEOID"][i]:
unclaimed_dict = {}
for item in unclaimed_header:
unclaimed_dict[item] = str(unclaimed_df[item][k])
# append dictionary to unclaimed array
unclaimed_array.append(unclaimed_dict)
# increment k
k += 1
county_dict["properties"]['unclaimed'] = unclaimed_array
unidentified_array = []
l = 0
while l < len(unidentified_df):
# check if county_fips matches
if unidentified_df['County_FIPS'][l] == county_centroids_v2_df["GEOID"][i]:
unidentified_dict = {}
for item in unidentified_header:
unidentified_dict[item] = str(unidentified_df[item][l])
# append dictionary to unclaimed array
unidentified_array.append(unidentified_dict)
# increment l
l += 1
county_dict["properties"]['unidentified'] = unidentified_array
county_dict["properties"]['filtered'] = []
## set geometry
county_dict["geometry"] = {}
county_dict["geometry"]["type"] = "Point"
county_dict["geometry"]["coordinates"] = [county_centroids_v2_df['Lon_dd'][i], county_centroids_v2_df['Lat_dd'][i]]
## append county dictionary to array
county_array.append(county_dict)
# test statement
print("added county", str(i+1), "of 3284")
# increment interator
i += 1
# county_array
In [ ]:
# check item in array
len(county_array)
# county_array[55]['properties']['unclaimed']
In [ ]:
# Create FeatureCollection
county_feature_collection = {}
county_feature_collection["type"] = "FeatureCollection"
county_feature_collection["features"] = county_array
# county_feature_collection
In [ ]:
# Convert FeatureCollection to JSON format
county_geojson = geojson.dumps(county_feature_collection)
# check type to make sure conversion was sucessful
print(type(county_geojson))
In [ ]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/county_geojson.json', 'w', encoding='utf-8') as json_file:
json_file.write(county_geojson)
Goal format: { "type": "Feature", "name": "Dane", "state": "Wisconsin", "state_FIPS": , "county_FIPS": , "properties": { "missing": [ ], "unclaimed": [ ], "unidentified": [ ], "filtered": [ ] } "geometry": { "type": "Point", "coordinates": [ -117.79750667, 36.03755926 ] }
In [344]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)
In [345]:
# check cities
# final_summary_city_df.head()
len(final_summary_city_df)
Out[345]:
In [346]:
# sort each database by county FIPS
# sort county centroids by county FIPS
final_summary_city_df = final_summary_city_df.sort_values(by=['COUNTYFP_CITY'])
final_summary_city_df.head()
Out[346]:
In [347]:
# sort each database by county_FIPS_city
# sort missing by county FIPS
missing_df = missing_df.sort_values(by=['County_city'])
missing_df.head()
Out[347]:
In [348]:
# sort unclaimed by county FIPS
unclaimed_df = unclaimed_df.sort_values(by=['County_city'])
unclaimed_df.head()
Out[348]:
In [349]:
# sort unidentified by county FIPS
unidentified_df = unidentified_df.sort_values(by=['County_city'])
unidentified_df.head()
Out[349]:
In [350]:
final_summary_city_df.head()
Out[350]:
In [351]:
len(final_summary_city_df)
Out[351]:
In [352]:
final_summary_city_df["CITY_NAME"][0]
Out[352]:
In [353]:
## Create county array
city_array = []
## for each city in final_summary_city_df... len(city_df)
i = 0
while i < len(final_summary_city_df):
city_dict = {}
city_dict["type"] = "Feature"
city_dict["name"] = str(final_summary_city_df["CITY_NAME"][i])
city_dict["city_code"] = str(final_summary_city_df["COUNTYFP_CITY"][i])
city_dict["county_name"] = str(final_summary_city_df["COUNTY_NAME"][i])
city_dict["county_FIPS"] = str(final_summary_city_df["COUNTY_FIPS"][i])
city_dict["state_name"] = str(final_summary_city_df["STATE_NAME"][i])
city_dict["state_abbr"] = str(final_summary_city_df["STUSPS"][i])
city_dict["state_FIPS"] = str(final_summary_city_df["STATEFP"][i])
city_dict["properties"] = {}
missing_array = []
j = 0
while j < len(missing_df):
# check if county_city code matches
if missing_df['County_city'][j] == final_summary_city_df["COUNTYFP_CITY"][i]:
missing_dict = {}
for item in missing_header:
missing_dict[item] = str(missing_df[item][j])
# append dictionary to missing array
missing_array.append(missing_dict)
# increment j
j += 1
city_dict["properties"]['missing'] = missing_array
unclaimed_array = []
k = 0
while k < len(unclaimed_df):
# check if county_city code matches
if unclaimed_df['County_city'][k] == final_summary_city_df["COUNTYFP_CITY"][i]:
unclaimed_dict = {}
for item in unclaimed_header:
unclaimed_dict[item] = str(unclaimed_df[item][k])
# append dictionary to unclaimed array
unclaimed_array.append(unclaimed_dict)
# increment k
k += 1
city_dict["properties"]['unclaimed'] = unclaimed_array
unidentified_array = []
l = 0
while l < len(unidentified_df):
# check if county_city code matches
if unidentified_df['County_city'][l] == final_summary_city_df["COUNTYFP_CITY"][i]:
unidentified_dict = {}
for item in unidentified_header:
unidentified_dict[item] = str(unidentified_df[item][l])
# append dictionary to unclaimed array
unidentified_array.append(unidentified_dict)
# increment l
l += 1
city_dict["properties"]['unidentified'] = unidentified_array
city_dict["properties"]['filtered'] = []
## set geometry
city_dict["geometry"] = {}
city_dict["geometry"]["type"] = "Point"
city_dict["geometry"]["coordinates"] = [final_summary_city_df['Lon_dd'][i], final_summary_city_df['Lat_dd'][i]]
## append county dictionary to array
city_array.append(city_dict)
# test statement
print("added city", str(i+1), "of 6424")
# increment interator
i += 1
# city_array[0]
In [354]:
# check item in array
len(city_array)
# city_array[55]['properties']['unclaimed']
Out[354]:
In [355]:
# Create FeatureCollection
city_feature_collection = {}
city_feature_collection["type"] = "FeatureCollection"
city_feature_collection["features"] = city_array
# city_feature_collection
In [356]:
# Convert FeatureCollection to JSON format
city_geojson = geojson.dumps(city_feature_collection)
# check type to make sure conversion was sucessful
print(type(city_geojson))
In [357]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/city_geojson.json', 'w', encoding='utf-8') as json_file:
json_file.write(city_geojson)
In [ ]:
In [ ]:
In [ ]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)
In [ ]:
# load state json
state_gpd = gpd.read_file('JSON/state_polygons.json')
In [ ]:
state_gpd.head()
In [ ]:
type(state_gpd['STATEFP'][0])
In [ ]:
state_gpd['STATEFP'] = state_gpd['STATEFP'].astype(int)
In [ ]:
type(state_gpd['STATEFP'][0])
In [ ]:
# convert to dictionary
state_polygons = state_gpd.to_dict(orient='records')
In [ ]:
len(state_polygons)
In [ ]:
type(missing_df['State_FIPS'][0])
In [ ]:
type(state_polygons[0]["STATEFP"])
In [ ]:
state_polygons[0]
In [ ]:
## Create state array
state_poly_array = []
## for each state in state_centroids...
i = 0
while i < len(state_polygons):
state_poly_dict = {}
state_poly_dict["type"] = "Feature"
state_poly_dict["name"] = state_polygons[i]["NAME"]
state_poly_dict["name_abbr"] = state_polygons[i]["STUSPS"]
state_poly_dict["state_FIPS"] = str(state_polygons[i]["STATEFP"])
state_poly_dict["properties"] = {}
missing_array = []
j = 0
while j < len(missing_df):
# check if state_fips matches
if missing_df['State_FIPS'][j] == state_polygons[i]["STATEFP"]:
missing_dict = {}
for item in missing_header:
missing_dict[item] = str(missing_df[item][j])
# append dictionary to missing array
missing_array.append(missing_dict)
# increment j
j += 1
state_poly_dict["properties"]['missing'] = missing_array
unclaimed_array = []
k = 0
while k < len(unclaimed_df):
# check if state_fips matches
if unclaimed_df['State_FIPS'][k] == state_polygons[i]["STATEFP"]:
unclaimed_dict = {}
for item in unclaimed_header:
unclaimed_dict[item] = str(unclaimed_df[item][k])
# append dictionary to unclaimed array
unclaimed_array.append(unclaimed_dict)
# increment k
k += 1
state_poly_dict["properties"]['unclaimed'] = unclaimed_array
unidentified_array = []
l = 0
while l < len(unidentified_df):
# check if state_fips matches
if unidentified_df['State_FIPS'][l] == state_polygons[i]["STATEFP"]:
unidentified_dict = {}
for item in unidentified_header:
unidentified_dict[item] = str(unidentified_df[item][l])
# append dictionary to unclaimed array
unidentified_array.append(unidentified_dict)
# increment l
l += 1
state_poly_dict["properties"]['unidentified'] = unidentified_array
state_poly_dict["properties"]['filtered'] = []
## set geometry
state_poly_dict["geometry"] = state_polygons[i]["geometry"]
## append state dictionary to array
state_poly_array.append(state_poly_dict)
# test statement
print("added state", str(i+1), "of", len(state_polygons))
# increment interator
i += 1
# state_poly_array
In [ ]:
# check item in array
# len(state_poly_array)
# state_poly_array[9]['properties']['unclaimed'][0]
state_poly_array[9]['geometry']
In [ ]:
# Create FeatureCollection
state_poly_feature_collection = {}
state_poly_feature_collection["type"] = "FeatureCollection"
state_poly_feature_collection["features"] = state_poly_array
# state_poly_feature_collection
In [ ]:
# Convert FeatureCollection to JSON format
state_poly_geojson = geojson.dumps(state_poly_feature_collection)
# check type to make sure conversion was sucessful
print(type(state_poly_geojson))
In [ ]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/state_poly_geojson.json', 'w', encoding='utf-8') as json_file:
json_file.write(state_poly_geojson)
In [ ]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)
In [ ]:
# load county json
county_gpd = gpd.read_file('JSON/county_polygons.json')
In [ ]:
county_gpd.head()
In [ ]:
county_gpd['STATEFP'] = county_gpd['STATEFP'].astype(int)
In [ ]:
county_gpd['GEOID'] = county_gpd['GEOID'].astype(int)
In [ ]:
# sort county centroids by state FIPS
county_gpd = county_gpd.sort_values(by=['GEOID'])
county_gpd.head()
In [ ]:
# convert to dictionary
county_polygons = county_gpd.to_dict(orient='records')
In [ ]:
len(county_polygons)
In [ ]:
county_polygons[0]
In [ ]:
missing_df.head()
In [ ]:
# sort dataframes by county fips
missing_df = missing_df.sort_values(by=['County_FIPS'])
unclaimed_df = unclaimed_df.sort_values(by=['County_FIPS'])
unidentified_df = unidentified_df.sort_values(by=['County_FIPS'])
In [ ]:
unclaimed_df.head()
In [ ]:
type(missing_df['County_FIPS'][0])
In [ ]:
type(county_polygons[0]["GEOID"])
In [ ]:
county_polygons[0]
In [ ]:
## Create county array
county_poly_array = []
## for each county in county_centroids_v2...
i = 0
while i < len(county_polygons):
county_poly_dict = {}
county_poly_dict["type"] = "Feature"
county_poly_dict["name"] = county_polygons[i]["NAME"]
county_poly_dict["county_FIPS"] = str(county_polygons[i]["GEOID"])
county_poly_dict["state_name"] = county_polygons[i]["STATE_NAME"]
county_poly_dict["state_abbr"] = county_polygons[i]["STUSPS"]
county_poly_dict["state_FIPS"] = str(county_polygons[i]["STATEFP"])
county_poly_dict["properties"] = {}
missing_array = []
j = 0
while j < len(missing_df):
# check if county_fips matches
if missing_df['County_FIPS'][j] == county_polygons[i]["GEOID"]:
missing_dict = {}
for item in missing_header:
missing_dict[item] = str(missing_df[item][j])
# append dictionary to missing array
missing_array.append(missing_dict)
# increment j
j += 1
county_poly_dict["properties"]['missing'] = missing_array
unclaimed_array = []
k = 0
while k < len(unclaimed_df):
# check if county_fips matches
if unclaimed_df['County_FIPS'][k] == county_polygons[i]["GEOID"]:
unclaimed_dict = {}
for item in unclaimed_header:
unclaimed_dict[item] = str(unclaimed_df[item][k])
# append dictionary to unclaimed array
unclaimed_array.append(unclaimed_dict)
# increment k
k += 1
county_poly_dict["properties"]['unclaimed'] = unclaimed_array
unidentified_array = []
l = 0
while l < len(unidentified_df):
# check if county_fips matches
if unidentified_df['County_FIPS'][l] == county_polygons[i]["GEOID"]:
unidentified_dict = {}
for item in unidentified_header:
unidentified_dict[item] = str(unidentified_df[item][l])
# append dictionary to unclaimed array
unidentified_array.append(unidentified_dict)
# increment l
l += 1
county_poly_dict["properties"]['unidentified'] = unidentified_array
county_poly_dict["properties"]['filtered'] = []
## set geometry
county_poly_dict["geometry"] = county_polygons[i]["geometry"]
## append county dictionary to array
county_poly_array.append(county_poly_dict)
# test statement
print("added county", str(i+1), "of 3228")
# increment interator
i += 1
# county_poly_array
In [ ]:
# check item in array
len(county_poly_array)
# county_poly_array[3]
In [ ]:
# Create FeatureCollection
county_poly_feature_collection = {}
county_poly_feature_collection["type"] = "FeatureCollection"
county_poly_feature_collection["features"] = county_poly_array
# county_poly_feature_collection
In [ ]:
# Convert FeatureCollection to JSON format
county_poly_geojson = geojson.dumps(county_poly_feature_collection)
# check type to make sure conversion was sucessful
print(type(county_poly_geojson))
In [ ]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/county_poly_geojson.json', 'w', encoding='utf-8') as json_file:
json_file.write(county_poly_geojson)
In [ ]: