GPS Co-ordinates for all counties in the USA

This notebook is to prepare the population, counties, state data for the web application.
We will also trim out states that are not in US mainland, given the smaller population and contribution coming from there. We can add it back later if need be.

We want to have an exact match of state/county in the population and the counties dataset. Otherwise, that messes up the bokeh plots (making them meaningless).

We need the gps co-ordinates for plotting maps. The Bokeh sample data has this already.
So let's just pull in what we need into a pkl file for later use.



In [1]:

    
import pandas as pd



In [2]:

    
donations = pd.read_pickle('out/0/donations.pkl')
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state', 'name': 'state_name'}, inplace=True)



In [3]:

    
from bokeh.sampledata import us_states, us_counties
from bokeh.plotting import figure, show, output_file
import pandas as pd
import numpy as np

bokeh_counties = us_counties.data.copy()
bokeh_states = us_states.data.copy()


counties_list = list()
for county_id in bokeh_counties:
    counties_list.append(
        [bokeh_counties[county_id]['name'],
        bokeh_counties[county_id]['state'].upper(),
        bokeh_counties[county_id]['lats'],
        bokeh_counties[county_id]['lons']])

states_list = list()
for code in bokeh_states:
    states_list.append(
        [code,
        bokeh_states[code]['lats'],
        bokeh_states[code]['lons']])

counties_gps = pd.DataFrame(counties_list, columns=['county', 'state', 'lats', 'lons'])
states_gps = pd.DataFrame(states_list, columns=['state', 'lats', 'lons'])

counties_gps = counties_gps.drop_duplicates(['county', 'state'])



In [4]:

    
def normalizecounty(county):
    '''
    Return a normalized county name.
    Different data sources seem to differ
    '''
    return county\
        .replace('St.', '')\
        .replace('Ste.', '')\
        .replace('Saint', '')\
        .replace('County', '')\
        .replace('county', '')\
        .replace('Parish', '')\
        .replace('parish', '')\
        .replace('City', '')\
        .replace('city', '')\
        .replace('Borough', '')\
        .replace('borough', '')\
        .replace('.', '')\
        .replace(' ', '')\
        .lower()

pd.Series(['St. Lucie', 'Jefferson Parish', 'Anchorage Borough', 'King County', 'Raonoke city']).apply(normalizecounty)









    Out[4]:





0        lucie
1    jefferson
2    anchorage
3         king
4      raonoke
dtype: object



In [5]:

    
# We have donations that have county names that don't match up with one of the sources.
missing = donations[
    donations.county.isin
    (set(donations.county.unique())\
          .difference(counties_gps.county.unique()))].county

# The normalizecounty function should fix the issue
missingafternormalization = donations[
    donations.county.isin
    (set(donations.county.apply(normalizecounty).unique())\
          .difference(counties_gps.county.apply(normalizecounty).unique()))].county

len(missing), len(missingafternormalization)









    Out[5]:





(2854, 0)



In [6]:

    
counties_gps['county_norm'] = counties_gps.county.apply(normalizecounty)
donations['county_norm'] = donations.county.apply(normalizecounty)

Read in the Indian populations for each county



In [7]:

    
states_gps.head()









    Out[7]:






  
    
      
      state
      lats
      lons
    
  
  
    
      0
      WA
      [46.29443, 46.26451, 46.31405, 46.34919, 46.38...
      [-124.03622, -124.16101, -124.15117, -124.1378...
    
    
      1
      DE
      [39.63895, 39.68084, 39.72204, 39.72221, 39.72...
      [-75.7878, -75.78909, -75.78861, -75.78861, -7...
    
    
      2
      DC
      [38.97872, 38.98378, 38.96493, 38.95822, 38.95...
      [-77.06276, -77.02561, -77.00141, -76.99288, -...
    
    
      3
      WI
      [42.49273, 42.49433, 42.49562, 42.49561, 42.49...
      [-87.8156, -87.93137, -88.10268, -88.20645, -8...
    
    
      4
      WV
      [40.18683, 40.39711, 40.54795, 40.61628, 40.63...
      [-80.67905, -80.62345, -80.64068, -80.57018, -...



In [8]:

    
# Get the census region name since we need it later for vizualizations
statecols = ['census_region_name', 'state_name']

newcols= np.append(donations.columns.values, statecols)
donations = donations.merge(states, on='state', how='left')[newcols]

newcols= np.append(states_gps.columns.values, statecols)
states_gps = states_gps.merge(states, on='state', how='left')[newcols]

newcols= np.append(counties_gps.columns.values, statecols)
counties_gps = counties_gps.merge(states, on='state', how='left')[newcols]



In [9]:

    
pop2014 = pd.read_csv('in/IndianPopulation_2010_2014.csv', names=['county_state', 'population'], skiprows=1)
pop2010 = pd.read_csv('in/IndianPopulation_2006_2010.csv', names=['county_state', 'population'], skiprows=1)

pop2010['county'] = pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2014['county'] = pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2010['state'] =  pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]
pop2014['state'] =  pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]

pop2010 = pop2010.rename(columns={'population': 2008}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2014 = pop2014.rename(columns={'population': 2012}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2010['county_norm'] = pop2010.county.apply(normalizecounty)
pop2014['county_norm'] = pop2014.county.apply(normalizecounty)
population = pop2010.merge(pop2014, how='outer', on=['county_norm', 'state'])[['county_norm', 'state', 2008, 2012]]\
    .rename(columns={'state': 'state_name'})



In [10]:

    
population = population\
    .merge(states, how='left', on='state_name')[np.append(population.columns.values, 'state')]\
    .drop('state_name', axis=1)\
    .fillna(0)\
    .drop_duplicates(['county_norm','state'])



In [11]:

    
population.head()









    Out[11]:






  
    
      
      county_norm
      2008
      2012
      state
    
  
  
    
      0
      autauga
      37
      0
      AL
    
    
      1
      baldwin
      87
      97
      AL
    
    
      2
      barbour
      10
      42
      AL
    
    
      3
      bibb
      12
      0
      AL
    
    
      4
      blount
      77
      0
      AL



In [12]:

    
# It appears that the data obtained from the immigration has counties and cities for the same names.
# The amounts match and are the ones for the counties. So we can remove either of the two rows. It does not matter.
duplicates = population.groupby(['county_norm', 'state']).size()
duplicates[duplicates.values > 1]

dups = duplicates[duplicates.values > 1].to_frame().reset_index().drop(0, axis=1)
dup_tups = zip(dups.to_dict().values()[0].values(), dups.to_dict().values()[1].values())
duplicate_rows = population[population.apply(lambda x: (x.state, x.county_norm) in dup_tups, axis=1)]
print duplicate_rows.index.values
removed = duplicate_rows.index.values[::2]
print 'Indices to be removed: ', removed
print population.drop(removed, axis=0).shape, population.shape
population = population.drop(removed, axis=0)









    



[]
Indices to be removed:  []
(3137, 4) (3137, 4)



In [13]:

    
population = population.drop(population[population.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)
donations = donations.drop(donations[donations.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)



In [14]:

    
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]



In [15]:

    
counties_gps = counties_gps[~(counties_gps.state_name.isnull())]



In [16]:

    
population.loc[(population.state==0), 'state'] = 'DC'
population.loc[(population.state=='FL') & (population.county_norm=='dade'), 'county_norm'] = 'miami-dade'



In [17]:

    
population[2016] = (2*population[2012] - population[2008]).apply(lambda x: max(0, x))
population[2004] = (2*population[2008] - population[2012]).apply(lambda x: max(0, x))
population[2000] = (2*population[2004] - population[2008]).apply(lambda x: max(0, x))



In [18]:

    
for yr in range(2000, 2017):
    if yr not in population.columns:
        population[yr] = np.nan



In [19]:

    
# Re-index and sort the column names so we can interpolate
population = population\
    .set_index(['state', 'county_norm'])\

population = population[np.sort(population.columns.values)]



In [20]:

    
population = population.T.interpolate(method='linear', limit_direction='backward').T.astype('int')



In [21]:

    
population[population.index.get_level_values('county_norm').isin(['king'])]









    Out[21]:






  
    
      
      
      2000
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
    
    
      state
      county_norm
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TX
      king
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      WA
      king
      3357
      6376
      9396
      12416
      15436
      18455
      21475
      24495
      27515
      30534
      33554
      36574
      39594
      42613
      45633
      48653
      51673



In [22]:

    
population = population.drop(2000, axis=1)
population = population.reset_index()



In [23]:

    
counties_gps = counties_gps.sort_values(by=['state', 'county_norm'])
population = population.sort_values(by=['state', 'county_norm'])



In [24]:

    
counties_gps['county_id'] = range(len(counties_gps))
population['county_id'] = range(len(population))



In [25]:

    
donations = donations.merge(counties_gps[['county_id','county_norm','state']],
                            on=['county_norm','state'])



In [26]:

    
counties_gps.shape, population.shape









    Out[26]:





((3102, 8), (3102, 19))

Validation step



In [27]:

    
population[(population.state=='WA') & (population.county_norm=='king')]









    Out[27]:






  
    
      
      state
      county_norm
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      county_id
    
  
  
    
      2929
      WA
      king
      6376
      9396
      12416
      15436
      18455
      21475
      24495
      27515
      30534
      33554
      36574
      39594
      42613
      45633
      48653
      51673
      2929



In [28]:

    
counties_gps[(counties_gps.state=='WA') & (counties_gps.county_norm=='king')]









    Out[28]:






  
    
      
      county
      state
      lats
      lons
      county_norm
      census_region_name
      state_name
      county_id
    
  
  
    
      863
      King
      WA
      [47.25792, 47.25792, 47.30373, 47.40299, 47.40...
      [-122.33363, -122.33491, -122.39325, -122.5367...
      king
      West
      Washington
      2929



In [29]:

    
donations[(donations.county_norm=='king') & (donations.state=='WA') & (donations.activity_year==2014)].amount.sum()









    Out[29]:





455897L



In [30]:

    
x1 = counties_gps.reset_index()[['county_id', 'county_norm','state']]
x2 = population.reset_index()[['county_id', 'county_norm','state']]
print x1.shape, x2.shape









    



(3102, 3) (3102, 3)



In [31]:

    
(x1 != x2).any()









    Out[31]:





county_id      False
county_norm    False
state          False
dtype: bool



In [32]:

    
diff = pd.concat([x1.groupby(['state','county_norm']).size(), x2.groupby(['state','county_norm']).size()], axis=1).reset_index()
diff[diff[0] != diff[1]]









    Out[32]:






  
    
      
      state
      county_norm
      0
      1



In [33]:

    
# Finally save the data to a pickle file
!mkdir -p out/11

states_gps.to_pickle('out/11/states_gps.pkl')
counties_gps.to_pickle('out/11/counties_gps.pkl')
population.to_pickle('out/11/indian_population.pkl')
donations.to_pickle('out/11/donations.pkl')



In [34]:

    
population.head()









    Out[34]:






  
    
      
      state
      county_norm
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      county_id
    
  
  
    
      0
      AL
      autauga
      101
      92
      83
      74
      64
      55
      46
      37
      27
      18
      9
      0
      0
      0
      0
      0
      0
    
    
      1
      AL
      baldwin
      69
      72
      74
      77
      79
      82
      84
      87
      89
      92
      94
      97
      99
      102
      104
      107
      1
    
    
      2
      AL
      barbour
      0
      0
      0
      0
      2
      5
      7
      10
      18
      26
      34
      42
      50
      58
      66
      74
      2
    
    
      3
      AL
      bibb
      33
      30
      27
      24
      21
      18
      15
      12
      9
      6
      3
      0
      0
      0
      0
      0
      3
    
    
      4
      AL
      blount
      211
      192
      173
      154
      134
      115
      96
      77
      57
      38
      19
      0
      0
      0
      0
      0
      4



In [ ]:

	state	lats	lons
0	WA	[46.29443, 46.26451, 46.31405, 46.34919, 46.38...	[-124.03622, -124.16101, -124.15117, -124.1378...
1	DE	[39.63895, 39.68084, 39.72204, 39.72221, 39.72...	[-75.7878, -75.78909, -75.78861, -75.78861, -7...
2	DC	[38.97872, 38.98378, 38.96493, 38.95822, 38.95...	[-77.06276, -77.02561, -77.00141, -76.99288, -...
3	WI	[42.49273, 42.49433, 42.49562, 42.49561, 42.49...	[-87.8156, -87.93137, -88.10268, -88.20645, -8...
4	WV	[40.18683, 40.39711, 40.54795, 40.61628, 40.63...	[-80.67905, -80.62345, -80.64068, -80.57018, -...

	county_norm	2008	2012	state
0	autauga	37	0	AL
1	baldwin	87	97	AL
2	barbour	10	42	AL
3	bibb	12	0	AL
4	blount	77	0	AL

		2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016
state	county_norm
TX	king	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
WA	king	3357	6376	9396	12416	15436	18455	21475	24495	27515	30534	33554	36574	39594	42613	45633	48653	51673