GPS Co-ordinates for all counties in the USA

This notebook is to prepare the population, counties, state data for the web application.
We will also trim out states that are not in US mainland, given the smaller population and contribution coming from there. We can add it back later if need be.

We want to have an exact match of state/county in the population and the counties dataset. Otherwise, that messes up the bokeh plots (making them meaningless).

We need the gps co-ordinates for plotting maps. The Bokeh sample data has this already.
So let's just pull in what we need into a pkl file for later use.


In [1]:
import pandas as pd

In [2]:
donations = pd.read_pickle('out/0/donations.pkl')
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state', 'name': 'state_name'}, inplace=True)

In [3]:
from bokeh.sampledata import us_states, us_counties
from bokeh.plotting import figure, show, output_file
import pandas as pd
import numpy as np

bokeh_counties = us_counties.data.copy()
bokeh_states = us_states.data.copy()


counties_list = list()
for county_id in bokeh_counties:
    counties_list.append(
        [bokeh_counties[county_id]['name'],
        bokeh_counties[county_id]['state'].upper(),
        bokeh_counties[county_id]['lats'],
        bokeh_counties[county_id]['lons']])

states_list = list()
for code in bokeh_states:
    states_list.append(
        [code,
        bokeh_states[code]['lats'],
        bokeh_states[code]['lons']])

counties_gps = pd.DataFrame(counties_list, columns=['county', 'state', 'lats', 'lons'])
states_gps = pd.DataFrame(states_list, columns=['state', 'lats', 'lons'])

counties_gps = counties_gps.drop_duplicates(['county', 'state'])

In [4]:
def normalizecounty(county):
    '''
    Return a normalized county name.
    Different data sources seem to differ
    '''
    return county\
        .replace('St.', '')\
        .replace('Ste.', '')\
        .replace('Saint', '')\
        .replace('County', '')\
        .replace('county', '')\
        .replace('Parish', '')\
        .replace('parish', '')\
        .replace('City', '')\
        .replace('city', '')\
        .replace('Borough', '')\
        .replace('borough', '')\
        .replace('.', '')\
        .replace(' ', '')\
        .lower()

pd.Series(['St. Lucie', 'Jefferson Parish', 'Anchorage Borough', 'King County', 'Raonoke city']).apply(normalizecounty)


Out[4]:
0        lucie
1    jefferson
2    anchorage
3         king
4      raonoke
dtype: object

In [5]:
# We have donations that have county names that don't match up with one of the sources.
missing = donations[
    donations.county.isin
    (set(donations.county.unique())\
          .difference(counties_gps.county.unique()))].county

# The normalizecounty function should fix the issue
missingafternormalization = donations[
    donations.county.isin
    (set(donations.county.apply(normalizecounty).unique())\
          .difference(counties_gps.county.apply(normalizecounty).unique()))].county

len(missing), len(missingafternormalization)


Out[5]:
(2854, 0)

In [6]:
counties_gps['county_norm'] = counties_gps.county.apply(normalizecounty)
donations['county_norm'] = donations.county.apply(normalizecounty)

Read in the Indian populations for each county


In [7]:
states_gps.head()


Out[7]:
state lats lons
0 WA [46.29443, 46.26451, 46.31405, 46.34919, 46.38... [-124.03622, -124.16101, -124.15117, -124.1378...
1 DE [39.63895, 39.68084, 39.72204, 39.72221, 39.72... [-75.7878, -75.78909, -75.78861, -75.78861, -7...
2 DC [38.97872, 38.98378, 38.96493, 38.95822, 38.95... [-77.06276, -77.02561, -77.00141, -76.99288, -...
3 WI [42.49273, 42.49433, 42.49562, 42.49561, 42.49... [-87.8156, -87.93137, -88.10268, -88.20645, -8...
4 WV [40.18683, 40.39711, 40.54795, 40.61628, 40.63... [-80.67905, -80.62345, -80.64068, -80.57018, -...

In [8]:
# Get the census region name since we need it later for vizualizations
statecols = ['census_region_name', 'state_name']

newcols= np.append(donations.columns.values, statecols)
donations = donations.merge(states, on='state', how='left')[newcols]

newcols= np.append(states_gps.columns.values, statecols)
states_gps = states_gps.merge(states, on='state', how='left')[newcols]

newcols= np.append(counties_gps.columns.values, statecols)
counties_gps = counties_gps.merge(states, on='state', how='left')[newcols]

In [9]:
pop2014 = pd.read_csv('in/IndianPopulation_2010_2014.csv', names=['county_state', 'population'], skiprows=1)
pop2010 = pd.read_csv('in/IndianPopulation_2006_2010.csv', names=['county_state', 'population'], skiprows=1)

pop2010['county'] = pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2014['county'] = pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2010['state'] =  pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]
pop2014['state'] =  pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]

pop2010 = pop2010.rename(columns={'population': 2008}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2014 = pop2014.rename(columns={'population': 2012}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2010['county_norm'] = pop2010.county.apply(normalizecounty)
pop2014['county_norm'] = pop2014.county.apply(normalizecounty)
population = pop2010.merge(pop2014, how='outer', on=['county_norm', 'state'])[['county_norm', 'state', 2008, 2012]]\
    .rename(columns={'state': 'state_name'})

In [10]:
population = population\
    .merge(states, how='left', on='state_name')[np.append(population.columns.values, 'state')]\
    .drop('state_name', axis=1)\
    .fillna(0)\
    .drop_duplicates(['county_norm','state'])

In [11]:
population.head()


Out[11]:
county_norm 2008 2012 state
0 autauga 37 0 AL
1 baldwin 87 97 AL
2 barbour 10 42 AL
3 bibb 12 0 AL
4 blount 77 0 AL

In [12]:
# It appears that the data obtained from the immigration has counties and cities for the same names.
# The amounts match and are the ones for the counties. So we can remove either of the two rows. It does not matter.
duplicates = population.groupby(['county_norm', 'state']).size()
duplicates[duplicates.values > 1]

dups = duplicates[duplicates.values > 1].to_frame().reset_index().drop(0, axis=1)
dup_tups = zip(dups.to_dict().values()[0].values(), dups.to_dict().values()[1].values())
duplicate_rows = population[population.apply(lambda x: (x.state, x.county_norm) in dup_tups, axis=1)]
print duplicate_rows.index.values
removed = duplicate_rows.index.values[::2]
print 'Indices to be removed: ', removed
print population.drop(removed, axis=0).shape, population.shape
population = population.drop(removed, axis=0)


[]
Indices to be removed:  []
(3137, 4) (3137, 4)

In [13]:
population = population.drop(population[population.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)
donations = donations.drop(donations[donations.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)

In [14]:
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]

In [15]:
counties_gps = counties_gps[~(counties_gps.state_name.isnull())]

In [16]:
population.loc[(population.state==0), 'state'] = 'DC'
population.loc[(population.state=='FL') & (population.county_norm=='dade'), 'county_norm'] = 'miami-dade'

In [17]:
population[2016] = (2*population[2012] - population[2008]).apply(lambda x: max(0, x))
population[2004] = (2*population[2008] - population[2012]).apply(lambda x: max(0, x))
population[2000] = (2*population[2004] - population[2008]).apply(lambda x: max(0, x))

In [18]:
for yr in range(2000, 2017):
    if yr not in population.columns:
        population[yr] = np.nan

In [19]:
# Re-index and sort the column names so we can interpolate
population = population\
    .set_index(['state', 'county_norm'])\

population = population[np.sort(population.columns.values)]

In [20]:
population = population.T.interpolate(method='linear', limit_direction='backward').T.astype('int')

In [21]:
population[population.index.get_level_values('county_norm').isin(['king'])]


Out[21]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
state county_norm
TX king 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
WA king 3357 6376 9396 12416 15436 18455 21475 24495 27515 30534 33554 36574 39594 42613 45633 48653 51673

In [22]:
population = population.drop(2000, axis=1)
population = population.reset_index()

In [23]:
counties_gps = counties_gps.sort_values(by=['state', 'county_norm'])
population = population.sort_values(by=['state', 'county_norm'])

In [24]:
counties_gps['county_id'] = range(len(counties_gps))
population['county_id'] = range(len(population))

In [25]:
donations = donations.merge(counties_gps[['county_id','county_norm','state']],
                            on=['county_norm','state'])

In [26]:
counties_gps.shape, population.shape


Out[26]:
((3102, 8), (3102, 19))

Validation step


In [27]:
population[(population.state=='WA') & (population.county_norm=='king')]


Out[27]:
state county_norm 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 county_id
2929 WA king 6376 9396 12416 15436 18455 21475 24495 27515 30534 33554 36574 39594 42613 45633 48653 51673 2929

In [28]:
counties_gps[(counties_gps.state=='WA') & (counties_gps.county_norm=='king')]


Out[28]:
county state lats lons county_norm census_region_name state_name county_id
863 King WA [47.25792, 47.25792, 47.30373, 47.40299, 47.40... [-122.33363, -122.33491, -122.39325, -122.5367... king West Washington 2929

In [29]:
donations[(donations.county_norm=='king') & (donations.state=='WA') & (donations.activity_year==2014)].amount.sum()


Out[29]:
455897L

In [30]:
x1 = counties_gps.reset_index()[['county_id', 'county_norm','state']]
x2 = population.reset_index()[['county_id', 'county_norm','state']]
print x1.shape, x2.shape


(3102, 3) (3102, 3)

In [31]:
(x1 != x2).any()


Out[31]:
county_id      False
county_norm    False
state          False
dtype: bool

In [32]:
diff = pd.concat([x1.groupby(['state','county_norm']).size(), x2.groupby(['state','county_norm']).size()], axis=1).reset_index()
diff[diff[0] != diff[1]]


Out[32]:
state county_norm 0 1

In [33]:
# Finally save the data to a pickle file
!mkdir -p out/11

states_gps.to_pickle('out/11/states_gps.pkl')
counties_gps.to_pickle('out/11/counties_gps.pkl')
population.to_pickle('out/11/indian_population.pkl')
donations.to_pickle('out/11/donations.pkl')

In [34]:
population.head()


Out[34]:
state county_norm 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 county_id
0 AL autauga 101 92 83 74 64 55 46 37 27 18 9 0 0 0 0 0 0
1 AL baldwin 69 72 74 77 79 82 84 87 89 92 94 97 99 102 104 107 1
2 AL barbour 0 0 0 0 2 5 7 10 18 26 34 42 50 58 66 74 2
3 AL bibb 33 30 27 24 21 18 15 12 9 6 3 0 0 0 0 0 3
4 AL blount 211 192 173 154 134 115 96 77 57 38 19 0 0 0 0 0 4

In [ ]: