This notebook is to prepare the population, counties, state data for the web application.
We will also trim out states that are not in US mainland, given the smaller population and contribution coming from there. We can add it back later if need be.
We want to have an exact match of state/county in the population and the counties dataset. Otherwise, that messes up the bokeh plots (making them meaningless).
We need the gps co-ordinates for plotting maps. The Bokeh sample data has this already.
So let's just pull in what we need into a pkl file for later use.
In [1]:
import pandas as pd
In [2]:
donations = pd.read_pickle('out/0/donations.pkl')
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state', 'name': 'state_name'}, inplace=True)
In [3]:
from bokeh.sampledata import us_states, us_counties
from bokeh.plotting import figure, show, output_file
import pandas as pd
import numpy as np
bokeh_counties = us_counties.data.copy()
bokeh_states = us_states.data.copy()
counties_list = list()
for county_id in bokeh_counties:
counties_list.append(
[bokeh_counties[county_id]['name'],
bokeh_counties[county_id]['state'].upper(),
bokeh_counties[county_id]['lats'],
bokeh_counties[county_id]['lons']])
states_list = list()
for code in bokeh_states:
states_list.append(
[code,
bokeh_states[code]['lats'],
bokeh_states[code]['lons']])
counties_gps = pd.DataFrame(counties_list, columns=['county', 'state', 'lats', 'lons'])
states_gps = pd.DataFrame(states_list, columns=['state', 'lats', 'lons'])
counties_gps = counties_gps.drop_duplicates(['county', 'state'])
In [4]:
def normalizecounty(county):
'''
Return a normalized county name.
Different data sources seem to differ
'''
return county\
.replace('St.', '')\
.replace('Ste.', '')\
.replace('Saint', '')\
.replace('County', '')\
.replace('county', '')\
.replace('Parish', '')\
.replace('parish', '')\
.replace('City', '')\
.replace('city', '')\
.replace('Borough', '')\
.replace('borough', '')\
.replace('.', '')\
.replace(' ', '')\
.lower()
pd.Series(['St. Lucie', 'Jefferson Parish', 'Anchorage Borough', 'King County', 'Raonoke city']).apply(normalizecounty)
Out[4]:
In [5]:
# We have donations that have county names that don't match up with one of the sources.
missing = donations[
donations.county.isin
(set(donations.county.unique())\
.difference(counties_gps.county.unique()))].county
# The normalizecounty function should fix the issue
missingafternormalization = donations[
donations.county.isin
(set(donations.county.apply(normalizecounty).unique())\
.difference(counties_gps.county.apply(normalizecounty).unique()))].county
len(missing), len(missingafternormalization)
Out[5]:
In [6]:
counties_gps['county_norm'] = counties_gps.county.apply(normalizecounty)
donations['county_norm'] = donations.county.apply(normalizecounty)
In [7]:
states_gps.head()
Out[7]:
In [8]:
# Get the census region name since we need it later for vizualizations
statecols = ['census_region_name', 'state_name']
newcols= np.append(donations.columns.values, statecols)
donations = donations.merge(states, on='state', how='left')[newcols]
newcols= np.append(states_gps.columns.values, statecols)
states_gps = states_gps.merge(states, on='state', how='left')[newcols]
newcols= np.append(counties_gps.columns.values, statecols)
counties_gps = counties_gps.merge(states, on='state', how='left')[newcols]
In [9]:
pop2014 = pd.read_csv('in/IndianPopulation_2010_2014.csv', names=['county_state', 'population'], skiprows=1)
pop2010 = pd.read_csv('in/IndianPopulation_2006_2010.csv', names=['county_state', 'population'], skiprows=1)
pop2010['county'] = pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2014['county'] = pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2010['state'] = pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]
pop2014['state'] = pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]
pop2010 = pop2010.rename(columns={'population': 2008}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2014 = pop2014.rename(columns={'population': 2012}).drop('county_state', axis=1).replace('a Ana County', 'Dona Ana')
pop2010['county_norm'] = pop2010.county.apply(normalizecounty)
pop2014['county_norm'] = pop2014.county.apply(normalizecounty)
population = pop2010.merge(pop2014, how='outer', on=['county_norm', 'state'])[['county_norm', 'state', 2008, 2012]]\
.rename(columns={'state': 'state_name'})
In [10]:
population = population\
.merge(states, how='left', on='state_name')[np.append(population.columns.values, 'state')]\
.drop('state_name', axis=1)\
.fillna(0)\
.drop_duplicates(['county_norm','state'])
In [11]:
population.head()
Out[11]:
In [12]:
# It appears that the data obtained from the immigration has counties and cities for the same names.
# The amounts match and are the ones for the counties. So we can remove either of the two rows. It does not matter.
duplicates = population.groupby(['county_norm', 'state']).size()
duplicates[duplicates.values > 1]
dups = duplicates[duplicates.values > 1].to_frame().reset_index().drop(0, axis=1)
dup_tups = zip(dups.to_dict().values()[0].values(), dups.to_dict().values()[1].values())
duplicate_rows = population[population.apply(lambda x: (x.state, x.county_norm) in dup_tups, axis=1)]
print duplicate_rows.index.values
removed = duplicate_rows.index.values[::2]
print 'Indices to be removed: ', removed
print population.drop(removed, axis=0).shape, population.shape
population = population.drop(removed, axis=0)
In [13]:
population = population.drop(population[population.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)
donations = donations.drop(donations[donations.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)
In [14]:
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]
counties_gps = counties_gps[~(counties_gps.state_name.isin(['Hawaii', 'Alaska']))]
In [15]:
counties_gps = counties_gps[~(counties_gps.state_name.isnull())]
In [16]:
population.loc[(population.state==0), 'state'] = 'DC'
population.loc[(population.state=='FL') & (population.county_norm=='dade'), 'county_norm'] = 'miami-dade'
In [17]:
population[2016] = (2*population[2012] - population[2008]).apply(lambda x: max(0, x))
population[2004] = (2*population[2008] - population[2012]).apply(lambda x: max(0, x))
population[2000] = (2*population[2004] - population[2008]).apply(lambda x: max(0, x))
In [18]:
for yr in range(2000, 2017):
if yr not in population.columns:
population[yr] = np.nan
In [19]:
# Re-index and sort the column names so we can interpolate
population = population\
.set_index(['state', 'county_norm'])\
population = population[np.sort(population.columns.values)]
In [20]:
population = population.T.interpolate(method='linear', limit_direction='backward').T.astype('int')
In [21]:
population[population.index.get_level_values('county_norm').isin(['king'])]
Out[21]:
In [22]:
population = population.drop(2000, axis=1)
population = population.reset_index()
In [23]:
counties_gps = counties_gps.sort_values(by=['state', 'county_norm'])
population = population.sort_values(by=['state', 'county_norm'])
In [24]:
counties_gps['county_id'] = range(len(counties_gps))
population['county_id'] = range(len(population))
In [25]:
donations = donations.merge(counties_gps[['county_id','county_norm','state']],
on=['county_norm','state'])
In [26]:
counties_gps.shape, population.shape
Out[26]:
In [27]:
population[(population.state=='WA') & (population.county_norm=='king')]
Out[27]:
In [28]:
counties_gps[(counties_gps.state=='WA') & (counties_gps.county_norm=='king')]
Out[28]:
In [29]:
donations[(donations.county_norm=='king') & (donations.state=='WA') & (donations.activity_year==2014)].amount.sum()
Out[29]:
In [30]:
x1 = counties_gps.reset_index()[['county_id', 'county_norm','state']]
x2 = population.reset_index()[['county_id', 'county_norm','state']]
print x1.shape, x2.shape
In [31]:
(x1 != x2).any()
Out[31]:
In [32]:
diff = pd.concat([x1.groupby(['state','county_norm']).size(), x2.groupby(['state','county_norm']).size()], axis=1).reset_index()
diff[diff[0] != diff[1]]
Out[32]:
In [33]:
# Finally save the data to a pickle file
!mkdir -p out/11
states_gps.to_pickle('out/11/states_gps.pkl')
counties_gps.to_pickle('out/11/counties_gps.pkl')
population.to_pickle('out/11/indian_population.pkl')
donations.to_pickle('out/11/donations.pkl')
In [34]:
population.head()
Out[34]:
In [ ]: