In [1]:
from __future__ import division, print_function
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import re
import six
In [3]:
from IPython.display import display
Electoral data is by county, census data is by PUMA.
Define regions that are connected components of PUMAs and counties. We'll do it for 2000 and 2010 geographies, as well as merged ones (for multi-year ACS files that use both geographies).
These are produced by the MABLE geocorr tool, by choosing source as County and target as either PUMA (2012) or PUMA (2000--used in ACS data thru vintage 2011).
In [4]:
data = '../pummeler/data/{}'.format
In [5]:
county_to_puma00 = pd.read_csv(data('county-to-puma00.csv.gz'), compression='gzip', skiprows=[1], dtype={'county': 'str'})
county_to_puma10 = pd.read_csv(data('county-to-puma10.csv.gz'), compression='gzip', skiprows=[1], dtype={'county': 'str'})
In [6]:
# puma00 of 77777 is combo of 01801, 01802, and 01905 in LA
# make sure they're in the same CC...
sub = county_to_puma00[(county_to_puma00.stab == 'LA')
& ((county_to_puma00.puma2k == 1801)
| (county_to_puma00.puma2k == 1802)
| (county_to_puma00.puma2k == 1905))].copy()
sub.puma2k = 77777
county_to_puma00 = county_to_puma00.append(sub, ignore_index=True)
In [7]:
from itertools import count
def get_CCs(pairs):
A_cc = {}
B_cc = {}
next_cc = count().next
for A, B in pairs:
A_id = A_cc.get(A)
B_id = B_cc.get(B)
if A_id is None:
if B_id is None:
A_cc[A] = B_cc[B] = next_cc()
else:
A_cc[A] = B_id
elif B_id is None:
B_cc[B] = A_id
elif A_id != B_id:
for k, v in A_cc.iteritems():
if v == B_id:
A_cc[k] = A_id
for k, v in B_cc.iteritems():
if v == B_id:
B_cc[k] = A_id
ccs = [(set(), set()) for _ in xrange(next_cc())]
for k, v in A_cc.iteritems():
ccs[v][0].add(k)
for k, v in B_cc.iteritems():
ccs[v][1].add(k)
return [(As, Bs) for As, Bs in ccs if As or Bs]
In [8]:
ccs00_orig = get_CCs(
(row.county, (row.state, row.puma2k))
for _, row in county_to_puma00.iterrows())
In [9]:
ccs10_orig = get_CCs(
(row.county, (row.state, row.puma12))
for _, row in county_to_puma10.iterrows())
In [10]:
len(ccs00_orig), len(ccs10_orig)
Out[10]:
In [11]:
# Alaska's electoral districts are different from their counties.
# Too much work to do it, so just pretend Alaska was one CC all along.
def kill_alaska(ccs):
cs = set()
s_ps = set()
to_skip = set()
for i, (counties, state_pumas) in enumerate(ccs):
if any(state == 2 for state, puma in state_pumas):
cs |= counties
s_ps |= state_pumas
to_skip.add(i)
return [(cs, s_ps)] + \
[cc for i, cc in enumerate(ccs) if i not in to_skip]
ccs00 = kill_alaska(ccs00_orig)
ccs10 = kill_alaska(ccs10_orig)
In [12]:
len(ccs00), len(ccs10)
Out[12]:
In [13]:
st_to_stab = county_to_puma00[['state', 'stab']].drop_duplicates()
st_to_stab = dict(zip(st_to_stab.state, st_to_stab.stab))
In [14]:
from collections import defaultdict
from itertools import count
def cc_names(ccs, fmt='{}_{}'):
state_counters = defaultdict(lambda: count(1))
names = []
for counties, state_pumas in ccs:
st, = {st for st, puma in state_pumas}
i = next(state_counters[st])
names.append(fmt.format(st_to_stab[st], i))
return names
In [15]:
cc_names00 = cc_names(ccs00, '{}_00_{:02}')
cc_names10 = cc_names(ccs10, '{}_10_{:02}')
In [16]:
def region_mappings(ccs, cc_names):
assert len(ccs) == len(cc_names)
county_region = []
puma_region = []
for name, (counties, pumas) in zip(cc_names, ccs):
st, = {st for st, puma in pumas}
stab = st_to_stab[st]
for c in counties:
county_region.append((c, name))
for st, puma in pumas:
puma_region.append((st, puma, name))
county_region_df = pd.DataFrame.from_records(
county_region, columns=['county', 'region'], index=['county']).sortlevel()
puma_region_df = pd.DataFrame.from_records(
puma_region, columns=['state', 'puma', 'region'], index=['state', 'puma']).sortlevel()
return county_region_df, puma_region_df
In [17]:
county_region_00, puma_region_00 = region_mappings(ccs00, cc_names00)
county_region_10, puma_region_10 = region_mappings(ccs10, cc_names10)
In [18]:
county_region_00.to_hdf('regions.h5', 'county_region_00', format='table', complib='blosc', complevel=9, mode='w')
puma_region_00.to_hdf('regions.h5', 'puma_region_00', format='table', complib='blosc', complevel=9)
county_region_10.to_hdf('regions.h5', 'county_region_10', format='table', complib='blosc', complevel=9)
puma_region_10.to_hdf('regions.h5', 'puma_region_10', format='table', complib='blosc', complevel=9)
In [19]:
pd.DataFrame.from_records(st_to_stab.iteritems(), columns=['state', 'stab'], index='state') \
.to_hdf('regions.h5', 'state_to_stab', format='table', complib='blosc', complevel=9)