In [1]:

    
from __future__ import division, print_function
%matplotlib inline



In [2]:

    
import numpy as np
import pandas as pd
import re
import six



In [3]:

    
from IPython.display import display

Figure out county/PUMA regions

Electoral data is by county, census data is by PUMA.

Define regions that are connected components of PUMAs and counties. We'll do it for 2000 and 2010 geographies, as well as merged ones (for multi-year ACS files that use both geographies).

These are produced by the MABLE geocorr tool, by choosing source as County and target as either PUMA (2012) or PUMA (2000--used in ACS data thru vintage 2011).



In [4]:

    
data = '../pummeler/data/{}'.format



In [5]:

    
county_to_puma00 = pd.read_csv(data('county-to-puma00.csv.gz'), compression='gzip', skiprows=[1], dtype={'county': 'str'})
county_to_puma10 = pd.read_csv(data('county-to-puma10.csv.gz'), compression='gzip', skiprows=[1], dtype={'county': 'str'})



In [6]:

    
# puma00 of 77777 is combo of 01801, 01802, and 01905 in LA
# make sure they're in the same CC...
sub = county_to_puma00[(county_to_puma00.stab == 'LA')
                 & ((county_to_puma00.puma2k == 1801)
                  | (county_to_puma00.puma2k == 1802)
                  | (county_to_puma00.puma2k == 1905))].copy()
sub.puma2k = 77777
county_to_puma00 = county_to_puma00.append(sub, ignore_index=True)



In [7]:

    
from itertools import count

def get_CCs(pairs):
    A_cc = {}
    B_cc = {}
    
    next_cc = count().next
    
    for A, B in pairs:
        A_id = A_cc.get(A)
        B_id = B_cc.get(B)
        
        if A_id is None:
            if B_id is None:
                A_cc[A] = B_cc[B] = next_cc()
            else:
                A_cc[A] = B_id
        elif B_id is None:
            B_cc[B] = A_id
        elif A_id != B_id:
            for k, v in A_cc.iteritems():
                if v == B_id:
                    A_cc[k] = A_id
            for k, v in B_cc.iteritems():
                if v == B_id:
                    B_cc[k] = A_id
    
    ccs = [(set(), set()) for _ in xrange(next_cc())]
    for k, v in A_cc.iteritems():
        ccs[v][0].add(k)
    for k, v in B_cc.iteritems():
        ccs[v][1].add(k)
    return [(As, Bs) for As, Bs in ccs if As or Bs]



In [8]:

    
ccs00_orig = get_CCs(
    (row.county, (row.state, row.puma2k))
    for _, row in county_to_puma00.iterrows())



In [9]:

    
ccs10_orig = get_CCs(
    (row.county, (row.state, row.puma12))
    for _, row in county_to_puma10.iterrows())



In [10]:

    
len(ccs00_orig), len(ccs10_orig)









    Out[10]:





(842, 982)



In [11]:

    
# Alaska's electoral districts are different from their counties.
# Too much work to do it, so just pretend Alaska was one CC all along.
def kill_alaska(ccs):
    cs = set()
    s_ps = set()
    to_skip = set()
    for i, (counties, state_pumas) in enumerate(ccs):
        if any(state == 2 for state, puma in state_pumas):
            cs |= counties
            s_ps |= state_pumas
            to_skip.add(i)
    
    return [(cs, s_ps)] + \
        [cc for i, cc in enumerate(ccs) if i not in to_skip]

ccs00 = kill_alaska(ccs00_orig)
ccs10 = kill_alaska(ccs10_orig)



In [12]:

    
len(ccs00), len(ccs10)









    Out[12]:





(840, 979)



In [13]:

    
st_to_stab = county_to_puma00[['state', 'stab']].drop_duplicates()
st_to_stab = dict(zip(st_to_stab.state, st_to_stab.stab))



In [14]:

    
from collections import defaultdict
from itertools import count

def cc_names(ccs, fmt='{}_{}'):
    state_counters = defaultdict(lambda: count(1))
    names = []
    for counties, state_pumas in ccs:
        st, = {st for st, puma in state_pumas}
        i = next(state_counters[st])
        names.append(fmt.format(st_to_stab[st], i))
    return names



In [15]:

    
cc_names00 = cc_names(ccs00, '{}_00_{:02}')
cc_names10 = cc_names(ccs10, '{}_10_{:02}')



In [16]:

    
def region_mappings(ccs, cc_names):
    assert len(ccs) == len(cc_names)
    county_region = []
    puma_region = []
    for name, (counties, pumas) in zip(cc_names, ccs):
        st, = {st for st, puma in pumas}
        stab = st_to_stab[st]
        
        for c in counties:
            county_region.append((c, name))

        for st, puma in pumas:
            puma_region.append((st, puma, name))
    
    county_region_df = pd.DataFrame.from_records(
        county_region, columns=['county', 'region'], index=['county']).sortlevel()
    puma_region_df = pd.DataFrame.from_records(
        puma_region, columns=['state', 'puma', 'region'], index=['state', 'puma']).sortlevel()
    
    return county_region_df, puma_region_df



In [17]:

    
county_region_00, puma_region_00 = region_mappings(ccs00, cc_names00)
county_region_10, puma_region_10 = region_mappings(ccs10, cc_names10)



In [18]:

    
county_region_00.to_hdf('regions.h5', 'county_region_00', format='table', complib='blosc', complevel=9, mode='w')
puma_region_00.to_hdf('regions.h5', 'puma_region_00', format='table', complib='blosc', complevel=9)
county_region_10.to_hdf('regions.h5', 'county_region_10', format='table', complib='blosc', complevel=9)
puma_region_10.to_hdf('regions.h5', 'puma_region_10', format='table', complib='blosc', complevel=9)



In [19]:

    
pd.DataFrame.from_records(st_to_stab.iteritems(), columns=['state', 'stab'], index='state') \
  .to_hdf('regions.h5', 'state_to_stab', format='table', complib='blosc', complevel=9)