In [1]:
from __future__ import print_function, division
from six import iteritems, next
from six.moves import xrange

In [2]:
import sys
sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pummeler.data import geocode_data

In [5]:
county_region_00 = geocode_data('county_region_00')
county_region_10 = geocode_data('county_region_10')

In [6]:
county_region_00.head()


Out[6]:
region
county
01001 AL_00_16
01003 AL_00_01
01005 AL_00_05
01007 AL_00_02
01009 AL_00_03

In [7]:
counties = set(county_region_00.index)

In [8]:
counties == set(county_region_10.index)


Out[8]:
True

In [9]:
from itertools import count
from functools import partial

def get_CCs(pairs):
    A_cc = {}
    B_cc = {}
    
    next_cc = partial(next, count())
    
    for A, B in pairs:
        A_id = A_cc.get(A)
        B_id = B_cc.get(B)
        
        if A_id is None:
            if B_id is None:
                A_cc[A] = B_cc[B] = next_cc()
            else:
                A_cc[A] = B_id
        elif B_id is None:
            B_cc[B] = A_id
        elif A_id != B_id:
            for k, v in iteritems(A_cc):
                if v == B_id:
                    A_cc[k] = A_id
            for k, v in iteritems(B_cc):
                if v == B_id:
                    B_cc[k] = A_id
    
    ccs = [(set(), set()) for _ in xrange(next_cc())]
    for k, v in iteritems(A_cc):
        ccs[v][0].add(k)
    for k, v in iteritems(B_cc):
        ccs[v][1].add(k)
    return [(As, Bs) for As, Bs in ccs if As or Bs]

In [10]:
from collections import defaultdict
from itertools import count

def cc_names(ccs, fmt='{}_{}'):
    state_counters = defaultdict(lambda: count(1))
    names = []
    for counties, state_regions in ccs:
        st, = {r[:2] for r in state_regions}
        i = next(state_counters[st])
        names.append(fmt.format(st, i))
    return names

In [11]:
def region_mappings(ccs, cc_names):
    assert len(ccs) == len(cc_names)
    county_region = []
    sub_super = []
    for name, (counties, subregions) in zip(cc_names, ccs):        
        for c in counties:
            county_region.append((c, name))

        for r in subregions:
            sub_super.append((r, name))
    
    county_region_df = pd.DataFrame.from_records(
        county_region, columns=['county', 'merged_region'], index=['county']).sort_index()
    sub_super_df = pd.DataFrame.from_records(
        sub_super, columns=['region', 'merged_region'], index=['region']).sort_index()
    
    return county_region_df, sub_super_df

In [12]:
merged_ccs = get_CCs(
    (c, r) for d in [county_region_00, county_region_10]
           for c, r in iteritems(d.region))

In [13]:
merged_cc_names = cc_names(merged_ccs, '{}_merged_{:02}')

In [14]:
county_superregion, region_superregion = region_mappings(merged_ccs, merged_cc_names)

In [15]:
county_superregion.head()


Out[15]:
merged_region
county
01001 AL_merged_02
01003 AL_merged_01
01005 AL_merged_04
01007 AL_merged_02
01009 AL_merged_02

In [16]:
region_superregion.head()


Out[16]:
merged_region
region
AK_00_01 AK_merged_01
AK_10_01 AK_merged_01
AL_00_01 AL_merged_01
AL_00_02 AL_merged_02
AL_00_03 AL_merged_02

In [17]:
fn = '../pummeler/data/regions.h5'
county_superregion.to_hdf(fn, 'county_superregion', format='table', complib='blosc', complevel=9)
region_superregion.to_hdf(fn, 'region_superregion', format='table', complib='blosc', complevel=9)