notebook.community

Edit and run



In [1]:

    
from __future__ import print_function, division
from six import iteritems, next
from six.moves import xrange



In [2]:

    
import sys
sys.path.append('..')



In [3]:

    
import pandas as pd



In [4]:

    
from pummeler.data import geocode_data



In [5]:

    
county_region_00 = geocode_data('county_region_00')
county_region_10 = geocode_data('county_region_10')



In [6]:

    
county_region_00.head()



In [7]:

    
counties = set(county_region_00.index)



In [8]:

    
counties == set(county_region_10.index)









    Out[8]:





True



In [9]:

    
from itertools import count
from functools import partial

def get_CCs(pairs):
    A_cc = {}
    B_cc = {}
    
    next_cc = partial(next, count())
    
    for A, B in pairs:
        A_id = A_cc.get(A)
        B_id = B_cc.get(B)
        
        if A_id is None:
            if B_id is None:
                A_cc[A] = B_cc[B] = next_cc()
            else:
                A_cc[A] = B_id
        elif B_id is None:
            B_cc[B] = A_id
        elif A_id != B_id:
            for k, v in iteritems(A_cc):
                if v == B_id:
                    A_cc[k] = A_id
            for k, v in iteritems(B_cc):
                if v == B_id:
                    B_cc[k] = A_id
    
    ccs = [(set(), set()) for _ in xrange(next_cc())]
    for k, v in iteritems(A_cc):
        ccs[v][0].add(k)
    for k, v in iteritems(B_cc):
        ccs[v][1].add(k)
    return [(As, Bs) for As, Bs in ccs if As or Bs]



In [10]:

    
from collections import defaultdict
from itertools import count

def cc_names(ccs, fmt='{}_{}'):
    state_counters = defaultdict(lambda: count(1))
    names = []
    for counties, state_regions in ccs:
        st, = {r[:2] for r in state_regions}
        i = next(state_counters[st])
        names.append(fmt.format(st, i))
    return names



In [11]:

    
def region_mappings(ccs, cc_names):
    assert len(ccs) == len(cc_names)
    county_region = []
    sub_super = []
    for name, (counties, subregions) in zip(cc_names, ccs):        
        for c in counties:
            county_region.append((c, name))

        for r in subregions:
            sub_super.append((r, name))
    
    county_region_df = pd.DataFrame.from_records(
        county_region, columns=['county', 'merged_region'], index=['county']).sort_index()
    sub_super_df = pd.DataFrame.from_records(
        sub_super, columns=['region', 'merged_region'], index=['region']).sort_index()
    
    return county_region_df, sub_super_df



In [12]:

    
merged_ccs = get_CCs(
    (c, r) for d in [county_region_00, county_region_10]
           for c, r in iteritems(d.region))



In [13]:

    
merged_cc_names = cc_names(merged_ccs, '{}_merged_{:02}')



In [14]:

    
county_superregion, region_superregion = region_mappings(merged_ccs, merged_cc_names)



In [15]:

    
county_superregion.head()









    Out[15]:







  
    
      
      merged_region
    
    
      county
      
    
  
  
    
      01001
      AL_merged_02
    
    
      01003
      AL_merged_01
    
    
      01005
      AL_merged_04
    
    
      01007
      AL_merged_02
    
    
      01009
      AL_merged_02



In [16]:

    
region_superregion.head()









    Out[16]:







  
    
      
      merged_region
    
    
      region
      
    
  
  
    
      AK_00_01
      AK_merged_01
    
    
      AK_10_01
      AK_merged_01
    
    
      AL_00_01
      AL_merged_01
    
    
      AL_00_02
      AL_merged_02
    
    
      AL_00_03
      AL_merged_02



In [17]:

    
fn = '../pummeler/data/regions.h5'
county_superregion.to_hdf(fn, 'county_superregion', format='table', complib='blosc', complevel=9)
region_superregion.to_hdf(fn, 'region_superregion', format='table', complib='blosc', complevel=9)

	region
county
01001	AL_00_16
01003	AL_00_01
01005	AL_00_05
01007	AL_00_02
01009	AL_00_03

	merged_region
county
01001	AL_merged_02
01003	AL_merged_01
01005	AL_merged_04
01007	AL_merged_02
01009	AL_merged_02

	merged_region
region
AK_00_01	AK_merged_01
AK_10_01	AK_merged_01
AL_00_01	AL_merged_01
AL_00_02	AL_merged_02
AL_00_03	AL_merged_02