In [1]:
%load_ext autoreload
%autoreload 2
import ambry
l = ambry.get_library()
b = l.bundle('d04w001') # Geoschemas
sumlevels_p = l.partition('census.gov-acs_geofile-schemas-2009e-sumlevels')

In [2]:
sumlevels = {}
for row in sumlevels_p.stream(as_dict=True):
    sumlevels[row['sumlevel']] = row['description']

In [65]:
from collections import defaultdict, Counter
from geoid import base62_encode

collector = {}
geoids = {}
descriptions = {}

for p in b.partitions:
    #print "=====", p.identity.name
    l = {}
    for i, c in  enumerate(p.table.columns):
        if i > 5 and c.name not in ('name','geoid', 'memi'):
            l[c.name] = [Counter(), 0]
            descriptions[c.name] = c.description
           
    for i, row in enumerate(p.stream(as_dict=True)):
        if i >= 500:
            break
            
        geoid = row['geoid']
        
        for k in l:
            v = row[k]
            
            
            
            if not str(v).strip():
                continue
            
            
            try:
                # The index is not guarantted to be found in the right position; it could be at the start of the
                # geoid, so we keep track of the most common place it is found
                idx = geoid.index(str(v))
                size = len(str(v))
                
                # Kepp tract of the right end position, not the start, since the end pos is independent of the length
                l[k][0][idx+size] += 1
                l[k][1] = max(l[k][1], size)
                
            except ValueError:
                pass
      
    ordered = []
    for k, v in l.items():
        
        most = v[0].most_common(1)
        
        if most:
            size = v[1]
            start = most[0][0] - size

            ordered.append((k, start, size))
            
    ordered = sorted(ordered, key = lambda r: r[1])

    #for e in ordered:
    #    print " ", e, len(base62_encode(10**e[2]))

    geoids[int(p.grain)] = ordered
    
    for e in ordered:
        collector[e[0]]=(e[2],len(base62_encode(10**e[2])) )
        
# Print out the lengths array
out = []
for k, v in collector.items():
    out.append('\'{}\': {},  # {}'.format(k, v[0], descriptions[k]))
    
print '\n'.join(sorted(out))
    
for  sl in sorted(geoids):
    ordered = geoids[sl]
    
    print str(sl)+':', str([ str(e[0]) for e in ordered ])+',', "#", sumlevels[sl]


'aianhh': 4,  # American Indian Area/Alaska Native Area/ Hawaiian Home Land (Census)
'aihhtli': 1,  # American Indian Trust Land/ Hawaiian Home Land Indicator
'aitsce': 3,  # American Indian Tribal Subdivision (Census)
'anrc': 5,  # Alaska Native Regional Corporation (FIPS)
'blkgrp': 1,  # Block Group
'cbsa': 5,  # Metropolitan and Micropolitan Statistical Area
'cdcurr': 2,  # Current Congressional District ***
'cnecta': 3,  # New England City and Town Combined Statistical Area
'concit': 5,  # Consolidated City
'county': 3,  # County of current residence
'cousub': 5,  # County Subdivision (FIPS)
'csa': 3,  # Combined Statistical Area
'division': 1,  # Census Division
'metdiv': 5,  # Metropolitan Statistical Area- Metropolitan Division
'necta': 5,  # New England City and Town Area
'nectadiv': 5,  # New England City and Town Area Division
'place': 5,  # Place (FIPS Code)
'puma5': 4,  # Public Use Microdata Area 5% File
'region': 1,  # Census Region
'sdelm': 5,  # State-School District (Elementary)
'sdsec': 5,  # State-School District (Secondary)
'sduni': 5,  # State-School District (Unified)
'sldl': 3,  # State Legislative District Lower
'sldu': 2,  # State Legislative District Upper
'state': 2,  # State (FIPS Code)
'submcd': 5,  # Subminor Civil Division (FIPS)
'tract': 4,  # Census Tract
'ua': 5,  # Urban Area
'ur': 1,  # Urban/Rural
'us': 1,  # US
10: ['us', 'ur'], # United States
20: ['ur', 'region'], # Region
30: ['ur', 'division'], # Division
40: ['ur', 'state'], # State
50: ['state', 'county'], # County
60: ['state', 'county', 'cousub'], # County Subdivision
67: ['state', 'county', 'cousub', 'submcd'], # State (Puerto Rico Only)-County-County Subdivision-Subbarrio
70: ['state', 'county', 'cousub', 'place'], # County Subdivision-Place/Remainder
80: ['state', 'county', 'cousub', 'place', 'tract'], # County Subdivision-Place/Remainder-Census Tract
140: ['state', 'county', 'tract'], # Census Tract
150: ['state', 'county', 'tract', 'blkgrp'], # Census Tract-Block Group
155: ['state', 'place', 'county'], # Place-County
160: ['state', 'place'], # Place
170: ['state', 'concit'], # Consolidated City
172: ['state', 'concit', 'place'], # Consolidated City-Place Within Consolidated City
230: ['state', 'anrc'], # State-Alaska Native Regional Corporation
250: ['aianhh'], # American Indian Area/Alaska Native Area/Hawaiian Home Land
251: ['aianhh', 'aitsce'], # American Indian Area/Alaska NativeArea/HawaiianHomeLand-Tribal Subdivision/Remainder
252: ['aianhh', 'aihhtli'], # American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)4
254: ['aianhh', 'aihhtli'], # American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land
260: ['state', 'aianhh'], # American Indian Area/Alaska Native Area/Hawaiian Home Land-State
269: ['state', 'aianhh', 'place'], # American Indian Area/Alaska Native Area/Hawaiian Home Land-Place-Remainder
270: ['aianhh', 'state', 'county'], # American Indian Area/Alaska Native Area/Hawaiian Home Land-State-County
280: ['state', 'aianhh'], # State-American Indian Area/Alaska Native Area/Hawaiian Home Land
283: ['state', 'aianhh', 'aihhtli'], # State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)
286: ['state', 'aianhh', 'aihhtli'], # State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land
290: ['aianhh', 'aitsce', 'state'], # American Indian Area/Alaska Native Area/Hawaiian Home Land-Tribal Subdivision/Remainder-State
310: ['cbsa'], # CBSA
311: ['cbsa', 'state'], # CBSA-State-County
312: ['cbsa', 'state', 'place'], # CBSA-State-Principal City
313: ['cbsa', 'state', 'county'], # CBSA-State-County
314: ['cbsa', 'metdiv'], # Metropolitan Statistical Area/Metropolitan Division
315: ['cbsa', 'metdiv', 'state'], # Metropolitan Statistical Area/Metropolitan Division-State
316: ['cbsa', 'metdiv', 'state', 'county'], # Metropolitan Statistical Area/Metropolitan Division-State-County
320: ['state', 'cbsa'], # State- CBSA
321: ['state', 'cbsa', 'place'], # State- CBSA -Principal City
322: ['state', 'cbsa', 'county'], # State- CBSA -County
323: ['state', 'cbsa', 'metdiv'], # State- Metropolitan Statistical Area/Metropolitan Division
324: ['state', 'cbsa', 'metdiv', 'county'], # State- Metropolitan Statistical Area/Metropolitan Division-County
330: ['csa'], # Combined Statistical Area
331: ['csa', 'state'], # Combined Statistical Area-State
332: ['csa', 'cbsa'], # Combined Statistical Area-CBSA
333: ['csa', 'cbsa', 'state'], # Combined Statistical Area-CBSA-State
335: ['cnecta'], # Combined New England City and Town Area
336: ['cnecta', 'state'], # Combined New England City and Town Area -State
337: ['cnecta', 'necta'], # Combined New England City and Town Area -New England City and Town Area
338: ['cnecta', 'necta', 'state'], # Combined New England City and Town Area -New England City and Town Area-State
340: ['state', 'csa'], # State-Combined Statistical Area
341: ['state', 'csa', 'cbsa'], # State-Combined Statistical Area-CBSA
345: ['state', 'cnecta'], # State-Combined New England City and Town Area
346: ['state', 'cnecta', 'necta'], # State-Combined New England City and Town Area-New England City and Town Area
350: ['necta'], # New England City and Town Area
351: ['necta', 'state'], # New England City and Town Area-State
352: ['necta', 'state', 'place'], # New England City and Town Area-State-Principal City
353: ['necta', 'state', 'county'], # New England City and Town Area-State-County
354: ['necta', 'state', 'county', 'cousub'], # New England City and Town Area-State-County-County Subdivision
355: ['necta', 'nectadiv'], # New England City and Town Area (NECTA)-NECTA Division
356: ['necta', 'nectadiv', 'state'], # New England City and Town Area (NECTA)-NECTA Division-State
357: ['necta', 'nectadiv', 'state', 'county'], # New England City and Town Area (NECTA)-NECTA Division-State-County
358: ['necta', 'nectadiv', 'state', 'county', 'cousub'], # New England City and Town Area (NECTA)-NECTA Division-State-County-County Subdivision
360: ['state', 'necta'], # State-New England City and Town Area
361: ['state', 'necta', 'place'], # State-New England City and Town Area-Principal City
362: ['state', 'necta', 'county'], # State-New England City and Town Area-County
363: ['state', 'necta', 'county', 'cousub'], # State-New England City and Town Area-County-County Subdivision
364: ['state', 'necta', 'nectadiv'], # State-New England City and Town Area (NECTA)-NECTA Division
365: ['state', 'necta', 'nectadiv', 'county'], # State-New England City and Town Area (NECTA)-NECTA Division-County
366: ['state', 'necta', 'nectadiv', 'county', 'cousub'], # State-New England City and Town Area (NECTA)-NECTA Division-County-County Subdivision
400: ['ua'], # Urban Area
500: ['state', 'cdcurr'], # Congressional District
510: ['state', 'cdcurr', 'county'], # 
550: ['state', 'cdcurr', 'aianhh'], # Congressional District-American IndianArea/Alaska NativeArea/Hawaiian Home Land
610: ['state', 'sldu'], # State Senate District
612: ['state', 'sldu', 'county'], # State Senate District-County
620: ['state', 'sldl'], # State House District
622: ['state', 'sldl', 'county'], # State House District-County
795: ['state', 'puma5'], # State-Public Use MicroSample Area 5%
950: ['state', 'sdelm'], # State-Elementary School District
960: ['state', 'sdsec'], # State-High School District
970: ['state', 'sduni'], # State-Unified School District

In [20]:
from geoid import names, segments
names_map = {v:k for k, v in names.items()}

seen = set()

for k, v in segments.items():
    if k in names_map:
        pass
    else:
        
        name =  '_'.join( e for e in v)
        name = name[0].lower() + name[1:]
        
        if name in seen:
            name += str(k)
        
        seen.add(name)
        
        print "'{}': {},".format(name, k)


'state_aianhh': 260,
'necta_nectadiv_state_county_cousub': 358,
'state_sldl': 620,
'state_aianhh_place': 269,
'aianhh_state_county': 270,
'state_cbsa_metdiv': 323,
'state_sldu': 610,
'state_aianhh280': 280,
'state_place_county': 155,
'aianhh_aitsce_state': 290,
'state_aianhh_aihhtli': 283,
'state_cdcurr_aianhh': 550,
'state_concit': 170,
'state_concit_place': 172,
'state_aianhh_aihhtli286': 286,
'cbsa': 310,
'cbsa_state': 311,
'cbsa_state_place': 312,
'cbsa_state_county': 313,
'cbsa_metdiv': 314,
'cbsa_metdiv_state': 315,
'state_cbsa': 320,
'state_cbsa_place': 321,
'state_cbsa_county': 322,
'state_county_cousub_submcd': 67,
'state_cbsa_metdiv_county': 324,
'state_county_cousub_place': 70,
'necta_state_county': 353,
'state_puma5': 795,
'csa': 330,
'csa_state': 331,
'csa_cbsa': 332,
'csa_cbsa_state': 333,
'cnecta': 335,
'state_county_cousub_place_tract': 80,
'cnecta_necta': 337,
'cnecta_necta_state': 338,
'state_csa': 340,
'state_csa_cbsa': 341,
'state_cnecta': 345,
'state_cnecta_necta': 346,
'necta': 350,
'necta_state': 351,
'necta_state_place': 352,
'cnecta_state': 336,
'necta_state_county_cousub': 354,
'necta_nectadiv': 355,
'necta_nectadiv_state': 356,
'state_anrc': 230,
'necta_nectadiv_state_county': 357,
'state_necta': 360,
'cbsa_metdiv_state_county': 316,
'state_necta_county': 362,
'state_necta_county_cousub': 363,
'state_necta_nectadiv': 364,
'state_necta_nectadiv_county': 365,
'state_necta_nectadiv_county_cousub': 366,
'state_sldu_county': 612,
'state_cdcurr': 500,
'state_cdcurr_county': 510,
'state_necta_place': 361,
'aianhh': 250,
'aianhh_aitsce': 251,
'aianhh_aihhtli': 252,
'state_sldl_county': 622,
'aianhh_aihhtli254': 254,

In [10]:
%load_ext autoreload
%autoreload 2
from geoid.acs import AcsGeoid

for p in b.partitions:
           
    for i, row in enumerate(p.stream(as_dict=True)):
        if i >= 500:
            break
            
        geoid = row['geoid']
        
        try:
            AcsGeoid.parse(geoid)
            
        except Exception as e:
            print geoid, e
            raise


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [ ]: