notebook.community

Edit and run



In [1]:

    
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')

mp.jupyter.init() # Initialize indexes and downloder context



In [2]:

    
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg









    Out[2]:




Small Area Estimation For San Diego
sandiegodata.org-pums_sae-1 Last Update: 2018-12-01T05:59:19
Processing ACS PUMS and tracts for small area estimation
Contacts

Wrangler Eric Busboom, Civic Knowledge

Resources

 column_dimensions. Links Age, Sex, Poverty status to column ids.
 rasp_replicates. Replicates of race, age, sex and poverty estimates.

References

race_hisp_pums. B03002_002 HISPANIC OR LATINO ORIGIN BY RACE, PUMS
race_hisp_tracts. B03002_002 HISPANIC OR LATINO ORIGIN BY RACE, tracts
ca_tract_boundaries. California Tract boundaries
ca_pums_boundaries. California PUMS boundaries
rasp_diabetes. Diabetes probabilities, by age, race, sex and poverty status
sex_age_white_tracts. B01001A: Sex by Age (White Alone)
sex_age_white_pums. B01001A: Sex by Age (White Alone)
race_iterations. Race iteration table suffixes
sandiego_puma_tract. Pumas and associated tracts, for San Diego County
pov_age. Table B17020: Poverty Status by Age
pov_sex_age_template. Template for Table B17001: Poverty Status by Sex by Age
pov. Table B17002: Ratio of Income to Poverty Level
age_pov. Table B17024: Age by Ratio of Income to Poverty Level



In [3]:

    
race_iterations = pkg.reference('race_iterations').dataframe()

# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract_pkg =  pkg.reference('sandiego_puma_tract').dataframe()
sandiego_puma_tract = list(sandiego_puma_tract_pkg.puma_geoid.unique()) + \
                      list(sandiego_puma_tract_pkg.tract_geoid.unique())



In [4]:

    
def make_col_map(t):
    from collections import namedtuple

    sex = None
    age = None
    pov = 'all'

    def parse_age(v):

        if v and 'year' in v:
            parts = v.split()
            if parts[0] == '75':
                return  pd.Interval(left=75, right=120, closed='both')
            elif len(parts) == 4:
                return  pd.Interval(left=int(parts[0]), right=int(parts[2]), closed='both') 

        return None

    col_map = {}

    Dimensions = namedtuple('Dimensions', ['sex', 'age_range', 'pov'])

    for e in t.table.columns:

        sd = e.short_description
        if '_m90' in e.unique_id:
            continue
        if sd:
            if 'Male' in sd:
                sex = 'male'
            elif 'Female' in sd:
                sex = 'female'
            elif 'Total' in sd:
                sex = 'total'
            elif 'below poverty level' in sd:
                pov = 'pov'
            elif 'above poverty level' in sd:
                pov = 'npov'
            
        col_map[e.unique_id] = Dimensions( sex, parse_age(sd), pov)

    return col_map



In [5]:

    
def get_raceeth(pkg, re_code, re_name):
    """Get a single race/ethnicity iteration, and process it so 
    it can be merged with others. """
    
    pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
    url = pov_sex_age_url.interpolate(context={'iteration':re_code.strip()})
    
    psa = url.dataframe()
    psa = psa[psa.index.isin(sandiego_puma_tract)]
    col_map= make_col_map(psa)
    
    
    t = psa.drop(columns=['STUSAB', 'COUNTY','NAME'])
    t = t.stack().to_frame().reset_index()
    t.columns = ['geoid','col_name','value']
    t['m_type'] = t.col_name.apply(lambda v: 'margin' if '_m90' in v else 'est')
    t['col_name'] = t.col_name.apply(lambda v: v.replace('_m90',''))
    t = t.set_index(['geoid','col_name','m_type']).unstack()
    t['sex'] = [col_map[str(i[1]).replace('_m90','')].sex for i,r in t.iterrows() ]
    t['raceeth'] = re_name
    t['age_range'] = [col_map[str(i[1]).replace('_m90','')].age_range for i,r in t.iterrows() ]
    t['pov'] = [col_map[str(i[1]).replace('_m90','')].pov for i,r in t.iterrows() ]
    return t



In [6]:

    
sd_pumas









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-74bd63856dd9> in <module>
----> 1 sd_pumas

NameError: name 'sd_pumas' is not defined



In [ ]:

    
frames= []

for e in race_iterations.itertuples():
    print(e)
    frames.append(get_raceeth(pkg, e.code, e.raceeth))

df = pd.concat(frames)



In [ ]:

    
df



In [ ]:

    
t = df[(df.sex != 'both') & (df.pov != 'all')& (df.raceeth != 'white') & (~df.age_range.isnull())].copy()
# Collapse the multi-level columns index
t.columns = [ '_'.join(e) if e[1] else e[0] for e in zip(*zip(*t.columns)) ]
t.head()



In [ ]:

    
t.value_est.sum()



In [ ]:

    
t['value_std'] = t.value_margin/1.645
puma_ras = t
puma_ras.head()



In [ ]:

    
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()

# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001 

# Apparently, the PUMA population is also controlles. 
assert sd_puma_pop == sd_controlled_pop

# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number. 
#
# Interestingly, the controlled population number is not with in the 90% margins of 
# the sum of the table esimates. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()



In [ ]:

    
from numpy.random import normal

def yield_estimate_replicates(df):
    for n, (i,r) in enumerate(df.reset_index().iterrows()):

        s = int(normal(loc=r.value_est, scale=r.value_std))
        s = s if s > 0 else 0

        col_no = int(r.col_name.split('_')[1])
        
        yield r.geoid, r.col_name, s
        
def make_estimate_replicates(df, controll_sum = None):
    
    t =  pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
    
    if controll_sum:
        total = t.est.sum()
        ratio = controll_sum/total
        t['controlled_est'] = (t.est*ratio).round(0).astype(int)
    
    return t



In [ ]:

    
# Create some estimate replicates, so se can calc the std dev
estimates = []

from tqdm import tnrange, tqdm_notebook

for i in tnrange(10, desc='Create Estimates'):
    est = make_estimate_replicates(puma_ras)
    estimates.append(est.est.sum())
    
sim_pop_est = pd.Series(estimates).mean()
sim_pop_est, pd.Series(estimates).std() * 1.645



In [ ]:

    
est = make_estimate_replicates(puma_ras, sd_controlled_pop)
est.head()



In [ ]:

    
len(est)



In [ ]:

    
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp



In [ ]: