In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from tqdm import tqdm_notebook

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

Small Area Estimation For San Diego

sandiegodata.org-pums_sae-1 Last Update: 2018-12-01T05:59:19

Processing ACS PUMS and tracts for small area estimation

Contacts

Resources

References


In [3]:
race_iterations = pkg.reference('race_iterations').dataframe()

# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract =  pkg.reference('sandiego_puma_tract').dataframe()
sd_pumas = list(sandiego_puma_tract.puma_geoid.unique())
sd_tracts = list(sandiego_puma_tract.tract_geoid.unique())
sd_puma_tracts = sd_pumas + sd_tracts

In [4]:
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()

# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001 

# Apparently, the PUMA population is also controlles. 
assert sd_puma_pop == sd_controlled_pop

# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number. 
#
# Interestingly, the controlled population number is not with in the 90% margins of 
# the sum of the table esimates. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()


Out[4]:
(3172544, -80812, 9788.614048985688)

In [5]:
def make_col_map(t):
    from collections import namedtuple

    sex = None
    age = None
    pov = 'all'

    def parse_age(v):

        if v and 'year' in v:
            parts = v.split()
            if parts[0] == '75':
                return  pd.Interval(left=75, right=120, closed='both')
            elif len(parts) == 4:
                return  pd.Interval(left=int(parts[0]), right=int(parts[2]), closed='both') 

        return None

    col_map = {}

    Dimensions = namedtuple('Dimensions', ['sex', 'age_range', 'pov'])

    for e in t.table.columns:

        sd = e.short_description
        if '_m90' in e.unique_id:
            continue
        if sd:
            if 'Male' in sd:
                sex = 'male'
            elif 'Female' in sd:
                sex = 'female'
            elif 'Total' in sd:
                sex = 'total'
            elif 'below poverty level' in sd:
                pov = 'pov'
            elif 'above poverty level' in sd:
                pov = 'npov'
            
        col_map[e.unique_id] = Dimensions( sex, parse_age(sd), pov)

    return col_map

In [6]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})
psa = url.dataframe()

In [7]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})

psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]

In [8]:
t = psa[psa.index.isin(list(sandiego_puma_tract.puma_geoid))].drop(columns=['STUSAB', 'COUNTY','NAME'])
t = t.stack().to_frame().reset_index()
len(psa)


Out[8]:
628

In [9]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})

psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]

assert len(psa) > 0

col_map= make_col_map(psa)
ages = list(sorted(set(e.age_range  for e in col_map.values() if e.age_range is not None)))
ages


Out[9]:
[Interval(6, 11, closed='both'),
 Interval(12, 14, closed='both'),
 Interval(16, 17, closed='both'),
 Interval(18, 24, closed='both'),
 Interval(25, 34, closed='both'),
 Interval(35, 44, closed='both'),
 Interval(45, 54, closed='both'),
 Interval(55, 64, closed='both'),
 Interval(65, 74, closed='both'),
 Interval(75, 120, closed='both')]

In [262]:
def get_raceeth(pkg, re_code, re_name):
    """Get a single race/ethnicity iteration, and process it so 
    it can be merged with others. """
    
    pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
    url = pov_sex_age_url.interpolate(context={'iteration':re_code.strip()})
    
    psa = url.dataframe()
    psa = psa[psa.index.isin(sd_puma_tracts)]
    
    assert len(psa) > 0
    
    col_map= make_col_map(psa)
    
    t = psa.drop(columns=['STUSAB', 'COUNTY','NAME'])
    t = t.stack().to_frame().reset_index()
    t.columns = ['geoid','col_name','value']
    
    t['m_type'] = t.col_name.apply(lambda v: 'margin' if '_m90' in v else 'est')
    t['col_name'] = t.col_name.apply(lambda v: v.replace('_m90',''))
    t = t.set_index(['geoid','col_name','m_type']).unstack()
    t['sex'] = [col_map[str(i[1]).replace('_m90','')].sex for i,r in t.iterrows() ]
    t['raceeth'] = re_name
    t['age_range'] = [col_map[str(i[1]).replace('_m90','')].age_range for i,r in t.iterrows() ]
    t['pov'] = [col_map[str(i[1]).replace('_m90','')].pov for i,r in t.iterrows() ]

    t.columns = [ '_'.join(e) if e[1] else e[0] for e in zip(*zip(*t.columns)) ]

    t['value_std'] = t.value_margin/1.645
    
    return t[['value_est',  'value_margin', 'value_std', 'sex', 'raceeth', 'age_range', 'pov']]

def concat_race_eth(pkg):
    
    
    frames= []

    tn = tqdm_notebook(list(race_iterations.itertuples()))
    
    for e in tn:
        tn.set_description(e.raceeth)
        frames.append(get_raceeth(pkg, e.code, e.raceeth))
 
    return pd.concat(frames)

df = concat_race_eth(pkg)

# Select just the non-overlaping entries
df = df[(df.sex != 'both') & (df.pov != 'all')& (df.raceeth != 'white') & (~df.age_range.isnull())].copy()

assert df.value_est.sum() > 3_100_000 # Ought to be close to CA population 

df['raceeth'] = df['raceeth'].astype('category')
df['raceeth'].cat.rename_categories({
    'hnwhite':'white',
    'hisp':'latino',
}, inplace=True)
df['raceeth'].replace({
    'aian':'other', 
    'nhopi':'other', 
    'many':'other'}, inplace=True)

control_ratio = sd_controlled_pop/df.value_est.sum()
df['value_est_controlled'] = (df.value_est * control_ratio).round().astype(int)

df['age_min'] = df.age_range.apply(lambda v: v.left)
df['age_max'] = df.age_range.apply(lambda v: v.right)

df.head()


Out[262]:
value_est value_margin value_std sex raceeth age_range pov
geoid col_name
14000US06073000100 B17001B_006 0 12 7.294833 male black [6, 11] pov
B17001B_007 0 12 7.294833 male black [12, 14] pov
B17001B_009 0 12 7.294833 male black [16, 17] pov
B17001B_010 0 12 7.294833 male black [18, 24] pov
B17001B_011 0 12 7.294833 male black [25, 34] pov

In [274]:
df18 = df[df.age_min >= 18]

In [270]:
len(df), df.value_est.sum(), df.value_est_controlled.sum(), sd_controlled_pop


Out[270]:
(200960, 3127844, 3251386, 3253356)

In [275]:
len(df), df18.value_est.sum(), df18.value_est_controlled.sum()


Out[275]:
(200960, 2643923, 2748648)

In [13]:
from numpy.random import normal

def yield_estimate_replicates(df):
    for n, (i,r) in enumerate(df.reset_index().iterrows()):

        s = int(normal(loc=r.value_est, scale=r.value_std))
        s = s if s > 0 else 0

        col_no = int(r.col_name.split('_')[1])
        
        yield r.geoid, r.col_name, s
        
def make_estimate_replicates(df, controll_sum = None):
    
    t =  pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
    
    if controll_sum:
        total = t.est.sum()
        ratio = controll_sum/total
        t['controlled_est'] = (t.est*ratio).round(0).astype(int)
    
    return t

# Create some estimate replicates, so se can calc the std dev
estimates = []

from tqdm import tnrange, tqdm_notebook

N = 4 # Should be larger in production

for i in tnrange(N, desc='Create Estimates'):
    est = make_estimate_replicates(df18, sd_controlled_pop)
    est['replicate'] = i
    estimates.append(est)
    
replicates = pd.concat(estimates)



In [248]:
replicates[replicates.replicate == 0].head()


Out[248]:
geoid col_no est controlled_est replicate
0 14000US06073000100 B17001B_006 0 0 0
1 14000US06073000100 B17001B_007 0 0 0
2 14000US06073000100 B17001B_009 0 0 0
3 14000US06073000100 B17001B_010 9 8 0
4 14000US06073000100 B17001B_011 0 0 0

In [265]:
# Total population estimate
sd_controlled_pop, replicates[replicates.replicate == 0].controlled_est.sum(), replicates[replicates.replicate == 0].est.sum()


Out[265]:
(3253356, 3262145, 3614458)

In [ ]:
#def yield_col_map():
#    for k, v in col_map.items():
#        if v.sex and v.age_range:
#            yield k, v.sex, v.age_range.left, v.age_range.right, v.pov
    

#col_map_df = pd.DataFrame.from_records(list(yield_col_map()), columns = 'col_id sex age_min age_max pov'.split())

In [260]:
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp['sex'] = rasp['sex'].astype('category')
rasp.sex.cat.rename_categories({0:'female', 1:'male'}, inplace=True)
rasp['raceeth'] = rasp['raceeth'].astype('category')
rasp['pov'] = rasp['pov'].astype('category')
rasp.pov.cat.rename_categories({0:'npov', 1:'pov'}, inplace=True)
rasp = rasp[rasp.age > 18].copy()
print(len(rasp))
rasp.head()


140
Out[260]:
age age_min age_max pov sex raceeth prob
60 21.0 18 24 pov male white 0.042474
61 21.0 18 24 pov male asian 0.029804
62 21.0 18 24 pov male black 0.066237
63 21.0 18 24 pov male latino 0.072074
64 21.0 18 24 pov male other 0.054269

In [278]:
len(df18)


Out[278]:
140672

In [282]:



Out[282]:
age age_min age_max pov sex raceeth prob
60 21.0 18 24 pov male white 0.042474
61 21.0 18 24 pov male asian 0.029804
62 21.0 18 24 pov male black 0.066237
63 21.0 18 24 pov male latino 0.072074
64 21.0 18 24 pov male other 0.054269

In [297]:
t = df18.reset_index().merge(rasp.reset_index(), on=['raceeth','age_min','pov','sex'], how='left')
t['value_est'] = t['value_est_controlled']
t = t[['geoid', 'value_est', 'value_margin', 'sex', 'raceeth', 'age_min', 'pov', 'index', 'age',  'prob']].copy()

print(len(df), len(rasp), len(t))
tract_probs = t.reset_index().copy()
tract_probs.head()


200960 140 140672
Out[297]:
level_0 geoid value_est value_margin sex raceeth age_min pov index age prob
0 0 14000US06073000100 0 12 male black 18 pov 62 21.0 0.066237
1 1 14000US06073000100 0 12 male black 25 pov 82 29.5 0.094178
2 2 14000US06073000100 0 12 male black 35 pov 102 39.5 0.140174
3 3 14000US06073000100 0 12 male black 45 pov 122 49.5 0.203585
4 4 14000US06073000100 0 12 male black 55 pov 142 59.5 0.286135

In [298]:
t = tract_probs[['value_est','value_margin', 'prob']].copy()

n = np.random.normal(t.value_est,  t.value_margin/1.645).round(0).astype(int)
n = np.where(n>0,n,0)

# There are a whole lot of estimates of 0, and the ramdom sampling process above gives half9 of them 
# a non-zero value. 
n = np.where(t.value_est==0,0,n)

t['n'] = n
    
t['samples'] =t.n.apply(lambda e : np.random.random_sample(e) )

t['count'] = t.apply(lambda r: (r.samples<r.prob).sum(), axis=1)
 
# That random sample process above works out, for large N, to be nearly exactly the same as:
# However, the sampling process would allow for creating variances. 
t['est_prob'] = t['value_est'] * t['prob']
    
print(len(t)   )

t.head()


140672
Out[298]:
value_est value_margin prob n samples count est_prob
0 0 12 0.066237 0 [] 0 0.0
1 0 12 0.094178 0 [] 0 0.0
2 0 12 0.140174 0 [] 0 0.0
3 0 12 0.203585 0 [] 0 0.0
4 0 12 0.286135 0 [] 0 0.0

In [299]:
t.value_est.sum(), len(t)


Out[299]:
(2748648, 140672)

In [300]:
t['est_prob'] = t['value_est'] * t['prob']
t['est_prob'].sum()/t['value_est'].sum()


Out[300]:
0.11146276151814759

In [301]:
t['count'].sum() /t['value_est'].sum()


Out[301]:
0.11385706718357534

In [302]:
(t['value_est']*t['prob']).sum()/t['value_est'].sum()


Out[302]:
0.11146276151814759

In [303]:
t.value_est.sum()


Out[303]:
2748648

In [291]:
tract_probs[tract_probs.raceeth == 'white'].head()


Out[291]:
level_0 geoid value_est value_margin sex raceeth age_min pov index age prob
105504 105504 14000US06073000100 0 12 male white 18 pov 60 21.0 0.042474
105505 105505 14000US06073000100 0 12 male white 25 pov 80 29.5 0.061047
105506 105506 14000US06073000100 0 12 male white 35 pov 100 39.5 0.092514
105507 105507 14000US06073000100 11 18 male white 45 pov 120 49.5 0.137821
105508 105508 14000US06073000100 0 12 male white 55 pov 140 59.5 0.200416

In [ ]: