In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from tqdm import tqdm_notebook

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

Small Area Estimation For San Diego

sandiegodata.org-pums_sae-1 Last Update: 2018-12-01T05:59:19

Processing ACS PUMS and tracts for small area estimation

Contacts

Resources

References


In [3]:
race_iterations = pkg.reference('race_iterations').dataframe()

# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract =  pkg.reference('sandiego_puma_tract').dataframe()
sd_pumas = list(sandiego_puma_tract.puma_geoid.unique())
sd_tracts = list(sandiego_puma_tract.tract_geoid.unique())
sd_puma_tracts = sd_pumas + sd_tracts

In [4]:
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()

# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001 

# Apparently, the PUMA population is also controlles. 
assert sd_puma_pop == sd_controlled_pop

# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number. 
#
# Interestingly, the controlled population number is not with in the 90% margins of 
# the sum of the table esimates. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()


Out[4]:
(3172544, -80812, 9788.614048985688)

In [270]:
len(df), df.value_est.sum(), df.value_est_controlled.sum(), sd_controlled_pop


Out[270]:
(200960, 3127844, 3251386, 3253356)

In [275]:
len(df), df18.value_est.sum(), df18.value_est_controlled.sum()


Out[275]:
(200960, 2643923, 2748648)

In [13]:
from numpy.random import normal

def yield_estimate_replicates(df):
    for n, (i,r) in enumerate(df.reset_index().iterrows()):

        s = int(normal(loc=r.value_est, scale=r.value_std))
        s = s if s > 0 else 0

        col_no = int(r.col_name.split('_')[1])
        
        yield r.geoid, r.col_name, s
        
def make_estimate_replicates(df, controll_sum = None):
    
    t =  pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
    
    if controll_sum:
        total = t.est.sum()
        ratio = controll_sum/total
        t['controlled_est'] = (t.est*ratio).round(0).astype(int)
    
    return t

# Create some estimate replicates, so se can calc the std dev
estimates = []

from tqdm import tnrange, tqdm_notebook

N = 4 # Should be larger in production

for i in tnrange(N, desc='Create Estimates'):
    est = make_estimate_replicates(df18, sd_controlled_pop)
    est['replicate'] = i
    estimates.append(est)
    
replicates = pd.concat(estimates)



In [248]:
replicates[replicates.replicate == 0].head()


Out[248]:
geoid col_no est controlled_est replicate
0 14000US06073000100 B17001B_006 0 0 0
1 14000US06073000100 B17001B_007 0 0 0
2 14000US06073000100 B17001B_009 0 0 0
3 14000US06073000100 B17001B_010 9 8 0
4 14000US06073000100 B17001B_011 0 0 0

In [265]:
# Total population estimate
sd_controlled_pop, replicates[replicates.replicate == 0].controlled_est.sum(), replicates[replicates.replicate == 0].est.sum()


Out[265]:
(3253356, 3262145, 3614458)

In [ ]:
#def yield_col_map():
#    for k, v in col_map.items():
#        if v.sex and v.age_range:
#            yield k, v.sex, v.age_range.left, v.age_range.right, v.pov
    

#col_map_df = pd.DataFrame.from_records(list(yield_col_map()), columns = 'col_id sex age_min age_max pov'.split())

In [260]:
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp['sex'] = rasp['sex'].astype('category')
rasp.sex.cat.rename_categories({0:'female', 1:'male'}, inplace=True)
rasp['raceeth'] = rasp['raceeth'].astype('category')
rasp['pov'] = rasp['pov'].astype('category')
rasp.pov.cat.rename_categories({0:'npov', 1:'pov'}, inplace=True)
rasp = rasp[rasp.age > 18].copy()
print(len(rasp))
rasp.head()


140
Out[260]:
age age_min age_max pov sex raceeth prob
60 21.0 18 24 pov male white 0.042474
61 21.0 18 24 pov male asian 0.029804
62 21.0 18 24 pov male black 0.066237
63 21.0 18 24 pov male latino 0.072074
64 21.0 18 24 pov male other 0.054269

In [278]:
len(df18)


Out[278]:
140672

In [282]:



Out[282]:
age age_min age_max pov sex raceeth prob
60 21.0 18 24 pov male white 0.042474
61 21.0 18 24 pov male asian 0.029804
62 21.0 18 24 pov male black 0.066237
63 21.0 18 24 pov male latino 0.072074
64 21.0 18 24 pov male other 0.054269

In [297]:
t = df18.reset_index().merge(rasp.reset_index(), on=['raceeth','age_min','pov','sex'], how='left')
t['value_est'] = t['value_est_controlled']
t = t[['geoid', 'value_est', 'value_margin', 'sex', 'raceeth', 'age_min', 'pov', 'index', 'age',  'prob']].copy()

print(len(df), len(rasp), len(t))
tract_probs = t.reset_index().copy()
tract_probs.head()


200960 140 140672
Out[297]:
level_0 geoid value_est value_margin sex raceeth age_min pov index age prob
0 0 14000US06073000100 0 12 male black 18 pov 62 21.0 0.066237
1 1 14000US06073000100 0 12 male black 25 pov 82 29.5 0.094178
2 2 14000US06073000100 0 12 male black 35 pov 102 39.5 0.140174
3 3 14000US06073000100 0 12 male black 45 pov 122 49.5 0.203585
4 4 14000US06073000100 0 12 male black 55 pov 142 59.5 0.286135

In [298]:
t = tract_probs[['value_est','value_margin', 'prob']].copy()

n = np.random.normal(t.value_est,  t.value_margin/1.645).round(0).astype(int)
n = np.where(n>0,n,0)

# There are a whole lot of estimates of 0, and the ramdom sampling process above gives half9 of them 
# a non-zero value. 
n = np.where(t.value_est==0,0,n)

t['n'] = n
    
t['samples'] =t.n.apply(lambda e : np.random.random_sample(e) )

t['count'] = t.apply(lambda r: (r.samples<r.prob).sum(), axis=1)
 
# That random sample process above works out, for large N, to be nearly exactly the same as:
# However, the sampling process would allow for creating variances. 
t['est_prob'] = t['value_est'] * t['prob']
    
print(len(t)   )

t.head()


140672
Out[298]:
value_est value_margin prob n samples count est_prob
0 0 12 0.066237 0 [] 0 0.0
1 0 12 0.094178 0 [] 0 0.0
2 0 12 0.140174 0 [] 0 0.0
3 0 12 0.203585 0 [] 0 0.0
4 0 12 0.286135 0 [] 0 0.0

In [299]:
t.value_est.sum(), len(t)


Out[299]:
(2748648, 140672)

In [300]:
t['est_prob'] = t['value_est'] * t['prob']
t['est_prob'].sum()/t['value_est'].sum()


Out[300]:
0.11146276151814759

In [301]:
t['count'].sum() /t['value_est'].sum()


Out[301]:
0.11385706718357534

In [302]:
(t['value_est']*t['prob']).sum()/t['value_est'].sum()


Out[302]:
0.11146276151814759

In [303]:
t.value_est.sum()


Out[303]:
2748648

In [291]:
tract_probs[tract_probs.raceeth == 'white'].head()


Out[291]:
level_0 geoid value_est value_margin sex raceeth age_min pov index age prob
105504 105504 14000US06073000100 0 12 male white 18 pov 60 21.0 0.042474
105505 105505 14000US06073000100 0 12 male white 25 pov 80 29.5 0.061047
105506 105506 14000US06073000100 0 12 male white 35 pov 100 39.5 0.092514
105507 105507 14000US06073000100 11 18 male white 45 pov 120 49.5 0.137821
105508 105508 14000US06073000100 0 12 male white 55 pov 140 59.5 0.200416

In [ ]: