notebook.community

Edit and run



In [1]:

    
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from tqdm import tqdm_notebook

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context



In [2]:

    
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg









    Out[2]:




Small Area Estimation For San Diego
sandiegodata.org-pums_sae-1 Last Update: 2018-12-01T05:59:19
Processing ACS PUMS and tracts for small area estimation
Contacts

Wrangler Eric Busboom, Civic Knowledge

Resources

 column_dimensions. Links Age, Sex, Poverty status to column ids.
 rasp_replicates. Replicates of race, age, sex and poverty estimates.

References

race_hisp_pums. B03002_002 HISPANIC OR LATINO ORIGIN BY RACE, PUMS
race_hisp_tracts. B03002_002 HISPANIC OR LATINO ORIGIN BY RACE, tracts
ca_tract_boundaries. California Tract boundaries
ca_pums_boundaries. California PUMS boundaries
rasp_diabetes. Diabetes probabilities, by age, race, sex and poverty status
sex_age_white_tracts. B01001A: Sex by Age (White Alone)
sex_age_white_pums. B01001A: Sex by Age (White Alone)
race_iterations. Race iteration table suffixes
sandiego_puma_tract. Pumas and associated tracts, for San Diego County
pov_age. Table B17020: Poverty Status by Age
pov_sex_age_template. Template for Table B17001: Poverty Status by Sex by Age
pov. Table B17002: Ratio of Income to Poverty Level
age_pov. Table B17024: Age by Ratio of Income to Poverty Level



In [3]:

    
race_iterations = pkg.reference('race_iterations').dataframe()

# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract =  pkg.reference('sandiego_puma_tract').dataframe()
sd_pumas = list(sandiego_puma_tract.puma_geoid.unique())
sd_tracts = list(sandiego_puma_tract.tract_geoid.unique())
sd_puma_tracts = sd_pumas + sd_tracts



In [4]:

    
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()

# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001 

# Apparently, the PUMA population is also controlles. 
assert sd_puma_pop == sd_controlled_pop

# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number. 
#
# Interestingly, the controlled population number is not with in the 90% margins of 
# the sum of the table esimates. 
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()









    Out[4]:





(3172544, -80812, 9788.614048985688)



In [5]:

    
def make_col_map(t):
    from collections import namedtuple

    sex = None
    age = None
    pov = 'all'

    def parse_age(v):

        if v and 'year' in v:
            parts = v.split()
            if parts[0] == '75':
                return  pd.Interval(left=75, right=120, closed='both')
            elif len(parts) == 4:
                return  pd.Interval(left=int(parts[0]), right=int(parts[2]), closed='both') 

        return None

    col_map = {}

    Dimensions = namedtuple('Dimensions', ['sex', 'age_range', 'pov'])

    for e in t.table.columns:

        sd = e.short_description
        if '_m90' in e.unique_id:
            continue
        if sd:
            if 'Male' in sd:
                sex = 'male'
            elif 'Female' in sd:
                sex = 'female'
            elif 'Total' in sd:
                sex = 'total'
            elif 'below poverty level' in sd:
                pov = 'pov'
            elif 'above poverty level' in sd:
                pov = 'npov'
            
        col_map[e.unique_id] = Dimensions( sex, parse_age(sd), pov)

    return col_map



In [6]:

    
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})
psa = url.dataframe()



In [7]:

    
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})

psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]



In [8]:

    
t = psa[psa.index.isin(list(sandiego_puma_tract.puma_geoid))].drop(columns=['STUSAB', 'COUNTY','NAME'])
t = t.stack().to_frame().reset_index()
len(psa)









    Out[8]:





628



In [9]:

    
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})

psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]

assert len(psa) > 0

col_map= make_col_map(psa)
ages = list(sorted(set(e.age_range  for e in col_map.values() if e.age_range is not None)))
ages









    Out[9]:





[Interval(6, 11, closed='both'),
 Interval(12, 14, closed='both'),
 Interval(16, 17, closed='both'),
 Interval(18, 24, closed='both'),
 Interval(25, 34, closed='both'),
 Interval(35, 44, closed='both'),
 Interval(45, 54, closed='both'),
 Interval(55, 64, closed='both'),
 Interval(65, 74, closed='both'),
 Interval(75, 120, closed='both')]



In [262]:

    
def get_raceeth(pkg, re_code, re_name):
    """Get a single race/ethnicity iteration, and process it so 
    it can be merged with others. """
    
    pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
    url = pov_sex_age_url.interpolate(context={'iteration':re_code.strip()})
    
    psa = url.dataframe()
    psa = psa[psa.index.isin(sd_puma_tracts)]
    
    assert len(psa) > 0
    
    col_map= make_col_map(psa)
    
    t = psa.drop(columns=['STUSAB', 'COUNTY','NAME'])
    t = t.stack().to_frame().reset_index()
    t.columns = ['geoid','col_name','value']
    
    t['m_type'] = t.col_name.apply(lambda v: 'margin' if '_m90' in v else 'est')
    t['col_name'] = t.col_name.apply(lambda v: v.replace('_m90',''))
    t = t.set_index(['geoid','col_name','m_type']).unstack()
    t['sex'] = [col_map[str(i[1]).replace('_m90','')].sex for i,r in t.iterrows() ]
    t['raceeth'] = re_name
    t['age_range'] = [col_map[str(i[1]).replace('_m90','')].age_range for i,r in t.iterrows() ]
    t['pov'] = [col_map[str(i[1]).replace('_m90','')].pov for i,r in t.iterrows() ]

    t.columns = [ '_'.join(e) if e[1] else e[0] for e in zip(*zip(*t.columns)) ]

    t['value_std'] = t.value_margin/1.645
    
    return t[['value_est',  'value_margin', 'value_std', 'sex', 'raceeth', 'age_range', 'pov']]

def concat_race_eth(pkg):
    
    
    frames= []

    tn = tqdm_notebook(list(race_iterations.itertuples()))
    
    for e in tn:
        tn.set_description(e.raceeth)
        frames.append(get_raceeth(pkg, e.code, e.raceeth))
 
    return pd.concat(frames)

df = concat_race_eth(pkg)

# Select just the non-overlaping entries
df = df[(df.sex != 'both') & (df.pov != 'all')& (df.raceeth != 'white') & (~df.age_range.isnull())].copy()

assert df.value_est.sum() > 3_100_000 # Ought to be close to CA population 

df['raceeth'] = df['raceeth'].astype('category')
df['raceeth'].cat.rename_categories({
    'hnwhite':'white',
    'hisp':'latino',
}, inplace=True)
df['raceeth'].replace({
    'aian':'other', 
    'nhopi':'other', 
    'many':'other'}, inplace=True)

control_ratio = sd_controlled_pop/df.value_est.sum()
df['value_est_controlled'] = (df.value_est * control_ratio).round().astype(int)

df['age_min'] = df.age_range.apply(lambda v: v.left)
df['age_max'] = df.age_range.apply(lambda v: v.right)

df.head()









    





 
 










    Out[262]:







  
    
      
      
      value_est
      value_margin
      value_std
      sex
      raceeth
      age_range
      pov
    
    
      geoid
      col_name
      
      
      
      
      
      
      
    
  
  
    
      14000US06073000100
      B17001B_006
      0
      12
      7.294833
      male
      black
      [6, 11]
      pov
    
    
      B17001B_007
      0
      12
      7.294833
      male
      black
      [12, 14]
      pov
    
    
      B17001B_009
      0
      12
      7.294833
      male
      black
      [16, 17]
      pov
    
    
      B17001B_010
      0
      12
      7.294833
      male
      black
      [18, 24]
      pov
    
    
      B17001B_011
      0
      12
      7.294833
      male
      black
      [25, 34]
      pov



In [274]:

    
df18 = df[df.age_min >= 18]



In [270]:

    
len(df), df.value_est.sum(), df.value_est_controlled.sum(), sd_controlled_pop









    Out[270]:





(200960, 3127844, 3251386, 3253356)



In [275]:

    
len(df), df18.value_est.sum(), df18.value_est_controlled.sum()









    Out[275]:





(200960, 2643923, 2748648)



In [13]:

    
from numpy.random import normal

def yield_estimate_replicates(df):
    for n, (i,r) in enumerate(df.reset_index().iterrows()):

        s = int(normal(loc=r.value_est, scale=r.value_std))
        s = s if s > 0 else 0

        col_no = int(r.col_name.split('_')[1])
        
        yield r.geoid, r.col_name, s
        
def make_estimate_replicates(df, controll_sum = None):
    
    t =  pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
    
    if controll_sum:
        total = t.est.sum()
        ratio = controll_sum/total
        t['controlled_est'] = (t.est*ratio).round(0).astype(int)
    
    return t

# Create some estimate replicates, so se can calc the std dev
estimates = []

from tqdm import tnrange, tqdm_notebook

N = 4 # Should be larger in production

for i in tnrange(N, desc='Create Estimates'):
    est = make_estimate_replicates(df18, sd_controlled_pop)
    est['replicate'] = i
    estimates.append(est)
    
replicates = pd.concat(estimates)



In [248]:

    
replicates[replicates.replicate == 0].head()









    Out[248]:







  
    
      
      geoid
      col_no
      est
      controlled_est
      replicate
    
  
  
    
      0
      14000US06073000100
      B17001B_006
      0
      0
      0
    
    
      1
      14000US06073000100
      B17001B_007
      0
      0
      0
    
    
      2
      14000US06073000100
      B17001B_009
      0
      0
      0
    
    
      3
      14000US06073000100
      B17001B_010
      9
      8
      0
    
    
      4
      14000US06073000100
      B17001B_011
      0
      0
      0



In [265]:

    
# Total population estimate
sd_controlled_pop, replicates[replicates.replicate == 0].controlled_est.sum(), replicates[replicates.replicate == 0].est.sum()









    Out[265]:





(3253356, 3262145, 3614458)



In [ ]:

    
#def yield_col_map():
#    for k, v in col_map.items():
#        if v.sex and v.age_range:
#            yield k, v.sex, v.age_range.left, v.age_range.right, v.pov
    

#col_map_df = pd.DataFrame.from_records(list(yield_col_map()), columns = 'col_id sex age_min age_max pov'.split())



In [260]:

    
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp['sex'] = rasp['sex'].astype('category')
rasp.sex.cat.rename_categories({0:'female', 1:'male'}, inplace=True)
rasp['raceeth'] = rasp['raceeth'].astype('category')
rasp['pov'] = rasp['pov'].astype('category')
rasp.pov.cat.rename_categories({0:'npov', 1:'pov'}, inplace=True)
rasp = rasp[rasp.age > 18].copy()
print(len(rasp))
rasp.head()



In [278]:

    
len(df18)









    Out[278]:





140672



In [282]:



In [297]:

    
t = df18.reset_index().merge(rasp.reset_index(), on=['raceeth','age_min','pov','sex'], how='left')
t['value_est'] = t['value_est_controlled']
t = t[['geoid', 'value_est', 'value_margin', 'sex', 'raceeth', 'age_min', 'pov', 'index', 'age',  'prob']].copy()

print(len(df), len(rasp), len(t))
tract_probs = t.reset_index().copy()
tract_probs.head()









    



200960 140 140672






    Out[297]:







  
    
      
      level_0
      geoid
      value_est
      value_margin
      sex
      raceeth
      age_min
      pov
      index
      age
      prob
    
  
  
    
      0
      0
      14000US06073000100
      0
      12
      male
      black
      18
      pov
      62
      21.0
      0.066237
    
    
      1
      1
      14000US06073000100
      0
      12
      male
      black
      25
      pov
      82
      29.5
      0.094178
    
    
      2
      2
      14000US06073000100
      0
      12
      male
      black
      35
      pov
      102
      39.5
      0.140174
    
    
      3
      3
      14000US06073000100
      0
      12
      male
      black
      45
      pov
      122
      49.5
      0.203585
    
    
      4
      4
      14000US06073000100
      0
      12
      male
      black
      55
      pov
      142
      59.5
      0.286135



In [298]:

    
t = tract_probs[['value_est','value_margin', 'prob']].copy()

n = np.random.normal(t.value_est,  t.value_margin/1.645).round(0).astype(int)
n = np.where(n>0,n,0)

# There are a whole lot of estimates of 0, and the ramdom sampling process above gives half9 of them 
# a non-zero value. 
n = np.where(t.value_est==0,0,n)

t['n'] = n
    
t['samples'] =t.n.apply(lambda e : np.random.random_sample(e) )

t['count'] = t.apply(lambda r: (r.samples<r.prob).sum(), axis=1)
 
# That random sample process above works out, for large N, to be nearly exactly the same as:
# However, the sampling process would allow for creating variances. 
t['est_prob'] = t['value_est'] * t['prob']
    
print(len(t)   )

t.head()









    



140672






    Out[298]:







  
    
      
      value_est
      value_margin
      prob
      n
      samples
      count
      est_prob
    
  
  
    
      0
      0
      12
      0.066237
      0
      []
      0
      0.0
    
    
      1
      0
      12
      0.094178
      0
      []
      0
      0.0
    
    
      2
      0
      12
      0.140174
      0
      []
      0
      0.0
    
    
      3
      0
      12
      0.203585
      0
      []
      0
      0.0
    
    
      4
      0
      12
      0.286135
      0
      []
      0
      0.0



In [299]:

    
t.value_est.sum(), len(t)









    Out[299]:





(2748648, 140672)



In [300]:

    
t['est_prob'] = t['value_est'] * t['prob']
t['est_prob'].sum()/t['value_est'].sum()









    Out[300]:





0.11146276151814759



In [301]:

    
t['count'].sum() /t['value_est'].sum()









    Out[301]:





0.11385706718357534



In [302]:

    
(t['value_est']*t['prob']).sum()/t['value_est'].sum()









    Out[302]:





0.11146276151814759



In [303]:

    
t.value_est.sum()









    Out[303]:





2748648



In [291]:

    
tract_probs[tract_probs.raceeth == 'white'].head()









    Out[291]:







  
    
      
      level_0
      geoid
      value_est
      value_margin
      sex
      raceeth
      age_min
      pov
      index
      age
      prob
    
  
  
    
      105504
      105504
      14000US06073000100
      0
      12
      male
      white
      18
      pov
      60
      21.0
      0.042474
    
    
      105505
      105505
      14000US06073000100
      0
      12
      male
      white
      25
      pov
      80
      29.5
      0.061047
    
    
      105506
      105506
      14000US06073000100
      0
      12
      male
      white
      35
      pov
      100
      39.5
      0.092514
    
    
      105507
      105507
      14000US06073000100
      11
      18
      male
      white
      45
      pov
      120
      49.5
      0.137821
    
    
      105508
      105508
      14000US06073000100
      0
      12
      male
      white
      55
      pov
      140
      59.5
      0.200416



In [ ]:

		value_est	value_margin	value_std	sex	raceeth	age_range	pov
geoid	col_name
14000US06073000100	B17001B_006	0	12	7.294833	male	black	[6, 11]	pov
	B17001B_007	0	12	7.294833	male	black	[12, 14]	pov
	B17001B_009	0	12	7.294833	male	black	[16, 17]	pov
	B17001B_010	0	12	7.294833	male	black	[18, 24]	pov
	B17001B_011	0	12	7.294833	male	black	[25, 34]	pov

	age	age_min	age_max	pov	sex	raceeth	prob
60	21.0	18	24	pov	male	white	0.042474
61	21.0	18	24	pov	male	asian	0.029804
62	21.0	18	24	pov	male	black	0.066237
63	21.0	18	24	pov	male	latino	0.072074
64	21.0	18	24	pov	male	other	0.054269

	value_margin	prob	samples
0	12	0.066237	[]
1	12	0.094178	[]
2	12	0.140174	[]
3	12	0.203585	[]
4	12	0.286135	[]

	level_0	geoid	value_est	value_margin	sex	raceeth	age_min	pov	index	age	prob
105504	105504	14000US06073000100	0	12	male	white	18	pov	60	21.0	0.042474
105505	105505	14000US06073000100	0	12	male	white	25	pov	80	29.5	0.061047
105506	105506	14000US06073000100	0	12	male	white	35	pov	100	39.5	0.092514
105507	105507	14000US06073000100	11	18	male	white	45	pov	120	49.5	0.137821
105508	105508	14000US06073000100	0	12	male	white	55	pov	140	59.5	0.200416