In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tqdm_notebook
%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
race_iterations = pkg.reference('race_iterations').dataframe()
# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract = pkg.reference('sandiego_puma_tract').dataframe()
sd_pumas = list(sandiego_puma_tract.puma_geoid.unique())
sd_tracts = list(sandiego_puma_tract.tract_geoid.unique())
sd_puma_tracts = sd_pumas + sd_tracts
In [4]:
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()
# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001
# Apparently, the PUMA population is also controlles.
assert sd_puma_pop == sd_controlled_pop
# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number.
#
# Interestingly, the controlled population number is not with in the 90% margins of
# the sum of the table esimates.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()
Out[4]:
In [5]:
def make_col_map(t):
from collections import namedtuple
sex = None
age = None
pov = 'all'
def parse_age(v):
if v and 'year' in v:
parts = v.split()
if parts[0] == '75':
return pd.Interval(left=75, right=120, closed='both')
elif len(parts) == 4:
return pd.Interval(left=int(parts[0]), right=int(parts[2]), closed='both')
return None
col_map = {}
Dimensions = namedtuple('Dimensions', ['sex', 'age_range', 'pov'])
for e in t.table.columns:
sd = e.short_description
if '_m90' in e.unique_id:
continue
if sd:
if 'Male' in sd:
sex = 'male'
elif 'Female' in sd:
sex = 'female'
elif 'Total' in sd:
sex = 'total'
elif 'below poverty level' in sd:
pov = 'pov'
elif 'above poverty level' in sd:
pov = 'npov'
col_map[e.unique_id] = Dimensions( sex, parse_age(sd), pov)
return col_map
In [6]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})
psa = url.dataframe()
In [7]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})
psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]
In [8]:
t = psa[psa.index.isin(list(sandiego_puma_tract.puma_geoid))].drop(columns=['STUSAB', 'COUNTY','NAME'])
t = t.stack().to_frame().reset_index()
len(psa)
Out[8]:
In [9]:
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':'A'})
psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]
assert len(psa) > 0
col_map= make_col_map(psa)
ages = list(sorted(set(e.age_range for e in col_map.values() if e.age_range is not None)))
ages
Out[9]:
In [262]:
def get_raceeth(pkg, re_code, re_name):
"""Get a single race/ethnicity iteration, and process it so
it can be merged with others. """
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':re_code.strip()})
psa = url.dataframe()
psa = psa[psa.index.isin(sd_puma_tracts)]
assert len(psa) > 0
col_map= make_col_map(psa)
t = psa.drop(columns=['STUSAB', 'COUNTY','NAME'])
t = t.stack().to_frame().reset_index()
t.columns = ['geoid','col_name','value']
t['m_type'] = t.col_name.apply(lambda v: 'margin' if '_m90' in v else 'est')
t['col_name'] = t.col_name.apply(lambda v: v.replace('_m90',''))
t = t.set_index(['geoid','col_name','m_type']).unstack()
t['sex'] = [col_map[str(i[1]).replace('_m90','')].sex for i,r in t.iterrows() ]
t['raceeth'] = re_name
t['age_range'] = [col_map[str(i[1]).replace('_m90','')].age_range for i,r in t.iterrows() ]
t['pov'] = [col_map[str(i[1]).replace('_m90','')].pov for i,r in t.iterrows() ]
t.columns = [ '_'.join(e) if e[1] else e[0] for e in zip(*zip(*t.columns)) ]
t['value_std'] = t.value_margin/1.645
return t[['value_est', 'value_margin', 'value_std', 'sex', 'raceeth', 'age_range', 'pov']]
def concat_race_eth(pkg):
frames= []
tn = tqdm_notebook(list(race_iterations.itertuples()))
for e in tn:
tn.set_description(e.raceeth)
frames.append(get_raceeth(pkg, e.code, e.raceeth))
return pd.concat(frames)
df = concat_race_eth(pkg)
# Select just the non-overlaping entries
df = df[(df.sex != 'both') & (df.pov != 'all')& (df.raceeth != 'white') & (~df.age_range.isnull())].copy()
assert df.value_est.sum() > 3_100_000 # Ought to be close to CA population
df['raceeth'] = df['raceeth'].astype('category')
df['raceeth'].cat.rename_categories({
'hnwhite':'white',
'hisp':'latino',
}, inplace=True)
df['raceeth'].replace({
'aian':'other',
'nhopi':'other',
'many':'other'}, inplace=True)
control_ratio = sd_controlled_pop/df.value_est.sum()
df['value_est_controlled'] = (df.value_est * control_ratio).round().astype(int)
df['age_min'] = df.age_range.apply(lambda v: v.left)
df['age_max'] = df.age_range.apply(lambda v: v.right)
df.head()
Out[262]:
In [274]:
df18 = df[df.age_min >= 18]
In [270]:
len(df), df.value_est.sum(), df.value_est_controlled.sum(), sd_controlled_pop
Out[270]:
In [275]:
len(df), df18.value_est.sum(), df18.value_est_controlled.sum()
Out[275]:
In [13]:
from numpy.random import normal
def yield_estimate_replicates(df):
for n, (i,r) in enumerate(df.reset_index().iterrows()):
s = int(normal(loc=r.value_est, scale=r.value_std))
s = s if s > 0 else 0
col_no = int(r.col_name.split('_')[1])
yield r.geoid, r.col_name, s
def make_estimate_replicates(df, controll_sum = None):
t = pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
if controll_sum:
total = t.est.sum()
ratio = controll_sum/total
t['controlled_est'] = (t.est*ratio).round(0).astype(int)
return t
# Create some estimate replicates, so se can calc the std dev
estimates = []
from tqdm import tnrange, tqdm_notebook
N = 4 # Should be larger in production
for i in tnrange(N, desc='Create Estimates'):
est = make_estimate_replicates(df18, sd_controlled_pop)
est['replicate'] = i
estimates.append(est)
replicates = pd.concat(estimates)
In [248]:
replicates[replicates.replicate == 0].head()
Out[248]:
In [265]:
# Total population estimate
sd_controlled_pop, replicates[replicates.replicate == 0].controlled_est.sum(), replicates[replicates.replicate == 0].est.sum()
Out[265]:
In [ ]:
#def yield_col_map():
# for k, v in col_map.items():
# if v.sex and v.age_range:
# yield k, v.sex, v.age_range.left, v.age_range.right, v.pov
#col_map_df = pd.DataFrame.from_records(list(yield_col_map()), columns = 'col_id sex age_min age_max pov'.split())
In [260]:
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp['sex'] = rasp['sex'].astype('category')
rasp.sex.cat.rename_categories({0:'female', 1:'male'}, inplace=True)
rasp['raceeth'] = rasp['raceeth'].astype('category')
rasp['pov'] = rasp['pov'].astype('category')
rasp.pov.cat.rename_categories({0:'npov', 1:'pov'}, inplace=True)
rasp = rasp[rasp.age > 18].copy()
print(len(rasp))
rasp.head()
Out[260]:
In [278]:
len(df18)
Out[278]:
In [282]:
Out[282]:
In [297]:
t = df18.reset_index().merge(rasp.reset_index(), on=['raceeth','age_min','pov','sex'], how='left')
t['value_est'] = t['value_est_controlled']
t = t[['geoid', 'value_est', 'value_margin', 'sex', 'raceeth', 'age_min', 'pov', 'index', 'age', 'prob']].copy()
print(len(df), len(rasp), len(t))
tract_probs = t.reset_index().copy()
tract_probs.head()
Out[297]:
In [298]:
t = tract_probs[['value_est','value_margin', 'prob']].copy()
n = np.random.normal(t.value_est, t.value_margin/1.645).round(0).astype(int)
n = np.where(n>0,n,0)
# There are a whole lot of estimates of 0, and the ramdom sampling process above gives half9 of them
# a non-zero value.
n = np.where(t.value_est==0,0,n)
t['n'] = n
t['samples'] =t.n.apply(lambda e : np.random.random_sample(e) )
t['count'] = t.apply(lambda r: (r.samples<r.prob).sum(), axis=1)
# That random sample process above works out, for large N, to be nearly exactly the same as:
# However, the sampling process would allow for creating variances.
t['est_prob'] = t['value_est'] * t['prob']
print(len(t) )
t.head()
Out[298]:
In [299]:
t.value_est.sum(), len(t)
Out[299]:
In [300]:
t['est_prob'] = t['value_est'] * t['prob']
t['est_prob'].sum()/t['value_est'].sum()
Out[300]:
In [301]:
t['count'].sum() /t['value_est'].sum()
Out[301]:
In [302]:
(t['value_est']*t['prob']).sum()/t['value_est'].sum()
Out[302]:
In [303]:
t.value_est.sum()
Out[303]:
In [291]:
tract_probs[tract_probs.raceeth == 'white'].head()
Out[291]:
In [ ]: