In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
race_iterations = pkg.reference('race_iterations').dataframe()
# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract_pkg = pkg.reference('sandiego_puma_tract').dataframe()
sandiego_puma_tract = list(sandiego_puma_tract_pkg.puma_geoid.unique()) + \
list(sandiego_puma_tract_pkg.tract_geoid.unique())
In [4]:
def make_col_map(t):
from collections import namedtuple
sex = None
age = None
pov = 'all'
def parse_age(v):
if v and 'year' in v:
parts = v.split()
if parts[0] == '75':
return pd.Interval(left=75, right=120, closed='both')
elif len(parts) == 4:
return pd.Interval(left=int(parts[0]), right=int(parts[2]), closed='both')
return None
col_map = {}
Dimensions = namedtuple('Dimensions', ['sex', 'age_range', 'pov'])
for e in t.table.columns:
sd = e.short_description
if '_m90' in e.unique_id:
continue
if sd:
if 'Male' in sd:
sex = 'male'
elif 'Female' in sd:
sex = 'female'
elif 'Total' in sd:
sex = 'total'
elif 'below poverty level' in sd:
pov = 'pov'
elif 'above poverty level' in sd:
pov = 'npov'
col_map[e.unique_id] = Dimensions( sex, parse_age(sd), pov)
return col_map
In [5]:
def get_raceeth(pkg, re_code, re_name):
"""Get a single race/ethnicity iteration, and process it so
it can be merged with others. """
pov_sex_age_url = pkg.reference('pov_sex_age_template').resolved_url
url = pov_sex_age_url.interpolate(context={'iteration':re_code.strip()})
psa = url.dataframe()
psa = psa[psa.index.isin(sandiego_puma_tract)]
col_map= make_col_map(psa)
t = psa.drop(columns=['STUSAB', 'COUNTY','NAME'])
t = t.stack().to_frame().reset_index()
t.columns = ['geoid','col_name','value']
t['m_type'] = t.col_name.apply(lambda v: 'margin' if '_m90' in v else 'est')
t['col_name'] = t.col_name.apply(lambda v: v.replace('_m90',''))
t = t.set_index(['geoid','col_name','m_type']).unstack()
t['sex'] = [col_map[str(i[1]).replace('_m90','')].sex for i,r in t.iterrows() ]
t['raceeth'] = re_name
t['age_range'] = [col_map[str(i[1]).replace('_m90','')].age_range for i,r in t.iterrows() ]
t['pov'] = [col_map[str(i[1]).replace('_m90','')].pov for i,r in t.iterrows() ]
return t
In [6]:
sd_pumas
In [ ]:
frames= []
for e in race_iterations.itertuples():
print(e)
frames.append(get_raceeth(pkg, e.code, e.raceeth))
df = pd.concat(frames)
In [ ]:
df
In [ ]:
t = df[(df.sex != 'both') & (df.pov != 'all')& (df.raceeth != 'white') & (~df.age_range.isnull())].copy()
# Collapse the multi-level columns index
t.columns = [ '_'.join(e) if e[1] else e[0] for e in zip(*zip(*t.columns)) ]
t.head()
In [ ]:
t.value_est.sum()
In [ ]:
t['value_std'] = t.value_margin/1.645
puma_ras = t
puma_ras.head()
In [ ]:
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()
# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001
# Apparently, the PUMA population is also controlles.
assert sd_puma_pop == sd_controlled_pop
# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number.
#
# Interestingly, the controlled population number is not with in the 90% margins of
# the sum of the table esimates.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()
In [ ]:
from numpy.random import normal
def yield_estimate_replicates(df):
for n, (i,r) in enumerate(df.reset_index().iterrows()):
s = int(normal(loc=r.value_est, scale=r.value_std))
s = s if s > 0 else 0
col_no = int(r.col_name.split('_')[1])
yield r.geoid, r.col_name, s
def make_estimate_replicates(df, controll_sum = None):
t = pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
if controll_sum:
total = t.est.sum()
ratio = controll_sum/total
t['controlled_est'] = (t.est*ratio).round(0).astype(int)
return t
In [ ]:
# Create some estimate replicates, so se can calc the std dev
estimates = []
from tqdm import tnrange, tqdm_notebook
for i in tnrange(10, desc='Create Estimates'):
est = make_estimate_replicates(puma_ras)
estimates.append(est.est.sum())
sim_pop_est = pd.Series(estimates).mean()
sim_pop_est, pd.Series(estimates).std() * 1.645
In [ ]:
est = make_estimate_replicates(puma_ras, sd_controlled_pop)
est.head()
In [ ]:
len(est)
In [ ]:
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp
In [ ]: