In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tqdm_notebook
%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init() # Initialize indexes and downloder context
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
race_iterations = pkg.reference('race_iterations').dataframe()
# List of PUMS in San Giego county, so we can restrict census data to just SD County
sandiego_puma_tract = pkg.reference('sandiego_puma_tract').dataframe()
sd_pumas = list(sandiego_puma_tract.puma_geoid.unique())
sd_tracts = list(sandiego_puma_tract.tract_geoid.unique())
sd_puma_tracts = sd_pumas + sd_tracts
In [4]:
# Get the sum of the population of all PUMS
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B01003').dataframe()
t = t[t.index.isin(sd_pumas)]
sd_puma_pop = t.B01003_001.sum()
# Get the controlled population for SD county. THe margin for this estimate is zero
# because it's tied to administrative data.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/50/B01003').dataframe()
sd_controlled_pop = t.loc['05000US06073'].B01003_001
# Apparently, the PUMA population is also controlles.
assert sd_puma_pop == sd_controlled_pop
# However, the sums from the poverty status by sex by age table is not
# controlled ( and would not expect it to be ) so it's a different number.
#
# Interestingly, the controlled population number is not with in the 90% margins of
# the sum of the table esimates.
from rowgenerators import parse_app_url
t = parse_app_url('census://2016/5/CA/795/B17001').dataframe()
t = t[t.index.isin(sd_pumas)]
t.B17001_001.sum(), t.B17001_001.sum() - sd_controlled_pop, t.B17001_001.sum_m90()
Out[4]:
In [270]:
len(df), df.value_est.sum(), df.value_est_controlled.sum(), sd_controlled_pop
Out[270]:
In [275]:
len(df), df18.value_est.sum(), df18.value_est_controlled.sum()
Out[275]:
In [13]:
from numpy.random import normal
def yield_estimate_replicates(df):
for n, (i,r) in enumerate(df.reset_index().iterrows()):
s = int(normal(loc=r.value_est, scale=r.value_std))
s = s if s > 0 else 0
col_no = int(r.col_name.split('_')[1])
yield r.geoid, r.col_name, s
def make_estimate_replicates(df, controll_sum = None):
t = pd.DataFrame.from_records(list(yield_estimate_replicates(df)), columns = 'geoid col_no est'.split())
if controll_sum:
total = t.est.sum()
ratio = controll_sum/total
t['controlled_est'] = (t.est*ratio).round(0).astype(int)
return t
# Create some estimate replicates, so se can calc the std dev
estimates = []
from tqdm import tnrange, tqdm_notebook
N = 4 # Should be larger in production
for i in tnrange(N, desc='Create Estimates'):
est = make_estimate_replicates(df18, sd_controlled_pop)
est['replicate'] = i
estimates.append(est)
replicates = pd.concat(estimates)
In [248]:
replicates[replicates.replicate == 0].head()
Out[248]:
In [265]:
# Total population estimate
sd_controlled_pop, replicates[replicates.replicate == 0].controlled_est.sum(), replicates[replicates.replicate == 0].est.sum()
Out[265]:
In [ ]:
#def yield_col_map():
# for k, v in col_map.items():
# if v.sex and v.age_range:
# yield k, v.sex, v.age_range.left, v.age_range.right, v.pov
#col_map_df = pd.DataFrame.from_records(list(yield_col_map()), columns = 'col_id sex age_min age_max pov'.split())
In [260]:
rasp = pkg.reference('rasp_diabetes').dataframe()
rasp['sex'] = rasp['sex'].astype('category')
rasp.sex.cat.rename_categories({0:'female', 1:'male'}, inplace=True)
rasp['raceeth'] = rasp['raceeth'].astype('category')
rasp['pov'] = rasp['pov'].astype('category')
rasp.pov.cat.rename_categories({0:'npov', 1:'pov'}, inplace=True)
rasp = rasp[rasp.age > 18].copy()
print(len(rasp))
rasp.head()
Out[260]:
In [278]:
len(df18)
Out[278]:
In [282]:
Out[282]:
In [297]:
t = df18.reset_index().merge(rasp.reset_index(), on=['raceeth','age_min','pov','sex'], how='left')
t['value_est'] = t['value_est_controlled']
t = t[['geoid', 'value_est', 'value_margin', 'sex', 'raceeth', 'age_min', 'pov', 'index', 'age', 'prob']].copy()
print(len(df), len(rasp), len(t))
tract_probs = t.reset_index().copy()
tract_probs.head()
Out[297]:
In [298]:
t = tract_probs[['value_est','value_margin', 'prob']].copy()
n = np.random.normal(t.value_est, t.value_margin/1.645).round(0).astype(int)
n = np.where(n>0,n,0)
# There are a whole lot of estimates of 0, and the ramdom sampling process above gives half9 of them
# a non-zero value.
n = np.where(t.value_est==0,0,n)
t['n'] = n
t['samples'] =t.n.apply(lambda e : np.random.random_sample(e) )
t['count'] = t.apply(lambda r: (r.samples<r.prob).sum(), axis=1)
# That random sample process above works out, for large N, to be nearly exactly the same as:
# However, the sampling process would allow for creating variances.
t['est_prob'] = t['value_est'] * t['prob']
print(len(t) )
t.head()
Out[298]:
In [299]:
t.value_est.sum(), len(t)
Out[299]:
In [300]:
t['est_prob'] = t['value_est'] * t['prob']
t['est_prob'].sum()/t['value_est'].sum()
Out[300]:
In [301]:
t['count'].sum() /t['value_est'].sum()
Out[301]:
In [302]:
(t['value_est']*t['prob']).sum()/t['value_est'].sum()
Out[302]:
In [303]:
t.value_est.sum()
Out[303]:
In [291]:
tract_probs[tract_probs.raceeth == 'white'].head()
Out[291]:
In [ ]: