In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
dia = pkg.reference('rasp_diabetes').dataframe()
tracts = pkg.reference('rasp_tracts').dataframe()
tracts['raceeth'] = tracts.raceeth.replace({'aian':'other', 'many': 'other','nhopi':'other'})
In [4]:
dia_adult_pop = dia.group_pop.sum() # Adult pop of California, at least according to this dataset
# The Census ACS has a different idea about what the adult pop of SD County is.
tracts_adult_pop = tracts[(tracts.age_min >= 18) & (tracts.overlapping == 0)].value_est.sum()
dia_adult_pop, tracts_adult_pop
Out[4]:
In [5]:
dia.raceeth.value_counts()
Out[5]:
In [6]:
tracts.raceeth.value_counts()
Out[6]:
In [7]:
tracts.head()
Out[7]:
In [8]:
dia_merge = pd.DataFrame(
{
'raceeth': dia.raceeth,
'age_range': dia.age_group.replace({'75-085':'75-120'}),
'sex': dia.sex.apply(lambda v: 'male' if v == 1 else 'female' ),
'pov': dia.pov.apply(lambda v: 'below' if v == 1 else 'above' ),
'rate': dia.imputed_rate
})
dia_merge.head()
Out[8]:
In [9]:
#
tracts_merge = tracts[tracts.raceeth != 'white'].copy()
tracts_merge.head()
Out[9]:
In [10]:
tracts[tracts.overlapping == 0].value_est.sum()
Out[10]:
In [11]:
m = tracts_merge.merge(dia_merge, on=['raceeth','age_range','sex','pov'], how='left')
m = m[~m.rate.isnull() & (m.overlapping == 0)]
m.head()
Out[11]:
In [12]:
m.value_est.sum()
Out[12]:
In [13]:
# For 2017 data, this should be about 3337685 ( whole pop) * 78.2%% (% 18 or older ) == 2610069
s = m[m.overlapping== 0].value_est.sum()
s, s-2610069, (s-2610069)/2610069
Out[13]:
In [14]:
# The percentages here, 8.6%, doesn't match the AskCHIS value for SD of 9.1%, but if you
# adjust for the differences in the population between the two datasets, it almost does (9.3%)
t = m[m.overlapping==0].copy()
d =(t.rate*t.value_est).sum()
d, d/s, (d/s) * (2643923/2430000)
Out[14]:
In [26]:
m['diabetes_est'] = m.value_est * m.rate
t = m.groupby('geoid').sum().copy()
t['tract_diabetes_rate'] = (t.diabetes_est / t.value_est* 100).round(0)
In [28]:
t[['diabetes_est','tract_diabetes_rate']].to_csv('tract_diabetes.csv')
In [73]:
t1 = m.groupby(['geoid', 'raceeth']).sum()[['value_est','diabetes_est']]
t2 = m.groupby('geoid').sum()[['value_est','diabetes_est']]
t2['raceeth'] = 'total'
t1 = t1.reset_index()
t2 = t2.reset_index()[t1.columns]
t = pd.concat([t1, t2], ignore_index=True, sort=False).sort_values(['geoid','raceeth'])
t['rate'] = t.diabetes_est / t.value_est
tract_diabetes = t.set_index(['geoid','raceeth'])['rate'].unstack()
tract_diabetes.head()
Out[73]:
In [ ]: