In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

Small Area Estimates for Diabetes in San Diego

sandiegodata.org-diabetes_sae-1 Last Update: 2018-12-05T23:54:30

__

Contacts

References

  • rasp_diabetes. Diabetes probabilities, by age, race, sex and poverty status
  • rasp_tracts. Race, age, sex and poverty status of population by tract in San Diego County

In [3]:
dia = pkg.reference('rasp_diabetes').dataframe()
tracts = pkg.reference('rasp_tracts').dataframe()

tracts['raceeth'] = tracts.raceeth.replace({'aian':'other', 'many': 'other','nhopi':'other'})

In [4]:
dia_adult_pop = dia.group_pop.sum() # Adult pop of California, at least according to this dataset

# The Census ACS has a different idea about what the adult pop of SD County is. 
tracts_adult_pop = tracts[(tracts.age_min >= 18) & (tracts.overlapping == 0)].value_est.sum()

dia_adult_pop, tracts_adult_pop


Out[4]:
(29309514, 2643923)

In [5]:
dia.raceeth.value_counts()


Out[5]:
black      28
other      28
asian      28
hisp       28
nhwhite    28
Name: raceeth, dtype: int64

In [6]:
tracts.raceeth.value_counts()


Out[6]:
other      148208
black       37052
hisp        37052
all         37052
asian       37052
white       37052
nhwhite     37052
Name: raceeth, dtype: int64

In [7]:
tracts.head()


Out[7]:
geoid col_name value_est value_margin sex raceeth age_range pov age_min age_max overlapping
0 14000US06073000100 B17001A_001 2455 214 both white 00-120 all 0 120 1
1 14000US06073000100 B17001A_002 102 70 both white 00-120 below 0 120 1
2 14000US06073000100 B17001A_003 12 16 male white 00-120 below 0 120 1
3 14000US06073000100 B17001A_004 0 12 male white 00-004 below 0 4 1
4 14000US06073000100 B17001A_005 0 12 male white 05-005 below 5 5 1

In [8]:
dia_merge = pd.DataFrame(
{
    'raceeth': dia.raceeth,
    'age_range': dia.age_group.replace({'75-085':'75-120'}),
    'sex': dia.sex.apply(lambda v: 'male' if v == 1 else 'female' ),
    'pov':  dia.pov.apply(lambda v: 'below' if v == 1 else 'above' ),
    'rate': dia.imputed_rate
})

dia_merge.head()


Out[8]:
raceeth age_range sex pov rate
0 asian 18-024 female above 0.007575
1 asian 18-024 female below 0.007575
2 asian 18-024 male above 0.007575
3 asian 18-024 male below 0.007575
4 asian 25-034 female above 0.000067

In [9]:
#
tracts_merge = tracts[tracts.raceeth != 'white'].copy()
tracts_merge.head()


Out[9]:
geoid col_name value_est value_margin sex raceeth age_range pov age_min age_max overlapping
37052 14000US06073000100 B17001B_001 0 12 both black 00-120 all 0 120 1
37053 14000US06073000100 B17001B_002 0 12 both black 00-120 below 0 120 1
37054 14000US06073000100 B17001B_003 0 12 male black 00-120 below 0 120 1
37055 14000US06073000100 B17001B_004 0 12 male black 00-004 below 0 4 0
37056 14000US06073000100 B17001B_005 0 12 male black 05-005 below 5 5 0

In [10]:
tracts[tracts.overlapping == 0].value_est.sum()


Out[10]:
3452837

In [11]:
m = tracts_merge.merge(dia_merge, on=['raceeth','age_range','sex','pov'], how='left')

m = m[~m.rate.isnull() & (m.overlapping == 0)]
m.head()


Out[11]:
geoid col_name value_est value_margin sex raceeth age_range pov age_min age_max overlapping rate
9 14000US06073000100 B17001B_010 0 12 male black 18-024 below 18 24 0 0.004529
10 14000US06073000100 B17001B_011 0 12 male black 25-034 below 25 34 0 0.024107
11 14000US06073000100 B17001B_012 0 12 male black 35-044 below 35 44 0 0.079815
12 14000US06073000100 B17001B_013 0 12 male black 45-054 below 45 54 0 0.124868
13 14000US06073000100 B17001B_014 0 12 male black 55-064 below 55 64 0 0.265294

In [12]:
m.value_est.sum()


Out[12]:
2643923

In [13]:
# For 2017 data, this should be about 3337685 ( whole pop) * 78.2%% (% 18 or older ) == 2610069
s = m[m.overlapping== 0].value_est.sum()
s, s-2610069, (s-2610069)/2610069


Out[13]:
(2643923, 33854, 0.012970538326764541)

In [14]:
# The percentages here, 8.6%, doesn't match the AskCHIS value for SD of 9.1%, but if you 
# adjust for the differences in the population between the two datasets, it almost does (9.3%)
t = m[m.overlapping==0].copy() 

d =(t.rate*t.value_est).sum()

d, d/s, (d/s) * (2643923/2430000)


Out[14]:
(226252.36856408906, 0.08557449236006082, 0.0931079706025058)

In [26]:
m['diabetes_est'] = m.value_est * m.rate
t = m.groupby('geoid').sum().copy()
t['tract_diabetes_rate'] = (t.diabetes_est / t.value_est* 100).round(0)

In [28]:
t[['diabetes_est','tract_diabetes_rate']].to_csv('tract_diabetes.csv')

In [73]:
t1 = m.groupby(['geoid', 'raceeth']).sum()[['value_est','diabetes_est']]

t2 = m.groupby('geoid').sum()[['value_est','diabetes_est']]
t2['raceeth'] = 'total'

t1 = t1.reset_index()
t2 = t2.reset_index()[t1.columns]

t =  pd.concat([t1, t2], ignore_index=True, sort=False).sort_values(['geoid','raceeth'])
t['rate'] = t.diabetes_est / t.value_est
tract_diabetes = t.set_index(['geoid','raceeth'])['rate'].unstack()
tract_diabetes.head()


Out[73]:
raceeth asian black hisp nhwhite other total
geoid
14000US06073000100 0.090034 NaN 0.240076 0.100720 0.149703 0.117236
14000US06073000201 0.083256 0.101111 0.093961 0.085595 0.094269 0.087723
14000US06073000202 0.084176 0.073590 0.096333 0.054982 0.084927 0.065198
14000US06073000300 0.045533 0.108271 0.086672 0.069047 0.025907 0.068951
14000US06073000400 0.048468 0.028781 0.045808 0.056943 0.049220 0.053447

In [ ]: