RASP Diabetes Rates

Diabetes rates from CHIS surveys for 2015, 2016 and 2017, segmented by race, age, sex and poverty status



In [2]:

    
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from publicdata.chis import *

%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.



In [3]:

    
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg









    Out[3]:




CHIS California Health Interview Survey, Adults
healthpolicy.ucla.edu-chis-adult-1 Last Update: 2018-12-07T00:50:35
Documentation and Reference Links to CHIS files.
CHIS Data packages

Using these file requires accepting the terms and restrictions provided by the
UCLA Center for Health Policy Research. These terms are available
online, and reproduced here:
Restrictions on the Use of California Health Interview Survey Data Before you
download this file, you must first agree to these Restrictions on the Use of
CHIS Data by clicking the button below.

The California Health Interview Survey (CHIS) is bound by promises made to
respondents, by California law, and by University and government human subject
protection committees to assure that no personal information is released in a
form that identifies an individual without the consent of the person who
supplied the information. The California Information Practices Act (section
1798.24) provides that the data collected by CHIS may be released only for
statistical research and reporting purposes. Any intentional identification or
disclosure of personal information violates this law, and violates the privacy
rights of the people who provided data to CHIS. Unauthorized disclosure of
personal information is subject to civil action and penalties for invasion of
privacy under California Civil Code, Section 1798.53.

Documentation Links

dd_adult_2017 Data dictionary, Adult, 2017
dd_adult_2016 Data dictionary, Adult, 2016
dd_adult_2015 Data dictionary, Adult, 2015
dd_adult_2014 Data dictionary, Adult, 2014
dd_adult_2013 Data dictionary, Adult, 2013
homepage Data download home page.

Contacts

Wrangler Eric Busboom, Civic Knowledge

Resources

 rasp_diabetes. Diabetes probabilities for age, race, sex, and poverty level ratio, for all CHIS respondents in California

References

data_dict_17. Data Dictionary in CSV format for 2017
adult_2017. CHIS, Adult 2017
adult_2016. CHIS, Adult 2016
adult_2015. CHIS, Adult 2015
adult_2014. CHIS, Adult 2014
adult_2013. CHIS, Adult 2013



In [3]:

    
def recode(df):
    """ Recode to a simpler group of races.  For a lot of health outcomes, the major divisions are
    * White + Asian ( whasian )
    * Black + Latino Afrotino
    * Others
    """
    
    from pandas.api.types import CategoricalDtype
    
    df['race_recode'] = df.racedf_p1
    df.replace({'race_recode':{
        'NON-LATINO WHITE':'nhwhite',
        'NON-LATINO ASIAN':'asian',
        'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
        'NON-LATINO AFR. AMER.': 'black',
        'LATINO': 'hisp',
        'NON-LATINO, TWO+ RACES': 'other',
        'NON-LATINO OTHER, ONE RACE': 'other'
    }}, inplace=True)
    df.race_recode = df.race_recode.astype('category')
    
    df['minority'] = (df['race_recode'] != 'white_asian').astype(int)
    
    
    df['old'] = (df.srage_p1 < '45-49 YEARS').astype(CategoricalDtype(categories=[False, True],ordered=True))
    df.old.cat.rename_categories(['OLD','YOUNG'], inplace=True)

    df['poor'] = (df.povll.isin(('200-299% FPL', '300% FPL AND ABOVE')) )\
        .astype(CategoricalDtype(categories=[True, False],ordered=True))
    df.poor.cat.rename_categories(['NPOV','POV'], inplace=True)

    return df



In [4]:

    
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()

# Rename some categories. 2016 and 2015 have "ALASKA" where the others have "ALASKAN", which 
# causes the concat() operation to convert categories to strings
cats_17 = df17.racedf_p1.cat.categories
cat_map = dict(zip(df16.racedf_p1.cat.categories, df17.racedf_p1.cat.categories))

for e in [df13,df14,df15,df16,df17]:
    e.racedf_p1.cat.rename_categories(cat_map, inplace=True)
    
for df, year in zip([df13, df14, df15, df16, df17], range(2013, 2018)):
    df['year'] = year
    
    df = recode(df)

Poverty, Age and Race



In [5]:

    
n_years, df1517 = chis_concat([df17,df16,df15], ['diabetes', 'srsex', 'srage_p1', 'racedf_p1', 'race_recode', 'povll', 'minority', 'poor', 'old'])

def age_group_parts(v):
    try:
        y1, y2, _ = v.replace('-',' ').split()
        return y1,y2
    except ValueError:
        # Probably '85+ YEARS'
        return 85, 120



def age_group_to_age(v):
    y1, y2 = age_group_parts(v)
    if y1 == 85:
        return 85
    else:
        return (int(y1)+int(y2))/2

# Convert to census age ranges
census_age_ranges = [
    pd.Interval(18, 24, closed='both'),
    pd.Interval(25, 34, closed='both'),
    pd.Interval(35, 44, closed='both'),
    pd.Interval(45, 54, closed='both'),
    pd.Interval(55, 64, closed='both'),
    pd.Interval(65, 74, closed='both'),
    pd.Interval(75, 85, closed='both') # Actualy range is 75:120, but want a lower mean for prediction
]
    

pov_status_map = {
    "0-99% FPL": 1,
    "100-199% FPL": 0,
    "200-299% FPL":0,
    "300% FPL AND ABOVE": 0
}

dflr = pd.get_dummies(df1517, columns=['race_recode'], prefix='', prefix_sep = '')
dflr['race_recode'] = df1517.race_recode 
dflr['diabetes_bool'] = (dflr.diabetes == 'YES').astype(int)
dflr['group_age_mean'] = dflr.srage_p1.apply(lambda v:age_group_to_age(v)).astype(float)
dflr['group_age_min'] = dflr.srage_p1.apply(lambda v:age_group_parts(v)[0]).astype(int)
dflr['group_age_max'] = dflr.srage_p1.apply(lambda v:age_group_parts(v)[1]).astype(int)

dflr['census_age_group'] = pd.cut(dflr.group_age_mean, pd.IntervalIndex(census_age_ranges))

#dflr['age_group_name'] = dflr.apply(lambda r: '{:02d}-{:03d}'.format(r.group_age_min, r.group_age_max), axis=1)
dflr['age_group'] = dflr.census_age_group.apply(lambda v: '{:02d}-{:03d}'.format(v.left, v.right))

dflr['pov'] = dflr.povll.apply(lambda v: pov_status_map[v])
dflr['is_male'] = (dflr.srsex == 'MALE').astype(int)

dflr.head()









    Out[5]:







  
    
      
      index
      diabetes
      srsex
      srage_p1
      racedf_p1
      povll
      minority
      poor
      old
      rakedw0
      ...
      other
      race_recode
      diabetes_bool
      group_age_mean
      group_age_min
      group_age_max
      census_age_group
      age_group
      pov
      is_male
    
  
  
    
      0
      0
      NO
      FEMALE
      70-74 YEARS
      NON-LATINO WHITE
      200-299% FPL
      1
      NPOV
      OLD
      12.380612
      ...
      0
      nhwhite
      0
      72.0
      70
      74
      [65, 74]
      65-074
      0
      0
    
    
      1
      1
      NO
      MALE
      65-69 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      216.417872
      ...
      0
      nhwhite
      0
      67.0
      65
      69
      [65, 74]
      65-074
      0
      1
    
    
      2
      2
      NO
      MALE
      80-84 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      112.672832
      ...
      0
      nhwhite
      0
      82.0
      80
      84
      [75, 85]
      75-085
      0
      1
    
    
      3
      3
      NO
      MALE
      60-64 YEARS
      NON-LATINO WHITE
      100-199% FPL
      1
      POV
      OLD
      198.559016
      ...
      0
      nhwhite
      0
      62.0
      60
      64
      [55, 64]
      55-064
      0
      1
    
    
      4
      4
      YES
      FEMALE
      60-64 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      256.148161
      ...
      0
      nhwhite
      1
      62.0
      60
      64
      [55, 64]
      55-064
      0
      0
    
  

5 rows × 104 columns



In [6]:

    
raked_col = [c for c in df.columns if c.startswith('rake')]
index_cols = ['race_recode','is_male', 'age_group', 'pov' ]
value_cols = ['diabetes_bool', 'rec_count']
dflr['rec_count'] = 1

dflr['census_age_group'] = pd.cut(dflr.group_age_mean, pd.IntervalIndex(census_age_ranges))

t = dflr[index_cols+value_cols+raked_col].copy()


# Couldn't figure out the idomatic way to do this. 
t['group_pop'] = t.rakedw0
for c in raked_col:
    t[c] *= t.diabetes_bool

# Now the raked columns are the estimates for the number of people who have diabetes, 
# so we just have to sum up everything
    
t2 = t.groupby(index_cols).sum().reset_index()

t2['estimate'] = t2.rakedw0
t2['repl_mean'] = t2[raked_col[1:]].mean(axis=1)
t2['repl_std'] = t2[raked_col[1:]].std(axis=1)

t2['rse'] = t2['repl_std']/t2.estimate

t2['rate'] = t2.estimate / t2.group_pop
#t2['rate_std'] = t2['rate'] * t2['rse'] 


rasp_d = t2.sort_values('rate',ascending=False)[list(c for c in t2.columns if c not in raked_col)]
rasp_d.rename(columns={'race_recode': 'raceeth','repl_std':'std'}, inplace=True)
rasp_d['sex'] = rasp_d.is_male
#rasp_d['pov'] = rasp_d.poor.apply(lambda v: 1 if v == 'POV' else 0)
rasp_d.head()









    Out[6]:







  
    
      
      raceeth
      is_male
      age_group
      pov
      diabetes_bool
      rec_count
      group_pop
      estimate
      repl_mean
      std
      rse
      rate
      sex
    
  
  
    
      23
      asian
      1
      55-064
      1
      27
      77
      49517.994223
      21360.851002
      21529.619929
      970.031024
      0.045412
      0.431376
      1
    
    
      80
      hisp
      1
      65-074
      0
      196
      508
      248559.204237
      102916.455628
      103076.981427
      1631.825043
      0.015856
      0.414052
      1
    
    
      83
      hisp
      1
      75-085
      1
      25
      73
      33383.942287
      13774.911790
      12905.067666
      620.657912
      0.045057
      0.412621
      1
    
    
      81
      hisp
      1
      65-074
      1
      62
      138
      78820.913665
      31217.169504
      32760.904189
      865.259607
      0.027717
      0.396052
      1
    
    
      82
      hisp
      1
      75-085
      0
      101
      295
      129487.642569
      50434.324105
      50772.893148
      1829.917361
      0.036283
      0.389491
      1



In [7]:

    
rasp_d.estimate.sum(), rasp_d.group_pop.sum(), rasp_d.estimate.sum()/rasp_d.group_pop.sum()









    Out[7]:





(2886861.2931825323, 29309516.230000064, 0.09849569916229647)



In [8]:

    
index_cols = ['raceeth','age_group', 'sex','pov' ]
val_cols = ['group_pop','estimate','std','rate']
rasp_diabetes = rasp_d[index_cols+val_cols].reset_index(drop=True).sort_values(index_cols)[index_cols+val_cols]

re_group = rasp_diabetes[rasp_diabetes.rate != 0].groupby(['raceeth', 'age_group']).mean()

# There are a lot of entries, particularly for yound asians, where the group won't have any estimate
# for diabetes because the sample group is too small to have even one case, and all values are integers. 
# So, we impute these values from others in the same raceeth+age_group
def impute_rate(r, re_group):
    """Impute rates from other values in the similar group"""
    if r.rate == 0:
        return re_group.loc[(r.raceeth,r.age_group)].rate
    else:
        return r.rate

rasp_diabetes['imputed_rate'] = rasp_diabetes.apply(impute_rate,re_group=re_group, axis=1)
rasp_diabetes['imputed_estimate'] = (rasp_diabetes.imputed_rate * rasp_diabetes.group_pop).round(0).astype(int)

rasp_diabetes['group_pop'] = rasp_diabetes['group_pop'].round(0).astype(int)
rasp_diabetes['estimate'] = rasp_diabetes['estimate'].round(0).astype(int)

rasp_diabetes.head()









    Out[8]:







  
    
      
      raceeth
      age_group
      sex
      pov
      group_pop
      estimate
      std
      rate
      imputed_rate
      imputed_estimate
    
  
  
    
      139
      asian
      18-024
      0
      0
      350720
      0
      0.000000
      0.000000
      0.007575
      2657
    
    
      132
      asian
      18-024
      0
      1
      81904
      0
      0.000000
      0.000000
      0.007575
      620
    
    
      119
      asian
      18-024
      1
      0
      317159
      2402
      303.854180
      0.007575
      0.007575
      2402
    
    
      137
      asian
      18-024
      1
      1
      71232
      0
      0.000000
      0.000000
      0.007575
      540
    
    
      126
      asian
      25-034
      0
      0
      375851
      25
      4.381716
      0.000067
      0.000067
      25



In [9]:

    
t = rasp_diabetes.reset_index().groupby(['raceeth','age_group']).sum()
t['rate'] = t.estimate/t.group_pop
t[['rate']].unstack(0).plot()









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x122452be0>



In [10]:

    
t = rasp_diabetes.reset_index().groupby(['raceeth','pov']).sum()
t['rate'] = t.estimate/t.group_pop
t[['rate']].unstack(0).plot(kind='bar')









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x12e394470>

Compare to CHIS

Here is the AskCHIS page for:

"Ever Diagnosed with diabetes"
by OMB Race
Pooling 2015, 2016 and 2017

Below is the same values from this dataset, which agree very closely. Note that to get the same values, you must use the estimate column, not the imputed_estimate.



In [11]:

    
t = rasp_diabetes.groupby('raceeth').sum().copy()
t['rate'] = (t.estimate / t.group_pop * 100).round(1)
t['rate']









    Out[11]:





raceeth
asian       8.6
black      12.9
hisp       12.2
nhwhite     8.0
other       8.3
Name: rate, dtype: float64

AskCHIS, By Race, 55-64



In [12]:

    
t = rasp_diabetes[ (rasp_diabetes.age_group=='55-064')  ].groupby('raceeth').sum().copy()
t['rate'] = (t.estimate / t.group_pop * 100).round(1)
assert round(t.loc['asian'].estimate,-3) == 96_000
t[['estimate','group_pop','rate']]

AskCHIS, By Race, 55-64, Male



In [13]:

    
t = rasp_diabetes[ (rasp_diabetes.age_group=='55-064') & (rasp_diabetes.sex == 1) ].groupby('raceeth').sum().copy()
t['rate'] = (t.estimate / t.group_pop * 100).round(1)
assert round(t.loc['asian'].estimate,-3) == 47_000
t[['estimate','rate']]

AskChis, By Race, 55-64, In Poverty



In [14]:

    
t = rasp_diabetes[ (rasp_diabetes.age_group=='55-064') & (rasp_diabetes.pov == 1) ].groupby('raceeth').sum().copy()
t['rate'] = (t.estimate / t.group_pop * 100).round(1)
assert round(t.loc['asian'].estimate,-3) == 30_000
t[['estimate','rate', 'group_pop']]



In [ ]:



In [ ]:



In [ ]:

	estimate	group_pop	rate
raceeth
asian	95941	536009	17.9
black	58940	299635	19.7
hisp	373566	1318747	28.3
nhwhite	244368	2431359	10.1
other	22386	137743	16.3

	estimate	rate
raceeth
asian	46739	19.4
black	30508	21.4
hisp	207525	31.0
nhwhite	129023	11.0
other	8110	11.8

	estimate	rate	group_pop
raceeth
asian	29679	36.7	80976
black	15868	24.8	63997
hisp	120816	36.4	332334
nhwhite	36447	18.6	195748
other	6666	24.9	26772

	index	diabetes	srsex	srage_p1	racedf_p1	povll	minority	poor	old	rakedw0	...	race_recode	diabetes_bool	group_age_mean	group_age_min	group_age_max	census_age_group	age_group	is_male
0	0	NO	FEMALE	70-74 YEARS	NON-LATINO WHITE	200-299% FPL	1	NPOV	OLD	12.380612	...	nhwhite	0	72.0	70	74	[65, 74]	65-074	0
1	1	NO	MALE	65-69 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	216.417872	...	nhwhite	0	67.0	65	69	[65, 74]	65-074	1
2	2	NO	MALE	80-84 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	112.672832	...	nhwhite	0	82.0	80	84	[75, 85]	75-085	1
3	3	NO	MALE	60-64 YEARS	NON-LATINO WHITE	100-199% FPL	1	POV	OLD	198.559016	...	nhwhite	0	62.0	60	64	[55, 64]	55-064	1
4	4	YES	FEMALE	60-64 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	256.148161	...	nhwhite	1	62.0	60	64	[55, 64]	55-064	0

	raceeth	is_male	age_group	pov	diabetes_bool	rec_count	group_pop	estimate	repl_mean	std	rse	rate	sex
23	asian	1	55-064	1	27	77	49517.994223	21360.851002	21529.619929	970.031024	0.045412	0.431376	1
80	hisp	1	65-074	0	196	508	248559.204237	102916.455628	103076.981427	1631.825043	0.015856	0.414052	1
83	hisp	1	75-085	1	25	73	33383.942287	13774.911790	12905.067666	620.657912	0.045057	0.412621	1
81	hisp	1	65-074	1	62	138	78820.913665	31217.169504	32760.904189	865.259607	0.027717	0.396052	1
82	hisp	1	75-085	0	101	295	129487.642569	50434.324105	50772.893148	1829.917361	0.036283	0.389491	1

	raceeth	age_group	sex	pov	group_pop	estimate	std	rate	imputed_rate	imputed_estimate
139	asian	18-024	0	0	350720	0	0.000000	0.000000	0.007575	2657
132	asian	18-024	0	1	81904	0	0.000000	0.000000	0.007575	620
119	asian	18-024	1	0	317159	2402	303.854180	0.007575	0.007575	2402
137	asian	18-024	1	1	71232	0	0.000000	0.000000	0.007575	540
126	asian	25-034	0	0	375851	25	4.381716	0.000067	0.000067	25