notebook.community

Edit and run



In [1]:

    
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from publicdata.chis import *

%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.



In [2]:

    
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg









    Out[2]:




CHIS California Health Interview Survey, Adults
healthpolicy.ucla.edu-chis-adult-1 Last Update: 2018-12-06T01:35:05
Documentation and Reference Links to CHIS files.
CHIS Data packages

Using these file requires accepting the terms and restrictions provided by the
UCLA Center for Health Policy Research. These terms are available
online, and reproduced here:
Restrictions on the Use of California Health Interview Survey Data Before you
download this file, you must first agree to these Restrictions on the Use of
CHIS Data by clicking the button below.

The California Health Interview Survey (CHIS) is bound by promises made to
respondents, by California law, and by University and government human subject
protection committees to assure that no personal information is released in a
form that identifies an individual without the consent of the person who
supplied the information. The California Information Practices Act (section
1798.24) provides that the data collected by CHIS may be released only for
statistical research and reporting purposes. Any intentional identification or
disclosure of personal information violates this law, and violates the privacy
rights of the people who provided data to CHIS. Unauthorized disclosure of
personal information is subject to civil action and penalties for invasion of
privacy under California Civil Code, Section 1798.53.

Documentation Links

dd_adult_2017 Data dictionary, Adult, 2017
dd_adult_2016 Data dictionary, Adult, 2016
dd_adult_2015 Data dictionary, Adult, 2015
dd_adult_2014 Data dictionary, Adult, 2014
dd_adult_2013 Data dictionary, Adult, 2013
homepage Data download home page.

Contacts

Wrangler Eric Busboom, Civic Knowledge

Resources

 rasp_diabetes. Diabetes probabilities for age, race, sex, and poverty level ratio
 rasp_diabetes_reduced. Like rasp_diabetes, but with two-level poverty and census age ranges

References

adult_2017. CHIS, Adult 2017
adult_2016. CHIS, Adult 2016
adult_2015. CHIS, Adult 2015
adult_2014. CHIS, Adult 2014
adult_2013. CHIS, Adult 2013



In [3]:

    
def recode(df):
    """ Recode to a simpler group of races.  For a lot of health outcomes, the major divisions are
    * White + Asian ( whasian )
    * Black + Latino Afrotino
    * Others
    """
    
    from pandas.api.types import CategoricalDtype
    
    df['race_recode'] = df.racedf_p1
    df.replace({'race_recode':{
        'NON-LATINO WHITE':'nhwhite',
        'NON-LATINO ASIAN':'asian',
        'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
        'NON-LATINO AFR. AMER.': 'black',
        'LATINO': 'hisp',
        'NON-LATINO, TWO+ RACES': 'other',
        'NON-LATINO OTHER, ONE RACE': 'other'
    }}, inplace=True)
    df.race_recode = df.race_recode.astype('category')
    

    df['minority'] = (df['race_recode'] != 'white_asian').astype(int)
    
    
    df['old'] = (df.srage_p1 < '45-49 YEARS').astype(CategoricalDtype(categories=[False, True],ordered=True))
    df.old.cat.rename_categories(['OLD','YOUNG'], inplace=True)

    df['poor'] = (df.povll.isin(('200-299% FPL', '300% FPL AND ABOVE')) )\
        .astype(CategoricalDtype(categories=[True, False],ordered=True))
    df.poor.cat.rename_categories(['NPOV','POV'], inplace=True)

    return df



In [4]:

    
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()

# Rename some categories. 2016 and 2015 have "ALASKA" where the others have "ALASKAN", which 
# causes the concat() operation to convert categories to strings
cats_17 = df17.racedf_p1.cat.categories
cat_map = dict(zip(df16.racedf_p1.cat.categories, df17.racedf_p1.cat.categories))

for e in [df13,df14,df15,df16,df17]:
    e.racedf_p1.cat.rename_categories(cat_map, inplace=True)
    
for df, year in zip([df13, df14, df15, df16, df17], range(2013, 2018)):
    df['year'] = year
    
    df = recode(df)



In [5]:

    
# Pool two years of data. These are the only two that have ak22_p1, 
#  "HH TOTAL ANN. INC BEFORE TAXES IN 2014/2015 (PUF 1 YR RECODE)"

df = pd.concat([
        select_columns(df17, ['diabetes', 'ak22_p1']),
        select_columns(df16, ['diabetes', 'ak22_p1'])]).reset_index()

n_years = 2

# Need to divide all of the weights by 2
weight_cols = [c for c in df.columns if 'raked' in c]

df.loc[:,weight_cols] /= n_years
len(df)









    Out[5]:





42208



In [6]:

    
t = chis_segment_estimate(df, 'diabetes')
t = t.loc[idx[('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
          idx[:] # columns
         ]

t = t.unstack('measure')
t.columns = t.columns.droplevel()

t['diabetes_count'] /=  n_years
t









    Out[6]:







  
    
      measure
      diabetes_count
      diabetes_pct
      diabetes_rse
    
    
      diabetes
      
      
      
    
  
  
    
      YES
      1.451790e+06
      9.9
      3.8



In [7]:

    
# Diabetes rates vs income. 
t = chis_segment_estimate(df, 'diabetes', 'ak22_p1')
# Now we can select with the two indexers. 
t = t.loc[idx[:,('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
          idx[:] # columns
         ]

t = t.unstack('measure')

# The columns are multi-level, but there is only one value for the first level, 
# so it is useless. 
t.columns = t.columns.droplevel()

# We already selected for the single value of 'YES', so this level is useless too
t.index = t.index.droplevel('diabetes')

def split_range(v):
    if '-' in v:
        p = v.split('-')
        return int(p[0].replace(',',''))
    else:
        p = v.split()
        # -1 so "10,000" is not equal to "<10,000"
        return int(p[-1].replace(',',''))-5000
    
t = t.rename(mapper=split_range, axis='index')

t[t.diabetes_rse<=30].sort_index().diabetes_pct.plot()









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x126723ba8>



In [8]:

    
from matplotlib import pylab
from scipy.optimize import curve_fit
import plotly.plotly as py

def exponenial_func(x, a, b, c):
    return a*np.exp(-b*x)+c

t = t.sort_index().reset_index()

x = t.ak22_p1
y = t.diabetes_pct

popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))

xx = np.linspace(0, 180000, 500)
yy = exponenial_func(xx, *popt)

plt.plot(x,y,'o', xx, yy)
pylab.title('Diabetes Rates Vs Income\nWith Exponential Fit')
ax = plt.gca()
ax.set_facecolor((0.898, 0.898, 0.898))
fig = plt.gcf()



In [9]:

    
popt









    Out[9]:





array([1.12899227e+01, 2.37719728e-05, 6.07291799e+00])



In [10]:

    
perr = np.sqrt(np.diag(pcov))
perr/popt









    Out[10]:





array([0.16837275, 0.42169253, 0.16457238])

Poverty, Age and Race

Clearly, the most important factor in diabetes rates in age



In [11]:

    
n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'race_recode', 'racedf_p1', 'poor', 'old'])

t = process_segments(df, 'diabetes', 'YES', ['old', 'poor','race_recode'])
t.diabetes_count /= n_years
t = t.reset_index().sort_values('diabetes_pct')
t[t.diabetes_rse < 30]









    Out[11]:







  
    
      measure
      race_recode
      poor
      old
      diabetes_count
      diabetes_pct
      diabetes_rse
    
  
  
    
      13
      nhwhite
      NPOV
      YOUNG
      16350.145525
      1.4
      22.4
    
    
      1
      asian
      NPOV
      YOUNG
      14811.581952
      2.4
      27.9
    
    
      9
      hisp
      NPOV
      YOUNG
      25101.715734
      2.7
      27.6
    
    
      15
      nhwhite
      POV
      YOUNG
      11221.872617
      3.5
      28.3
    
    
      11
      hisp
      POV
      YOUNG
      53941.136343
      4.8
      17.7
    
    
      12
      nhwhite
      NPOV
      OLD
      207666.517921
      9.8
      5.0
    
    
      16
      other
      NPOV
      OLD
      10843.759849
      12.9
      22.6
    
    
      0
      asian
      NPOV
      OLD
      55098.808223
      14.4
      18.0
    
    
      4
      black
      NPOV
      OLD
      34079.879348
      18.1
      16.9
    
    
      8
      hisp
      NPOV
      OLD
      112671.763648
      18.6
      9.1
    
    
      14
      nhwhite
      POV
      OLD
      89191.423772
      18.9
      9.1
    
    
      18
      other
      POV
      OLD
      10251.531483
      22.2
      22.6
    
    
      2
      asian
      POV
      OLD
      46963.213287
      24.4
      16.4
    
    
      6
      black
      POV
      OLD
      28145.626668
      24.7
      16.4
    
    
      10
      hisp
      POV
      OLD
      231210.452400
      27.9
      7.0

Log regression



In [12]:

    
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)

n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'srsex', 'srage_p1', 'racedf_p1', 'race_recode', 'povll', 'minority', 'poor', 'old'])

def age_group_parts(v):
    try:
        y1, y2, _ = v.replace('-',' ').split()
        return y1,y2
    except ValueError:
        # Probably '85+ YEARS'
        return 85, 120



def age_group_to_age(v):
    y1, y2 = age_group_parts(v)
    if y1 == 85:
        return 85
    else:
        return (int(y1)+int(y2))/2


pov_mid_map = {
    "0-99% FPL": 50,
    "100-199% FPL": 150,
    "200-299% FPL":250,
    "300% FPL AND ABOVE": 400
}

pov_status_map = {
    "0-99% FPL": 1,
    "100-199% FPL": 0,
    "200-299% FPL":0,
    "300% FPL AND ABOVE": 0
}


df = pd.get_dummies(df, columns=['race_recode'], prefix='', prefix_sep = '')

df['diabetes_bool'] = (df.diabetes == 'YES').astype(int)
df['group_age_mean'] = df.srage_p1.apply(lambda v:age_group_to_age(v)).astype(float)
df['group_age_min'] = df.srage_p1.apply(lambda v:age_group_parts(v)[0]).astype(int)
df['group_age_max'] = df.srage_p1.apply(lambda v:age_group_parts(v)[1]).astype(int)

df['mid_povll'] = df.povll.apply(lambda v: pov_mid_map[v])
df['pov'] = df.povll.apply(lambda v: pov_status_map[v])
df['poor_bool'] = (df.poor == 'POV').astype(int)
df['is_male'] = (df.srsex == 'MALE').astype(int)

#X = df[['avg_group_age', 'minority', 'mid_povll']]
X = df[['group_age_mean',  'mid_povll', 'is_male', 'asian','nhwhite','black','hisp','other']]
#X = df[['avg_group_age',  'mid_povll']]
y = df['diabetes_bool']

logreg.fit(X, y)

df['lr_prob'] = logreg.predict_proba(X)[:, 1]

logreg.coef_,  df['lr_prob'].sum(), df['diabetes_bool'].sum(), sum(df.diabetes == 'YES')









    Out[12]:





(array([[ 0.04475782, -0.00198462,  0.37597614, -0.84606202, -1.17699891,
         -0.40336332, -0.37034146, -0.61479478]]),
 7750.708415967981,
 7750,
 7750)



In [13]:

    
df.head()









    Out[13]:







  
    
      
      index
      diabetes
      srsex
      srage_p1
      racedf_p1
      povll
      minority
      poor
      old
      rakedw0
      ...
      other
      diabetes_bool
      group_age_mean
      group_age_min
      group_age_max
      mid_povll
      pov
      poor_bool
      is_male
      lr_prob
    
  
  
    
      0
      0
      NO
      FEMALE
      70-74 YEARS
      NON-LATINO WHITE
      200-299% FPL
      1
      NPOV
      OLD
      12.380612
      ...
      0
      0
      72.0
      70
      74
      250
      0
      0
      0
      0.134452
    
    
      1
      1
      NO
      MALE
      65-69 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      216.417872
      ...
      0
      0
      67.0
      65
      69
      400
      0
      0
      1
      0.118401
    
    
      2
      2
      NO
      MALE
      80-84 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      112.672832
      ...
      0
      0
      82.0
      80
      84
      400
      0
      0
      1
      0.208121
    
    
      3
      3
      NO
      MALE
      60-64 YEARS
      NON-LATINO WHITE
      100-199% FPL
      1
      POV
      OLD
      198.559016
      ...
      0
      0
      62.0
      60
      64
      150
      0
      1
      1
      0.149912
    
    
      4
      4
      YES
      FEMALE
      60-64 YEARS
      NON-LATINO WHITE
      300% FPL AND ABOVE
      1
      NPOV
      OLD
      256.148161
      ...
      0
      1
      62.0
      60
      64
      400
      0
      0
      0
      0.068662
    
  

5 rows × 104 columns



In [14]:

    
(df.rakedw0*df.lr_prob).sum()









    Out[14]:





2975892.525311896



In [15]:

    
# Races and the on-hot encoding for them. 
races = (
    ('nhwhite', (1,0,0,0,0)),
    ('asian', (0,1,0,0,0)),
    ('black', (0,0,1,0,0)),
    ('hisp',(0,0,0,1,0)),
    ('other', (0,0,0,0,1)),
)

l = []

for age in range(18,85):
    for pov in range(0,400,10):
        for sex in (1,0):
            for race_name, race_ohe in races:
                l.append( (age, pov, sex,race_name) + race_ohe )
                
rasp = pd.DataFrame(l, columns = 'age pov sex raceeth asian nhwhite black hisp other'.split())

# The first column ( [:,0]) is the probability of *not* having diabetes. 
cols = [ c for c in rasp.columns if c!= 'raceeth' ]
rasp['prob'] = logreg.predict_proba(rasp[cols])[:,1]

# Get rid of the one-hot columns 
rasp = rasp['age pov sex raceeth prob'.split()]

rasp.head()



In [16]:

    
rasp.groupby('raceeth').prob.mean()









    Out[16]:





raceeth
asian      0.100538
black      0.184821
hisp       0.189307
nhwhite    0.131815
other      0.157859
Name: prob, dtype: float64



In [17]:

    
rasp.sort_values('prob', ascending = True).head()



In [18]:

    
rasp.prob.describe()









    Out[18]:





count    26800.000000
mean         0.152868
std          0.114989
min          0.010385
25%          0.059406
50%          0.118095
75%          0.222134
max          0.587522
Name: prob, dtype: float64



In [19]:

    
rasp.prob.sum()









    Out[19]:





4096.863445681018



In [ ]:



In [20]:

    
logreg = LogisticRegression(C=1e9)

X = df[['group_age_mean', 'pov', 'is_male', 'asian','nhwhite','black','hisp','other']]
y = df['diabetes_bool']

logreg.fit(X, y)

l = []

# Convert to census age ranges
census_age_ranges = [
    pd.Interval(18, 24, closed='both'),
    pd.Interval(25, 34, closed='both'),
    pd.Interval(35, 44, closed='both'),
    pd.Interval(45, 54, closed='both'),
    pd.Interval(55, 64, closed='both'),
    pd.Interval(65, 74, closed='both'),
    pd.Interval(75, 120, closed='both')
]

for age_range in census_age_ranges:
    age =( age_range.left + age_range.right)/2
    for pov in (1,0):
        for sex in (1,0):
            for race_name, race_ohe in races:
                l.append( (age, age_range.left, age_range.right, pov, sex, race_name) + race_ohe )
                
t = pd.DataFrame(l, columns = 'age age_min age_max pov sex raceeth asian nhwhite black hisp other'.split())

cols = [ c for c in t.columns if c not in  ['raceeth', 'age_min','age_max' ] ]
t['prob'] = logreg.predict_proba(t[cols])[:,1] # The first column ( [:,0]) is the probability of *not* having diabetes. 

t.head()

# Get rid of the one-hot columns 
t = t['age age_min age_max pov sex raceeth prob'.split()]

rasp_reduced = t

rasp_reduced.head()



In [21]:

    
len(rasp_reduced)









    Out[21]:





140



In [ ]:

measure	race_recode	poor	old	diabetes_count	diabetes_pct	diabetes_rse
13	nhwhite	NPOV	YOUNG	16350.145525	1.4	22.4
1	asian	NPOV	YOUNG	14811.581952	2.4	27.9
9	hisp	NPOV	YOUNG	25101.715734	2.7	27.6
15	nhwhite	POV	YOUNG	11221.872617	3.5	28.3
11	hisp	POV	YOUNG	53941.136343	4.8	17.7
12	nhwhite	NPOV	OLD	207666.517921	9.8	5.0
16	other	NPOV	OLD	10843.759849	12.9	22.6
0	asian	NPOV	OLD	55098.808223	14.4	18.0
4	black	NPOV	OLD	34079.879348	18.1	16.9
8	hisp	NPOV	OLD	112671.763648	18.6	9.1
14	nhwhite	POV	OLD	89191.423772	18.9	9.1
18	other	POV	OLD	10251.531483	22.2	22.6
2	asian	POV	OLD	46963.213287	24.4	16.4
6	black	POV	OLD	28145.626668	24.7	16.4
10	hisp	POV	OLD	231210.452400	27.9	7.0

	index	diabetes	srsex	srage_p1	racedf_p1	povll	minority	poor	old	rakedw0	...	diabetes_bool	group_age_mean	group_age_min	group_age_max	mid_povll	poor_bool	is_male	lr_prob
0	0	NO	FEMALE	70-74 YEARS	NON-LATINO WHITE	200-299% FPL	1	NPOV	OLD	12.380612	...	0	72.0	70	74	250	0	0	0.134452
1	1	NO	MALE	65-69 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	216.417872	...	0	67.0	65	69	400	0	1	0.118401
2	2	NO	MALE	80-84 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	112.672832	...	0	82.0	80	84	400	0	1	0.208121
3	3	NO	MALE	60-64 YEARS	NON-LATINO WHITE	100-199% FPL	1	POV	OLD	198.559016	...	0	62.0	60	64	150	1	1	0.149912
4	4	YES	FEMALE	60-64 YEARS	NON-LATINO WHITE	300% FPL AND ABOVE	1	NPOV	OLD	256.148161	...	1	62.0	60	64	400	0	0	0.068662

	age	sex	raceeth	prob
0	18	1	nhwhite	0.044108
1	18	1	asian	0.032079
2	18	1	black	0.067025
3	18	1	hisp	0.069120
4	18	1	other	0.054954

	age	pov	raceeth	prob
396	18	390	asian	0.010385
386	18	380	asian	0.010591
376	18	370	asian	0.010801
796	19	390	asian	0.010856
366	18	360	asian	0.011015

	age	age_min	age_max	pov	sex	raceeth	prob
0	21.0	18	24	1	1	nhwhite	0.042474
1	21.0	18	24	1	1	asian	0.029804
2	21.0	18	24	1	1	black	0.066237
3	21.0	18	24	1	1	hisp	0.072074
4	21.0	18	24	1	1	other	0.054269