In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from publicdata.chis import *

%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

CHIS California Health Interview Survey, Adults

healthpolicy.ucla.edu-chis-adult-1 Last Update: 2018-12-04T17:29:19

Documentation and Reference Links to CHIS files.

CHIS Data packages

Using these file requires accepting the terms and restrictions provided by the UCLA Center for Health Policy Research. These terms are available online, and reproduced here:

Restrictions on the Use of California Health Interview Survey Data Before you
download this file, you must first agree to these Restrictions on the Use of
CHIS Data by clicking the button below.

The California Health Interview Survey (CHIS) is bound by promises made to
respondents, by California law, and by University and government human subject
protection committees to assure that no personal information is released in a
form that identifies an individual without the consent of the person who
supplied the information. The California Information Practices Act (section
1798.24) provides that the data collected by CHIS may be released only for
statistical research and reporting purposes. Any intentional identification or
disclosure of personal information violates this law, and violates the privacy
rights of the people who provided data to CHIS. Unauthorized disclosure of
personal information is subject to civil action and penalties for invasion of
privacy under California Civil Code, Section 1798.53.

Documentation Links

Contacts

Resources

  • rasp_diabetes. Diabetes probabilities for age, race, sex, and poverty level ratio
  • rasp_diabetes_reduced. Like rasp_diabetes, but with two-level poverty and census age ranges

References


In [3]:
def recode(df):
    """ Recode to a simpler group of races.  For a lot of health outcomes, the major divisions are
    * White + Asian ( whasian )
    * Black + Latino Afrotino
    * Others
    """
    
    from pandas.api.types import CategoricalDtype
    
    df['race_recode'] = df.racedf_p1
    df.replace({'race_recode':{
        'NON-LATINO WHITE':'white',
        'NON-LATINO ASIAN':'asian',
        'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
        'NON-LATINO AFR. AMER.': 'black',
        'LATINO': 'latino',
        'NON-LATINO, TWO+ RACES': 'other',
        'NON-LATINO OTHER, ONE RACE': 'other'
    }}, inplace=True)
    df.race_recode = df.race_recode.astype('category')
    
    df['race_recode_3'] = df.racedf_p1
    df.replace({'race_recode_3':{
        'NON-LATINO WHITE':'white_asian',
        'NON-LATINO ASIAN':'white_asian',
        'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
        'NON-LATINO AFR. AMER.': 'black_latino',
        'LATINO': 'black_latino',
        'NON-LATINO, TWO+ RACES': 'other',
        'NON-LATINO OTHER, ONE RACE': 'other'
    }}, inplace=True)
    df.race_recode_3 = df.race_recode.astype('category')
    
    df['minority'] = (df['race_recode'] != 'white_asian').astype(int)
    
    
    df['old'] = (df.srage_p1 < '45-49 YEARS').astype(CategoricalDtype(categories=[False, True],ordered=True))
    df.old.cat.rename_categories(['OLD','YOUNG'], inplace=True)

    df['poor'] = (df.povll.isin(('200-299% FPL', '300% FPL AND ABOVE')) )\
        .astype(CategoricalDtype(categories=[True, False],ordered=True))
    df.poor.cat.rename_categories(['NPOV','POV'], inplace=True)

    return df

In [4]:
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()

# Rename some categories. 2016 and 2015 have "ALASKA" where the others have "ALASKAN", which 
# causes the concat() operation to convert categories to strings
cats_17 = df17.racedf_p1.cat.categories
cat_map = dict(zip(df16.racedf_p1.cat.categories, df17.racedf_p1.cat.categories))

for e in [df13,df14,df15,df16,df17]:
    e.racedf_p1.cat.rename_categories(cat_map, inplace=True)
    
for df, year in zip([df13, df14, df15, df16, df17], range(2013, 2018)):
    df['year'] = year
    
    df = recode(df)

In [5]:
# Pool two years of data. These are the only two that have ak22_p1, 
#  "HH TOTAL ANN. INC BEFORE TAXES IN 2014/2015 (PUF 1 YR RECODE)"

df = pd.concat([
        select_columns(df17, ['diabetes', 'ak22_p1']),
        select_columns(df16, ['diabetes', 'ak22_p1'])]).reset_index()

n_years = 2

# Need to divide all of the weights by 2
weight_cols = [c for c in df.columns if 'raked' in c]

df.loc[:,weight_cols] /= n_years
len(df)


Out[5]:
42208

In [6]:
t = chis_segment_estimate(df, 'diabetes')
t = t.loc[idx[('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
          idx[:] # columns
         ]

t = t.unstack('measure')
t.columns = t.columns.droplevel()

t['diabetes_count'] /=  n_years
t


Out[6]:
measure diabetes_count diabetes_pct diabetes_rse
diabetes
YES 1.451790e+06 9.9 3.8

In [7]:
# Diabetes rates vs income. 
t = chis_segment_estimate(df, 'diabetes', 'ak22_p1')
# Now we can select with the two indexers. 
t = t.loc[idx[:,('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
          idx[:] # columns
         ]

t = t.unstack('measure')

# The columns are multi-level, but there is only one value for the first level, 
# so it is useless. 
t.columns = t.columns.droplevel()

# We already selected for the single value of 'YES', so this level is useless too
t.index = t.index.droplevel('diabetes')

def split_range(v):
    if '-' in v:
        p = v.split('-')
        return int(p[0].replace(',',''))
    else:
        p = v.split()
        # -1 so "10,000" is not equal to "<10,000"
        return int(p[-1].replace(',',''))-5000
    
t = t.rename(mapper=split_range, axis='index')

t[t.diabetes_rse<=30].sort_index().diabetes_pct.plot()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x136db0be0>

In [8]:
from matplotlib import pylab
from scipy.optimize import curve_fit
import plotly.plotly as py

def exponenial_func(x, a, b, c):
    return a*np.exp(-b*x)+c

t = t.sort_index().reset_index()

x = t.ak22_p1
y = t.diabetes_pct

popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))

xx = np.linspace(0, 180000, 500)
yy = exponenial_func(xx, *popt)

plt.plot(x,y,'o', xx, yy)
pylab.title('Diabetes Rates Vs Income\nWith Exponential Fit')
ax = plt.gca()
ax.set_facecolor((0.898, 0.898, 0.898))
fig = plt.gcf()



In [9]:
popt


Out[9]:
array([1.12899227e+01, 2.37719728e-05, 6.07291799e+00])

In [10]:
perr = np.sqrt(np.diag(pcov))
perr/popt


Out[10]:
array([0.16837275, 0.42169253, 0.16457238])

Poverty, Age and Race

Clearly, the most important factor in diabetes rates in age


In [11]:
n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'race_recode', 'racedf_p1', 'poor', 'old'])

t = process_segments(df, 'diabetes', 'YES', ['old', 'poor','race_recode'])
t.diabetes_count /= n_years
t = t.reset_index().sort_values('diabetes_pct')
t[t.diabetes_rse < 30]


Out[11]:
measure race_recode poor old diabetes_count diabetes_pct diabetes_rse
17 white NPOV YOUNG 16350.145525 1.4 22.4
1 asian NPOV YOUNG 14811.581952 2.4 27.9
9 latino NPOV YOUNG 25101.715734 2.7 27.6
19 white POV YOUNG 11221.872617 3.5 28.3
11 latino POV YOUNG 53941.136343 4.8 17.7
16 white NPOV OLD 207666.517921 9.8 5.0
12 other NPOV OLD 10843.759849 12.9 22.6
0 asian NPOV OLD 55098.808223 14.4 18.0
4 black NPOV OLD 34079.879348 18.1 16.9
8 latino NPOV OLD 112671.763648 18.6 9.1
18 white POV OLD 89191.423772 18.9 9.1
14 other POV OLD 10251.531483 22.2 22.6
2 asian POV OLD 46963.213287 24.4 16.4
6 black POV OLD 28145.626668 24.7 16.4
10 latino POV OLD 231210.452400 27.9 7.0

Log regression


In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)

n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'srsex', 'srage_p1', 'racedf_p1', 'race_recode', 'povll', 'minority', 'poor', 'old'])

def age_group_to_age(v):
    try:
        y1, y2, _ = v.replace('-',' ').split()
        return (int(y1)+int(y2))/2
    except ValueError:
        # Probably '85+ YEARS'
        return 85

m = {
    "0-99% FPL": 50,
    "100-199% FPL": 150,
    "200-299% FPL":250,
    "300% FPL AND ABOVE": 400
}

df = pd.get_dummies(df, columns=['race_recode'], prefix='', prefix_sep = '')

df['diabetes_bool'] = (df.diabetes == 'YES').astype(int)
df['avg_group_age'] = df.srage_p1.apply(age_group_to_age)
df['mid_povll'] = df.povll.apply(lambda v: m[v])
df['poor_bool'] = (df.poor == 'POV').astype(int)
df['is_male'] = (df.srsex == 'MALE').astype(int)

#X = df[['avg_group_age', 'minority', 'mid_povll']]
X = df[['avg_group_age',  'mid_povll', 'is_male', 'asian','white','black','latino','other']]
#X = df[['avg_group_age',  'mid_povll']]
y = df['diabetes_bool']

logreg.fit(X, y)

df['lr_prob'] = logreg.predict_proba(X)[:, 1]
logreg.coef_,  df['lr_prob'].sum(), df['diabetes_bool'].sum(), sum(df.diabetes == 'YES')


Out[12]:
(array([[ 0.04475782, -0.00198462,  0.37597613, -0.84606202, -1.17699891,
         -0.40336332, -0.37034146, -0.61479478]]),
 7750.708983306723,
 7750,
 7750)

In [13]:
# Races and the on-hot encoding for them. 
races = (
    ('white', (1,0,0,0,0)),
    ('asian', (0,1,0,0,0)),
    ('black', (0,0,1,0,0)),
    ('latino',(0,0,0,1,0)),
    ('other', (0,0,0,0,1)),
)

l = []

for age in range(18,85):
    for pov in range(0,400,10):
        for sex in (1,0):
            for race_name, race_ohe in races:
                l.append( (age, pov, sex,race_name) + race_ohe )
                
rasp = pd.DataFrame(l, columns = 'age pov sex race_name asian white black latino other'.split())

In [14]:
# The first column ( [:,0]) is the probability of *not* having diabetes. 
cols = [ c for c in rasp.columns if c!= 'race_name' ]
rasp['prob'] = logreg.predict_proba(rasp[cols])[:,1]

In [16]:
df


Out[16]:
index diabetes srsex srage_p1 racedf_p1 povll minority poor old rakedw0 ... black latino other white diabetes_bool avg_group_age mid_povll poor_bool is_male lr_prob
0 0 NO FEMALE 70-74 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 12.380612 ... 0 0 0 1 0 72.0 250 0 0 0.134452
1 1 NO MALE 65-69 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 216.417872 ... 0 0 0 1 0 67.0 400 0 1 0.118401
2 2 NO MALE 80-84 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 112.672832 ... 0 0 0 1 0 82.0 400 0 1 0.208121
3 3 NO MALE 60-64 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 198.559016 ... 0 0 0 1 0 62.0 150 1 1 0.149912
4 4 YES FEMALE 60-64 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 256.148161 ... 0 0 0 1 1 62.0 400 0 0 0.068662
5 5 NO FEMALE 70-74 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 274.018202 ... 0 0 0 1 0 72.0 250 0 0 0.134452
6 6 NO FEMALE 35-39 YEARS NON-LATINO AFR. AMER. 200-299% FPL 1 NPOV YOUNG 399.144124 ... 1 0 0 0 0 37.0 250 0 0 0.065679
7 7 YES MALE 75-79 YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 22.759789 ... 0 0 0 1 1 77.0 50 1 1 0.296199
8 8 YES FEMALE 65-69 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 123.138738 ... 0 0 0 1 1 67.0 250 0 0 0.110471
9 9 NO FEMALE 50-54 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 61.931248 ... 0 0 0 1 0 52.0 400 0 0 0.045002
10 10 NO MALE 40-44 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV YOUNG 842.692990 ... 0 0 0 1 0 42.0 400 0 1 0.042023
11 11 NO FEMALE 85+ YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 114.904433 ... 0 0 0 1 0 85.0 50 1 0 0.292479
12 12 NO MALE 18-25 YEARS LATINO 300% FPL AND ABOVE 1 NPOV YOUNG 327.401825 ... 0 1 0 0 0 21.5 400 0 1 0.037779
13 13 NO FEMALE 75-79 YEARS LATINO 300% FPL AND ABOVE 1 NPOV OLD 427.010981 ... 0 1 0 0 0 77.0 400 0 0 0.244272
14 14 NO MALE 50-54 YEARS LATINO 100-199% FPL 1 POV OLD 249.541054 ... 0 1 0 0 0 52.0 150 1 1 0.201618
15 15 YES FEMALE 60-64 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 78.392987 ... 0 0 0 1 1 62.0 250 0 0 0.090320
16 16 NO FEMALE 75-79 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 68.493104 ... 0 0 0 1 0 77.0 400 0 0 0.126082
17 17 NO FEMALE 65-69 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 27.398983 ... 0 0 0 1 0 67.0 250 0 0 0.110471
18 18 NO FEMALE 65-69 YEARS NON-LATINO ASIAN 100-199% FPL 1 POV OLD 336.858081 ... 0 0 0 0 0 67.0 150 1 0 0.174143
19 19 NO FEMALE 60-64 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 415.304813 ... 0 0 0 1 0 62.0 250 0 0 0.090320
20 20 NO MALE 35-39 YEARS NON-LATINO WHITE 100-199% FPL 1 POV YOUNG 1648.270770 ... 0 0 0 1 0 37.0 150 1 1 0.054463
21 21 NO FEMALE 75-79 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 47.845522 ... 0 0 0 1 0 77.0 250 0 0 0.162688
22 22 NO FEMALE 60-64 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 350.884034 ... 0 0 0 1 0 62.0 150 1 0 0.108006
23 23 NO FEMALE 75-79 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 87.414431 ... 0 0 0 1 0 77.0 400 0 0 0.126082
24 24 NO FEMALE 70-74 YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 427.336349 ... 0 0 0 1 0 72.0 50 1 0 0.187669
25 25 NO MALE 60-64 YEARS LATINO 300% FPL AND ABOVE 1 NPOV OLD 834.410693 ... 0 1 0 0 0 62.0 400 0 1 0.193912
26 26 NO FEMALE 75-79 YEARS NON-LATINO AFR. AMER. 200-299% FPL 1 NPOV OLD 384.252386 ... 1 0 0 0 0 77.0 250 0 0 0.296353
27 27 NO MALE 18-25 YEARS NON-LATINO ASIAN 200-299% FPL 1 NPOV YOUNG 1225.700078 ... 0 0 0 0 0 21.5 250 0 1 0.031814
28 28 NO MALE 75-79 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 327.008104 ... 0 0 0 1 0 77.0 400 0 1 0.173635
29 29 NO FEMALE 55-59 YEARS NON-LATINO AFR. AMER. 0-99% FPL 1 POV OLD 552.315264 ... 1 0 0 0 0 57.0 50 1 0 0.203760
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
63212 21004 NO MALE 55-59 YEARS NON-LATINO AFR. AMER. 300% FPL AND ABOVE 1 NPOV OLD 12.291232 ... 1 0 0 0 0 57.0 400 0 1 0.156884
63213 21005 YES FEMALE 60-64 YEARS LATINO 0-99% FPL 1 POV OLD 8.542873 ... 0 1 0 0 1 62.0 50 1 0 0.248590
63214 21006 NO MALE 30-34 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV YOUNG 74.086083 ... 0 0 0 1 0 32.0 400 0 1 0.027274
63215 21007 NO FEMALE 55-59 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 36.138842 ... 0 0 0 1 0 57.0 400 0 0 0.055661
63216 21008 NO MALE 45-49 YEARS LATINO 100-199% FPL 1 POV OLD 171.358428 ... 0 1 0 0 0 47.0 150 1 1 0.167981
63217 21009 NO FEMALE 70-74 YEARS LATINO 300% FPL AND ABOVE 1 NPOV OLD 2.179550 ... 0 1 0 0 0 72.0 400 0 0 0.205350
63218 21010 NO FEMALE 70-74 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 37.992934 ... 0 0 0 1 0 72.0 250 0 0 0.134452
63219 21011 NO MALE 70-74 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 102.303391 ... 0 0 0 1 0 72.0 250 0 1 0.184496
63220 21012 NO FEMALE 75-79 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 59.187063 ... 0 0 0 1 0 77.0 150 1 0 0.191561
63221 21013 YES MALE 60-64 YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 283.065165 ... 0 0 0 1 1 62.0 50 1 1 0.176997
63222 21014 NO FEMALE 60-64 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 120.403543 ... 0 0 0 1 0 62.0 150 1 0 0.108006
63223 21015 NO MALE 70-74 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 148.623611 ... 0 0 0 1 0 72.0 400 0 1 0.143826
63224 21016 YES MALE 50-54 YEARS NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE 200-299% FPL 1 NPOV OLD 238.121520 ... 0 0 1 0 1 52.0 250 0 1 0.139539
63225 21017 NO FEMALE 65-69 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 74.921436 ... 0 0 0 1 0 67.0 250 0 0 0.110471
63226 21018 YES MALE 65-69 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 33.853494 ... 0 0 0 1 1 67.0 150 1 1 0.180716
63227 21019 NO FEMALE 55-59 YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 185.379671 ... 0 0 0 1 0 57.0 50 1 0 0.105591
63228 21020 NO FEMALE 55-59 YEARS NON-LATINO WHITE 0-99% FPL 1 POV OLD 48.326165 ... 0 0 0 1 0 57.0 50 1 0 0.105591
63229 21021 NO FEMALE 70-74 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 141.706151 ... 0 0 0 1 0 72.0 400 0 0 0.103415
63230 21022 NO FEMALE 55-59 YEARS NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE 0-99% FPL 1 POV OLD 352.280342 ... 0 0 1 0 0 57.0 50 1 0 0.171591
63231 21023 NO MALE 70-74 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 64.030384 ... 0 0 0 1 0 72.0 400 0 1 0.143826
63232 21024 NO FEMALE 65-69 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 60.111408 ... 0 0 0 1 0 67.0 400 0 0 0.084429
63233 21025 NO FEMALE 50-54 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 438.721048 ... 0 0 0 1 0 52.0 150 1 0 0.071834
63234 21026 NO MALE 60-64 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV OLD 229.115698 ... 0 0 0 1 0 62.0 250 0 1 0.126336
63235 21027 NO MALE 55-59 YEARS NON-LATINO WHITE 100-199% FPL 1 POV OLD 731.698061 ... 0 0 0 1 0 57.0 150 1 1 0.123567
63236 21028 NO MALE 60-64 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 197.338667 ... 0 0 0 1 0 62.0 400 0 1 0.096962
63237 21029 NO FEMALE 70-74 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 135.379119 ... 0 0 0 1 0 72.0 400 0 0 0.103415
63238 21030 NO FEMALE 35-39 YEARS NON-LATINO WHITE 200-299% FPL 1 NPOV YOUNG 1328.759884 ... 0 0 0 1 0 37.0 250 0 0 0.031411
63239 21031 NO FEMALE 60-64 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 395.479296 ... 0 0 0 1 0 62.0 400 0 0 0.068662
63240 21032 NO MALE 65-69 YEARS NON-LATINO WHITE 300% FPL AND ABOVE 1 NPOV OLD 144.205192 ... 0 0 0 1 0 67.0 400 0 1 0.118401
63241 21033 YES FEMALE 65-69 YEARS LATINO 100-199% FPL 1 POV OLD 380.029673 ... 0 1 0 0 1 67.0 150 1 0 0.253351

63242 rows × 101 columns

Since the probabilities are 0 to 1, they should sum to the total number of diabetes caes in the dataset, and they almost do.

Soda vs Diabetes


In [17]:
n_years, df = chis_concat([df17,df16,df15], ['ac11', 'diabetes', 'race_recode', 'poor', 'old'])

# Approximately equal count bins, 
# also: pd.qcut(df.ac11,20, duplicates='drop').cat.categories
df['soda_recode'] = pd.cut(df.ac11,[-1,0,1,2,3,4,7,10,20,30,300])
df.soda_recode.cat.set_categories("000,001,002,003,004,007,010,020,030,300".split(','), rename=True, inplace=True)
df.soda_recode.value_counts().sort_index()


Out[17]:
000    34417
001     4302
002     3966
003     2185
004     3511
007     2353
010     3109
020     3365
030     3979
300     2055
Name: soda_recode, dtype: int64

In [18]:
t = chis_segment_estimate(df, 'soda_recode', ['old'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('old')


Out[18]:
measure soda_recode_pct
old OLD YOUNG
soda_recode
000 57.3 33.7
001 6.2 7.8
002 6.1 7.8
003 3.4 4.8
004 5.8 8.5
007 2.9 5.9
010 4.7 8.5
020 4.5 9.4
030 6.2 8.8
300 2.9 4.7

In [19]:
t = chis_segment_estimate(df, 'soda_recode', ['poor'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('poor')


Out[19]:
measure soda_recode_pct
poor NPOV POV
soda_recode
000 50.8 36.9
001 7.6 5.8
002 7.1 6.5
003 3.9 4.4
004 6.5 8.2
007 4.0 5.0
010 5.7 8.1
020 6.1 8.4
030 5.7 10.6
300 2.5 6.1

In [20]:
t = chis_segment_estimate(df, 'soda_recode', ['race_recode'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('race_recode')


Out[20]:
measure soda_recode_pct
race_recode asian black latino other white
soda_recode
000 49.2 38.2 32.8 47.0 56.8
001 8.5 7.1 6.1 8.0 7.1
002 9.1 8.0 6.6 9.6 6.1
003 4.9 3.9 5.1 3.8 3.0
004 8.8 6.8 9.2 7.1 4.8
007 3.5 5.2 5.3 4.9 3.7
010 6.1 7.9 9.3 4.3 4.4
020 5.2 8.4 9.5 4.7 5.3
030 3.4 7.1 11.3 6.8 5.6
300 1.3 7.5 4.7 3.8 3.3

This correlation is probably a bad idea, since while drinking sugary drinks is a risk factor in developing diabetes, diabetics shouldn't drink sodas, so it isn't surprising that there is a higher percentage of diabetics among people who don't drink sodas, precisely the opposite connection you might expect.

To properly perform this correlationwould require a longitudinal study, where the soda consumption in earlier years can be correlated with diabetes in later years.


In [21]:
t = chis_segment_estimate(df, 'diabetes', ['soda_recode'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t = t[['diabetes_pct']].loc[(slice(None),"YES"),:]
t.index = t.index.droplevel('diabetes')
t.sort_index()


Out[21]:
measure diabetes_pct
soda_recode
000 13.2
001 8.0
002 7.2
003 9.1
004 7.6
007 5.5
010 6.4
020 5.6
030 7.1
300 6.6

Food Insecurity


In [22]:
df17.fslev.value_counts()


Out[22]:
INAPPLICABLE - >=200% FPL     14927
FOOD SECURITY                  3643
FOOD INSECURITY W/O HUNGER     1783
FOOD INSECURITY W/ HUNGER       800
Name: fslev, dtype: int64

In [23]:
n_years, df = chis_concat([df17,df16,df15], ['fslev', 'race_recode', 'poor',])
t = chis_segment_estimate(df, 'fslev', ['race_recode', 'poor'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t1  = t
t = t[['fslev_pct']].unstack(['poor'])
t.loc[(slice(None),('FOOD INSECURITY W/O HUNGER','FOOD INSECURITY W/ HUNGER')),:]


Out[23]:
measure fslev_pct
poor NPOV POV
race_recode fslev
asian FOOD INSECURITY W/O HUNGER NaN 23.1
FOOD INSECURITY W/ HUNGER NaN 7.3
black FOOD INSECURITY W/O HUNGER NaN 29.8
FOOD INSECURITY W/ HUNGER NaN 19.9
latino FOOD INSECURITY W/O HUNGER NaN 31.4
FOOD INSECURITY W/ HUNGER NaN 12.9
other FOOD INSECURITY W/O HUNGER NaN 24.7
FOOD INSECURITY W/ HUNGER NaN 30.4
white FOOD INSECURITY W/O HUNGER NaN 23.2
FOOD INSECURITY W/ HUNGER NaN 18.3

In [24]:
t1 = t1.loc[('POV',slice(None),('FOOD INSECURITY W/O HUNGER','FOOD INSECURITY W/ HUNGER')),:]
t1[['fslev_pct']].unstack('race_recode')


Out[24]:
measure fslev_pct
race_recode asian black latino other white
poor fslev
POV FOOD INSECURITY W/O HUNGER 23.1 29.8 31.4 24.7 23.2
FOOD INSECURITY W/ HUNGER 7.3 19.9 12.9 30.4 18.3

In [25]:
len(df)


Out[25]:
63242

In [ ]: