Diabetes rates from CHIS surveys for 2015, 2016 and 2017, segmented by race, age, sex and poverty status


In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
import publicdata.chis as chis

%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

CHIS California Health Interview Survey, Adults

healthpolicy.ucla.edu-chis-adult-1 Last Update: 2018-12-07T00:50:35

Documentation and Reference Links to CHIS files.

CHIS Data packages

Using these file requires accepting the terms and restrictions provided by the UCLA Center for Health Policy Research. These terms are available online, and reproduced here:

Restrictions on the Use of California Health Interview Survey Data Before you
download this file, you must first agree to these Restrictions on the Use of
CHIS Data by clicking the button below.

The California Health Interview Survey (CHIS) is bound by promises made to
respondents, by California law, and by University and government human subject
protection committees to assure that no personal information is released in a
form that identifies an individual without the consent of the person who
supplied the information. The California Information Practices Act (section
1798.24) provides that the data collected by CHIS may be released only for
statistical research and reporting purposes. Any intentional identification or
disclosure of personal information violates this law, and violates the privacy
rights of the people who provided data to CHIS. Unauthorized disclosure of
personal information is subject to civil action and penalties for invasion of
privacy under California Civil Code, Section 1798.53.

Documentation Links

Contacts

Resources

  • rasp_diabetes. Diabetes probabilities for age, race, sex, and poverty level ratio, for all CHIS respondents in California

References


In [3]:
# Link the common cols to the data dict so we know what the columns are. 
dd = pkg.reference('data_dict_17').dataframe().fillna(0)
dd['variable'] = dd.variable.str.lower()
dd['description'] = dd.description.str.capitalize()

demo_cols = list(dd[(dd.demographic == 1) & (~dd.description.str.contains('recode'))].variable)

In [5]:
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()

all_sets =  [df13,df14,df15,df16,df17]

all_sets =  [df16,df17]

for df, year in zip(all_sets, range(2013, 2018)):
    df['year'] = year
    
    df = chis.recode(df)

In [ ]:
# What are the columns common to all datasets?
from operator import and_, or_
from functools import reduce

common_cols = list(reduce(and_, [e.columns for e in all_sets]))

cc_dd = dd.set_index('variable')
i = pd.DataFrame(index=sorted(common_cols))
cc_dd = i.join(dd).fillna(0)

In [ ]:
n_years, df = chis_concat(all_sets, common_cols + ['minority','poor','old','race_recode'])

In [ ]:
def unhealthy(r):
    
    uh = 0
    uh += 1 if r.ab1 == 'FAIR' else 0 # General health condition
    uh += 2 if r.ab1 == 'POOR' else 0 # General health condition
    uh += 1 if r.ab17 == 'YES' else 0 # DOCTOR EVER TOLD HAVE ASTHMA 
    uh += 2 if r.ab22 == 'YES' else 0 # DOCTOR EVER TOLD HAVE DIABETES 
    uh += 1 if r.ab22 == 'BORDERLINE OR PRE-DIABETES' else 0
    uh += 1 if r.ab29 == 'YES' else 0 # DOCTOR EVER TOLD HAVE HIGH BLOOD PRESSURE 
    uh += .25 if r.ab29 == 'BORDERLINE HYPERTENSION' else 0
    uh += .25 if r.ovrwt == 'YES' else 0 # Overweight
    
    return uh
    
uhcols = 'ab1 ab17 ab22 ab29 ovrwt'.split()

df['unhealthy'] = df.apply(unhealthy, axis=1)

df.unhealthy.hist()

In [ ]:
ccdf.ad32_p1.value_counts()

In [ ]:
from scipy import stats
from tqdm import  tqdm_notebook

ccdf = df[common_cols+['unhealthy']].copy()

y =  pd.get_dummies(ccdf['unhealthy'])
X = ccdf[common_cols]

vals = []
cat_sets = set()

tn = tqdm_notebook(list(enumerate(X.columns)), desc='correlating')
    
for i,c in tn:
    
    if 'raked' in c:
        continue
    
    if c in uhcols:
        continue
        
    if c.startswith('ab'): # All related to 'unhealthy' columns
        continue
        
    dm = pd.get_dummies(X[c])
    if len(dm.columns) > 7:
        continue
        
    for dmc in dm.columns:
        try:
            pbr = stats.pearsonr(dm[dmc], y)
        except ValueError as e:
            continue
            
       
        if abs(pbr[0]) > .2:
            vals.append((c, dmc, pbr[0]))
            
        tn.set_description("{} {}".format(c, dmc))

In [ ]:
cors = pd.DataFrame.from_records(vals, columns=['variable','value','corr'])
jc = cors.set_index('variable').join(dd[['variable','description']].set_index('variable'))
jc.sort_values('corr',ascending=False)

In [ ]:

from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA pctile_cols = [c for c in df.columns if 'pctile' in c] x = pd.get_dummies(df[common_cols]).values # Standardizing the features x = StandardScaler().fit_transform(x) x = np.nan_to_num(x) y = df['hpi2score'].values pca = PCA(n_components=2) principalComponents = pca.fit_transform(x) principalDf = pd.DataFrame(data = principalComponents , columns = ['pc1', 'pc2']) principalDf.plot.scatter(x='pc1',y='pc2')

In [ ]:
ccdf[common_cols].to_csv('chis1617-demo.csv')

In [ ]:
!pwd