Diabetes rates from CHIS surveys for 2015, 2016 and 2017, segmented by race, age, sex and poverty status
In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import publicdata.chis as chis
%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
# Link the common cols to the data dict so we know what the columns are.
dd = pkg.reference('data_dict_17').dataframe().fillna(0)
dd['variable'] = dd.variable.str.lower()
dd['description'] = dd.description.str.capitalize()
demo_cols = list(dd[(dd.demographic == 1) & (~dd.description.str.contains('recode'))].variable)
In [5]:
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()
all_sets = [df13,df14,df15,df16,df17]
all_sets = [df16,df17]
for df, year in zip(all_sets, range(2013, 2018)):
df['year'] = year
df = chis.recode(df)
In [ ]:
# What are the columns common to all datasets?
from operator import and_, or_
from functools import reduce
common_cols = list(reduce(and_, [e.columns for e in all_sets]))
cc_dd = dd.set_index('variable')
i = pd.DataFrame(index=sorted(common_cols))
cc_dd = i.join(dd).fillna(0)
In [ ]:
n_years, df = chis_concat(all_sets, common_cols + ['minority','poor','old','race_recode'])
In [ ]:
def unhealthy(r):
uh = 0
uh += 1 if r.ab1 == 'FAIR' else 0 # General health condition
uh += 2 if r.ab1 == 'POOR' else 0 # General health condition
uh += 1 if r.ab17 == 'YES' else 0 # DOCTOR EVER TOLD HAVE ASTHMA
uh += 2 if r.ab22 == 'YES' else 0 # DOCTOR EVER TOLD HAVE DIABETES
uh += 1 if r.ab22 == 'BORDERLINE OR PRE-DIABETES' else 0
uh += 1 if r.ab29 == 'YES' else 0 # DOCTOR EVER TOLD HAVE HIGH BLOOD PRESSURE
uh += .25 if r.ab29 == 'BORDERLINE HYPERTENSION' else 0
uh += .25 if r.ovrwt == 'YES' else 0 # Overweight
return uh
uhcols = 'ab1 ab17 ab22 ab29 ovrwt'.split()
df['unhealthy'] = df.apply(unhealthy, axis=1)
df.unhealthy.hist()
In [ ]:
ccdf.ad32_p1.value_counts()
In [ ]:
from scipy import stats
from tqdm import tqdm_notebook
ccdf = df[common_cols+['unhealthy']].copy()
y = pd.get_dummies(ccdf['unhealthy'])
X = ccdf[common_cols]
vals = []
cat_sets = set()
tn = tqdm_notebook(list(enumerate(X.columns)), desc='correlating')
for i,c in tn:
if 'raked' in c:
continue
if c in uhcols:
continue
if c.startswith('ab'): # All related to 'unhealthy' columns
continue
dm = pd.get_dummies(X[c])
if len(dm.columns) > 7:
continue
for dmc in dm.columns:
try:
pbr = stats.pearsonr(dm[dmc], y)
except ValueError as e:
continue
if abs(pbr[0]) > .2:
vals.append((c, dmc, pbr[0]))
tn.set_description("{} {}".format(c, dmc))
In [ ]:
cors = pd.DataFrame.from_records(vals, columns=['variable','value','corr'])
jc = cors.set_index('variable').join(dd[['variable','description']].set_index('variable'))
jc.sort_values('corr',ascending=False)
In [ ]:
In [ ]:
ccdf[common_cols].to_csv('chis1617-demo.csv')
In [ ]:
!pwd