In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from publicdata.chis.prepare import *

%matplotlib inline
sns.set_context('notebook')

In [2]:
pkg = mp.jupyter.open_package()
#pkg = mp.jupyter.open_source_package()
pkg


Out[2]:

CHIS California Health Interview Survey, Adults

healthpolicy.ucla.edu-chis-adult-1 Last Update: 2018-11-21T20:44:48

Documentation and Reference Links to CHIS files.

CHIS Data packages

Using these file requires accepting the terms and restrictions provided by the UCLA Center for Health Policy Research. These terms are available online, and reproduced here:

Restrictions on the Use of California Health Interview Survey Data Before you
download this file, you must first agree to these Restrictions on the Use of
CHIS Data by clicking the button below.

The California Health Interview Survey (CHIS) is bound by promises made to
respondents, by California law, and by University and government human subject
protection committees to assure that no personal information is released in a
form that identifies an individual without the consent of the person who
supplied the information. The California Information Practices Act (section
1798.24) provides that the data collected by CHIS may be released only for
statistical research and reporting purposes. Any intentional identification or
disclosure of personal information violates this law, and violates the privacy
rights of the people who provided data to CHIS. Unauthorized disclosure of
personal information is subject to civil action and penalties for invasion of
privacy under California Civil Code, Section 1798.53.

Documentation Links

Contacts

References


In [3]:
columns = pkg.reference('adult_2017').row_generator.columns

def find_var(*vals):

    
    result = []
    
    for c in columns:
        for v in vals:
            if v in c['description'].lower() or v in c['name'].lower():
                result.append((c['name'], c['description']))
            
    return result
    

diabet_vars = find_var('diabet')+find_var('general health')

food_vars = find_var('eat')+find_var('food')+find_var('hungry')+\
    find_var('drink')+find_var('soda')+find_var('bmi')+find_var('obe')+find_var('fresh')+\
    find_var('veg')+find_var('fruit')+find_var('fries')+find_var('beans')

demo_vars = find_var('gender')+find_var('hisp')+find_var('race')+find_var('income')+find_var('pov')+find_var('urban')

subset_vars = list(set(diabet_vars+food_vars+demo_vars))
subset_vars


Out[3]:
[('ae5', '# TIMES ATE COOKED DRIED BEANS IN PAST MONTH'),
 ('dmc9', 'HOW LONG AGO TREATED UNFAIRLY W/ MEDICAL CARE DUE TO RACE/ETH'),
 ('dmc6b_p1', 'MAIN RSN TREATED UNFAIRLY GET MED. CARE (PUF 1 YR RECODE)'),
 ('whobmi', 'BODY MASS INDEX: WHO DEFINITION'),
 ('ur_tract6', 'RURAL AND URBAN - CLARITAS (BY CENSUS TRACT) (6 LVLS)'),
 ('am5', "HOW OFTEN HUNGRY BUT DIDN'T EAT B/C OF MONEY IN PAST 12 MOS"),
 ('ae3', '# TIMES ATE FRNCH FRIES, HME FRIES, HSH BRWNS IN PAST MO'),
 ('ae_soda', '# OF TIMES DRINKING SODA PER WEEK'),
 ('ovrwt', 'OVERWEIGHT OR OBESE'),
 ('srsex', 'SELF-REPORTED GENDER'),
 ('ab25', 'CURRENTLY TAKING DIABETIC PILLS TO LOWER BLOOD SUGAR'),
 ('povgwd_p1', 'FAMILY POVERTY THRESHOLD LEVEL (PUF 1 YR RECODE)'),
 ('ur_omb', 'RURAL AND URBAN - OMB'),
 ('ur_bg6', 'RURAL AND URBAN - CLARITAS (BY BLOCK GROUP)(6 LVLS)'),
 ('al6', 'RECEIVING SSI (SUPPLEMENTAL SECURITY INCOME)'),
 ('srh', 'SELF-REPORTED LATINO/HISPANIC'),
 ('ab22', 'DOCTOR EVER TOLD HAVE DIABETES'),
 ('povll_aca',
  'FAMILY POVERTY THRESHOLD LEVEL: ACA MAGI ELIGIBILITY (4 LVLS)'),
 ('ab51_p1', 'TYPE 1 OR TYPE 2 DIABETES (PUF 1 YR RECODE)'),
 ('ur_rhp', 'RURAL AND URBAN - RHP'),
 ('ac44', 'NEIGHBORHOOD FRUIT/VEG AFFORDABLE'),
 ('rbmi', 'BMI DESCRIPTIVE'),
 ('prediab',
  'DOCTOR EVER TOLD HAVE PRE OR BORDERLINE DIABETES (NON-GESTATIONAL)'),
 ('ac42_p', 'HOW OFTEN FIND FRESH FRUIT/VEG IN NEIGHB (PUF RECODE)'),
 ('ae7', '# OF TIMES ATE VEGETABLES IN PAST MO'),
 ('ak32', 'HH INCOME SUPPORTS SOMEONE LIVING IN U.S. BUT NOT LIVING IN HH'),
 ('am2', "HOW OFTEN COULDN'T AFFORD TO EAT BALANCED MEALS"),
 ('povll', 'POVERTY LEVEL'),
 ('ab112', 'MEDICAL PROVIDERS DEVELOP DIABETES CARE PLAN'),
 ('ab23_p1', 'AGE FIRST TOLD HAVE DIABETES (PUF 1 YR RECODE)'),
 ('ab114_p1', 'CONFIDENCE TO CONTROL AND MANAGE DIABETES (PUF 1 YR RECODE)'),
 ('ab99', 'DOC EVER TOLD HAVE PRE- OR BORDERLINE DIABETES'),
 ('dmc8', 'WOULD HAVE GOTTEN BETTER MEDICAL CARE IF DIFF. RACE/ETHNICITY'),
 ('elder_idx',
  'ELDERLY SINGLES/COUPLE INCOME BELOW CNTY COST OF LIVING THRESHOLDS'),
 ('ur_ihs', 'RURAL AND URBAN - IHS'),
 ('ab1', 'GENERAL HEALTH CONDITION'),
 ('latin2tp', 'LATIN/HISPANIC SUBTYPES - 2 LVLS'),
 ('famsize2_p1',
  "FAMILY SIZE: INCL. ALL SUPT'D BY HH INCOME (PUF 1 YR RECODE)"),
 ('diabetes', 'DOCTOR EVER TOLD HAVE DIABETES (NON-GESTATIONAL)'),
 ('racehp2_p1', 'RACE - UCLA CHPR DEFINITION, UNABRIDGED (PUF 1 YR RECODE)'),
 ('bmi_p', 'BODY MASS INDEX (PUF RECODE)'),
 ('al5', 'RECEIVING FOOD STAMP BENEFITS'),
 ('povll2_p1v2', 'POVERTY LEVEL AS TIMES OF 100% FPL (PUF RECODE V2)'),
 ('ac46', '# OF TIMES DRANK SWEET FRUIT DRINKS PAST MO'),
 ('fpg', 'FEDERAL POVERTY GUIDELINE'),
 ('ur_clrt6', 'RURAL AND URBAN - CLARITAS (BY ZIPCODE) (6 LVLS)'),
 ('aj1', 'INS COVERS TREATMENT FOR MNTL HEALTH PROBLEMS'),
 ('dmc3',
  'HOW OFTEN TREATED UNFAIRLY WHEN GETTING MEDICAL CARE OVER LIFETIME'),
 ('ac11', '# OF TIMES DRANK SODA LAST MONTH'),
 ('sro', 'SELF-REPORTED OTHER RACE'),
 ('ombsrr_p1', 'OMB/CURRENT DOF RACE - ETHNICITY (PUF 1 YR RECODE)'),
 ('ac42', 'HOW OFTEN FIND FRESH FRUIT/VEG IN NEIGHB'),
 ('am1', "HOW OFTEN FOOD DIDN'T LAST,COULDN'T AFFORD MORE, PAST 12 MOS"),
 ('ab81', 'DOCTOR TOLD HAD DIABETES ONLY DURING PREGNANCY'),
 ('racedf_p1', 'FORMER DOF RACE - ETHNICITY (PUF 1 YR RECODE)'),
 ('ur_clrt2', 'RURAL AND URBAN - CLARITAS (BY ZIPCODE) (2 LVLS)'),
 ('aesoda_p1', '# OF TIMES DRINKING SODA PER WEEK (PUF 1 YR RECODE)'),
 ('ae2', '# TIMES ATE FRUIT IN PAST MO'),
 ('racecn_p1', 'RACE - CENSUS 2000 DEFINITION (PUF 1 YR RECODE)'),
 ('fslevcb', 'FOOD SECURITY STATUS (2 LVLS)'),
 ('ak33_p1',
  '# OF PERSONS SUPPORTED BY HH INCOME NOT IN HH (PUF 1 YR RECODE)'),
 ('fslev', 'FOOD SECURITY STATUS LEVEL')]

In [4]:
df = pkg.reference('adult_2017').dataframe()

In [41]:
dfr = df[[c for c in df.columns if 'raked' in c]].copy()
dfv = df[[c for c in df.columns if 'raked' not in c]].copy()

dfv = convert_to_numbers(dfv)

In [5]:
dfs = df[[e[0] for e in subset_vars]]

In [6]:
d = pd.get_dummies(dfs)

In [7]:
corrs = d.corr()

In [8]:
len(corrs)


Out[8]:
2986

In [9]:
t = corrs.stack().to_frame()
t.columns= ['correl']
top_cor = t[t.correl<1.0].sort_values('correl',ascending=False)

In [10]:
t = top_cor.reset_index()
t2 = t[ (~t.level_0.str.contains('INAPPLICABLE')) & (~t.level_1.str.contains('INAPPLICABLE'))]

In [11]:
t2[(t2.correl < .8) & (t2.correl > .7)]


Out[11]:
level_0 level_1 correl
210 ak32_YES ak33_p1_1.0 0.799663
211 ak33_p1_1.0 ak32_YES 0.799663
212 ur_tract6_SUBURBAN ur_bg6_SUBURBAN 0.787830
213 ur_bg6_SUBURBAN ur_tract6_SUBURBAN 0.787830
214 ab25_YES ab112_YES 0.785694
215 ab112_YES ab25_YES 0.785694
216 ae_soda_2.0 aesoda_p1_2-3 TIMES 0.777431
217 aesoda_p1_2-3 TIMES ae_soda_2.0 0.777431
218 fslevcb_FOOD SECURITY am5_NO 0.777065
219 fslev_FOOD SECURITY am5_NO 0.777065
220 am5_NO fslev_FOOD SECURITY 0.777065
221 am5_NO fslevcb_FOOD SECURITY 0.777065
222 ur_tract6_2ND CITY ur_bg6_2ND CITY 0.775767
223 ur_bg6_2ND CITY ur_tract6_2ND CITY 0.775767
224 ab22_YES ab114_p1_VERY CONFIDENT 0.766429
225 ab114_p1_VERY CONFIDENT ab22_YES 0.766429
226 ab114_p1_VERY CONFIDENT diabetes_YES 0.763033
227 diabetes_YES ab114_p1_VERY CONFIDENT 0.763033
228 ac42_p_DOESN'T EAT/SHOP FOR FRUITS AND VEGETABLES ac42_DOESN'T SHOP IN HIS/HER NEIGHBORHOOD 0.761934
229 ac42_DOESN'T SHOP IN HIS/HER NEIGHBORHOOD ac42_p_DOESN'T EAT/SHOP FOR FRUITS AND VEGETABLES 0.761934
230 al6_NO al5_NO 0.759633
231 al5_NO al6_NO 0.759633
238 ac11_0.0 aesoda_p1_0 TIMES 0.757131
239 aesoda_p1_0 TIMES ac11_0.0 0.757131
240 ae_soda_0.0 ac11_0.0 0.757131
241 ac11_0.0 ae_soda_0.0 0.757131
254 am5_NO am1_NEVER TRUE 0.748029
255 am1_NEVER TRUE am5_NO 0.748029
256 racedf_p1_NON-LATINO WHITE racecn_p1_WHITE 0.747395
257 racecn_p1_WHITE racedf_p1_NON-LATINO WHITE 0.747395
... ... ... ...
294 ac11_90.0 aesoda_p1_21+ TIMES 0.723031
295 ae_soda_21.0 aesoda_p1_21+ TIMES 0.723031
296 aesoda_p1_21+ TIMES ac11_90.0 0.723031
297 aesoda_p1_21+ TIMES ae_soda_21.0 0.723031
298 aesoda_p1_4-6 TIMES ae_soda_5.0 0.722613
299 ae_soda_5.0 aesoda_p1_4-6 TIMES 0.722613
300 ur_clrt2_RURAL ur_clrt6_TOWN 0.720050
301 ur_clrt6_TOWN ur_clrt2_RURAL 0.720050
306 povll_aca_400%+ FPL povll_300% FPL AND ABOVE 0.711906
307 povll_300% FPL AND ABOVE povll_aca_400%+ FPL 0.711906
308 povll2_p1v2_5.0 povll_aca_400%+ FPL 0.707901
309 povll_aca_400%+ FPL povll2_p1v2_5.0 0.707901
310 ab51_p1_TYPE 2 ab114_p1_VERY CONFIDENT 0.707832
311 ab114_p1_VERY CONFIDENT ab51_p1_TYPE 2 0.707832
312 bmi_p_32.4 ae3_29.0 0.707090
313 ae3_29.0 bmi_p_32.4 0.707090
314 povll2_p1v2_3.15 ae5_140.0 0.707090
315 ae5_140.0 povll2_p1v2_3.15 0.707090
316 ae7_110.0 bmi_p_40.98 0.707090
317 bmi_p_40.98 ae7_110.0 0.707090
318 bmi_p_37.79 ae2_129.0 0.707090
319 ae2_129.0 bmi_p_37.79 0.707090
320 ae7_42.0 bmi_p_31.97 0.707090
321 bmi_p_31.97 ae7_42.0 0.707090
322 bmi_p_27.26 povll2_p1v2_4.44 0.707090
323 povll2_p1v2_4.44 bmi_p_27.26 0.707090
324 srh_NO ombsrr_p1_WHITE, NON-HISPANIC (NH) 0.701240
325 ombsrr_p1_WHITE, NON-HISPANIC (NH) latin2tp_NON-LATINO 0.701240
326 ombsrr_p1_WHITE, NON-HISPANIC (NH) srh_NO 0.701240
327 latin2tp_NON-LATINO ombsrr_p1_WHITE, NON-HISPANIC (NH) 0.701240

84 rows × 3 columns


In [12]:
find_var('bmi','prepovll', 'ae7')


Out[12]:
[('ae7', '# OF TIMES ATE VEGETABLES IN PAST MO'),
 ('whobmi', 'BODY MASS INDEX: WHO DEFINITION'),
 ('rbmi', 'BMI DESCRIPTIVE'),
 ('bmi_p', 'BODY MASS INDEX (PUF RECODE)')]

In [13]:
x = pd.DataFrame({ 'ae7' : dfs.ae7.astype(float),
                    'bmi' : dfs.bmi_p.astype(float)})
x.corr()


Out[13]:
ae7 bmi
ae7 1.000000 -0.075463
bmi -0.075463 1.000000

In [33]:
dfn[['ae7','bmi_p','ac11','povll2_p1v2']].corr()


Out[33]:
ae7 bmi_p ac11 povll2_p1v2
ae7 1.000000 -0.075463 -0.078796 0.138781
bmi_p -0.075463 1.000000 0.047354 -0.109280
ac11 -0.078796 0.047354 1.000000 -0.144427
povll2_p1v2 0.138781 -0.109280 -0.144427 1.000000

In [49]:
t = dfv.select_dtypes('float64').corr().stack().to_frame().sort_values(0, ascending = False)
t[t[0]< .3]


Out[49]:
0
ac46 ac11 0.185315
ac11 ac46 0.185315
ac46 ae_soda 0.185200
ae_soda ac46 0.185200
povll2_p1v2 heighm_p 0.183933
hghtm_p povll2_p1v2 0.183933
povll2_p1v2 hghtm_p 0.183933
heighm_p povll2_p1v2 0.183933
povll2_p1v2 hghti_p 0.183496
hghti_p povll2_p1v2 0.183496
ac11 ae3 0.176760
ae3 ac11 0.176760
ae_soda ae3 0.175720
ae3 ae_soda 0.175720
povgwd_p1 heighm_p 0.163272
heighm_p povgwd_p1 0.163272
povgwd_p1 hghtm_p 0.163272
hghtm_p povgwd_p1 0.163272
povgwd_p1 hghti_p 0.162888
hghti_p povgwd_p1 0.162888
ae7 povll2_p1v2 0.138781
povll2_p1v2 ae7 0.138781
ins12m 0.128526
ins12m povll2_p1v2 0.128526
povgwd_p1 ae7 0.128301
ae7 povgwd_p1 0.128301
ac46 ae3 0.122398
ae3 ac46 0.122398
povgwd_p1 ins12m 0.120377
ins12m povgwd_p1 0.120377
... ... ...
povgwd_p1 ac46 -0.086259
ac46 povgwd_p1 -0.086259
povgwd_p1 hhsize_p1 -0.086431
hhsize_p1 povgwd_p1 -0.086431
bmi_p heighm_p -0.090016
hghtm_p -0.090016
hghtm_p bmi_p -0.090016
heighm_p bmi_p -0.090016
hghti_p bmi_p -0.091543
bmi_p hghti_p -0.091543
ae5 povgwd_p1 -0.093948
povgwd_p1 ae5 -0.093948
ac46 povll2_p1v2 -0.099174
povll2_p1v2 ac46 -0.099174
hhsize_p1 ins12m -0.108809
ins12m hhsize_p1 -0.108809
povll2_p1v2 bmi_p -0.109280
bmi_p povll2_p1v2 -0.109280
povll2_p1v2 ae5 -0.118321
ae5 povll2_p1v2 -0.118321
povgwd_p1 ae_soda -0.120602
ae_soda povgwd_p1 -0.120602
ac11 povgwd_p1 -0.121153
povgwd_p1 ac11 -0.121153
povll2_p1v2 hhsize_p1 -0.142684
hhsize_p1 povll2_p1v2 -0.142684
povll2_p1v2 ac11 -0.144427
ac11 povll2_p1v2 -0.144427
povll2_p1v2 ae_soda -0.144619
ae_soda povll2_p1v2 -0.144619

336 rows × 1 columns


In [53]:
(dfv.ab1.value_counts()/21153*100).round(2)


Out[53]:
VERY GOOD    32.27
GOOD         30.18
EXCELLENT    17.38
FAIR         15.09
POOR          5.07
Name: ab1, dtype: float64

In [ ]: