In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from publicdata.chis import *
%matplotlib inline
sns.set_context('notebook')
idx = pd.IndexSlice # Convenience redefinition.
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
def recode(df):
""" Recode to a simpler group of races. For a lot of health outcomes, the major divisions are
* White + Asian ( whasian )
* Black + Latino Afrotino
* Others
"""
from pandas.api.types import CategoricalDtype
df['race_recode'] = df.racedf_p1
df.replace({'race_recode':{
'NON-LATINO WHITE':'white',
'NON-LATINO ASIAN':'asian',
'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
'NON-LATINO AFR. AMER.': 'black',
'LATINO': 'latino',
'NON-LATINO, TWO+ RACES': 'other',
'NON-LATINO OTHER, ONE RACE': 'other'
}}, inplace=True)
df.race_recode = df.race_recode.astype('category')
df['race_recode_3'] = df.racedf_p1
df.replace({'race_recode_3':{
'NON-LATINO WHITE':'white_asian',
'NON-LATINO ASIAN':'white_asian',
'NON-LATINO AMERICAN INDIAN/ALASKAN NATIVE': 'other',
'NON-LATINO AFR. AMER.': 'black_latino',
'LATINO': 'black_latino',
'NON-LATINO, TWO+ RACES': 'other',
'NON-LATINO OTHER, ONE RACE': 'other'
}}, inplace=True)
df.race_recode_3 = df.race_recode.astype('category')
df['minority'] = (df['race_recode'] != 'white_asian').astype(int)
df['old'] = (df.srage_p1 < '45-49 YEARS').astype(CategoricalDtype(categories=[False, True],ordered=True))
df.old.cat.rename_categories(['OLD','YOUNG'], inplace=True)
df['poor'] = (df.povll.isin(('200-299% FPL', '300% FPL AND ABOVE')) )\
.astype(CategoricalDtype(categories=[True, False],ordered=True))
df.poor.cat.rename_categories(['NPOV','POV'], inplace=True)
return df
In [4]:
df17 = pkg.reference('adult_2017').dataframe()
df16 = pkg.reference('adult_2016').dataframe()
df15 = pkg.reference('adult_2015').dataframe()
df14 = pkg.reference('adult_2014').dataframe()
df13 = pkg.reference('adult_2013').dataframe()
# Rename some categories. 2016 and 2015 have "ALASKA" where the others have "ALASKAN", which
# causes the concat() operation to convert categories to strings
cats_17 = df17.racedf_p1.cat.categories
cat_map = dict(zip(df16.racedf_p1.cat.categories, df17.racedf_p1.cat.categories))
for e in [df13,df14,df15,df16,df17]:
e.racedf_p1.cat.rename_categories(cat_map, inplace=True)
for df, year in zip([df13, df14, df15, df16, df17], range(2013, 2018)):
df['year'] = year
df = recode(df)
In [5]:
# Pool two years of data. These are the only two that have ak22_p1,
# "HH TOTAL ANN. INC BEFORE TAXES IN 2014/2015 (PUF 1 YR RECODE)"
df = pd.concat([
select_columns(df17, ['diabetes', 'ak22_p1']),
select_columns(df16, ['diabetes', 'ak22_p1'])]).reset_index()
n_years = 2
# Need to divide all of the weights by 2
weight_cols = [c for c in df.columns if 'raked' in c]
df.loc[:,weight_cols] /= n_years
len(df)
Out[5]:
In [6]:
t = chis_segment_estimate(df, 'diabetes')
t = t.loc[idx[('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
idx[:] # columns
]
t = t.unstack('measure')
t.columns = t.columns.droplevel()
t['diabetes_count'] /= n_years
t
Out[6]:
In [7]:
# Diabetes rates vs income.
t = chis_segment_estimate(df, 'diabetes', 'ak22_p1')
# Now we can select with the two indexers.
t = t.loc[idx[:,('diabetes_pct','diabetes_count','diabetes_rse'),'YES'], #rows
idx[:] # columns
]
t = t.unstack('measure')
# The columns are multi-level, but there is only one value for the first level,
# so it is useless.
t.columns = t.columns.droplevel()
# We already selected for the single value of 'YES', so this level is useless too
t.index = t.index.droplevel('diabetes')
def split_range(v):
if '-' in v:
p = v.split('-')
return int(p[0].replace(',',''))
else:
p = v.split()
# -1 so "10,000" is not equal to "<10,000"
return int(p[-1].replace(',',''))-5000
t = t.rename(mapper=split_range, axis='index')
t[t.diabetes_rse<=30].sort_index().diabetes_pct.plot()
Out[7]:
In [8]:
from matplotlib import pylab
from scipy.optimize import curve_fit
import plotly.plotly as py
def exponenial_func(x, a, b, c):
return a*np.exp(-b*x)+c
t = t.sort_index().reset_index()
x = t.ak22_p1
y = t.diabetes_pct
popt, pcov = curve_fit(exponenial_func, x, y, p0=(1, 1e-6, 1))
xx = np.linspace(0, 180000, 500)
yy = exponenial_func(xx, *popt)
plt.plot(x,y,'o', xx, yy)
pylab.title('Diabetes Rates Vs Income\nWith Exponential Fit')
ax = plt.gca()
ax.set_facecolor((0.898, 0.898, 0.898))
fig = plt.gcf()
In [9]:
popt
Out[9]:
In [10]:
perr = np.sqrt(np.diag(pcov))
perr/popt
Out[10]:
In [11]:
n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'race_recode', 'racedf_p1', 'poor', 'old'])
t = process_segments(df, 'diabetes', 'YES', ['old', 'poor','race_recode'])
t.diabetes_count /= n_years
t = t.reset_index().sort_values('diabetes_pct')
t[t.diabetes_rse < 30]
Out[11]:
In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
n_years, df = chis_concat([df17,df16,df15], ['diabetes', 'srsex', 'srage_p1', 'racedf_p1', 'race_recode', 'povll', 'minority', 'poor', 'old'])
def age_group_to_age(v):
try:
y1, y2, _ = v.replace('-',' ').split()
return (int(y1)+int(y2))/2
except ValueError:
# Probably '85+ YEARS'
return 85
m = {
"0-99% FPL": 50,
"100-199% FPL": 150,
"200-299% FPL":250,
"300% FPL AND ABOVE": 400
}
df = pd.get_dummies(df, columns=['race_recode'], prefix='', prefix_sep = '')
df['diabetes_bool'] = (df.diabetes == 'YES').astype(int)
df['avg_group_age'] = df.srage_p1.apply(age_group_to_age)
df['mid_povll'] = df.povll.apply(lambda v: m[v])
df['poor_bool'] = (df.poor == 'POV').astype(int)
df['is_male'] = (df.srsex == 'MALE').astype(int)
#X = df[['avg_group_age', 'minority', 'mid_povll']]
X = df[['avg_group_age', 'mid_povll', 'is_male', 'asian','white','black','latino','other']]
#X = df[['avg_group_age', 'mid_povll']]
y = df['diabetes_bool']
logreg.fit(X, y)
df['lr_prob'] = logreg.predict_proba(X)[:, 1]
logreg.coef_, df['lr_prob'].sum(), df['diabetes_bool'].sum(), sum(df.diabetes == 'YES')
Out[12]:
In [13]:
# Races and the on-hot encoding for them.
races = (
('white', (1,0,0,0,0)),
('asian', (0,1,0,0,0)),
('black', (0,0,1,0,0)),
('latino',(0,0,0,1,0)),
('other', (0,0,0,0,1)),
)
l = []
for age in range(18,85):
for pov in range(0,400,10):
for sex in (1,0):
for race_name, race_ohe in races:
l.append( (age, pov, sex,race_name) + race_ohe )
rasp = pd.DataFrame(l, columns = 'age pov sex race_name asian white black latino other'.split())
In [14]:
# The first column ( [:,0]) is the probability of *not* having diabetes.
cols = [ c for c in rasp.columns if c!= 'race_name' ]
rasp['prob'] = logreg.predict_proba(rasp[cols])[:,1]
In [16]:
df
Out[16]:
Since the probabilities are 0 to 1, they should sum to the total number of diabetes caes in the dataset, and they almost do.
In [17]:
n_years, df = chis_concat([df17,df16,df15], ['ac11', 'diabetes', 'race_recode', 'poor', 'old'])
# Approximately equal count bins,
# also: pd.qcut(df.ac11,20, duplicates='drop').cat.categories
df['soda_recode'] = pd.cut(df.ac11,[-1,0,1,2,3,4,7,10,20,30,300])
df.soda_recode.cat.set_categories("000,001,002,003,004,007,010,020,030,300".split(','), rename=True, inplace=True)
df.soda_recode.value_counts().sort_index()
Out[17]:
In [18]:
t = chis_segment_estimate(df, 'soda_recode', ['old'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('old')
Out[18]:
In [19]:
t = chis_segment_estimate(df, 'soda_recode', ['poor'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('poor')
Out[19]:
In [20]:
t = chis_segment_estimate(df, 'soda_recode', ['race_recode'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t[['soda_recode_pct']].unstack('race_recode')
Out[20]:
This correlation is probably a bad idea, since while drinking sugary drinks is a risk factor in developing diabetes, diabetics shouldn't drink sodas, so it isn't surprising that there is a higher percentage of diabetics among people who don't drink sodas, precisely the opposite connection you might expect.
To properly perform this correlationwould require a longitudinal study, where the soda consumption in earlier years can be correlated with diabetes in later years.
In [21]:
t = chis_segment_estimate(df, 'diabetes', ['soda_recode'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t = t[['diabetes_pct']].loc[(slice(None),"YES"),:]
t.index = t.index.droplevel('diabetes')
t.sort_index()
Out[21]:
In [22]:
df17.fslev.value_counts()
Out[22]:
In [23]:
n_years, df = chis_concat([df17,df16,df15], ['fslev', 'race_recode', 'poor',])
t = chis_segment_estimate(df, 'fslev', ['race_recode', 'poor'])
t = t.unstack('measure')
t.columns = t.columns.droplevel(0)
t1 = t
t = t[['fslev_pct']].unstack(['poor'])
t.loc[(slice(None),('FOOD INSECURITY W/O HUNGER','FOOD INSECURITY W/ HUNGER')),:]
Out[23]:
In [24]:
t1 = t1.loc[('POV',slice(None),('FOOD INSECURITY W/O HUNGER','FOOD INSECURITY W/ HUNGER')),:]
t1[['fslev_pct']].unstack('race_recode')
Out[24]:
In [25]:
len(df)
Out[25]:
In [ ]: