In [1]:
""" This notebook describes exploratory analyses of 2013 US vital
statistics data. These data are open access and publicly available.
Written by Laura Nolan
Created on 8/1/2015
Updated on 8/24/2015
"""
In [ ]:
import pandas as pd
import numpy as np
import psycopg2
import json
import re
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import itertools
import statsmodels.api as sm
import matplotlib as mpl
mpl.use('Agg')
import seaborn as sns
import math
%matplotlib inline
In [2]:
params = json.load(open('psql_psycopg2.password'))
try:
conn = psycopg2.connect(**params)
conn.autocommit = True
cur = conn.cursor()
except:
print('Unable to connect to database')
import sys
sys.path.insert(0, '/home/lnolan/git_dir/babies')
from babysaver import features
from babysaver import models
from babysaver import evaluation
In [3]:
# listing only features that are more or less present in 707G
vital_features = ['rf_diab_rec_c', 'rf_gest_rec_c', 'rf_phyp_rec_c', 'rf_ghyp_rec_c',
'rf_eclam_rec_c', 'rf_ppoutc_rec_c', 'rf_ppterm_rec_c', 'dplural_c',
'sexdis_rec_f', 'pay_rec_c', 'ip_gono_rec_c', 'ip_syph_rec_c',
'ip_chlam_rec_c', 'ip_hepatb_rec_c', 'ip_hepatc_rec_c',
'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 'cig_2_rec_n',
'cig_3_rec_n', 'pay_rec_rec_c', 'pwgt_r_rec_n']
vital_outcome = "advb_otc"
vital_table = "vitals_all"
In [6]:
df = features.general_data_getter(vital_features, vital_outcome, vital_table, n_rows=100000, conn = conn, create_ref=True)
In [41]:
df['dataframe'].head()
Out[41]:
In [7]:
print df['dataframe']['advb_otc'].mean()
In [43]:
print df['features']
In [44]:
help(models.machine_learner)
In [8]:
classifiers = {
'clf': LogisticRegression,
'param_dict': {
'C': [0.001, 0.01, 1, 10],
'penalty': ["l1", "l2"]
}
}
# only return the first item
models_dict, pkl_dct = models.machine_learner(df, classifiers, "kfold_cv", k=[0.3])
In [9]:
evaluation_df = evaluation.dict_to_dataframe(models_dict, pkl_dct)
pd.DataFrame(evaluation_df.sort('precision at 0.3 mean', ascending=False)['pickle_file'])
Out[9]:
In [10]:
pd.DataFrame(evaluation_df.sort('precision at 0.3 mean', ascending=False))
Out[10]:
In [74]:
df['features']
Out[74]:
In [75]:
from sklearn.externals import joblib
lr = joblib.load('pickles/LogisticRegression1.pkl')
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]})
Out[75]:
In [76]:
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]}).sort('coef')
Out[76]:
In [77]:
vars707 = ['rf_diab_c_rec', 'rf_gest_c_rec', 'rf_phyp_c_rec', 'rf_ghyp_c_rec',
'rf_eclam_c_rec', 'rf_ppoutc_c_rec', 'rf_ppterm_c_rec', 'dplural_c',
'sexdis_rec_f', 'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n',
'cig_2_rec_n', 'cig_3_rec_n', 'pay_rec_rec_c', 'advb_otc']
# initializing pandas data frame with the new_cols_list title - basically building an empty correlation matrix
ORs= pd.DataFrame(data=None, index=vars707, columns=vars707)
ORs.head()
Out[77]:
In [ ]:
for_heatmap = pd.read_sql('SELECT rf_diab_c_rec, rf_gest_c_rec, rf_phyp_c_rec, rf_ghyp_c_rec,' +
'rf_eclam_c_rec, rf_ppoutc_c_rec, rf_ppterm_c_rec, dplural_c,'
'sexdis_rec_f, meduc_rec_n, cig_0_rec_n, cig_1_rec_n,' +
'cig_2_rec_n, cig_3_rec_n, pay_rec_rec_c, pwgt_r_rec_n, advb_otc FROM forheatmap ORDER BY RANDOM() LIMIT 100000', conn)
In [79]:
# Checking types - some things will need to be recast
[(for_heatmap[i].dtype,i) for i in for_heatmap.columns.values]
Out[79]:
In [80]:
# making sure that everything is a float
for_heatmap[['dplural_c', 'pay_rec_rec_c']] = for_heatmap[['dplural_c', 'pay_rec_rec_c']].astype(float)
In [81]:
# looking at all variables to see distribution
print "rf_diab_c_rec"
print for_heatmap['rf_diab_c_rec'].value_counts(dropna = False)
print "rf_gest_c_rec"
print for_heatmap['rf_gest_c_rec'].value_counts(dropna = False)
print "rf_gest_c_rec"
print for_heatmap['rf_gest_c_rec'].value_counts(dropna = False)
print "rf_phyp_c_rec"
print for_heatmap['rf_phyp_c_rec'].value_counts(dropna = False)
print "rf_ghyp_c_rec"
print for_heatmap['rf_ghyp_c_rec'].value_counts(dropna = False)
print "rf_eclam_c_rec"
print for_heatmap['rf_eclam_c_rec'].value_counts(dropna = False)
print "rf_ppoutc_c_rec"
print for_heatmap['rf_ppoutc_c_rec'].value_counts(dropna = False)
print "rf_ppterm_c_rec"
print for_heatmap['rf_ppterm_c_rec'].value_counts(dropna = False)
print "dplural_c" # NEEDS TO BE RECODED TO >1
print for_heatmap['dplural_c'].value_counts(dropna = False)
print "sexdis_rec_f"
print for_heatmap['sexdis_rec_f'].value_counts(dropna = False)
print "meduc_rec_n" # NEEDS TO BE RECODED TO (probably) more than high school, etc.
print for_heatmap['meduc_rec_n'].value_counts(dropna = False)
print "cig_0_rec_n"
print for_heatmap['cig_0_rec_n'].value_counts(dropna = False)
In [82]:
# checking on how many missings there are for an often missing variables
for_heatmap['rf_diab_c_rec'].value_counts(dropna = False)
Out[82]:
In [83]:
for_heatmap.head()
# dropping na's
for_heatmap_complete = for_heatmap.dropna()
for_heatmap_complete.head()
Out[83]:
In [84]:
len(for_heatmap_complete) # quickly checking sample size
Out[84]:
In [ ]:
def babynum(f):
if f > 1:
return 1
else:
return 0
dplural_c_dum = [1 if int(x) > 1 else 0 for x in for_heatmap_complete['dplural_c']]
In [86]:
type(for_heatmap_complete['dplural_c'][0]) # checking type...
Out[86]:
In [ ]:
def meduc_rec(f):
if f < 3:
return 1
else:
return 0
meduc_rec_n_dum = [meduc_rec(x) for x in for_heatmap_complete['meduc_rec_n']]
In [ ]:
def cigs(f):
if f > 0:
return 1
else:
return 0
cig_0_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_0_rec_n']]
cig_1_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_1_rec_n']]
cig_2_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_2_rec_n']]
cig_3_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_3_rec_n']]
In [ ]:
def pay(f):
if f == 1:
return 1
else:
return 0
pay_rec_rec_c_dum = [pay(x) for x in for_heatmap_complete['pay_rec_rec_c']]
In [90]:
type(cig_3_rec_n_dum) # the recoded vars are in list format
Out[90]:
In [91]:
# Getting all vars, including recoded, into one data frame
test = pd.DataFrame({'cigs_before_preg': cig_0_rec_n_dum,
'cigs_in_first_tri': cig_1_rec_n_dum,
'cigs_in_second_tri': cig_2_rec_n_dum,
'cigs_in_third_tri': cig_3_rec_n_dum,
'medicaid_payer': pay_rec_rec_c_dum,
'diabetes': for_heatmap_complete['rf_diab_c_rec'],
'gestational_diab': for_heatmap_complete['rf_gest_c_rec'],
'prepreg_hyp': for_heatmap_complete['rf_phyp_c_rec'],
'gest_hyp': for_heatmap_complete['rf_ghyp_c_rec'],
'ecclampsia': for_heatmap_complete['rf_eclam_c_rec'],
'poor_prev_preg_out': for_heatmap_complete['rf_ppoutc_c_rec'],
'previous_preterm': for_heatmap_complete['rf_ppterm_c_rec'],
'multi_fetus': dplural_c_dum,
'sexually_trans_dis': for_heatmap_complete['sexdis_rec_f'],
'advb_otc': for_heatmap_complete['advb_otc']})
test.head()
Out[91]:
In [92]:
# all pairs of columns
varsformap = ['cigs_before_preg', 'cigs_in_first_tri', 'cigs_in_second_tri', 'cigs_in_third_tri',
'medicaid_payer', 'diabetes', 'gestational_diab', 'prepreg_hyp', 'gest_hyp', 'ecclampsia',
'poor_prev_preg_out', 'previous_preterm', 'multi_fetus', 'sexually_trans_dis', 'advb_otc']
forints = list(itertools.combinations(varsformap, 2))
# creating the shell for the correlation matrix
phis = pd.DataFrame(data=None, index=varsformap, columns=varsformap)
phis.head()
Out[92]:
In [ ]:
import math
def phi(thingy):
num = (thingy[1,1]*thingy[0,0]) - (thingy[1][0]*thingy[0][1])
denom = math.sqrt(thingy[2,1]*thingy[2,0]*thingy[0,2]*thingy[1,2])
return num/denom
In [94]:
# computing the correlation and appending it to the empty data frame
for comb in forints:
mini = test[[comb[0], comb[1]]]
thingy = pd.crosstab(test[comb[0]], test[comb[1]], margins = True)
thingy = np.array(thingy)
#print phi(thingy)
phi_corr = phi(thingy)
phis.loc[comb[1], comb[0]] = phi_corr
test = test.fillna(value=0)
print phis.head()
In [95]:
# another option is to compute the odds ratios from bivariate logististic regression models
# for comb in forints:
# print comb[0], comb[1]
# logit = sm.Logit(test[comb[0]], test[comb[1]])
# result = logit.fit()
# OR = np.exp(result.params)[0] # extracting just odds ratio
# ORs.loc[comb[1], comb[0]] = OR
In [96]:
sns.set(style="white")
# replace NAs with 0 - or else heat map will not run
phis = phis.fillna(value=0)
# Generate a mask for the upper triangle
mask = np.zeros_like(phis, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
pic = sns.heatmap(phis, mask=mask, xticklabels=varsformap, yticklabels=varsformap)
In [13]:
# listing only features that are more or less present in 707G
vital_features = ['rf_diab_rec_c', 'rf_phyp_rec_c',
'rf_ppoutc_rec_c', 'rf_ppterm_rec_c', 'dplural_c',
'sexdis_rec_f', 'ip_gono_rec_c', 'ip_syph_rec_c',
'ip_chlam_rec_c', 'ip_hepatb_rec_c', 'ip_hepatc_rec_c',
'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 'cig_2_rec_n',
'cig_3_rec_n', 'pay_rec_rec_c', 'pwgt_r_rec_n']
vital_outcome = "advb_otc"
vital_table = "justMedicaid"
# not fair to include things during preg: 'rf_gest_rec_c', 'rf_ghyp_rec_c', 'rf_eclam_rec_c'
df = features.general_data_getter(vital_features, vital_outcome, vital_table, n_rows=100000, conn = conn, create_ref=True)
classifiers = {
'clf': LogisticRegression,
'param_dict': {
'C': [0.001, 0.01, 1, 10],
'penalty': ["l1", "l2"]
}
}
# only return the first item
models_dict, pkl_dct = models.machine_learner(df, classifiers, "kfold_cv", k=[0.1])
evaluation_df = evaluation.dict_to_dataframe(models_dict, pkl_dct)
pd.DataFrame(evaluation_df.sort('precision at 0.1 mean', ascending=False)['pickle_file'])
Out[13]:
In [14]:
pd.DataFrame(evaluation_df.sort('precision at 0.1 mean', ascending=False))
Out[14]:
In [17]:
from sklearn.externals import joblib
lr = joblib.load('pickles/LogisticRegression1.pkl')
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]}).sort('coef')
Out[17]: