In [1]:

    
""" This notebook describes exploratory analyses of 2013 US vital
statistics data. These data are open access and publicly available.

Written by Laura Nolan

Created on 8/1/2015
Updated on 8/24/2015

"""



In [ ]:

    
import pandas as pd
import numpy as np
import psycopg2
import json
import re
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import itertools
import statsmodels.api as sm
import matplotlib as mpl
mpl.use('Agg')
import seaborn as sns
import math

%matplotlib inline

Setting up connection



In [2]:

    
params = json.load(open('psql_psycopg2.password'))

try:
    conn = psycopg2.connect(**params)
    conn.autocommit = True
    cur = conn.cursor()

except:
    print('Unable to connect to database')
    
import sys
sys.path.insert(0, '/home/lnolan/git_dir/babies')
from babysaver import features
from babysaver import models
from babysaver import evaluation

Pulling in a sample of the vital statistics



In [3]:

    
# listing only features that are more or less present in 707G
vital_features = ['rf_diab_rec_c', 'rf_gest_rec_c', 'rf_phyp_rec_c', 'rf_ghyp_rec_c', 
            'rf_eclam_rec_c', 'rf_ppoutc_rec_c', 'rf_ppterm_rec_c', 'dplural_c', 
            'sexdis_rec_f', 'pay_rec_c', 'ip_gono_rec_c', 'ip_syph_rec_c',
            'ip_chlam_rec_c', 'ip_hepatb_rec_c', 'ip_hepatc_rec_c',
            'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 'cig_2_rec_n',
            'cig_3_rec_n', 'pay_rec_rec_c', 'pwgt_r_rec_n']
vital_outcome = "advb_otc"
vital_table = "vitals_all"

Applying pipeline to vital statistics



In [6]:

    
df = features.general_data_getter(vital_features, vital_outcome, vital_table, n_rows=100000, conn = conn, create_ref=True)









    



data_getter: dataset has dimensions (81217, 33)



In [41]:

    
df['dataframe'].head()









    Out[41]:






  
    
      
      index
      sexdis_rec_f
      UNIQUE_ID_I
      rf_diab_rec_c_Y
      rf_gest_rec_c_Y
      rf_phyp_rec_c_Y
      rf_ghyp_rec_c_Y
      rf_eclam_rec_c_Y
      rf_ppoutc_rec_c_Y
      rf_ppterm_rec_c_Y
      ...
      pay_rec_rec_c_1
      pay_rec_rec_c_4
      pay_rec_rec_c_3
      advb_otc
      meduc_rec_n
      cig_0_rec_n
      cig_1_rec_n
      cig_2_rec_n
      cig_3_rec_n
      pwgt_r_rec_n
    
  
  
    
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      -0.120947
      -0.275209
      -0.234142
      -0.213524
      -0.203926
      -0.129174
    
    
      1
      1
      0
      2
      0
      0
      0
      1
      0
      0
      0
      ...
      1
      0
      0
      1
      -0.695180
      -0.275209
      -0.234142
      -0.213524
      -0.203926
      -0.869456
    
    
      2
      2
      0
      3
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      -0.695180
      -0.275209
      -0.234142
      -0.213524
      -0.203926
      0.240967
    
    
      3
      3
      0
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      -0.695180
      -0.275209
      -0.234142
      -0.213524
      -0.203926
      -0.252554
    
    
      4
      4
      0
      5
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      -0.120947
      -0.275209
      -0.234142
      -0.213524
      -0.203926
      -0.474639
    
  

5 rows × 33 columns

Proportion of adverse births in the sample



In [7]:

    
print df['dataframe']['advb_otc'].mean()









    



0.170087543248



In [43]:

    
print df['features']









    



['sexdis_rec_f', 'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 'cig_2_rec_n', 'cig_3_rec_n', 'pwgt_r_rec_n', 'rf_diab_rec_c_Y', 'rf_gest_rec_c_Y', 'rf_phyp_rec_c_Y', 'rf_ghyp_rec_c_Y', 'rf_eclam_rec_c_Y', 'rf_ppoutc_rec_c_Y', 'rf_ppterm_rec_c_Y', 'dplural_c_2', 'dplural_c_3', 'dplural_c_4', 'dplural_c_5', 'pay_rec_c_1', 'pay_rec_c_4', 'pay_rec_c_3', 'pay_rec_c_9', 'ip_gono_rec_c_Y', 'ip_syph_rec_c_Y', 'ip_chlam_rec_c_Y', 'ip_hepatb_rec_c_Y', 'ip_hepatc_rec_c_Y', 'pay_rec_rec_c_1', 'pay_rec_rec_c_4', 'pay_rec_rec_c_3']



In [44]:

    
help(models.machine_learner)









    



Help on function machine_learner in module babysaver.models:

machine_learner(data_dct, clf_library, cv, make_evals=False, pkl_folder='pickles', verbose=False, eval_folder='evals', n_folds=5, stratified=True, bagging=False, test_size=0.3, increment=6, k=0.1, start_date=None, end_date=None, **kwargs)
    This function is customized for temporal cross-validation and 
    precision-at-k evaluation.
    
    data_dct - dictionary containing data, features, outcome from 
               features.data_getter() 
    clf_library - list of classifier dictionaries in following format
            {
                'clf': Classifier(),
                'param_dict': {
                    'param1': [list of parameter values],
                    'param2': [list of parameter values]
                } 
            }
    cv - specify type of cross-validation as string, current options are:
            'temporal_cv', 'train_test_split', 'kfold_cv'
    pkl_folder - folder to save pickle files 
    n_folds - if using kfold_cv, number of folds to use 
    stratified - True/False, whether to use stratified kfold cv 
    test_size - if using train_test_split, size of test set as decimal
                i.e., 0.1, 0.2, 0.3
    first_train_end - if using temporal_cv, specify the end date of the 
                      initial training set
    
    train_date_var - if using temporal cv, which date variable to use for 
                     train set
        format: 'Year-Month-Day' (include quotes)
    test_date_var - if using temporal cv, which date variable to use for 
                    test set
        format: 'Year-Month-Day' (include quotes)
    
    *** PLEASE NOTE: most of the time these will be the same.  In our case, 
        we must look at birth date for the train set and assessment date 
        for the test set because we can't train on data without seeing the 
        birth outcome first but we can predict on data without the birth 
        outcome. All observations in our dataset have realized birth outcomes 
        at some point in time 
    
    increment - time window of the training set specified in months
                (default: 6, aka 6 months)
    k - precision at what top fraction of data (default: 0.1, aka top 10%)
    
    *** PLEASE NOTE: the current implementation of precision @ k uses 
        looks at the top k fraction of the TRAINING SET to determine the
        threshold for the TEST set, which is crucial for our purposes ***
    
    start_date - date of first observation to include
        format: 'Year-Month-Day' (include quotes) 
        (default: min(train_date_var))
    end_date - date of last observation to include 
        format: 'Year-Month-Day' (include quotes)
        (default: max(train_date_var))

Specifying classifiers and running the pipeline



In [8]:

    
classifiers = {
    'clf': LogisticRegression,
    'param_dict': {
        'C': [0.001, 0.01, 1, 10],
        'penalty': ["l1", "l2"]
        }
    }

# only return the first item 
models_dict, pkl_dct = models.machine_learner(df, classifiers, "kfold_cv", k=[0.3])









    



You have not specified a folder for the pickle files. By default, they will be saved in ./pickles/ - do you want to continue? (y/n)y
machine_learner: finished running models
machine_learner: pickle files available in pickles/
machine_learner: total runtime was 0:00:29.226414

Running evaluation



In [9]:

    
evaluation_df = evaluation.dict_to_dataframe(models_dict, pkl_dct)
pd.DataFrame(evaluation_df.sort('precision at 0.3 mean', ascending=False)['pickle_file'])









    Out[9]:






  
    
      
      pickle_file
    
  
  
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression6.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression7.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression3.pkl
    
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression2.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression5.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression1.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression4.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression0.pkl



In [10]:

    
pd.DataFrame(evaluation_df.sort('precision at 0.3 mean', ascending=False))









    Out[10]:






  
    
      
      avg_prec_score_mean
      avg_prec_score_std
      roc_auc_mean
      roc_auc_std
      avg_prec_0.3 mean
      avg_prec_0.3 std
      precision at 0.3 mean
      precision at 0.3 std
      recall at 0.3 mean
      recall at 0.3 std
      test_count at 0.3 mean
      test_count at 0.3 std
      test_percent at 0.3 mean
      test_percent at 0.3 std
      pickle_file
    
  
  
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.378021
      0.007457
      0.663657
      0.003127
      0.269703
      0.009081
      0.282492
      0.003350
      0.499710
      0.009586
      4887.2
      73.431601
      0.300873
      0.004518
      pickles/LogisticRegression6.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.378342
      0.007251
      0.663673
      0.003106
      0.269986
      0.008827
      0.282329
      0.003214
      0.499420
      0.009946
      4887.2
      79.728916
      0.300873
      0.004905
      pickles/LogisticRegression7.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.378449
      0.007149
      0.663674
      0.003110
      0.270113
      0.008714
      0.282317
      0.003202
      0.499420
      0.009946
      4887.4
      79.077810
      0.300885
      0.004865
      pickles/LogisticRegression3.pkl
    
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.378358
      0.007259
      0.663600
      0.003288
      0.269961
      0.008865
      0.282294
      0.003491
      0.499275
      0.009590
      4886.4
      72.244723
      0.300824
      0.004445
      pickles/LogisticRegression2.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.369120
      0.007162
      0.662346
      0.003726
      0.260471
      0.009078
      0.281483
      0.003672
      0.498624
      0.010449
      4894.2
      89.569526
      0.301304
      0.005514
      pickles/LogisticRegression5.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.362182
      0.006804
      0.655399
      0.004709
      0.252071
      0.008438
      0.276287
      0.004459
      0.487694
      0.011941
      4876.8
      88.930872
      0.300233
      0.005474
      pickles/LogisticRegression1.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.335543
      0.007157
      0.639589
      0.005597
      0.224703
      0.008738
      0.265253
      0.004117
      0.467714
      0.009276
      4871.6
      62.970628
      0.299913
      0.003882
      pickles/LogisticRegression4.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.315779
      0.008254
      0.613340
      0.005141
      0.184670
      0.010284
      0.216166
      0.003262
      0.571377
      0.009519
      7303.0
      85.953476
      0.449598
      0.005284
      pickles/LogisticRegression0.pkl



In [74]:

    
df['features']









    Out[74]:





['sexdis_rec_f',
 'meduc_rec_n',
 'cig_0_rec_n',
 'cig_1_rec_n',
 'cig_2_rec_n',
 'cig_3_rec_n',
 'pwgt_r_rec_n',
 'rf_diab_rec_c_Y',
 'rf_gest_rec_c_Y',
 'rf_phyp_rec_c_Y',
 'rf_ghyp_rec_c_Y',
 'rf_eclam_rec_c_Y',
 'rf_ppoutc_rec_c_Y',
 'rf_ppterm_rec_c_Y',
 'dplural_c_2',
 'dplural_c_3',
 'dplural_c_4',
 'dplural_c_5',
 'pay_rec_c_1',
 'pay_rec_c_4',
 'pay_rec_c_3',
 'pay_rec_c_9',
 'ip_gono_rec_c_Y',
 'ip_syph_rec_c_Y',
 'ip_chlam_rec_c_Y',
 'ip_hepatb_rec_c_Y',
 'ip_hepatc_rec_c_Y',
 'pay_rec_rec_c_1',
 'pay_rec_rec_c_4',
 'pay_rec_rec_c_3']

Looking at coefficient importances



In [75]:

    
from sklearn.externals import joblib

lr = joblib.load('pickles/LogisticRegression1.pkl')
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]})









    Out[75]:






  
    
      
      coef
      features
    
  
  
    
      0
      0.000000
      sexdis_rec_f
    
    
      1
      -0.107866
      meduc_rec_n
    
    
      2
      0.000000
      cig_0_rec_n
    
    
      3
      0.044208
      cig_1_rec_n
    
    
      4
      0.031915
      cig_2_rec_n
    
    
      5
      0.000000
      cig_3_rec_n
    
    
      6
      -0.050275
      pwgt_r_rec_n
    
    
      7
      0.000000
      rf_diab_rec_c_Y
    
    
      8
      0.106340
      rf_gest_rec_c_Y
    
    
      9
      0.456506
      rf_phyp_rec_c_Y
    
    
      10
      0.902196
      rf_ghyp_rec_c_Y
    
    
      11
      0.000000
      rf_eclam_rec_c_Y
    
    
      12
      0.000000
      rf_ppoutc_rec_c_Y
    
    
      13
      0.781540
      rf_ppterm_rec_c_Y
    
    
      14
      2.450936
      dplural_c_2
    
    
      15
      0.000000
      dplural_c_3
    
    
      16
      0.000000
      dplural_c_4
    
    
      17
      0.000000
      dplural_c_5
    
    
      18
      0.158389
      pay_rec_c_1
    
    
      19
      0.000000
      pay_rec_c_4
    
    
      20
      0.000000
      pay_rec_c_3
    
    
      21
      0.000000
      pay_rec_c_9
    
    
      22
      0.000000
      ip_gono_rec_c_Y
    
    
      23
      0.000000
      ip_syph_rec_c_Y
    
    
      24
      0.000000
      ip_chlam_rec_c_Y
    
    
      25
      0.000000
      ip_hepatb_rec_c_Y
    
    
      26
      0.000000
      ip_hepatc_rec_c_Y
    
    
      27
      0.000004
      pay_rec_rec_c_1
    
    
      28
      0.000000
      pay_rec_rec_c_4
    
    
      29
      0.000000
      pay_rec_rec_c_3



In [76]:

    
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]}).sort('coef')









    Out[76]:






  
    
      
      coef
      features
    
  
  
    
      1
      -0.107866
      meduc_rec_n
    
    
      6
      -0.050275
      pwgt_r_rec_n
    
    
      0
      0.000000
      sexdis_rec_f
    
    
      26
      0.000000
      ip_hepatc_rec_c_Y
    
    
      25
      0.000000
      ip_hepatb_rec_c_Y
    
    
      24
      0.000000
      ip_chlam_rec_c_Y
    
    
      23
      0.000000
      ip_syph_rec_c_Y
    
    
      22
      0.000000
      ip_gono_rec_c_Y
    
    
      21
      0.000000
      pay_rec_c_9
    
    
      20
      0.000000
      pay_rec_c_3
    
    
      19
      0.000000
      pay_rec_c_4
    
    
      17
      0.000000
      dplural_c_5
    
    
      16
      0.000000
      dplural_c_4
    
    
      15
      0.000000
      dplural_c_3
    
    
      29
      0.000000
      pay_rec_rec_c_3
    
    
      12
      0.000000
      rf_ppoutc_rec_c_Y
    
    
      11
      0.000000
      rf_eclam_rec_c_Y
    
    
      7
      0.000000
      rf_diab_rec_c_Y
    
    
      5
      0.000000
      cig_3_rec_n
    
    
      2
      0.000000
      cig_0_rec_n
    
    
      28
      0.000000
      pay_rec_rec_c_4
    
    
      27
      0.000004
      pay_rec_rec_c_1
    
    
      4
      0.031915
      cig_2_rec_n
    
    
      3
      0.044208
      cig_1_rec_n
    
    
      8
      0.106340
      rf_gest_rec_c_Y
    
    
      18
      0.158389
      pay_rec_c_1
    
    
      9
      0.456506
      rf_phyp_rec_c_Y
    
    
      13
      0.781540
      rf_ppterm_rec_c_Y
    
    
      10
      0.902196
      rf_ghyp_rec_c_Y
    
    
      14
      2.450936
      dplural_c_2

SWITCHING GEARS A BIT:

Correlation matrix for vital statistics

Just the variables that roughly correspond to 707



In [77]:

    
vars707 = ['rf_diab_c_rec', 'rf_gest_c_rec', 'rf_phyp_c_rec', 'rf_ghyp_c_rec', 
            'rf_eclam_c_rec', 'rf_ppoutc_c_rec', 'rf_ppterm_c_rec', 'dplural_c',
            'sexdis_rec_f', 'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 
            'cig_2_rec_n', 'cig_3_rec_n', 'pay_rec_rec_c', 'advb_otc']
            
# initializing pandas data frame with the new_cols_list title - basically building an empty correlation matrix
ORs= pd.DataFrame(data=None, index=vars707, columns=vars707)
ORs.head()









    Out[77]:






  
    
      
      rf_diab_c_rec
      rf_gest_c_rec
      rf_phyp_c_rec
      rf_ghyp_c_rec
      rf_eclam_c_rec
      rf_ppoutc_c_rec
      rf_ppterm_c_rec
      dplural_c
      sexdis_rec_f
      meduc_rec_n
      cig_0_rec_n
      cig_1_rec_n
      cig_2_rec_n
      cig_3_rec_n
      pay_rec_rec_c
      advb_otc
    
  
  
    
      rf_diab_c_rec
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      rf_gest_c_rec
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      rf_phyp_c_rec
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      rf_ghyp_c_rec
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      rf_eclam_c_rec
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN

Load from postgres



In [ ]:

    
for_heatmap = pd.read_sql('SELECT rf_diab_c_rec, rf_gest_c_rec, rf_phyp_c_rec, rf_ghyp_c_rec,' + 
            'rf_eclam_c_rec, rf_ppoutc_c_rec, rf_ppterm_c_rec, dplural_c,' 
            'sexdis_rec_f, meduc_rec_n, cig_0_rec_n, cig_1_rec_n,' +
            'cig_2_rec_n, cig_3_rec_n, pay_rec_rec_c, pwgt_r_rec_n, advb_otc FROM forheatmap ORDER BY RANDOM() LIMIT 100000', conn)



In [79]:

    
# Checking types - some things will need to be recast
[(for_heatmap[i].dtype,i) for i in for_heatmap.columns.values]









    Out[79]:





[(dtype('float64'), 'rf_diab_c_rec'),
 (dtype('float64'), 'rf_gest_c_rec'),
 (dtype('float64'), 'rf_phyp_c_rec'),
 (dtype('float64'), 'rf_ghyp_c_rec'),
 (dtype('float64'), 'rf_eclam_c_rec'),
 (dtype('float64'), 'rf_ppoutc_c_rec'),
 (dtype('float64'), 'rf_ppterm_c_rec'),
 (dtype('O'), 'dplural_c'),
 (dtype('float64'), 'sexdis_rec_f'),
 (dtype('float64'), 'meduc_rec_n'),
 (dtype('float64'), 'cig_0_rec_n'),
 (dtype('float64'), 'cig_1_rec_n'),
 (dtype('float64'), 'cig_2_rec_n'),
 (dtype('float64'), 'cig_3_rec_n'),
 (dtype('O'), 'pay_rec_rec_c'),
 (dtype('float64'), 'pwgt_r_rec_n'),
 (dtype('float64'), 'advb_otc')]



In [80]:

    
# making sure that everything is a float

for_heatmap[['dplural_c', 'pay_rec_rec_c']] = for_heatmap[['dplural_c', 'pay_rec_rec_c']].astype(float)



In [81]:

    
# looking at all variables to see distribution
print "rf_diab_c_rec"
print for_heatmap['rf_diab_c_rec'].value_counts(dropna = False)
print "rf_gest_c_rec"
print for_heatmap['rf_gest_c_rec'].value_counts(dropna = False)
print "rf_gest_c_rec"
print for_heatmap['rf_gest_c_rec'].value_counts(dropna = False)
print "rf_phyp_c_rec"
print for_heatmap['rf_phyp_c_rec'].value_counts(dropna = False)
print "rf_ghyp_c_rec"
print for_heatmap['rf_ghyp_c_rec'].value_counts(dropna = False)
print "rf_eclam_c_rec"
print for_heatmap['rf_eclam_c_rec'].value_counts(dropna = False)
print "rf_ppoutc_c_rec"
print for_heatmap['rf_ppoutc_c_rec'].value_counts(dropna = False)
print "rf_ppterm_c_rec"
print for_heatmap['rf_ppterm_c_rec'].value_counts(dropna = False)
print "dplural_c" # NEEDS TO BE RECODED TO >1
print for_heatmap['dplural_c'].value_counts(dropna = False)
print "sexdis_rec_f"
print for_heatmap['sexdis_rec_f'].value_counts(dropna = False)
print "meduc_rec_n" # NEEDS TO BE RECODED TO (probably) more than high school, etc.
print for_heatmap['meduc_rec_n'].value_counts(dropna = False)
print "cig_0_rec_n"
print for_heatmap['cig_0_rec_n'].value_counts(dropna = False)









    



rf_diab_c_rec
 0     89515
NaN     9778
 1       707
dtype: int64
rf_gest_c_rec
 0     85484
NaN     9778
 1      4738
dtype: int64
rf_gest_c_rec
 0     85484
NaN     9778
 1      4738
dtype: int64
rf_phyp_c_rec
 0     88873
NaN     9778
 1      1349
dtype: int64
rf_ghyp_c_rec
 0     85759
NaN     9778
 1      4463
dtype: int64
rf_eclam_c_rec
 0     90024
NaN     9778
 1       198
dtype: int64
rf_ppoutc_c_rec
 0     88213
NaN     9778
 1      2009
dtype: int64
rf_ppterm_c_rec
 0     87839
NaN     9778
 1      2383
dtype: int64
dplural_c
1    96518
2     3347
3      127
4        6
5        2
dtype: int64
sexdis_rec_f
 0     88235
NaN     9544
 1      2221
dtype: int64
meduc_rec_n
 3     22288
 4     19080
 6     16889
 2     10763
NaN    10627
 7      7502
 5      7158
 1      3476
 8      2217
dtype: int64
cig_0_rec_n
 0     76547
NaN    13922
 20     2975
 10     2595
 5       942
 3       481
 2       354
 4       353
 6       264
 40      258
 1       256
 15      242
 7       179
 30      179
 8       153
 12      109
 60       41
 9        35
 25       19
 98       15
 80        9
 11        9
 22        6
 14        6
 13        6
 18        5
 50        5
 24        5
 16        4
 45        4
 21        3
 90        3
 17        3
 19        2
 35        2
 71        1
 23        1
 26        1
 27        1
 29        1
 36        1
 41        1
 42        1
 75        1
dtype: int64



In [82]:

    
# checking on how many missings there are for an often missing variables
for_heatmap['rf_diab_c_rec'].value_counts(dropna = False)









    Out[82]:





 0     89515
NaN     9778
 1       707
dtype: int64



In [83]:

    
for_heatmap.head()
# dropping na's
for_heatmap_complete = for_heatmap.dropna()
for_heatmap_complete.head()









    Out[83]:






  
    
      
      rf_diab_c_rec
      rf_gest_c_rec
      rf_phyp_c_rec
      rf_ghyp_c_rec
      rf_eclam_c_rec
      rf_ppoutc_c_rec
      rf_ppterm_c_rec
      dplural_c
      sexdis_rec_f
      meduc_rec_n
      cig_0_rec_n
      cig_1_rec_n
      cig_2_rec_n
      cig_3_rec_n
      pay_rec_rec_c
      pwgt_r_rec_n
      advb_otc
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      4
      0
      0
      0
      0
      2
      139
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      1
      0
      4
      0
      0
      0
      0
      1
      125
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      1
      0
      2
      0
      0
      0
      0
      1
      130
      1
    
    
      5
      0
      0
      0
      0
      0
      0
      0
      1
      0
      3
      5
      4
      0
      0
      1
      145
      0
    
    
      6
      0
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
      0
      0
      0
      4
      147
      0



In [84]:

    
len(for_heatmap_complete) # quickly checking sample size









    Out[84]:





81009

Recoding whether there is more than 1 fetus



In [ ]:

    
def babynum(f):
    if f > 1:
        return 1
    else: 
        return 0

dplural_c_dum = [1 if int(x) > 1 else 0 for x in for_heatmap_complete['dplural_c']]



In [86]:

    
type(for_heatmap_complete['dplural_c'][0]) # checking type...









    Out[86]:





numpy.float64

Recoding mother's education to 0/1



In [ ]:

    
def meduc_rec(f):
    if f < 3:
        return 1
    else:
        return 0

meduc_rec_n_dum = [meduc_rec(x) for x in for_heatmap_complete['meduc_rec_n']]

Recoding cigs



In [ ]:

    
def cigs(f):
    if f > 0:
        return 1
    else:
        return 0
    
cig_0_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_0_rec_n']]
cig_1_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_1_rec_n']]
cig_2_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_2_rec_n']]
cig_3_rec_n_dum = [cigs(x) for x in for_heatmap_complete['cig_3_rec_n']]

Recoding payment method - 1 is Medicaid



In [ ]:

    
def pay(f):
    if f == 1:
        return 1
    else: 
        return 0

pay_rec_rec_c_dum = [pay(x) for x in for_heatmap_complete['pay_rec_rec_c']]



In [90]:

    
type(cig_3_rec_n_dum) # the recoded vars are in list format









    Out[90]:





list



In [91]:

    
# Getting all vars, including recoded, into one data frame

test = pd.DataFrame({'cigs_before_preg': cig_0_rec_n_dum,
        'cigs_in_first_tri': cig_1_rec_n_dum,
        'cigs_in_second_tri': cig_2_rec_n_dum,
        'cigs_in_third_tri': cig_3_rec_n_dum,
        'medicaid_payer': pay_rec_rec_c_dum,
        'diabetes': for_heatmap_complete['rf_diab_c_rec'],
        'gestational_diab': for_heatmap_complete['rf_gest_c_rec'],
        'prepreg_hyp': for_heatmap_complete['rf_phyp_c_rec'],
        'gest_hyp': for_heatmap_complete['rf_ghyp_c_rec'],
        'ecclampsia': for_heatmap_complete['rf_eclam_c_rec'],
        'poor_prev_preg_out': for_heatmap_complete['rf_ppoutc_c_rec'],
        'previous_preterm': for_heatmap_complete['rf_ppterm_c_rec'],
        'multi_fetus': dplural_c_dum,
        'sexually_trans_dis': for_heatmap_complete['sexdis_rec_f'],
        'advb_otc': for_heatmap_complete['advb_otc']})

test.head()









    Out[91]:






  
    
      
      advb_otc
      cigs_before_preg
      cigs_in_first_tri
      cigs_in_second_tri
      cigs_in_third_tri
      diabetes
      ecclampsia
      gest_hyp
      gestational_diab
      medicaid_payer
      multi_fetus
      poor_prev_preg_out
      prepreg_hyp
      previous_preterm
      sexually_trans_dis
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      3
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      5
      0
      1
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0



In [92]:

    
# all pairs of columns
varsformap = ['cigs_before_preg', 'cigs_in_first_tri', 'cigs_in_second_tri', 'cigs_in_third_tri',
             'medicaid_payer', 'diabetes', 'gestational_diab', 'prepreg_hyp', 'gest_hyp', 'ecclampsia',
             'poor_prev_preg_out', 'previous_preterm', 'multi_fetus', 'sexually_trans_dis', 'advb_otc']
forints = list(itertools.combinations(varsformap, 2))
 
# creating the shell for the correlation matrix
phis = pd.DataFrame(data=None, index=varsformap, columns=varsformap)
phis.head()









    Out[92]:






  
    
      
      cigs_before_preg
      cigs_in_first_tri
      cigs_in_second_tri
      cigs_in_third_tri
      medicaid_payer
      diabetes
      gestational_diab
      prepreg_hyp
      gest_hyp
      ecclampsia
      poor_prev_preg_out
      previous_preterm
      multi_fetus
      sexually_trans_dis
      advb_otc
    
  
  
    
      cigs_before_preg
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      cigs_in_first_tri
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      cigs_in_second_tri
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      cigs_in_third_tri
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      medicaid_payer
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN

Defining a function to compute the phi correlation (a measure of association for two binary variables)



In [ ]:

    
import math
def phi(thingy):
    num = (thingy[1,1]*thingy[0,0]) - (thingy[1][0]*thingy[0][1])
    denom = math.sqrt(thingy[2,1]*thingy[2,0]*thingy[0,2]*thingy[1,2])
    return num/denom



In [94]:

    
# computing the correlation and appending it to the empty data frame

for comb in forints:
    mini = test[[comb[0], comb[1]]]
    thingy = pd.crosstab(test[comb[0]], test[comb[1]], margins = True)
    thingy = np.array(thingy)
    #print phi(thingy)
    phi_corr = phi(thingy)
    phis.loc[comb[1], comb[0]] = phi_corr

test = test.fillna(value=0)

print phis.head()









    



                   cigs_before_preg cigs_in_first_tri cigs_in_second_tri  \
cigs_before_preg                NaN               NaN                NaN   
cigs_in_first_tri         0.8388118               NaN                NaN   
cigs_in_second_tri         0.774751         0.9066412                NaN   
cigs_in_third_tri         0.7513074         0.8689686          0.9440565   
medicaid_payer            0.1683236          0.173997           0.172581   

                   cigs_in_third_tri medicaid_payer diabetes gestational_diab  \
cigs_before_preg                 NaN            NaN      NaN              NaN   
cigs_in_first_tri                NaN            NaN      NaN              NaN   
cigs_in_second_tri               NaN            NaN      NaN              NaN   
cigs_in_third_tri                NaN            NaN      NaN              NaN   
medicaid_payer             0.1688268            NaN      NaN              NaN   

                   prepreg_hyp gest_hyp ecclampsia poor_prev_preg_out  \
cigs_before_preg           NaN      NaN        NaN                NaN   
cigs_in_first_tri          NaN      NaN        NaN                NaN   
cigs_in_second_tri         NaN      NaN        NaN                NaN   
cigs_in_third_tri          NaN      NaN        NaN                NaN   
medicaid_payer             NaN      NaN        NaN                NaN   

                   previous_preterm multi_fetus sexually_trans_dis advb_otc  
cigs_before_preg                NaN         NaN                NaN      NaN  
cigs_in_first_tri               NaN         NaN                NaN      NaN  
cigs_in_second_tri              NaN         NaN                NaN      NaN  
cigs_in_third_tri               NaN         NaN                NaN      NaN  
medicaid_payer                  NaN         NaN                NaN      NaN



In [95]:

    
# another option is to compute the odds ratios from bivariate logististic regression models 
# for comb in forints:
    # print comb[0], comb[1]
    # logit = sm.Logit(test[comb[0]], test[comb[1]])
    # result = logit.fit()
    # OR = np.exp(result.params)[0] # extracting just odds ratio
    # ORs.loc[comb[1], comb[0]] = OR

Plotting a heat map of the phi correlations



In [96]:

    
sns.set(style="white")

# replace NAs with 0 - or else heat map will not run
phis = phis.fillna(value=0)

# Generate a mask for the upper triangle
mask = np.zeros_like(phis, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

pic = sns.heatmap(phis, mask=mask, xticklabels=varsformap, yticklabels=varsformap)

Rerunning pipeline just with women whose birth was paid for by Medicaid



In [13]:

    
# listing only features that are more or less present in 707G
vital_features = ['rf_diab_rec_c', 'rf_phyp_rec_c',
            'rf_ppoutc_rec_c', 'rf_ppterm_rec_c', 'dplural_c', 
            'sexdis_rec_f', 'ip_gono_rec_c', 'ip_syph_rec_c',
            'ip_chlam_rec_c', 'ip_hepatb_rec_c', 'ip_hepatc_rec_c',
            'meduc_rec_n', 'cig_0_rec_n', 'cig_1_rec_n', 'cig_2_rec_n',
            'cig_3_rec_n', 'pay_rec_rec_c', 'pwgt_r_rec_n']
vital_outcome = "advb_otc"
vital_table = "justMedicaid"
                  
                  # not fair to include things during preg: 'rf_gest_rec_c', 'rf_ghyp_rec_c', 'rf_eclam_rec_c'

df = features.general_data_getter(vital_features, vital_outcome, vital_table, n_rows=100000, conn = conn, create_ref=True)

classifiers = {
    'clf': LogisticRegression,
    'param_dict': {
        'C': [0.001, 0.01, 1, 10],
        'penalty': ["l1", "l2"]
        }
    }

# only return the first item 
models_dict, pkl_dct = models.machine_learner(df, classifiers, "kfold_cv", k=[0.1])


evaluation_df = evaluation.dict_to_dataframe(models_dict, pkl_dct)
pd.DataFrame(evaluation_df.sort('precision at 0.1 mean', ascending=False)['pickle_file'])









    



data_getter: dataset has dimensions (90647, 22)
You have not specified a folder for the pickle files. By default, they will be saved in ./pickles/ - do you want to continue? (y/n)y
machine_learner: finished running models
machine_learner: pickle files available in pickles/
machine_learner: total runtime was 0:00:16.470830






    Out[13]:






  
    
      
      pickle_file
    
  
  
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression2.pkl
    
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression6.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression7.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression3.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression5.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression1.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression4.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      pickles/LogisticRegression0.pkl



In [14]:

    
pd.DataFrame(evaluation_df.sort('precision at 0.1 mean', ascending=False))









    Out[14]:






  
    
      
      avg_prec_score_mean
      avg_prec_score_std
      roc_auc_mean
      roc_auc_std
      avg_prec_0.1 mean
      avg_prec_0.1 std
      precision at 0.1 mean
      precision at 0.1 std
      recall at 0.1 mean
      recall at 0.1 std
      test_count at 0.1 mean
      test_count at 0.1 std
      test_percent at 0.1 mean
      test_percent at 0.1 std
      pickle_file
    
  
  
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.348052
      0.004442
      0.605928
      0.004647
      0.152348
      0.005087
      0.434872
      0.008406
      0.225413
      0.003993
      1812.6
      48.839533
      0.099981
      0.002693
      pickles/LogisticRegression2.pkl
    
    
      LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.347984
      0.004480
      0.605935
      0.004620
      0.152329
      0.005042
      0.434680
      0.008300
      0.225413
      0.003709
      1813.4
      47.715825
      0.100025
      0.002631
      pickles/LogisticRegression6.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.348175
      0.004478
      0.605925
      0.004604
      0.152544
      0.005132
      0.434585
      0.008248
      0.225413
      0.003816
      1813.8
      48.473704
      0.100047
      0.002673
      pickles/LogisticRegression7.pkl
    
    
      LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.348228
      0.004510
      0.605924
      0.004606
      0.152574
      0.005171
      0.434458
      0.008115
      0.225299
      0.003914
      1813.4
      48.982650
      0.100025
      0.002701
      pickles/LogisticRegression3.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.343962
      0.003745
      0.604942
      0.004906
      0.148214
      0.003327
      0.429988
      0.010114
      0.222266
      0.003646
      1807.8
      50.514354
      0.099716
      0.002785
      pickles/LogisticRegression5.pkl
    
    
      LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.338083
      0.003234
      0.598966
      0.005718
      0.144032
      0.002288
      0.418889
      0.009191
      0.217118
      0.003660
      1812.4
      38.240031
      0.099970
      0.002108
      pickles/LogisticRegression1.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.325776
      0.005685
      0.599898
      0.005309
      0.130053
      0.005297
      0.409786
      0.008017
      0.211797
      0.003723
      1807.0
      25.855367
      0.099672
      0.001426
      pickles/LogisticRegression4.pkl
    
    
      LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n          intercept_scaling=1, max_iter=100, multi_class='ovr',\n          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n          verbose=0)
      0.282969
      0.017664
      0.563666
      0.003541
      0.074025
      0.020642
      0.365911
      0.007483
      0.192632
      0.007835
      1840.6
      73.717705
      0.101526
      0.004068
      pickles/LogisticRegression0.pkl



In [17]:

    
from sklearn.externals import joblib

lr = joblib.load('pickles/LogisticRegression1.pkl')
pd.DataFrame({'features': df['features'], 'coef': lr.coef_[0]}).sort('coef')









    Out[17]:






  
    
      
      coef
      features
    
  
  
    
      6
      -0.048390
      pwgt_r_rec_n
    
    
      1
      -0.045012
      meduc_rec_n
    
    
      18
      0.000000
      ip_hepatc_rec_c_Y
    
    
      16
      0.000000
      ip_chlam_rec_c_Y
    
    
      15
      0.000000
      ip_syph_rec_c_Y
    
    
      14
      0.000000
      ip_gono_rec_c_Y
    
    
      5
      0.000000
      cig_3_rec_n
    
    
      13
      0.000000
      dplural_c_4
    
    
      12
      0.000000
      dplural_c_3
    
    
      17
      0.000000
      ip_hepatb_rec_c_Y
    
    
      9
      0.015237
      rf_ppoutc_rec_c_Y
    
    
      2
      0.018724
      cig_0_rec_n
    
    
      3
      0.026343
      cig_1_rec_n
    
    
      0
      0.040056
      sexdis_rec_f
    
    
      4
      0.052022
      cig_2_rec_n
    
    
      7
      0.251015
      rf_diab_rec_c_Y
    
    
      8
      0.401064
      rf_phyp_rec_c_Y
    
    
      10
      0.782075
      rf_ppterm_rec_c_Y
    
    
      11
      2.373237
      dplural_c_2

	index	UNIQUE_ID_I	rf_ghyp_rec_c_Y	...	pay_rec_rec_c_1	advb_otc	meduc_rec_n	cig_0_rec_n	cig_1_rec_n	cig_2_rec_n	cig_3_rec_n	pwgt_r_rec_n
0	0	1	0	...	1	0	-0.120947	-0.275209	-0.234142	-0.213524	-0.203926	-0.129174
1	1	2	1	...	1	1	-0.695180	-0.275209	-0.234142	-0.213524	-0.203926	-0.869456
2	2	3	0	...	0	1	-0.695180	-0.275209	-0.234142	-0.213524	-0.203926	0.240967
3	3	4	0	...	1	0	-0.695180	-0.275209	-0.234142	-0.213524	-0.203926	-0.252554
4	4	5	0	...	0	0	-0.120947	-0.275209	-0.234142	-0.213524	-0.203926	-0.474639

	pickle_file
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression6.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression7.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression3.pkl
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression2.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression5.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression1.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression4.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	pickles/LogisticRegression0.pkl

	avg_prec_score_mean	avg_prec_score_std	roc_auc_mean	roc_auc_std	avg_prec_0.3 mean	avg_prec_0.3 std	precision at 0.3 mean	precision at 0.3 std	recall at 0.3 mean	recall at 0.3 std	test_count at 0.3 mean	test_count at 0.3 std	test_percent at 0.3 mean	test_percent at 0.3 std	pickle_file
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.378021	0.007457	0.663657	0.003127	0.269703	0.009081	0.282492	0.003350	0.499710	0.009586	4887.2	73.431601	0.300873	0.004518	pickles/LogisticRegression6.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.378342	0.007251	0.663673	0.003106	0.269986	0.008827	0.282329	0.003214	0.499420	0.009946	4887.2	79.728916	0.300873	0.004905	pickles/LogisticRegression7.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.378449	0.007149	0.663674	0.003110	0.270113	0.008714	0.282317	0.003202	0.499420	0.009946	4887.4	79.077810	0.300885	0.004865	pickles/LogisticRegression3.pkl
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.378358	0.007259	0.663600	0.003288	0.269961	0.008865	0.282294	0.003491	0.499275	0.009590	4886.4	72.244723	0.300824	0.004445	pickles/LogisticRegression2.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.369120	0.007162	0.662346	0.003726	0.260471	0.009078	0.281483	0.003672	0.498624	0.010449	4894.2	89.569526	0.301304	0.005514	pickles/LogisticRegression5.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.362182	0.006804	0.655399	0.004709	0.252071	0.008438	0.276287	0.004459	0.487694	0.011941	4876.8	88.930872	0.300233	0.005474	pickles/LogisticRegression1.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.335543	0.007157	0.639589	0.005597	0.224703	0.008738	0.265253	0.004117	0.467714	0.009276	4871.6	62.970628	0.299913	0.003882	pickles/LogisticRegression4.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.315779	0.008254	0.613340	0.005141	0.184670	0.010284	0.216166	0.003262	0.571377	0.009519	7303.0	85.953476	0.449598	0.005284	pickles/LogisticRegression0.pkl

	coef	features
0	0.000000	sexdis_rec_f
1	-0.107866	meduc_rec_n
2	0.000000	cig_0_rec_n
3	0.044208	cig_1_rec_n
4	0.031915	cig_2_rec_n
5	0.000000	cig_3_rec_n
6	-0.050275	pwgt_r_rec_n
7	0.000000	rf_diab_rec_c_Y
8	0.106340	rf_gest_rec_c_Y
9	0.456506	rf_phyp_rec_c_Y
10	0.902196	rf_ghyp_rec_c_Y
11	0.000000	rf_eclam_rec_c_Y
12	0.000000	rf_ppoutc_rec_c_Y
13	0.781540	rf_ppterm_rec_c_Y
14	2.450936	dplural_c_2
15	0.000000	dplural_c_3
16	0.000000	dplural_c_4
17	0.000000	dplural_c_5
18	0.158389	pay_rec_c_1
19	0.000000	pay_rec_c_4
20	0.000000	pay_rec_c_3
21	0.000000	pay_rec_c_9
22	0.000000	ip_gono_rec_c_Y
23	0.000000	ip_syph_rec_c_Y
24	0.000000	ip_chlam_rec_c_Y
25	0.000000	ip_hepatb_rec_c_Y
26	0.000000	ip_hepatc_rec_c_Y
27	0.000004	pay_rec_rec_c_1
28	0.000000	pay_rec_rec_c_4
29	0.000000	pay_rec_rec_c_3

	rf_diab_c_rec	rf_gest_c_rec	rf_phyp_c_rec	rf_ghyp_c_rec	rf_eclam_c_rec	rf_ppoutc_c_rec	rf_ppterm_c_rec	dplural_c	sexdis_rec_f	meduc_rec_n	cig_0_rec_n	cig_1_rec_n	cig_2_rec_n	cig_3_rec_n	pay_rec_rec_c	advb_otc
rf_diab_c_rec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
rf_gest_c_rec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
rf_phyp_c_rec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
rf_ghyp_c_rec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
rf_eclam_c_rec	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	dplural_c	meduc_rec_n	cig_0_rec_n	cig_1_rec_n	pay_rec_rec_c	pwgt_r_rec_n	advb_otc
0	1	4	0	0	2	139	0
2	1	4	0	0	1	125	0
3	1	2	0	0	1	130	1
5	1	3	5	4	1	145	0
6	1	3	0	0	4	147	0

	advb_otc	cigs_before_preg	cigs_in_first_tri	medicaid_payer
0	0	0	0	0
2	0	0	0	1
3	1	0	0	1
5	0	1	1	1
6	0	0	0	0

	cigs_before_preg	cigs_in_first_tri	cigs_in_second_tri	cigs_in_third_tri	medicaid_payer	diabetes	gestational_diab	prepreg_hyp	gest_hyp	ecclampsia	poor_prev_preg_out	previous_preterm	multi_fetus	sexually_trans_dis	advb_otc
cigs_before_preg	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
cigs_in_first_tri	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
cigs_in_second_tri	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
cigs_in_third_tri	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
medicaid_payer	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	avg_prec_score_mean	avg_prec_score_std	roc_auc_mean	roc_auc_std	avg_prec_0.1 mean	avg_prec_0.1 std	precision at 0.1 mean	precision at 0.1 std	recall at 0.1 mean	recall at 0.1 std	test_count at 0.1 mean	test_count at 0.1 std	test_percent at 0.1 mean	test_percent at 0.1 std	pickle_file
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.348052	0.004442	0.605928	0.004647	0.152348	0.005087	0.434872	0.008406	0.225413	0.003993	1812.6	48.839533	0.099981	0.002693	pickles/LogisticRegression2.pkl
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.347984	0.004480	0.605935	0.004620	0.152329	0.005042	0.434680	0.008300	0.225413	0.003709	1813.4	47.715825	0.100025	0.002631	pickles/LogisticRegression6.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.348175	0.004478	0.605925	0.004604	0.152544	0.005132	0.434585	0.008248	0.225413	0.003816	1813.8	48.473704	0.100047	0.002673	pickles/LogisticRegression7.pkl
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.348228	0.004510	0.605924	0.004606	0.152574	0.005171	0.434458	0.008115	0.225299	0.003914	1813.4	48.982650	0.100025	0.002701	pickles/LogisticRegression3.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.343962	0.003745	0.604942	0.004906	0.148214	0.003327	0.429988	0.010114	0.222266	0.003646	1807.8	50.514354	0.099716	0.002785	pickles/LogisticRegression5.pkl
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.338083	0.003234	0.598966	0.005718	0.144032	0.002288	0.418889	0.009191	0.217118	0.003660	1812.4	38.240031	0.099970	0.002108	pickles/LogisticRegression1.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.325776	0.005685	0.599898	0.005309	0.130053	0.005297	0.409786	0.008017	0.211797	0.003723	1807.0	25.855367	0.099672	0.001426	pickles/LogisticRegression4.pkl
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n intercept_scaling=1, max_iter=100, multi_class='ovr',\n penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n verbose=0)	0.282969	0.017664	0.563666	0.003541	0.074025	0.020642	0.365911	0.007483	0.192632	0.007835	1840.6	73.717705	0.101526	0.004068	pickles/LogisticRegression0.pkl

	coef	features
6	-0.048390	pwgt_r_rec_n
1	-0.045012	meduc_rec_n
18	0.000000	ip_hepatc_rec_c_Y
16	0.000000	ip_chlam_rec_c_Y
15	0.000000	ip_syph_rec_c_Y
14	0.000000	ip_gono_rec_c_Y
5	0.000000	cig_3_rec_n
13	0.000000	dplural_c_4
12	0.000000	dplural_c_3
17	0.000000	ip_hepatb_rec_c_Y
9	0.015237	rf_ppoutc_rec_c_Y
2	0.018724	cig_0_rec_n
3	0.026343	cig_1_rec_n
0	0.040056	sexdis_rec_f
4	0.052022	cig_2_rec_n
7	0.251015	rf_diab_rec_c_Y
8	0.401064	rf_phyp_rec_c_Y
10	0.782075	rf_ppterm_rec_c_Y
11	2.373237	dplural_c_2

	dplural_c	meduc_rec_n	cig_0_rec_n	cig_1_rec_n	pay_rec_rec_c	pwgt_r_rec_n	advb_otc
0	1	4	0	0	2	139	0
2	1	4	0	0	1	125	0
3	1	2	0	0	1	130	1
5	1	3	5	4	1	145	0
6	1	3	0	0	4	147	0

	advb_otc	cigs_before_preg	cigs_in_first_tri	medicaid_payer
0	0	0	0	0
2	0	0	0	1
3	1	0	0	1
5	0	1	1	1
6	0	0	0	0

	dplural_c	meduc_rec_n	cig_0_rec_n	cig_1_rec_n	pay_rec_rec_c	pwgt_r_rec_n	advb_otc
0	1	4	0	0	2	139	0
2	1	4	0	0	1	125	0
3	1	2	0	0	1	130	1
5	1	3	5	4	1	145	0
6	1	3	0	0	4	147	0

	advb_otc	cigs_before_preg	cigs_in_first_tri	medicaid_payer
0	0	0	0	0
2	0	0	0	1
3	1	0	0	1
5	0	1	1	1
6	0	0	0	0