In [1]:
import pandas as pd
import sys
import os, os.path

sys.path.append('/home/will/PatientPicker/')
import LoadingTools

In [2]:
redcap= LoadingTools.load_redcap_data().set_index(['Patient ID', 'VisitNum'])

In [3]:
admit_no_drugs = [('Current-Drug-Use-NO', redcap['Current Drug use']=='No'),
                  ('Current-Drug-Use-NEVER', redcap['Current Drug use']=='Never'),
                  ('Date-Stopped-Drug-Use', redcap['Date Stopped Drug Use']<redcap['Date Of Visit']),
                  ('Drug-Use-And-HIV-Status-BEFORE', redcap['Drug Use And HIV Status']=='Used after HIV+')]
test_cols = [col for col in redcap.columns if col.startswith('Test-')]
admit_cols = [col for col in redcap.columns if (col.startswith('Admit-') and ('None' not in col))]
ever_test = redcap[test_cols].groupby(level='Patient ID').agg('any')

admit_no_drug_df = pd.concat([redcap[admit_cols], pd.DataFrame(dict(admit_no_drugs))], axis = 1)

In [4]:
tmp = pd.concat(admit_no_drug_df.align(ever_test, axis=0, level='Patient ID'), 1)
tmp.iloc[100:105].T


Out[4]:
Patient ID A0021 A0022 A0023
VisitNum R00 R01 R00 R01 R00
Admit-Cannabinoid False False False False False
Admit-Cocaine True False False True False
Admit-Heroin True True False False False
Admit-Amphetamines False False False False False
Admit-Benzodiazapine False False False False False
Admit-Narcotics False False False False False
Admit-Ecstasy False False False False False
Admit-PCP False False False False False
Admit-Ritalin False False False False False
Admit-Other False False False False False
Current-Drug-Use-NEVER False False False False False
Current-Drug-Use-NO False True True True True
Date-Stopped-Drug-Use False True False True False
Drug-Use-And-HIV-Status-BEFORE False False False False False
Test-Amphetamines False False False False False
Test-Barbiturates False False False False False
Test-Benzodiazepine False False False False False
Test-Cannabinoid True True False False True
Test-Cocaine True True True True False
Test-Opiates False False False False False
Test-Phencyclidine False False False False False

In [5]:
def fix_num(num):
    if num < 1:
        return -1/num
    else:
        return num

res = []
tmp['Test-Anything'] = tmp[test_cols].any(axis=1)
ntest_cols = [col for col in tmp.columns if col.startswith('Test-')]
nadmit_cols = [col for col in tmp.columns if not col.startswith('Test-')]
for col in nadmit_cols:
    say_yes = tmp[col]
        
    yes_frac = tmp[ntest_cols][say_yes].mean()
    no_frac = tmp[ntest_cols][~say_yes].mean()
    odds_r = yes_frac/no_frac
    
    res.append(odds_r.to_dict())
    res[-1]['Col'] = col
    
nres = pd.DataFrame(res).set_index('Col').applymap(fix_num)

In [6]:
print nres


                                Test-Amphetamines  Test-Anything  \
Col                                                                
Admit-Cannabinoid                             inf       1.477619   
Admit-Cocaine                           -3.936862       1.078426   
Admit-Heroin                             4.789644      -1.161042   
Admit-Amphetamines                       2.525773       1.150213   
Admit-Benzodiazapine                     2.735912       1.012386   
Admit-Narcotics                          3.409396      -1.048573   
Admit-Ecstasy                           28.812500      -1.156492   
Admit-PCP                                    -inf       1.027540   
Admit-Ritalin                            6.225225      -1.037446   
Admit-Other                              3.910920       1.161659   
Current-Drug-Use-NEVER                       -inf      -3.229504   
Current-Drug-Use-NO                     -1.462578      -1.357272   
Date-Stopped-Drug-Use                   -1.126984      -1.350138   
Drug-Use-And-HIV-Status-BEFORE               -inf      -1.179185   

                                Test-Barbiturates  Test-Benzodiazepine  \
Col                                                                      
Admit-Cannabinoid                       -1.325093            -1.429997   
Admit-Cocaine                            3.725472            -1.230269   
Admit-Heroin                             1.479155             1.332603   
Admit-Amphetamines                       2.104811             1.353093   
Admit-Benzodiazapine                    -1.023425             1.348689   
Admit-Narcotics                         -2.698425            -1.129232   
Admit-Ecstasy                                -inf            -1.080260   
Admit-PCP                                    -inf            -1.785285   
Admit-Ritalin                                -inf             1.844511   
Admit-Other                              2.133229             1.466595   
Current-Drug-Use-NEVER                       -inf             1.417486   
Current-Drug-Use-NO                      2.563966            -1.365073   
Date-Stopped-Drug-Use                    2.528873            -1.273982   
Drug-Use-And-HIV-Status-BEFORE               -inf            -1.261803   

                                Test-Cannabinoid  Test-Cocaine  Test-Opiates  \
Col                                                                            
Admit-Cannabinoid                       2.202386      1.268904     -1.511588   
Admit-Cocaine                          -1.202065      1.765574      1.505241   
Admit-Heroin                           -1.200766      1.168104      2.694175   
Admit-Amphetamines                      1.104505      1.331478      2.525773   
Admit-Benzodiazapine                    1.014935     -1.032757      1.538950   
Admit-Narcotics                        -1.274908     -1.052818      2.185510   
Admit-Ecstasy                          -1.827332     -1.207158          -inf   
Admit-PCP                              -1.041935      1.035066      2.987384   
Admit-Ritalin                           1.529003      1.012232     -1.285094   
Admit-Other                             1.176737     -1.111053      2.085824   
Current-Drug-Use-NEVER                 -2.950165     -8.818427     -4.515035   
Current-Drug-Use-NO                    -1.723108     -1.379128      1.281983   
Date-Stopped-Drug-Use                  -1.739426     -1.316675      1.146127   
Drug-Use-And-HIV-Status-BEFORE         -2.658798      1.252688          -inf   

                                Test-Phencyclidine  
Col                                                 
Admit-Cannabinoid                              NaN  
Admit-Cocaine                                  NaN  
Admit-Heroin                                   NaN  
Admit-Amphetamines                             NaN  
Admit-Benzodiazapine                           NaN  
Admit-Narcotics                                NaN  
Admit-Ecstasy                                  NaN  
Admit-PCP                                      NaN  
Admit-Ritalin                                  NaN  
Admit-Other                                    NaN  
Current-Drug-Use-NEVER                         NaN  
Current-Drug-Use-NO                            NaN  
Date-Stopped-Drug-Use                          NaN  
Drug-Use-And-HIV-Status-BEFORE                 NaN  

In [7]:
#nres.to_excel('/home/will/DrugStuff/admit_explanations.xlsx')

In [20]:
from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier


X = tmp[nadmit_cols]
y_mat = tmp[ntest_cols]

classifiers = [('Naive-Bayes', GaussianNB()),
               ('Logistic-Regression', LogisticRegression()),
               ('Decision-Tree', DecisionTreeClassifier()),
               ('Adaboost', AdaBoostClassifier()),
               ]
tres = []
for col in ntest_cols:
    if len(y_mat[col].dropna().unique()) < 2:
        continue
    for name, cl in classifiers:
        
        vals= cross_val_score(cl, X.values, y=y_mat[col].values, 
                        cv = StratifiedShuffleSplit(y_mat[col].values),
                        scoring = 'roc_auc')
        tres.append({'Col':col, 
                     'Predictor': name, 
                     'Accuracy':np.mean(vals)})
tdf = pd.pivot_table(pd.DataFrame(tres),rows = 'Col', cols = 'Predictor', values='Accuracy')

In [21]:
tdf


Out[21]:
Predictor Adaboost Decision-Tree Logistic-Regression Naive-Bayes
Col
Test-Amphetamines 0.862411 0.818085 0.882624 0.788298
Test-Anything 0.648538 0.640345 0.661579 0.641016
Test-Barbiturates 0.667857 0.594821 0.551071 0.599286
Test-Benzodiazepine 0.515330 0.511905 0.487886 0.454971
Test-Cannabinoid 0.682888 0.684395 0.709146 0.655516
Test-Cocaine 0.603492 0.662399 0.650468 0.582098
Test-Opiates 0.716131 0.495182 0.673212 0.657664

In [ ]: