In [2]:
import sys
sys.path.append('/home/will/PatientPicker/')
import pandas as pd

In [29]:
import LoadingTools

store = pd.HDFStore('/home/will/HIVReportGen/Data/SplitRedcap/2013-01-16/EntireCohort.hdf')
pat_data = store['redcap']

In [53]:
drug_cols = ['Current Tobacco use',
             'Current Alcohol use',
             'Cocaine + metabolite',
             'Amphetamines',
             'Barbiturates',
             'Benzodiazepines',
             'Cannabinoid',
             'Opiates',
             'Phencyclidine',
             "Drugs used (choice='Ritalin')"]
gender_cols = ['Gender']
age_cols = ['Age', 'Calc-Years-Seropositive']
haart_cols = ['Current ART status']
race_cols = [col for col in pat_data.columns if col.startswith('Race')]
eth_cols = ['Ethnicity']

wanted_cols = drug_cols+gender_cols+age_cols+haart_cols+race_cols+eth_cols+['Patient ID', 'Patient visit number']
spat_data = pat_data[wanted_cols].sort(['Patient ID', 'Patient visit number'])

In [54]:
intake_mask = spat_data['Patient visit number'] == 'R00'
intake_data = spat_data[intake_mask].set_index('Patient ID').drop(['Patient visit number'], axis = 1)

pc_pats = ['A0022','A0025','A0029','A0039',
            'A0040','A0047','A0056','A0058',
            'A0068','A0083','A0091','A0106',
            'A0124','A0136','A0142','A0151',
            'A0175','A0181','A0191','A0208',
            'A0209','A0262','A0284','A0313',
            'A0379','A0388','A0427']
pn_pats = ['A0010','A0017','A0032','A0078',
           'A0100','A0159','A0195','A0206',
           'A0217','A0220','A0223','A0238',
           'A0239','A0240','A0242','A0255',
           'A0258','A0280','A0294','A0321',
           'A0339','A0356','A0363','A0376',
           'A0380','A0393','A0397','A0405',
           'A0406','A0415','A0440','A0447',
           'A0456']

pc_data = intake_data.ix[pc_pats]
pn_data = intake_data.ix[pn_pats]

In [62]:
from itertools import product
from copy import deepcopy

def calc_gender(indata):
    tdict = {
             'Male':(indata['Gender']=='Male').sum(),
             'Female':(indata['Gender']=='Female').sum(),
             }
    return tdict

def calc_race(indata):
    wcol = "Race (choice='White')"
    bcol = "Race (choice='Black or African American')"
    ucol = "Race (choice='Unknown')"
    ocols = ["Race (choice='Asian')",
             "Race (choice='American Indian/Alaska Native')",
             "Race (choice='Native Hawaiian or other Pacific Islander')",
             "Race (choice='More than one race')"]
 
    tdict = {
             'White': indata[wcol].sum(),
             'Black/AA': indata[bcol].sum(),
             'Unknown': indata[ucol].sum(),
             'Other': indata[ocols].any(axis=1).sum()
             }
    return tdict

def calc_drug_use(indata):
    
    groups = [('Tobacco', indata['Current Tobacco use']=='Yes'),
              ('Alcohol', indata['Current Alcohol use']=='Yes'),
              ('Cocaine', indata['Cocaine + metabolite']),
              ('Cannabinoids', indata['Cannabinoid']),
              ('Methamphetamines', indata['Amphetamines']),
              ('Benzodiazepines', indata['Benzodiazepines']),
              ('Narcotics', indata['Opiates']),
              ('Ritalin', indata["Drugs used (choice='Ritalin')"])]
    tdict = {}
    for key, col in groups:
        tdict[key] = col.sum()
    return tdict

def calc_eth(indata):
    
    cols = ['Not Hispanic or Latino', 'Hispanic or Latino']
    tdict = {}
    for col in cols:
        tdict[col] = (indata['Ethnicity']==col).sum()
    tdict['Unknown'] = indata['Ethnicity'].isnull().sum()
    return tdict


def calc_haart(indata):
    
    col = 'Current ART status'
    tdict = {
             'cH': (indata[col] == 'on').sum(),
             'dH': ((indata[col] == 'off') | (indata[col] == 'non-adherent')).sum(),
             'nH': (indata[col] == 'naive').sum(),
             }
    return tdict

def calc_age(indata):
    
    return {'':indata['Age'].mean()}

def calc_sero(indata):
    
    return {'':indata['Calc-Years-Seropositive'].mean()}

anal_list = [('Gender', calc_gender),
             ('Race', calc_race),
             ('Ethnicity', calc_eth),
             ('Drug Use', calc_drug_use),
             ('HAART', calc_haart),
             ('Age', calc_age),
             ('Years Seropsitive', calc_sero)]

groups = [('All', intake_data), 
          ('PN', pn_data), 
          ('PC', pc_data)]
out_groups = []
for (gname, group), (varname, func) in product(groups, anal_list):
    
    odict = func(group)
    for key, val in odict.items():
        out_groups.append({
                           'group':gname,
                           'varname':varname,
                           'itemname':key,
                           'itemval':val
                           })

In [63]:
ndf = pd.DataFrame(out_groups)
table_data = pd.pivot_table(ndf, 
                            rows = ['varname', 'itemname'], 
                            cols = 'group',
                            values = 'itemval')
print table_data
table_data.to_excel('/home/will/quickBrain/pat_groups.xlsx')


group                                            All         PC         PN
varname           itemname                                                
Age                                        45.079681  46.814815  43.363636
Drug Use          Alcohol                 157.000000  11.000000   6.000000
                  Benzodiazepines          17.000000   0.000000   0.000000
                  Cannabinoids             70.000000   0.000000   0.000000
                  Cocaine                  96.000000  19.000000   0.000000
                  Methamphetamines          2.000000   0.000000   0.000000
                  Narcotics                 7.000000   0.000000   0.000000
                  Ritalin                  20.000000   0.000000   0.000000
                  Tobacco                 367.000000  23.000000   7.000000
Ethnicity         Hispanic or Latino       33.000000   0.000000   3.000000
                  Not Hispanic or Latino  410.000000  22.000000  26.000000
                  Unknown                  59.000000   5.000000   4.000000
Gender            Female                  167.000000  10.000000  15.000000
                  Male                    329.000000  17.000000  17.000000
HAART             cH                      389.000000  16.000000  29.000000
                  dH                       51.000000   5.000000   2.000000
                  nH                       58.000000   6.000000   2.000000
Race              Black/AA                417.000000  27.000000  27.000000
                  Other                    16.000000   1.000000   2.000000
                  Unknown                   3.000000   0.000000   0.000000
                  White                    56.000000   0.000000   4.000000
Years Seropsitive                          11.920160  10.518519   9.757576

In [56]:
drug_cols = ['Current Tobacco use',
             'Current Alcohol use',
             'Cocaine + metabolite',
             'Amphetamines',
             'Barbiturates',
             'Benzodiazepines',
             'Cannabinoid',
             'Opiates',
             'Phencyclidine']
intake_data['Ethnicity'].unique()


Out[56]:
array(['Not Hispanic or Latino', nan, 'Hispanic or Latino'], dtype=object)

In [28]:
pat_data['Patient ID'].unique()


Out[28]:
array(['R00 - First visit', 'R01 - Second visit', 'R02 - Third visit',
       'R03 - Fourth visit', 'R04 - Fifth visit', 'R05 - Sixth visit',
       'R06 - Seventh visit', 'R07 - Eighth visit', 'R08 - Ninth visit',
       'R09 - Tenth visit', 'R10 - Eleventh visit'], dtype=object)

In [ ]: