Cannabinoid Effect on HIVD score

Here I am looking to find the effect of Cannabinoid Use on HIVD score. HIVD score is a value that ranges from 0-12 with anything below 10 marking 'impaired' and >=10 implying not impaired. This test is done at every patient visit along with blood-tests confirming drug use.


In [1]:
from __future__ import division
import os, os.path
from pandas import *
import numpy as np
import csv
from itertools import groupby
from collections import defaultdict
from copy import deepcopy
from types import TupleType
import matplotlib.pyplot as plt
from itertools import product, imap, islice
from patsy import dmatrices
from patsy.contrasts import Treatment
import statsmodels.api as sm
import shutil
import glob
import shlex
from subprocess import check_call, CalledProcessError
from tempfile import mkdtemp
from concurrent.futures import ProcessPoolExecutor
from types import TupleType
from rpy2.robjects import Formula
import rpy2.robjects as robjects
import pandas.rpy.common as com
from rpy2.rinterface import RRuntimeError
import sys


sys.path.append('/home/will/PySeqUtils/')
os.chdir('/home/will/HIVSystemsBio/NewCytokineAnalysis/')

import Rtools
import PlottingTools

Data Description

I took data from the 1/16/2013 (most recent) cohort data dump. I extracted drug test resuts and HIVD scores along with Date of Visit and Visit Number.


In [2]:
store = HDFStore('/home/will/HIVReportGen/Data/SplitRedcap/2013-01-16/EntireCohort.hdf')
redcap_data = store['redcap']

t = redcap_data['Event Name'].dropna().apply(lambda x: x.split(' - ')[0])
redcap_data['Patient visit number'] = redcap_data['Patient visit number'].combine_first(t)
redcap_data["Drugs used (choice='Other')"] = redcap_data["Drugs used (choice='Other')"].dropna() == 'Checked'

drug_cols = ['Amphetamines',
             'Barbiturates',
            'Benzodiazepines',
            'Cannabinoid',
            'Cocaine + metabolite',
            'Opiates',
            'Phencyclidine'
            ]
wanted_drug_names = ['Benzodiazepines', 'Cannabinoid', 'Cocaine', 'Opiates']
all_drug_names = ['Benzodiazepines', 'Cannabinoid', 'Cocaine', 'Opiates','Amphetamines','Barbiturates','Phencyclidine']
race_cols = ["Race (choice='Asian')",
 "Race (choice='American Indian/Alaska Native')",
 "Race (choice='Black or African American')",
 "Race (choice='Native Hawaiian or other Pacific Islander')",
 "Race (choice='White')",
 "Race (choice='More than one race')",
 "Race (choice='Unknown')"]

admit_cols = ["Drugs used (choice='Marijuana')",
 "Drugs used (choice='Cocaine (crack, nasal, smoke, inject)')",
 "Drugs used (choice='Heroin (nasal, inject)')",
 "Drugs used (choice='Methamphetamine (smoke, nasal, inject)')",
 "Drugs used (choice='Benzodiazapine (i.e. valium, ativan, xanax, klonipin, etc)')",
 "Drugs used (choice='Narcotics')",
 "Drugs used (choice='Ecstasy')",
 "Drugs used (choice='PCP')",
 "Drugs used (choice='Ritalin')",
 "Drugs used (choice='Other')"]


wanted_cols = ['Patient ID', 'Patient visit number', 'Year of Birth','HIV seropositive date','Current ART status',
                'Date of visit', 'Total Modified Hopkins Dementia Score']+drug_cols+race_cols+admit_cols
wanted_redcap = redcap_data[wanted_cols].dropna(how = 'all')
#wanted_redcap["Drugs used (choice='Other')"] = wanted_redcap["Drugs used (choice='Other')"] == 'Checked'
data = wanted_redcap.rename(columns= {
                                'Patient visit number':'VisitNum',
                                'Date of visit':'Date',
                                'Total Modified Hopkins Dementia Score':'nHIVD',
                                'Cocaine + metabolite': 'Cocaine',
                                "Drug Use and HIV Status":'DrugStatus',
                                'Current ART status':'ART'
                            })
data.sort(['Patient ID', 'VisitNum'], inplace=True)
data = data.set_index(['Patient ID', 'VisitNum'])

get_years = lambda x: x/np.timedelta64(1,'D')/365
drug_names = ['Cocaine', 'Cannabinoid', 'Opiates', 'Benzodiazepines']
data['DrugFree'] = (data[drug_names].dropna() == 0).all(axis = 1)
data['bDate'] = data['Year of Birth'].dropna().map(lambda x: datetime(int(x), 1,1))
data['Age'] = data[['Date', 'bDate']].dropna().apply(lambda x: x['Date'] - x['bDate'], axis = 1).map(get_years)
data['TimeSeropositive'] = data[['Date', 'HIV seropositive date']].dropna().apply(lambda x: x['Date'] - x['HIV seropositive date'], axis = 1).map(get_years)

In [3]:
race_dict = {"Race (choice='Asian')":'Asian',
             "Race (choice='American Indian/Alaska Native')":'Indian',
             "Race (choice='Black or African American')":'Black',
             "Race (choice='Native Hawaiian or other Pacific Islander')":'Hawaiian',
             "Race (choice='White')":'White',
             "Race (choice='More than one race')":'Multi-Race',
             "Race (choice='Unknown')":'Unknown'}

def fix_race(indf):
    tmp = indf.mean().idxmax()
    return race_dict[tmp]

race_data = data[race_cols].groupby(level = 'Patient ID').apply(fix_race)
data['Race'] = [race_data[pat] for pat, _ in data.index]

In [4]:
def count_with_skips(inser, nskips):
    skips = 0
    for row in inser.values:
        if row:
            skips = 0
        else:
            skips += 1
        if skips > nskips:
            return False
    return True

name_mappings = {'Benzodiazepines':'SBe', 
                'Cannabinoid':'SCa', 
                'Cocaine':'PC',
                'Opiates':'PO',
                'Amphetamines':np.nan,
                'Barbiturates':np.nan,
                'Phencyclidine':np.nan
                }

def niz_groupings(indf):
    
    if len(indf.index) < 3:
        return np.nan
    indf = indf.dropna()
    ever_used = indf.any(axis = 0)
    if not ever_used.any():
        return 'PN'
    all_used = indf.all(axis = 0)
    if all_used.sum() == 1:
        return name_mappings[all_used.idxmax()]
    elif all_used.sum() > 1:
        return 'MDU'
    
    pure_cols = []
    non_pure_cols = []
    for col in indf.columns:
        if count_with_skips(indf[col], 1):
            pure_cols.append(col)
        else:
            non_pure_cols.append(col)
    if ever_used[non_pure_cols].any():
        return np.nan
            
    if len(pure_cols) == 1:
        return name_mappings[pure_cols[0]]
    else:
        return 'MDU'
    
admit_data = data[admit_cols].any(axis = 1).groupby(level = 'Patient ID').any()
drug_names = ['Benzodiazepines', 'Cannabinoid', 'Cocaine', 'Opiates']
niz_groupings = data[all_drug_names].groupby(level = 'Patient ID').apply(niz_groupings)
niz_groupings[(niz_groupings == 'PN') & (admit_data)] = np.nan
data['NizGroupings'] = [niz_groupings[pat] for pat, _ in data.index]

In [5]:
sample_counts = data['NizGroupings'].value_counts()
patient_counts = data.groupby(level = 'Patient ID')['NizGroupings'].first().value_counts()
print DataFrame({'SampleCounts':sample_counts, 'PatientCounts':patient_counts})


     PatientCounts  SampleCounts
PC              26           111
SCa             21            93
PN              15            67
MDU             13            55
SBe              5            21
PO               2            10

In [6]:
data = data.drop(['Year of Birth', 'bDate', 'HIV seropositive date']+race_cols+admit_cols, axis = 1)

print data


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1419 entries, (A0001, R00) to (A0514, R00)
Data columns (total 15 columns):
ART                 1383  non-null values
Date                1419  non-null values
nHIVD               1223  non-null values
Amphetamines        774  non-null values
Barbiturates        774  non-null values
Benzodiazepines     774  non-null values
Cannabinoid         774  non-null values
Cocaine             774  non-null values
Opiates             774  non-null values
Phencyclidine       774  non-null values
DrugFree            774  non-null values
Age                 1399  non-null values
TimeSeropositive    1410  non-null values
Race                1419  non-null values
NizGroupings        357  non-null values
dtypes: float64(3), object(12)

HIVD Scores


In [7]:
fig, axes = plt.subplots(2,2, sharey = True, sharex = True, figsize = (10,10))
plt.sca(axes.flatten()[0])
data.groupby(level = 'Patient ID')['nHIVD'].first().hist(bins = range(13))
plt.title('Entry HIVD scores');

plt.sca(axes.flatten()[1])
data.groupby(level = 'Patient ID')['nHIVD'].mean().hist(bins = range(13))
plt.title('Mean HIVD scores');

plt.sca(axes.flatten()[2])
data.groupby(level = 'Patient ID')['nHIVD'].max().hist(bins = range(13))
plt.title('Max HIVD scores');

plt.sca(axes.flatten()[3])
data.groupby(level = 'Patient ID')['nHIVD'].min().hist(bins = range(13))
plt.title('Min HIVD scores');


We can see from these that all of the different ways of summarizing HIVD scores gives drastically different answers. If we look at the Max score then almost everyone is 'Healthy'. Looking at the Min score we can see that about half of the patients were impaired at some point. I wonder what would happen if we 'smoothed' them.


In [8]:
def cum_dev(inser):
    #print inser
    #print np.sum(np.abs(np.diff(inser.dropna().values)))
    #raise KeyError
    return np.sum(np.abs(np.diff(inser.dropna().values)))
    
abs_dev = data.groupby(level = 'Patient ID').agg({'nHIVD':cum_dev})
abs_dev = abs_dev.sort('nHIVD')
high_dev_pats = abs_dev.tail(n=4).index

fig, axs = plt.subplots(4,1, sharey=True, figsize = (10,15))
for pat, ax in zip(high_dev_pats, axs.flatten()):
    
    vals = data.ix[pat][['Date', 'nHIVD']].set_index('Date')
    sHIVD = rolling_mean(vals['nHIVD'], 3, min_periods=1, freq = '12M', axis = 0)
    start_date = max(vals.index[0], sHIVD.index[0])
    end_date = min(vals.index[-1], sHIVD.index[-1])
    sHIVD = sHIVD.truncate(before=start_date, after=end_date)
    plt.sca(ax)
    plt.hold(True)
    plt.plot(vals.index, vals['nHIVD'].values, lw = 4, alpha = 0.6, marker = '*')

    plt.plot(sHIVD.index, sHIVD.values, lw = 3, color = 'r', marker = 'o')
    plt.hold(False)
    plt.ylabel('HIVD')
    plt.title(pat)
plt.ylim([0, 12])
plt.savefig('figures/NeuroSpecificFigures/smoothed_HIVD.png', dpi = 200)



In [9]:
def normalize_by_date(indf):
    tdf = indf.reset_index().set_index('Date').drop(['Patient ID', 'VisitNum'], axis = 1)
    tdf[drug_names + ['DrugFree']] = tdf[drug_names + ['DrugFree']].applymap(float)
    tdata = rolling_mean(tdf[['nHIVD','DrugFree']+drug_names], 3, min_periods=1, freq = '12M', axis = 0)#.reset_index()
    odf = tdf.resample('12M', how = 'last')
    odf[['DrugFree']+drug_names] = tdata[['DrugFree']+drug_names]
    odf['sHIVD'] = tdata['nHIVD']
    try:
        odf['iHIVD'] = tdf['nHIVD'].dropna().values[0]
    except IndexError:
        odf['iHIVD'] = np.nan
    odf['LsHIVD'] = odf['sHIVD'].shift(1)
    odf['dHIVD'] = rolling_apply(odf['sHIVD'], 2, np.diff)
    #odf['ART'] = tdf['ART'].resample('12M', how = 'last')
    odf = odf.reset_index()
    odf.index.name = 'Year'
    return odf

date_data = data.groupby(level = 'Patient ID').apply(normalize_by_date).reset_index()
print date_data


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1486 entries, 0 to 1485
Data columns (total 21 columns):
Patient ID          1486  non-null values
Year                1486  non-null values
Date                1486  non-null values
ART                 1189  non-null values
nHIVD               1035  non-null values
Amphetamines        714  non-null values
Barbiturates        714  non-null values
Benzodiazepines     1191  non-null values
Cannabinoid         1191  non-null values
Cocaine             1191  non-null values
Opiates             1191  non-null values
Phencyclidine       714  non-null values
DrugFree            1191  non-null values
Age                 1199  non-null values
TimeSeropositive    1208  non-null values
Race                1213  non-null values
NizGroupings        299  non-null values
sHIVD               1213  non-null values
iHIVD               1438  non-null values
LsHIVD              750  non-null values
dHIVD               735  non-null values
dtypes: datetime64[ns](1), float64(12), int64(1), object(7)

First, I'll look at the subset of the cohort that was in the previous cytokine analysis. For consistency I'll use the Drug-Status groupings from the previous analysis. We'll branch out into a more expansive dataset and a more


In [10]:
norm_cyto_data = read_csv('CytoRawDataNorm.csv', 
                            sep = '\t')
known_pat_data = read_csv('CytoPatData.csv', 
                            sep = '\t')
#print known_pat_data

wanted_pats = set(known_pat_data['Patient ID'])
groupings = dict(zip(known_pat_data['Patient ID'].values, known_pat_data['Grouping'].values))
cyto_dataset = date_data.ix[date_data['Patient ID'].map(lambda x: x in wanted_pats)]
cyto_dataset['DrugStatus'] = cyto_dataset['Patient ID'].map(lambda x: groupings[x])

print cyto_dataset


<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 6 to 1409
Data columns (total 22 columns):
Patient ID          462  non-null values
Year                462  non-null values
Date                462  non-null values
ART                 378  non-null values
nHIVD               329  non-null values
Amphetamines        268  non-null values
Barbiturates        268  non-null values
Benzodiazepines     411  non-null values
Cannabinoid         411  non-null values
Cocaine             411  non-null values
Opiates             411  non-null values
Phencyclidine       268  non-null values
DrugFree            411  non-null values
Age                 383  non-null values
TimeSeropositive    386  non-null values
Race                387  non-null values
NizGroupings        195  non-null values
sHIVD               381  non-null values
iHIVD               460  non-null values
LsHIVD              272  non-null values
dHIVD               266  non-null values
DrugStatus          325  non-null values
dtypes: datetime64[ns](1), float64(12), int64(1), object(8)

In [11]:
cyto_dataset['DrugStatus'].value_counts()


Out[11]:
PC     108
PN      98
SCa     52
MDU     45
SBe     22
dtype: int64

In [12]:
drug_groups = ['PN', 'SCa', 'PC', 'SBe','MDU']
tdata = []
for group in drug_groups:
    mask = cyto_dataset['DrugStatus'] == group
    neuro_data = cyto_dataset['sHIVD'][mask]
    tdata.append(neuro_data.dropna().values)

plt.boxplot(tdata);
plt.xticks(range(1,len(drug_groups)+1), drug_groups)
plt.ylim([0,12])
plt.ylabel('sHIVD');
plt.savefig('figures/NeuroSpecificFigures/cyto-sHIVD.png')


Here we can see that there may be a difference between the different drug groupings. It would seem that Cannabinoids have less variation then Non-Users. However, this figure has all data points lumped into one boxplot. Would it look different if we grouped by year?


In [13]:
from itertools import product

fig, axs = plt.subplots(2,2, sharex=True, sharey=True, figsize=(10,10))

ax = axs.flatten()[0]
plt.sca(ax)
tdata = []
for group in drug_groups:
    mask = cyto_dataset['DrugStatus'] == group
    neuro_data = cyto_dataset['iHIVD'][mask]
    tdata.append(neuro_data.dropna().values)
plt.boxplot(tdata);
plt.xticks(range(1,len(drug_groups)+1), drug_groups)
plt.ylim([0,12])
plt.ylabel('sHIVD');
plt.title('Intake')


for year, ax in enumerate(axs.flatten()[1:], 1):
    plt.sca(ax)
    year_data = date_data['Year'] == year
    tdata = []
    for group in drug_groups:
        mask = cyto_dataset['DrugStatus'] == group
        neuro_data = cyto_dataset['sHIVD'][mask]
        tdata.append(neuro_data.dropna().values)

    plt.boxplot(tdata);
    plt.xticks(range(1,len(drug_groups)+1), drug_groups)
    plt.ylim([0,12])
    plt.ylabel('sHIVD');
    plt.title('HIVD - year %i' % (year))
plt.savefig('figures/NeuroSpecificFigures/cyto-YearHIVD.png')


Looking at this small dataset I cannot see a difference between Cannabinoid and Non-Users. If anything it looks like Benzos have a protective effect.

This is another way of looking at the same data. Here I've plotted the HIVD score afer X-years vs the Intake Score. You can see that there is a strong linear relationship between the intake HIVD score and the Year-1 and Year-2 score, by year 4 there just isn't enough data. It does look like there is a difference in slopes by Year 2 with the Pure-Cocaine (red) and Pure-Cannabinoid (green) having a smaller slope then the Non-users (grey) and MDU (blue). It seems so odd that Non-Users are on-par with MDU when you would expect the opposite! I wonder if I group people by intake scores and then by drug-use if this effect will go away.


In [104]:
vals = []
for grouping in cyto_dataset['DrugStatus'].unique():
    good_mask = cyto_dataset['DrugStatus'] == grouping
    tdata = cyto_dataset[good_mask]
    for pat, rows in tdata.groupby('Patient ID'):
        cum_diff = rolling_apply(rows['sHIVD'], 2, np.diff)
        tmp = DataFrame({'sHIVD': rows['sHIVD'], 'dHIVD':cum_diff})
        tmp['DrugStatus'] = grouping
        tmp['Age'] = rows['Age']
        tmp['TimeSeropositive'] = rows['TimeSeropositive']
        tmp['ART'] = rows['ART']
        tmp['iHIVD'] = rows['iHIVD']
        tmp['Patient ID'] = pat
        tmp['Year'] = rows['Year']
        tmp['Race'] = rows['Race']
        vals.append(tmp.reset_index())
deltaData = concat(vals, axis = 0, ignore_index=True)
print deltaData.dropna().head(n=10).to_string()


    index     dHIVD      sHIVD DrugStatus        Age  TimeSeropositive    ART  iHIVD Patient ID  Year   Race
4      42 -0.250000   5.250000         PN  56.128767         16.967123     on      4      A0010     4  Black
7      73  0.000000  10.000000         PN  32.654795          8.504110     on     10      A0017     2  Black
8      74 -0.333333   9.666667         PN  33.290411          8.556164     on     10      A0017     3  Black
10     76 -1.250000   8.250000         PN  34.786301         10.052055     on     10      A0017     5  Black
11     77  0.125000   8.375000         PN  36.336986         12.186301     on     10      A0017     6  Black
12     78  0.541667   8.916667         PN  36.838356         12.687671     on     10      A0017     7  Black
25    333 -0.500000  11.500000         PN  63.901370         25.739726     on     12      A0078     4  Black
28    451  1.000000  11.000000         PN  34.769863          2.613699  naive     10      A0100     2  Black
29    452  0.333333  11.333333         PN  35.709589          3.553425  naive     10      A0100     3  Black
31    454 -0.500000  11.500000         PN  38.372603          6.216438  naive     10      A0100     5  Black

In [105]:
data['Race'].unique()


Out[105]:
array(['Black', 'White', 'Indian', 'Asian', 'Multi-Race', 'Unknown'], dtype=object)

In [107]:
import statsmodels.api as sm
from patsy import dmatrices, dmatrix
from copy import deepcopy
result_objs = []

group_data = deltaData.dropna()
y, X = dmatrices('sHIVD ~ Year+iHIVD+TimeSeropositive+Age+C(DrugStatus, Treatment("PN"))+C(ART, Treatment("naive"))+C(Race, Treatment("White"))', group_data, return_type = 'dataframe')
res = sm.GLM(y,X).fit()
print res.summary()
print res.pvalues
print 
print res.params[res.pvalues < 0.05]
group_data['cHIVD'] = res.fittedvalues
result_objs.append(('Cytokine', 'Grouping', deepcopy(res)))


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  sHIVD   No. Observations:                  136
Model:                            GLM   Df Residuals:                      122
Model Family:                Gaussian   Df Model:                           13
Link Function:               identity   Scale:                   1.43051380968
Method:                          IRLS   Log-Likelihood:                -209.93
Date:                Mon, 24 Jun 2013   Deviance:                       174.52
Time:                        15:18:52   Pearson chi2:                     175.
No. Iterations:                     3                                         
==============================================================================================================
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
Intercept                                      6.0147      1.050      5.728      0.000         3.957     8.073
C(DrugStatus, Treatment("PN"))[T.MDU]          1.3736      0.395      3.480      0.001         0.600     2.147
C(DrugStatus, Treatment("PN"))[T.PC]           1.3162      0.314      4.190      0.000         0.700     1.932
C(DrugStatus, Treatment("PN"))[T.SBe]          1.5486      0.440      3.520      0.001         0.686     2.411
C(DrugStatus, Treatment("PN"))[T.SCa]          1.5983      0.344      4.646      0.000         0.924     2.273
C(ART, Treatment("naive"))[T.non-adherent]    -2.5382      0.875     -2.901      0.004        -4.253    -0.823
C(ART, Treatment("naive"))[T.off]             -1.7129      0.640     -2.675      0.008        -2.968    -0.458
C(ART, Treatment("naive"))[T.on]              -0.8056      0.492     -1.638      0.104        -1.770     0.159
C(Race, Treatment("White"))[T.Black]          -0.0494      0.547     -0.090      0.928        -1.121     1.022
C(Race, Treatment("White"))[T.Indian]          0.6315      0.870      0.726      0.469        -1.074     2.337
Year                                           0.1050      0.069      1.517      0.132        -0.031     0.241
iHIVD                                          0.6244      0.045     13.859      0.000         0.536     0.713
TimeSeropositive                               0.0169      0.016      1.024      0.308        -0.015     0.049
Age                                           -0.0551      0.013     -4.232      0.000        -0.081    -0.030
==============================================================================================================
Intercept                                     7.471126e-08
C(DrugStatus, Treatment("PN"))[T.MDU]         6.958284e-04
C(DrugStatus, Treatment("PN"))[T.PC]          5.312220e-05
C(DrugStatus, Treatment("PN"))[T.SBe]         6.078943e-04
C(DrugStatus, Treatment("PN"))[T.SCa]         8.627578e-06
C(ART, Treatment("naive"))[T.non-adherent]    4.414224e-03
C(ART, Treatment("naive"))[T.off]             8.489453e-03
C(ART, Treatment("naive"))[T.on]              1.040769e-01
C(Race, Treatment("White"))[T.Black]          9.281428e-01
C(Race, Treatment("White"))[T.Indian]         4.692882e-01
Year                                          1.319681e-01
iHIVD                                         8.160215e-27
TimeSeropositive                              3.076513e-01
Age                                           4.511850e-05
dtype: float64

Intercept                                     6.014702
C(DrugStatus, Treatment("PN"))[T.MDU]         1.373591
C(DrugStatus, Treatment("PN"))[T.PC]          1.316199
C(DrugStatus, Treatment("PN"))[T.SBe]         1.548627
C(DrugStatus, Treatment("PN"))[T.SCa]         1.598255
C(ART, Treatment("naive"))[T.non-adherent]   -2.538213
C(ART, Treatment("naive"))[T.off]            -1.712855
iHIVD                                         0.624432
Age                                          -0.055136
dtype: float64

We can see that there is a HUGE correlation with the initial HIVD score and has a positive effect on the nHIVD.


In [108]:
fig, axs = plt.subplots(2,2, sharex=True, sharey=True, figsize = (10, 10))
for year, ax in enumerate(axs.flatten(), 1):
    if year == 4:
        ymask = True
        title = 'All Years'
    else:
        title = 'Score at year %i' % year
        ymask = group_data['Year'] == year
    tdata = []
    for group in drug_groups:
        gmask = group_data['DrugStatus'] == group
        tdata.append(group_data['cHIVD'][ymask & gmask].dropna().values)
    plt.sca(ax)
    plt.boxplot(tdata)
    plt.xticks(range(1,len(drug_groups)+1), drug_groups, rotation=90)
    plt.title(title)
    plt.ylabel('Adjusted HIVD')
plt.savefig('figures/NeuroSpecificFigures/ctyo-Adj-HIVD-Group.png', dpi = 200)


Well, it certainly looks like Can users have an increased HIVD score. I wonder if this holds true in the 'linear contribution' model.


In [109]:
def tmp_rolling(inser):
    return -rolling_apply(inser, 2, np.diff, min_periods=2)
    

#Since we don't want the Year-0 and Year > 5 patients skewing the data
linear_data = cyto_dataset[(cyto_dataset['Year']>0) & (cyto_dataset['Year']<5)].dropna()
print linear_data.drop(['Date', 'Opiates'], axis  =1).head().to_string()


    Patient ID  Year    ART  nHIVD Amphetamines Barbiturates  Benzodiazepines  Cannabinoid  Cocaine Phencyclidine  DrugFree        Age  TimeSeropositive   Race NizGroupings      sHIVD  iHIVD  LsHIVD     dHIVD DrugStatus
42       A0010     4     on    5.0        False        False                0            0        0         False         1  56.128767         16.967123  Black           PN   5.250000    4.0     5.5 -0.250000         PN
48       A0013     3     on    9.5        False        False                0            1        0         False         0  62.904110         10.734247  Black          SCa   9.750000   10.0    10.0 -0.250000        SCa
73       A0017     2     on   10.0        False        False                0            0        0         False         1  32.654795          8.504110  Black           PN  10.000000   10.0    10.0  0.000000         PN
74       A0017     3     on    9.0        False        False                0            0        0         False         1  33.290411          8.556164  Black           PN   9.666667   10.0    10.0 -0.333333         PN
129      A0029     3  naive   12.0        False        False                0            0        1         False         0  51.816438          3.649315  Black           PC  11.750000   11.5    11.5  0.250000         PC

In [110]:
y, X = dmatrices('sHIVD ~ iHIVD+Cannabinoid+Cocaine+Year+TimeSeropositive+Age+C(ART, Treatment("naive"))+C(Race, Treatment("White"))', 
                    linear_data, return_type = 'dataframe')

res = sm.GLM(y,X).fit()
print res.summary()
print res.pvalues, '\n'
print res.params[res.pvalues < 0.05]
linear_data['cHIVD'] = res.fittedvalues
result_objs.append(('Cytokine', 'LCM', deepcopy(res)))


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  sHIVD   No. Observations:                   50
Model:                            GLM   Df Residuals:                       39
Model Family:                Gaussian   Df Model:                           10
Link Function:               identity   Scale:                   1.33008442065
Method:                          IRLS   Log-Likelihood:                -71.866
Date:                Mon, 24 Jun 2013   Deviance:                       51.873
Time:                        15:19:45   Pearson chi2:                     51.9
No. Iterations:                     3                                         
==============================================================================================================
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
Intercept                                      7.7728      1.523      5.105      0.000         4.789    10.757
C(ART, Treatment("naive"))[T.non-adherent]    -4.2352      1.391     -3.044      0.004        -6.962    -1.509
C(ART, Treatment("naive"))[T.off]             -2.6503      1.049     -2.527      0.016        -4.706    -0.594
C(ART, Treatment("naive"))[T.on]              -0.2743      0.582     -0.471      0.640        -1.416     0.867
C(Race, Treatment("White"))[T.Black]          -1.6959      0.877     -1.933      0.060        -3.415     0.024
iHIVD                                          0.6775      0.062     10.938      0.000         0.556     0.799
Cannabinoid                                    0.8389      0.546      1.536      0.133        -0.231     1.909
Cocaine                                        1.0075      0.396      2.543      0.015         0.231     1.784
Year                                           0.1754      0.171      1.026      0.311        -0.160     0.510
TimeSeropositive                               0.0282      0.023      1.220      0.230        -0.017     0.074
Age                                           -0.0789      0.018     -4.405      0.000        -0.114    -0.044
==============================================================================================================
Intercept                                     8.978489e-06
C(ART, Treatment("naive"))[T.non-adherent]    4.162597e-03
C(ART, Treatment("naive"))[T.off]             1.568159e-02
C(ART, Treatment("naive"))[T.on]              6.402167e-01
C(Race, Treatment("White"))[T.Black]          6.049970e-02
iHIVD                                         1.900800e-13
Cannabinoid                                   1.325379e-01
Cocaine                                       1.505662e-02
Year                                          3.113679e-01
TimeSeropositive                              2.297606e-01
Age                                           8.005362e-05
dtype: float64 

Intercept                                     7.772849
C(ART, Treatment("naive"))[T.non-adherent]   -4.235244
C(ART, Treatment("naive"))[T.off]            -2.650296
iHIVD                                         0.677453
Cocaine                                       1.007512
Age                                          -0.078928
dtype: float64

So we see a huge effect of Cannabinoid use! Using Cannabinoids causes a 0.77 increase in HIVD score.


In [111]:
from scipy.stats import linregress
import re
from types import StringType
robj = re.compile('[\d.]+')
def get_mid(obj):
    nums = robj.findall(str(obj))
    if len(nums) != 2:
        return np.nan
    return float(nums[1])

fig, axs = plt.subplots(2,3, sharey=True, figsize = (15, 10))
groups = [('DrugFree', np.arange(0,1.1,0.1), 0.1),
            ('Cannabinoid', np.arange(0,1.1,0.1), 0.1), 
            ('Cocaine', np.arange(0,1.1,0.1), 0.1),
            ('TimeSeropositive', np.arange(0,30,5), 5),
            ('ART', ['on', 'off', 'naive', 'non-adherent'], None),
            ('Age', np.arange(20,70, 10), 10),]
for ax, (group, bins, width) in zip(axs.flatten(), groups):
    
    tdata = linear_data[[group, 'cHIVD']].dropna()
    
   
    if type(bins[0]) != StringType:
        pos = Series(map(get_mid, cut(tdata[group], bins)), 
                index = tdata.index)
        tdata['Pos'] = pos
        tdata = tdata.dropna().sort('Pos')
        m, b, r, p, _ = linregress(tdata[group].values, y = tdata['cHIVD'].values)
        x = np.linspace(bins[0]-width, bins[-1]+width)
        y = m*x + b
    else:
        tdata['Pos'] = tdata[group]
    #tdata.boxplot(column = 'cHIVD', by = 'Pos', ax = ax)
    tmp = []
    for b in bins:
        tmp.append(tdata['cHIVD'][tdata['Pos'] == b].values)
    plt.sca(ax)
    plt.hold(True)
    if type(bins[0]) != StringType:
        plt.plot(x, y, lw=10, alpha=0.2, color = 'r')
        plt.boxplot(tmp, positions = bins, widths=width*0.7)
        plt.xlim([bins[0]-width, bins[-1]+width])
    else:
        plt.boxplot(tmp)
        plt.xticks(range(1, len(bins)+1), bins)
    plt.title(group)
    plt.ylabel('Adj-HIVD')
    plt.hold(False)
       
plt.savefig('figures/NeuroSpecificFigures/cyto-Adj-HIVD-Trends.png', dpi = 200)



In [112]:
vals = []
for grouping in date_data['NizGroupings'].unique():
    good_mask = date_data['NizGroupings'] == grouping
    tdata = date_data[good_mask]
    for pat, rows in tdata.groupby('Patient ID'):
        cum_diff = rolling_apply(rows['sHIVD'], 2, np.diff)
        tmp = DataFrame({'sHIVD': rows['sHIVD'], 'dHIVD':cum_diff})
        tmp['DrugStatus'] = grouping
        tmp['Age'] = rows['Age']
        tmp['TimeSeropositive'] = rows['TimeSeropositive']
        tmp['ART'] = rows['ART']
        tmp['iHIVD'] = rows['iHIVD']
        tmp['Patient ID'] = pat
        tmp['Year'] = rows['Year']
        tmp['Race'] = rows['Race']
        vals.append(tmp.reset_index())
large_deltaData = concat(vals, axis = 0, ignore_index=True)
print large_deltaData.dropna().head(n=10).to_string()


    index     dHIVD      sHIVD DrugStatus        Age  TimeSeropositive    ART  iHIVD Patient ID  Year   Race
3      42 -0.250000   5.250000         PN  56.128767         16.967123     on    4.0      A0010     4  Black
6      73  0.000000  10.000000         PN  32.654795          8.504110     on   10.0      A0017     2  Black
7      74 -0.333333   9.666667         PN  33.290411          8.556164     on   10.0      A0017     3  Black
8      76 -1.416667   8.250000         PN  34.786301         10.052055     on   10.0      A0017     5  Black
9      77  0.125000   8.375000         PN  36.336986         12.186301     on   10.0      A0017     6  Black
10     78  0.541667   8.916667         PN  36.838356         12.687671     on   10.0      A0017     7  Black
13    148  1.500000  11.000000         PN  38.046575          7.890411     on    9.5      A0032     7  Black
16    333 -0.500000  11.500000         PN  63.901370         25.739726     on   12.0      A0078     4  Black
19    451  1.000000  11.000000         PN  34.769863          2.613699  naive   10.0      A0100     2  Black
20    452  0.333333  11.333333         PN  35.709589          3.553425  naive   10.0      A0100     3  Black

In [114]:
import statsmodels.api as sm
from patsy import dmatrices, dmatrix
group_data = large_deltaData.dropna()
y, X = dmatrices('sHIVD ~ Year+iHIVD+TimeSeropositive+Age+C(DrugStatus, Treatment("PN"))+C(ART, Treatment("naive"))+C(Race, Treatment("White"))', group_data, return_type = 'dataframe')
res = sm.GLM(y,X).fit()
print res.summary()
print res.pvalues
print 
print res.params[res.pvalues < 0.05]
group_data['cHIVD'] = res.fittedvalues
result_objs.append(('Entire', 'Grouping', deepcopy(res)))


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  sHIVD   No. Observations:                  177
Model:                            GLM   Df Residuals:                      162
Model Family:                Gaussian   Df Model:                           14
Link Function:               identity   Scale:                   1.35749950916
Method:                          IRLS   Log-Likelihood:                -270.36
Date:                Mon, 24 Jun 2013   Deviance:                       219.91
Time:                        15:20:37   Pearson chi2:                     220.
No. Iterations:                     3                                         
==============================================================================================================
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
Intercept                                      4.9721      0.845      5.884      0.000         3.316     6.628
C(DrugStatus, Treatment("PN"))[T.MDU]          1.0000      0.329      3.038      0.003         0.355     1.645
C(DrugStatus, Treatment("PN"))[T.PC]           1.0818      0.287      3.770      0.000         0.519     1.644
C(DrugStatus, Treatment("PN"))[T.PO]           1.2153      0.653      1.862      0.064        -0.064     2.494
C(DrugStatus, Treatment("PN"))[T.SBe]          1.2848      0.418      3.076      0.002         0.466     2.104
C(DrugStatus, Treatment("PN"))[T.SCa]          1.1937      0.288      4.150      0.000         0.630     1.757
C(ART, Treatment("naive"))[T.non-adherent]    -1.3115      0.677     -1.936      0.055        -2.639     0.016
C(ART, Treatment("naive"))[T.off]             -1.5682      0.571     -2.747      0.007        -2.687    -0.449
C(ART, Treatment("naive"))[T.on]              -0.6309      0.401     -1.574      0.117        -1.416     0.154
C(Race, Treatment("White"))[T.Black]           0.3830      0.436      0.877      0.382        -0.473     1.238
C(Race, Treatment("White"))[T.Indian]          0.4848      0.945      0.513      0.609        -1.367     2.337
Year                                           0.1167      0.060      1.949      0.053        -0.001     0.234
iHIVD                                          0.6154      0.038     16.323      0.000         0.541     0.689
TimeSeropositive                               0.0253      0.013      1.915      0.057        -0.001     0.051
Age                                           -0.0440      0.010     -4.219      0.000        -0.064    -0.024
==============================================================================================================
Intercept                                     2.236405e-08
C(DrugStatus, Treatment("PN"))[T.MDU]         2.779598e-03
C(DrugStatus, Treatment("PN"))[T.PC]          2.284452e-04
C(DrugStatus, Treatment("PN"))[T.PO]          6.438353e-02
C(DrugStatus, Treatment("PN"))[T.SBe]         2.466402e-03
C(DrugStatus, Treatment("PN"))[T.SCa]         5.359000e-05
C(ART, Treatment("naive"))[T.non-adherent]    5.456615e-02
C(ART, Treatment("naive"))[T.off]             6.689743e-03
C(ART, Treatment("naive"))[T.on]              1.173230e-01
C(Race, Treatment("White"))[T.Black]          3.815419e-01
C(Race, Treatment("White"))[T.Indian]         6.086501e-01
Year                                          5.302109e-02
iHIVD                                         4.841333e-36
TimeSeropositive                              5.730893e-02
Age                                           4.080997e-05
dtype: float64

Intercept                                4.972123
C(DrugStatus, Treatment("PN"))[T.MDU]    0.999951
C(DrugStatus, Treatment("PN"))[T.PC]     1.081803
C(DrugStatus, Treatment("PN"))[T.SBe]    1.284810
C(DrugStatus, Treatment("PN"))[T.SCa]    1.193719
C(ART, Treatment("naive"))[T.off]       -1.568190
iHIVD                                    0.615385
Age                                     -0.043999
dtype: float64

In [115]:
fig, axs = plt.subplots(2,2, sharex=True, sharey=True, figsize = (10, 10))
for year, ax in enumerate(axs.flatten(), 1):
    if year == 4:
        ymask = True
        title = 'All Years'
    else:
        title = 'Score at year %i' % year
        ymask = group_data['Year'] == year
    tdata = []
    for group in drug_groups:
        gmask = group_data['DrugStatus'] == group
        tdata.append(group_data['cHIVD'][ymask & gmask].dropna().values)
    plt.sca(ax)
    plt.boxplot(tdata)
    plt.xticks(range(1,len(drug_groups)+1), drug_groups, rotation=90)
    plt.title(title)
    plt.ylabel('Adjusted HIVD')
plt.savefig('figures/NeuroSpecificFigures/large-Adj-HIVD-Group.png', dpi = 200)



In [116]:
large_linear_data = date_data.dropna()
y, X = dmatrices('sHIVD ~ iHIVD+Cannabinoid+Cocaine+Year+TimeSeropositive+Age+C(ART, Treatment("naive"))+C(Race, Treatment("White"))', 
                    large_linear_data, return_type = 'dataframe')

res = sm.GLM(y,X).fit()
print res.summary()
print res.pvalues, '\n'
print res.params[res.pvalues < 0.05]
large_linear_data['cHIVD'] = res.fittedvalues
result_objs.append(('Entire', 'LCM', deepcopy(res)))


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  sHIVD   No. Observations:                   66
Model:                            GLM   Df Residuals:                       55
Model Family:                Gaussian   Df Model:                           10
Link Function:               identity   Scale:                    1.3268127052
Method:                          IRLS   Log-Likelihood:                -96.965
Date:                Mon, 24 Jun 2013   Deviance:                       72.975
Time:                        15:21:06   Pearson chi2:                     73.0
No. Iterations:                     3                                         
==============================================================================================================
                                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
Intercept                                      6.3817      1.362      4.684      0.000         3.712     9.052
C(ART, Treatment("naive"))[T.non-adherent]    -5.0612      1.314     -3.850      0.000        -7.638    -2.485
C(ART, Treatment("naive"))[T.off]             -2.4975      0.989     -2.526      0.014        -4.435    -0.560
C(ART, Treatment("naive"))[T.on]              -0.8622      0.513     -1.681      0.098        -1.867     0.143
C(Race, Treatment("White"))[T.Black]          -0.7427      0.728     -1.020      0.312        -2.169     0.684
iHIVD                                          0.6743      0.056     12.128      0.000         0.565     0.783
Cannabinoid                                    0.4836      0.467      1.036      0.305        -0.431     1.398
Cocaine                                        0.9377      0.348      2.698      0.009         0.256     1.619
Year                                           0.0500      0.116      0.433      0.667        -0.176     0.277
TimeSeropositive                               0.0296      0.020      1.506      0.138        -0.009     0.068
Age                                           -0.0526      0.015     -3.416      0.001        -0.083    -0.022
==============================================================================================================
Intercept                                     1.883441e-05
C(ART, Treatment("naive"))[T.non-adherent]    3.098770e-04
C(ART, Treatment("naive"))[T.off]             1.445288e-02
C(ART, Treatment("naive"))[T.on]              9.837268e-02
C(Race, Treatment("White"))[T.Black]          3.120551e-01
iHIVD                                         3.572171e-17
Cannabinoid                                   3.046856e-01
Cocaine                                       9.254518e-03
Year                                          6.667896e-01
TimeSeropositive                              1.378115e-01
Age                                           1.200806e-03
dtype: float64 

Intercept                                     6.381690
C(ART, Treatment("naive"))[T.non-adherent]   -5.061222
C(ART, Treatment("naive"))[T.off]            -2.497494
iHIVD                                         0.674338
Cocaine                                       0.937741
Age                                          -0.052593
dtype: float64

In [117]:
fig, axs = plt.subplots(2,3, sharey=True, figsize = (15, 10))
groups = [('DrugFree', np.arange(0,1.1,0.1), 0.1),
            ('Cannabinoid', np.arange(0,1.1,0.1), 0.1), 
            ('Cocaine', np.arange(0,1.1,0.1), 0.1),
            ('TimeSeropositive', np.arange(0,30,5), 5),
            ('ART', ['on', 'off', 'naive', 'non-adherent'], None),
            ('Age', np.arange(20,70, 10), 10),]
for ax, (group, bins, width) in zip(axs.flatten(), groups):
    
    tdata = large_linear_data[[group, 'cHIVD']].dropna()
    
   
    if type(bins[0]) != StringType:
        pos = Series(map(get_mid, cut(tdata[group], bins)), 
                index = tdata.index)
        tdata['Pos'] = pos
        tdata = tdata.dropna().sort('Pos')
        m, b, r, p, _ = linregress(tdata[group].values, y = tdata['cHIVD'].values)
        x = np.linspace(bins[0]-width, bins[-1]+width)
        y = m*x + b
    else:
        tdata['Pos'] = tdata[group]
    #tdata.boxplot(column = 'cHIVD', by = 'Pos', ax = ax)
    tmp = []
    for b in bins:
        tmp.append(tdata['cHIVD'][tdata['Pos'] == b].values)
    plt.sca(ax)
    plt.hold(True)
    if type(bins[0]) != StringType:
        plt.plot(x, y, lw=10, alpha=0.2, color = 'r')
        plt.boxplot(tmp, positions = bins, widths=width*0.7)
        plt.xlim([bins[0]-width, bins[-1]+width])
    else:
        plt.boxplot(tmp)
        plt.xticks(range(1, len(bins)+1), bins)
    plt.title(group)
    plt.ylabel('Adj-HIVD')
    plt.hold(False)
       
plt.savefig('figures/NeuroSpecificFigures/large-Adj-HIVD-Trends.png', dpi = 200)



In [118]:
nindex = []
ndata = []
for cohort, anal, res in result_objs:
    nindex.append((cohort, anal, 'pvalues'))
    nindex.append((cohort, anal, 'effects'))
    ndata.append(res.pvalues.copy())
    ndata.append(res.params.copy())
    
mi = MultiIndex.from_tuples(nindex, names = ['Cohort', 'Analysis', 'Type'])
df = DataFrame(ndata, index = mi).reset_index()

In [119]:
df.reset_index().T.to_excel('figures/NeuroSpecificFigures/regression_results.xlsx')

In [28]:
def normalize_by_date_nosample(indf):
    tdf = indf.reset_index().set_index('Date').drop(['Patient ID', 'VisitNum'], axis = 1)
    tdf[drug_names + ['DrugFree']] = tdf[drug_names + ['DrugFree']].applymap(float)
    tdata = rolling_mean(tdf[['nHIVD','DrugFree']+drug_names], 2, min_periods=1, freq = '6M', axis = 0)#.reset_index()
    odf = tdf.resample('6M', how = 'last')
    odf[['DrugFree']+drug_names] = tdata[['DrugFree']+drug_names]
    odf['sHIVD'] = tdata['nHIVD']
    try:
        odf['iHIVD'] = tdf['nHIVD'].dropna().values[0]
    except IndexError:
        odf['iHIVD'] = np.nan
    odf['LsHIVD'] = odf['sHIVD'].shift(1)
    odf['dsHIVD'] = rolling_apply(odf['sHIVD'], 2, np.diff)
    odf['LnHIVD'] = odf['nHIVD'].shift(1)
    #odf[]
    #odf['ART'] = tdf['ART'].resample('12M', how = 'last')
    odf = odf.reset_index()
    odf.index.name = 'Year'
    return odf

new_date_data = data.groupby(level = 'Patient ID').apply(normalize_by_date_nosample).reset_index()
print new_date_data


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2302 entries, 0 to 2301
Data columns (total 22 columns):
Patient ID          2302  non-null values
Year                2302  non-null values
Date                2302  non-null values
ART                 1360  non-null values
nHIVD               1203  non-null values
Amphetamines        760  non-null values
Barbiturates        760  non-null values
Benzodiazepines     1153  non-null values
Cannabinoid         1153  non-null values
Cocaine             1153  non-null values
Opiates             1153  non-null values
Phencyclidine       760  non-null values
DrugFree            1153  non-null values
Age                 1373  non-null values
TimeSeropositive    1380  non-null values
Race                1389  non-null values
NizGroupings        351  non-null values
sHIVD               1539  non-null values
iHIVD               2244  non-null values
LsHIVD              1083  non-null values
dsHIVD              941  non-null values
LnHIVD              759  non-null values
dtypes: datetime64[ns](1), float64(13), int64(1), object(7)

In [29]:
tmp_data = new_date_data.dropna(subset = ['nHIVD','LnHIVD','dsHIVD','sHIVD', 'Cannabinoid', 'Cocaine', 'TimeSeropositive', 'Age', 'ART'])
tmp_data = tmp_data[tmp_data['NizGroupings'] != 'PO']
tmp_data = tmp_data[tmp_data['Year']<6]

y, X = dmatrices('sHIVD ~ iHIVD+Year+Cocaine+Cannabinoid+TimeSeropositive+Age+ART', 
                    tmp_data, return_type = 'dataframe')
res = sm.GLM(y, X).fit()
#print tmp_data
print res.summary()


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  sHIVD   No. Observations:                  143
Model:                            GLM   Df Residuals:                      133
Model Family:                Gaussian   Df Model:                            9
Link Function:               identity   Scale:                   2.22860907688
Method:                          IRLS   Log-Likelihood:                -255.02
Date:                Mon, 24 Jun 2013   Deviance:                       296.41
Time:                        11:18:31   Pearson chi2:                     296.
No. Iterations:                     3                                         
=======================================================================================
                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------
Intercept               6.6026      1.145      5.768      0.000         4.359     8.846
ART[T.non-adherent]     3.4239      1.625      2.107      0.037         0.240     6.608
ART[T.off]              0.3315      0.733      0.452      0.652        -1.105     1.768
ART[T.on]               0.3670      0.595      0.617      0.538        -0.798     1.532
iHIVD                   0.5681      0.052     11.027      0.000         0.467     0.669
Year                   -0.0977      0.091     -1.070      0.286        -0.276     0.081
Cocaine                 0.0384      0.324      0.118      0.906        -0.596     0.673
Cannabinoid             0.2825      0.347      0.814      0.417        -0.398     0.963
TimeSeropositive       -0.0188      0.019     -0.994      0.322        -0.056     0.018
Age                    -0.0457      0.016     -2.771      0.006        -0.078    -0.013
=======================================================================================

In [30]:
def resolve_date_last_used(inval):
    
    if '/' in inval:
        parts = [x for x in inval.split('/')]
        if len(parts) == 2:
            month = int(parts[0])
            year = parts[1]
            day = None
        elif len(parts) == 3:
            day = int(parts[1])
            month = int(parts[0])
            year = parts[2]
        else:
            print parts, inval
            raise ValueError
    elif '-' in inval:
        parts = [x for x in inval.split('-')]
        if len(parts) == 2:
            month = int(parts[1])
            year = parts[0]
            day = None
        elif len(parts) == 3:
            day = int(parts[2])
            month = int(parts[1])
            year = parts[0]
        else:
            print parts, inval
            raise ValueError
    else:
        day = None
        month = None
        year = inval
        
    if len(year) == 2:
        if int(year) > 20:
            year = '19'+year
        else:
            year = '20'+year
    freq = 'D'
    if day is None:
        day = 1
        freq = 'M'
    if month is None:
        month = 1
        freq = 'Y'
    
    #try:
    #print year, month, day, freq
    guess_date = Period(year = int(year), month = month, day = day, freq = freq)
    #except ValueError:
    #    print inval
        #rise ValueError
    #    return np.nan
    return guess_date
get_days = lambda x: x/np.timedelta64(1,'D')
drug_started_data = read_csv('DrugStartedData.tsv', sep = '\t', parse_dates=['Date of visit', 'HIV seropositive date']).set_index(['Patient ID', 'VisitNum'])
drug_started_data['DateLastUsedDrugs'] = drug_started_data['Date last used drugs'].dropna().map(resolve_date_last_used)
drug_started_data['DateLastUsedDrugs'] = drug_started_data['DateLastUsedDrugs'].dropna().map(lambda x: x.to_timestamp('D', how='s'))
drug_started_data['DaysSinceLastDrugUse'] = (drug_started_data['Date of visit'] - drug_started_data['DateLastUsedDrugs']).dropna().map(get_days)

In [31]:
tested_pos = data[all_drug_names].dropna().any(axis = 1)
pos_tests, last_drug = tested_pos.align(drug_started_data['DaysSinceLastDrugUse'].dropna(), join = 'inner')
last_drug_year = last_drug/365
fig, axs = plt.subplots(2,1, figsize=(10,10))

plt.sca(axs.flatten()[0])
plt.hold(True)
last_drug.hist(bins = np.linspace(0, 30, 30), ax = plt.gca())
last_drug[pos_tests].hist(bins = np.linspace(0, 30, 30), ax = plt.gca())
plt.legend(['Neg', 'Pos'], 'upper right')
plt.xlabel('Days since last drug use - Admit')
plt.ylabel('#Visits')
plt.hold(False)

plt.sca(axs.flatten()[1])
plt.hold(True)
last_drug_year.hist(bins = np.linspace(0,10, 20), ax = plt.gca())
last_drug_year[pos_tests].hist(bins = np.linspace(0,10, 20), ax = plt.gca())
plt.legend(['Neg', 'Pos'], 'upper right')
plt.xlabel('Years since last drug use - Admit')
plt.ylabel('#Visits')
plt.hold(False)

plt.savefig('figures/NeuroSpecificFigures/lying_admitters.png')



In [121]:
cols = [('Initial CD4 count (cells/uL)','CD4','Date of initial CD4 count'),
        ('Nadir CD4 count (cells/uL)', 'CD4', 'Date of nadir CD4 count'),
        ('Latest CD4 count (cells/uL)', 'CD4', 'Date of latest CD4 count'),
        ('Initial CD8 count (cells/uL)','CD8', 'Date of initial CD8 count'),
        ('Nadir CD8 count (cells/uL)','CD8', 'Date of nadir CD8 count'),
        ('Latest CD8 count (cells/uL)','CD8', 'Date of latest CD8 count'),
        ('Initial viral load (copies/mL)','VL', 'Date of initial viral load'),
        ('Peak viral load (copies/mL)','VL', 'Date of peak viral load'),
        ('Latest viral load','VL', 'Date of latest viral load'),
        ('Amphetamines', 'Test-Amphetamines'),
        ('Barbiturates', 'Test-Barbiturates'),
        ('Benzodiazepines', 'Test-Benzodiazepines'),
        ('Cannabinoid', 'Test-Cannabinoid'),
        ('Cocaine + metabolite', 'Test-Cocaine'),
        ('Opiates','Test-Opiates'),
        ('Phencyclidine', 'Test-Phencyclidine'),
        ("Race (choice='Asian')",'Asian'),
        ("Race (choice='American Indian/Alaska Native')",'Indian'),
        ("Race (choice='Black or African American')", 'Black'),
        ("Race (choice='Native Hawaiian or other Pacific Islander')", 'Hawaiian'),
        ("Race (choice='White')", 'White'),
        ("Race (choice='More than one race')",'Multi-Race'),
        ("Race (choice='Unknown')", 'Unknown'),
        ("Drugs used (choice='Marijuana')",'Admit-Cannabinoid'),
        ("Drugs used (choice='Cocaine (crack, nasal, smoke, inject)')", 'Admit-Cocaine'),
        ("Drugs used (choice='Heroin (nasal, inject)')",'Admit-Opiates'),
        ("Drugs used (choice='Methamphetamine (smoke, nasal, inject)')",'Amphetamines'),
        ("Drugs used (choice='Benzodiazapine (i.e. valium, ativan, xanax, klonipin, etc)')",'Admit-Benzodiazepines'),
        ("Drugs used (choice='Narcotics')",'Admit-Opiates'),
        ("Drugs used (choice='Ecstasy')",'Admit-Phencyclidine'),
        ("Drugs used (choice='PCP')",'Admit-Phencyclidine'),
        ("Drugs used (choice='Ritalin')",'Admit-Ritalin'),
        ("Drugs used (choice='Other')", 'Admit-Other'),
        ('Year of Birth', 'Year of Birth'),
        ('HIV seropositive date', 'HIV seropositive date'),
        ('Current ART status', 'ART'),
        ('Total Modified Hopkins Dementia Score', 'HIVD'),
        ('Drug Use and HIV Status', 'Drug Use and HIV Status'),
        ('Age first used drug', 'Age first used drug'),
        ('VisitNum', 'VisitNum')]
tmp_list = []
for tup in cols:
    if len(tup) == 2:
        col, typ = tup
        date_col = 'Date of visit'
    else:
        col, typ, date_col = tup
    tmp = redcap_data[['Patient ID', col, date_col]].dropna()
    tmp['Type'] = typ
    tmp = tmp.rename(columns={col:'Measure', date_col:'Date'})
    tmp_list.append(tmp)

drug_started_data = read_csv('DrugStartedData.tsv', sep = '\t', parse_dates=['Date of visit', 'HIV seropositive date'])
drug_started_data['DateLastUsedDrugs'] = drug_started_data['Date last used drugs'].dropna().map(resolve_date_last_used)
drug_started_data['DateLastUsedDrugs'] = drug_started_data['DateLastUsedDrugs'].dropna().map(lambda x: x.to_timestamp('D', how='s'))
drug_started_data['DaysSinceLastDrugUse'] = (drug_started_data['Date of visit'] - drug_started_data['DateLastUsedDrugs']).dropna().map(get_days)

tmp = drug_started_data[['Patient ID', 'Date of visit', 'DaysSinceLastDrugUse']]
tmp['Type'] = 'DaysSinceLastDrugUse'
tmp_list.append(tmp.rename(columns = {'Date of visit':'Date', 'DaysSinceLastDrugUse':'Measure'}))
    
clinical_params = concat(tmp_list, axis = 0, ignore_index=True)
clinical_params = pivot_table(clinical_params, rows = ['Patient ID', 'Date'], cols = 'Type', values='Measure', aggfunc='first')
clinical_params.head(n=10)


Out[121]:
&ltclass 'pandas.core.frame.DataFrame'>
MultiIndex: 10 entries, (A0001, 1996-12-16 00:00:00) to (A0001, 2008-02-27 00:00:00)
Data columns (total 33 columns):
ART                        2  non-null values
Admit-Benzodiazepines      2  non-null values
Admit-Cannabinoid          2  non-null values
Admit-Cocaine              2  non-null values
Admit-Opiates              2  non-null values
Admit-Other                2  non-null values
Admit-Phencyclidine        2  non-null values
Admit-Ritalin              2  non-null values
Age first used drug        2  non-null values
Amphetamines               2  non-null values
Asian                      2  non-null values
Black                      2  non-null values
CD4                        5  non-null values
CD8                        5  non-null values
DaysSinceLastDrugUse       2  non-null values
Drug Use and HIV Status    0  non-null values
HIV seropositive date      2  non-null values
HIVD                       0  non-null values
Hawaiian                   2  non-null values
Indian                     2  non-null values
Multi-Race                 2  non-null values
Test-Amphetamines          2  non-null values
Test-Barbiturates          2  non-null values
Test-Benzodiazepines       2  non-null values
Test-Cannabinoid           2  non-null values
Test-Cocaine               2  non-null values
Test-Opiates               2  non-null values
Test-Phencyclidine         2  non-null values
Unknown                    2  non-null values
VL                         5  non-null values
VisitNum                   2  non-null values
White                      2  non-null values
Year of Birth              2  non-null values
dtypes: object(33)

In [66]:
from datetime import timedelta
def safe_mean(inser):
    try:
        return inser.mean()
    except TypeError:
        return np.nan
        
def safe_max(inser):
    try:
        return inser.max()
    except TypeError:
        return np.nan

def safe_min(inser):
    try:
        return inser.min()
    except TypeError:
        return np.nan
        
def most_common(inser):
    nser = inser.dropna().value_counts()
    try:
        return nser.index[0]
    except IndexError:
        return np.nan

first_visit = redcap_data[['Patient ID', 'Date of visit']].groupby('Patient ID').first()['Date of visit']
last_visit = redcap_data[['Patient ID', 'Date of visit']].groupby('Patient ID').last()['Date of visit']
date_seropositive = redcap_data[['Patient ID', 'HIV seropositive date']].groupby('Patient ID').agg(most_common)

test_cols = ['Test-Amphetamines',
             'Test-Barbiturates',
             'Test-Benzodiazepines',
             'Test-Cannabinoid',
             'Test-Cocaine',
             'Test-Opiates',
             'Test-Phencyclidine']

admit_cols = ['Admit-Benzodiazepines',
             'Admit-Cannabinoid',
             'Admit-Cocaine',
             'Admit-Opiates',
             'Admit-Other',
             'Admit-Phencyclidine',
             'Admit-Ritalin']
other_admit_cols = ['DaysSinceLastDrugUse']
clinical_cols = ['VL', 'CD4', 'CD8', 'HIVD']

param_groups = [('mean-', safe_mean, test_cols+admit_cols+clinical_cols+other_admit_cols),
                ('max-', safe_max, clinical_cols+other_admit_cols),
                ('min-', safe_min, clinical_cols),
                ('', most_common, ['ART'])]


def calc_params(indf):
    
    pat_id = indf.index[0][0]
    indf = indf.ix[pat_id]
    fday = min(first_visit.ix[pat_id], last_visit.ix[pat_id])
    lday = max(first_visit.ix[pat_id], last_visit.ix[pat_id])
    
    pers = date_range(fday, lday, freq = '12M')
    indf = indf.truncate(fday-timedelta(days = 1), lday)
    
    out_data = []
    for prepend, func, cols in param_groups:
        ndf = indf[cols].resample('12M', closed = 'right', label = 'right', how = func)
        ncols = dict([(name, prepend+name) for name in cols])
        ndf.rename(columns = ncols, inplace = True)
        out_data.append(ndf.copy())
    odf = concat(out_data, axis = 1)
    #print 'date', date_seropositive.ix[pat_id]
    #print 'series', Series(odf.index)
    #print date_seropositive.ix[pat_id].values - np.array(list(odf.index), np.dtype('<M8[ns]'))
    if date_seropositive.ix[pat_id].dropna():
        odf['DaysSinceSeropositive'] = np.array(list(odf.index), np.dtype('<M8[ns]')) - date_seropositive.ix[pat_id].values
    else:
        odf['DaysSinceSeropositive'] = np.nan
        odf['DaysSinceSeropositive'] = odf['DaysSinceSeropositive'].astype('datetime64[ns]')
    return odf

nres = []
for pat, group in clinical_params.groupby(level = 'Patient ID'):
    tres = calc_params(group).reset_index()
    tres['Patient ID'] = pat
    tres['VisitYear'] = range(len(tres))
    nres.append(tres.copy())

res = concat(nres, axis = 0, ignore_index = True)
res = res.set_index(['Patient ID', 'VisitYear'])
#pat_df['Cannabinoid'].resample('12M', closed = 'right', label = 'right', how = safe_mean)#.truncate(fday, lday)

In [84]:
ban_cols = set(['mean-Test-Amphetamines',
            'mean-Test-Barbiturates',
            'mean-Test-Benzodiazepines',
            'mean-Test-Cocaine',
            'mean-Test-Opiates',
            'mean-Test-Phencyclidine',
            'mean-Test-Cannabinoid',
            'mean-Admit-Benzodiazepines',
            'mean-Admit-Cocaine',
            'mean-Admit-Cannabinoid',
            'mean-Admit-Opiates',
            'mean-Admit-Other',
            'mean-Admit-Phencyclidine',
            'mean-Admit-Ritalin'])
require_cols = [(set(), 'PN'),
                (set(['mean-Test-Cocaine', 'mean-Admit-Cocaine']), 'Cocaine'),
                (set(['mean-Test-Cannabinoid', 'mean-Admit-Cannabinoid']), 'Cannabinoid'),
                (set(['mean-Test-Cocaine', 'mean-Admit-Cocaine', 'mean-Test-Cannabinoid', 'mean-Admit-Cannabinoid']), 'Co-Ca')]

tmp_drug_cols = []
for allowed, name in require_cols:
    zero_cols = list(ban_cols - allowed)
    one_cols = list(allowed)
    take_nothing_but_alowed = (res[zero_cols].fillna(0)==0).groupby(level = 'Patient ID').all().all(axis = 1)
    take_allowed = (res[one_cols].fillna(1)==1).groupby(level = 'Patient ID').all().all(axis = 1)
    
    wanted_pats = (take_nothing_but_alowed & take_allowed)
    wanted_pats.name = name
    tmp_drug_cols.append(wanted_pats.copy())

pure_pats = concat(tmp_drug_cols, axis = 1)
print pure_pats.sum()


PN             41
Cocaine         8
Cannabinoid    19
Co-Ca          13
dtype: int64

In [101]:
years = range(4)
cols = [('PN', 'k'), 
        ('Cocaine', 'b'), 
        ('Cannabinoid' ,'g'), 
        ('Co-Ca', 'r')]
plt.figure(figsize = (10,10))
plt.hold(True)
tdata = merge(res[['min-HIVD']].reset_index(), pure_pats,
                left_on = 'Patient ID', right_index = True,
                how = 'inner')
for col, color in cols:
    wdata = tdata[tdata[col]]
    for pat, group in wdata.groupby('Patient ID'):
        plt.plot(group['VisitYear'], group['min-HIVD'], color = color, alpha = 0.5, lw = 10)



In [ ]: