Cytokine Analysis

This analysis uses data from Luminex plates measuring the amount of particular cytokines from patient samples. These patiens were chosen such that they are either Pure Cocaine Users (PC), Pure Benzo (PB), Cocaine + other drug (MDU) or pure NonUsers (PN). These drugs of abuse were tested using blood tests.

Data Extraction


In [1]:
from __future__ import division
import os, os.path
from pandas import *
import numpy as np
import csv
from itertools import groupby
from collections import defaultdict
from copy import deepcopy
from types import TupleType
import matplotlib.pyplot as plt
from itertools import product, imap, islice
from patsy import dmatrices
from patsy.contrasts import Treatment
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
import shutil
import glob
import shlex
from subprocess import check_call, CalledProcessError
from tempfile import mkdtemp
from concurrent.futures import ProcessPoolExecutor
from types import TupleType
from rpy2.robjects import Formula
import rpy2.robjects as robjects
import pandas.rpy.common as com
from rpy2.rinterface import RRuntimeError
import sys


sys.path.append('/home/will/PySeqUtils/')
os.chdir('/home/will/HIVSystemsBio/NewCytokineAnalysis/')

import Rtools

SAVE_FIGURES = False

In [2]:
norm_cyto_data = read_csv('CytoRawDataNorm.csv', 
                            sep = '\t')
known_pat_data = read_csv('CytoPatData.csv', 
                            index_col=[0,1], 
                            sep = '\t')

norm_cyto_data['Th1'] = norm_cyto_data['IFN.gamma'] + \
                            norm_cyto_data['IL.2']+norm_cyto_data['TNF.alpha']
norm_cyto_data['Th2'] = norm_cyto_data['IL.4'] + \
                            norm_cyto_data['IL.5']+norm_cyto_data['IL.10']

raw_cyto_data = read_csv('CytoRawData.csv', 
                            index_col=[0,1,2], 
                            sep = '\t')
raw_cyto_data['Th1'] = raw_cyto_data['IFN.gamma'] + \
                        raw_cyto_data['IL.2'] + \
                        raw_cyto_data['TNF.alpha']
raw_cyto_data['Th2'] = raw_cyto_data['IL.4'] + \
                        raw_cyto_data['IL.5'] + \
                        raw_cyto_data['IL.10']

In [3]:
cytos = sorted(['IL.8','VEGF','IL.1beta',
        'G.CSF','EGF','IL.10','HGF',
        'FGF.basic','IFN.alpha','IL.6',
        'IL.12','Rantes','Eotaxin',
        'GM.CSF','MIP.1beta',
        'MCP.1','IL.5','IL.13', 'IFN.gamma','TNF.alpha',
        'IL.RA','IL.2','IL.7','IP.10',
        'IL.2R','MIG','IL.4','IL.15',
        'IL.17','MIP.1alpha']) + ['Th1', 'Th2']

The quantile normalization has placed all of the data into a normal form with the same distribution across all cytokines.


In [4]:
pat_cyto_data = merge(known_pat_data, norm_cyto_data.reset_index(),
                        left_index = True, 
                        right_on = ['Patient ID', 'VisitNum'],
                        how = 'outer')

pat_cyto_data = pat_cyto_data.set_index(['Patient ID', 
                                        'VisitNum', 
                                        'SampleNum'])

In [5]:
from functools import partial

pat_cyto_data['CD4:CD8'] = pat_cyto_data['Latest CD4 count'] / \
                            pat_cyto_data['Latest CD8 count']
known_pat_data['CD4:CD8'] = known_pat_data['Latest CD4 count'] / \
                            known_pat_data['Latest CD8 count']

pat_cyto_data['Log10 Viral Load'] = pat_cyto_data['Latest Viral load'].map(np.log10)
known_pat_data['Log10 Viral Load'] = known_pat_data['Latest Viral load'].map(np.log10)

pat_cyto_data['Th1:Th2'] = pat_cyto_data['Th1']/pat_cyto_data['Th2']
raw_cyto_data['Th1:Th2'] = raw_cyto_data['Th1']/raw_cyto_data['Th2']

pat_cyto_data['HIV-Healthy'] = (pat_cyto_data['Latest Viral load'] <= 100) & \
                                (pat_cyto_data['Latest CD4 count'] >= 250)
pat_cyto_data['Drug Status'] = pat_cyto_data['Grouping']
pat_cyto_data['OlderThan50'] = pat_cyto_data['Age']>50
pat_cyto_data['OlderThan60'] = pat_cyto_data['Age']>60
pat_cyto_data['AgePast50'] = (pat_cyto_data['Age'] - 50).map(partial(max, 0))
pat_cyto_data['AgePast60'] = (pat_cyto_data['Age'] - 60).map(partial(max, 0))

Quality Control

Normalize Data


In [6]:
from collections import defaultdict

fig, axes = plt.subplots(6,6, figsize = (10,10), sharex = True)
fig.tight_layout()
masks = [('PN', pat_cyto_data['Grouping'] == 'PN'),
         ('PC', pat_cyto_data['Grouping'] == 'PC'),
         ('SCa', pat_cyto_data['Grouping'] == 'SCa'),
         ('SBe', pat_cyto_data['Grouping'] == 'SBe'),
         ('MDU', pat_cyto_data['Grouping'] == 'MDU'),
        ('HCV+', pat_cyto_data['Hepatitis C status (HCV)'] == 'True'),
        ('HCV-', pat_cyto_data['Hepatitis C status (HCV)'] == 'False'),
        ('AA', pat_cyto_data['Race'] == 'Black/AA'),
        ('N/AA', pat_cyto_data['Race'] != 'Black/AA')]
for ax, cyto in zip(axes.flatten(), cytos):
    
    items = []
    for key, mask in masks:
        items.append(pat_cyto_data[cyto][mask].dropna())
    
    plt.sca(ax)
    plt.boxplot(items, sym = '')
    plt.xticks(range(1, len(masks)+1), [name for name, _ in masks], rotation = 90)
    plt.title(cyto)
    plt.ylabel('NormUnits')
    
if SAVE_FIGURES:
    plt.savefig('figures/QCFigures/normalized_relative_cytokine_levels.png')


Raw Cytokine Values


In [7]:
def drop_outliers(series, rep = 0):
    if rep == 3:
        return series
    mval = series.mean()
    stdval = series.std()
    lval = mval -2*stdval
    hval = mval +2*stdval
    mask = (series < hval) & (series > lval)
    #print mask.isnull().sum()
    return drop_outliers(series[mask], rep = rep+1)
    
    
    
    
fig, axes = plt.subplots(6,6, figsize = (10,10), sharex = True)
fig.tight_layout()
for ax, cyto in zip(axes.flatten(), cytos):
    
    items = []
    for key, mask in masks:
        items.append(drop_outliers(pat_cyto_data[cyto][mask].dropna()))
    
    plt.sca(ax)
    plt.boxplot(items, sym = '')
    plt.xticks(range(1, len(masks)+1), [name for name, _ in masks], rotation = 90)
    plt.title(cyto)
    plt.ylabel('RealUnits')
    
if SAVE_FIGURES:
    plt.savefig('figures/QCFigures/raw_relative_cytokine_levels.png')


Missing Data


In [8]:
from pylab import get_cmap
from PlottingTools import make_heatmap_df

have_values = pat_cyto_data.apply(lambda x: x.notnull())
grouped_vals = have_values.groupby(level = 'Patient ID').mean()


#plt.figure(figsize = (10,20))
#plt.imshow(grouped_vals.T.values, 
#            interpolation = 'nearest', 
#            cmap = get_cmap('gray'), 
#            aspect='auto')

fig = make_heatmap_df(grouped_vals.T, colormap = 'gray', figsize = (10,20));

plt.xticks(range(len(grouped_vals.index)),[]);
plt.clim([0,1]);
plt.xlabel('Patient');

if SAVE_FIGURES:
    plt.savefig('figures/QCFigures/missing_values.png')


Statisical Analysis

Data Extraction

Adding some derived data into the mix.


In [9]:
from scipy.stats import ks_2samp, ttest_ind, mannwhitneyu
from scipy.stats import kruskal
from types import ListType
def test_categoical(cyto_series, cat_series):
    
    data = DataFrame({
                        'cyto':cyto_series,
                        'cat':cat_series
                    })
    test_data = []
    for _, group in data.groupby('cat'):
        test_data.append(group['cyto'].values)
    if len(test_data) < 2:
        return 1
    elif len(test_data) == 2:
        _, pval = mannwhitneyu(*test_data)
    else:
        _, pval = kruskal(*test_data)
    return pval
    
def test_multibinary(cyto_series, binary_df):
    pvals = []
    for col in binary_df.columns:
        pvals.append(test_categoical(cyto_series, binary_df[col]))
    return min(pvals)

def test_continous(cyto_series, value_series):
    
    tmp = DataFrame({'cyto':cyto_series,
                    'value':value_series}).dropna().reset_index(drop = True)
    
    try:
        res = ols(y=tmp['cyto'], x = tmp['value'])
        return res.f_stat['p-value']
    except:
        return None
    


confounders = [('Age', test_continous),
('Latest CD4 count', test_continous),
('Current Alcohol use', test_categoical),
('Current Tobacco use', test_categoical),
('Days since baseline', test_continous),
('Gender', test_categoical),
('Grouping', test_categoical),
('Race', test_categoical),
('HAART', test_categoical),
('Hepatitis C status (HCV)', test_categoical),
('HIVD score', test_continous),
('HIVD.I', test_categoical),
('Hepatitis B status (HBV)', test_categoical),
('Latest CD8 count', test_continous),
('Nadir CD4 count', test_continous),
('Peak viral load', test_continous),
('Nadir CD8 count', test_continous),
('Latest Viral load', test_continous),
('Years Seropositive', test_continous),
('CD4:CD8', test_continous),
('Log10 Viral Load', test_continous),
('Drug Status', test_categoical),
('HIV-Healthy', test_categoical),
]

In [10]:
from itertools import combinations

def mask2df(cyto_data, mask_tupA, mask_tupB):
    nameA, maskA = mask_tupA
    nameB, maskB = mask_tupB
    valid = maskA | maskB
    wdata = cyto_data.ix[valid]
    yield nameA, nameB, 'All Race',wdata, maskA.ix[valid]
        
    black_mask = cyto_data['Race'] == 'Black/AA'
    valid = (maskA | maskB) & black_mask
    wdata = cyto_data.ix[valid]
    yield nameA, nameB, 'AA', wdata, maskA.ix[valid]
    

gp_names = [(gp, pat_cyto_data['Grouping']==gp) for gp in pat_cyto_data['Grouping'].dropna().unique()]
drug_groups = list(combinations(gp_names, 2)) + [(('HCV-', pat_cyto_data['Hepatitis C status (HCV)']=='False'), ('HCV+', pat_cyto_data['Hepatitis C status (HCV)']=='True'))]

#nameA, nameB, race, cyto, confounder, p-val
test_res = []

for tupA, tupB in drug_groups:
    
    for nameA, nameB, race, data, mask in mask2df(pat_cyto_data, tupA, tupB):
        for cyto in cytos:
            for conf, test in confounders:
                pval = test(data[cyto], data[conf])
                test_res.append((nameA, nameB, race, cyto, conf, pval))
association_results_df = DataFrame(test_res, columns = ['GroupA', 'GroupB', 'Race', 'Cytokine', 'Column', 'p-value'])

In [11]:
pivoted_data = pivot_table(association_results_df, 
                            rows = ['Race', 'GroupA', 'GroupB', 'Column'], 
                            cols = 'Cytokine', 
                            values = 'p-value')
filtered_data = pivoted_data[pivoted_data<0.05]

They want a specific order to the heatmaps.


In [12]:
drug_order = [
'Age', 
'Gender',
'Race', 
'Current Alcohol use', 
'Current Tobacco use', 
'Latest CD4 count', 
'Nadir CD4 count', 
'Latest CD8 count',
'Nadir CD8 count', 
'CD4:CD8', 
'Latest Viral load', 
'Peak viral load', 
'Log10 Viral Load', 
'HIV-Healthy',
'HAART', 
'Hepatitis B status (HBV)', 
'Hepatitis C status (HCV)', 
'HIVD-I', 
'HIVD score', 
'Years Seropositive', 
'Days since baseline'] 

hcv_order = [
'Age', 
'Gender', 
'Race', 
'Current Alcohol use', 
'Current Tobacco use', 
'Drug Status', 
'Latest CD4 count', 
'Nadir CD4 count', 
'Latest CD8 count', 
'Nadir CD8 count', 
'CD4:CD8', 
'Latest Viral load', 
'Peak viral load', 
'Log10 Viral Load', 
'HIV-Healthy',
'HAART', 
'Hepatitis B status (HBV)', 
'HIVD-I', 
'HIVD score', 
'Years Seropositive', 
'Days since baseline']


presentation_order = {
 ('PN', 'MDU'):drug_order,
 ('PN', 'PC'):drug_order,
 ('PN', 'SBe'):drug_order,
 ('MDU', 'SCa'):drug_order,
 ('MDU', 'PC'):drug_order,
 ('MDU', 'SBe'):drug_order,
 ('SCa', 'PC'):drug_order,
 ('SCa', 'SBe'):drug_order,
 ('PC', 'SBe'):drug_order,
    ('HCV-','HCV+'):hcv_order,
}

In [13]:
def convert_for_R(indf, cyto, items, factor_refs):
    """Converts a dataframe into an R dataframe"""
    
    
    fcols = {
                'Hepatitis C status (HCV)':'HCV',
                'HIVD-I':'HIVDI',
                'Log10 Viral Load':'LVL',
                'TOSample-Benzodiazepines':'TOSample.Benzodiazepines',
                'TOSample-Cannabinoid':'TOSample.Cannabinoid',
                'TOSample-Cocaine':'TOSample.Cocaine',
                'TOSample-Opiates':'TOSample.Opiates',
                'ALL-Benzodiazepines':'ALL.Benzodiazepines',
                'ALL-Cannabinoid':'ALL.Cannabinoid',
                'ALL-Cocaine':'ALL.Cocaine',
                'ALL-Opiates':'ALL.Opiates'
            }

    
    wanted_cols = [col for col, _ in items] + [cyto]
    if 'Age' not in set(wanted_cols):
        wanted_cols.append('Age')
    ndf = indf.rename(columns = fcols)[wanted_cols].dropna()
    pat_index = ndf.index
    ndf = ndf.reset_index()
    #since RPY2 chokes on boolean columns
    bool_cols = ['HCV', 'HIVDI', 'OlderThan50', 'OlderThan60']
                    
    for col in bool_cols:
        try:
            ndf[col] = ndf[col].apply(str)
        except KeyError:
            pass
    #print ndf
    rpy_ndf = com.convert_to_r_dataframe(ndf)
    factor_convert_cols = []
    neqn = []
    for col, term in items:
        if len(ndf[col].dropna().unique()) > 1:
            neqn.append(term)
        if (col in factor_refs) and (factor_refs[col] in set(ndf[col])):
            factor_convert_cols.append((col, factor_refs[col]))
    
    rpy_ndf = Rtools.convert_columns_to_factors(rpy_ndf, factor_convert_cols)
            
    res_eqn = ' + '.join(neqn)
    
    formula = Formula(cyto + ' ~ ' + res_eqn)
    return rpy_ndf, formula, pat_index

In [14]:
def convert_output_rows(tTable):
    """Converts all the different representations of drug columns into a standard setup."""
       
    
    change_rules = [('PC', 'Cocaine'),
                    ('Cocaine', 'Cocaine'),
                    ('MDU', 'MDU'),
                    ('SCa', 'Cannabinoid'),
                    ('Cannabinoid', 'Cannabinoid'),
                    ('SBe', 'Benzodiazepines'),
                    ('Benzodiazepines', 'Benzodiazepines'),
                    ('Opiates', 'Opiates'),
                    ('HCV', 'HCV'),
                    ('Age', 'Age')]
    
    change_rows = {}
    for row, (check_term, res_row) in product(tTable.index, change_rules):
        if check_term.lower() in row.lower():
            change_rows[row] = res_row
            
    return tTable.rename(index = change_rows)

def RuiRegressTest(indf, items, factor_refs, cyto, debug = False):
    """Runs the Linear Mixed Effects model."""
    
    rpy_ndf, formula, pat_index = convert_for_R(indf, cyto, items, factor_refs)
    rand_formula = Formula('~1|Patient.ID')
    
    class Blackhole(object):

        def write(self, string):
            pass
        
        def flush(self, *args, **kwargs):
            pass
    
    if debug:
        result_dict = Rtools.R_linear_mixed_effects_model(rpy_ndf, 
                                                            formula, 
                                                            rand_formula)
    else:
        stdout = sys.stdout
        try:
            sys.stdout = Blackhole()
            result_dict = Rtools.R_linear_mixed_effects_model(rpy_ndf, 
                                                                formula, 
                                                                rand_formula)
        except RRuntimeError:
            return None
        finally:
            sys.stdout = stdout
       
        
    tTable = convert_output_rows(result_dict['tTable'])
    return tTable.T, pat_index

Subcohorts and correction equations we want to look at:


In [15]:
rui_groups = set(['PN', 'PC', 'MDU'])
subcohorts = [  ('AA', pat_cyto_data['Race'] == 'Black/AA'),
                ('AA-Restricted', (pat_cyto_data['Race'] == 'Black/AA') & pat_cyto_data['Grouping'].notnull()),
                #('All', pat_cyto_data['Age'] > 0),
                #('HIV-Healthy', pat_cyto_data['HIV-Healthy'] == True),
                #('HIV-NONHealthy', pat_cyto_data['HIV-Healthy'] == False),
                #('HIV-LowCD4', pat_cyto_data['Latest CD4 count'] < 250),
                #('HIV-HighCD4', pat_cyto_data['Latest CD4 count'] > 250),
                #('HIV-HighVL', pat_cyto_data['Latest Viral load'] > 100),
                #('HIV-LowVL', pat_cyto_data['Latest Viral load'] <= 100),
                #('High-CD4-CD8', pat_cyto_data['CD4:CD8'] > 1),
                #('Low-CD4-CD8', pat_cyto_data['CD4:CD8'] <= 1)
                ]

test_cols = [tup for tup in combinations(pat_cyto_data['Grouping'].dropna().unique(),2)]+[('HCV-', 'HCV+')]
test_cytos = [(tup[0], tup[1], cyto) for tup, cyto in product(test_cols, cytos)]
columns = MultiIndex.from_tuples(test_cytos)

final_test_res = []
final_resid = []
drug_groups = list(pat_cyto_data['Grouping'].dropna().unique())
eqns = [
('HCVOnlyCorrection', (('Gender', 'as.factor(Gender)'), 
                    ('Race', 'as.factor(Race)'), 
                    ('HAART', 'as.factor(HAART)'), 
                    ('HCV', 'as.factor(HCV)'),
                    ('LVL', 'LVL'))),
('GroupingCorrection', (('Grouping', 'as.factor(Grouping)'),
                    ('Gender', 'as.factor(Gender)'), 
                    ('Race', 'as.factor(Race)'), 
                    ('HAART', 'as.factor(HAART)'), 
                    ('HCV', 'as.factor(HCV)'),
                    ('LVL', 'LVL'))),
('HCV_AtSample', (('ATSample.Benzodiazepines', 'ATSample.Benzodiazepines'),
                  ('ATSample.Cannabinoid', 'ATSample.Cannabinoid'),
                  ('ATSample.Cocaine', 'ATSample.Cocaine'),
                  ('ATSample.Opiates', 'ATSample.Opiates'),
                ('Gender', 'as.factor(Gender)'), 
                ('Race', 'as.factor(Race)'), 
                ('HAART', 'as.factor(HAART)'), 
                ('HCV', 'as.factor(HCV)'),
                ('LVL', 'LVL'))),
('HCV_AllSample', (('ALL.Benzodiazepines', 'ALL.Benzodiazepines'),
                  ('ALL.Cannabinoid', 'ALL.Cannabinoid'),
                  ('ALL.Cocaine', 'ALL.Cocaine'),
                  ('ALL.Opiates', 'ALL.Opiates'),
                ('Gender', 'as.factor(Gender)'), 
                ('Race', 'as.factor(Race)'), 
                ('HAART', 'as.factor(HAART)'), 
                ('HCV', 'as.factor(HCV)'),
                ('LVL', 'LVL'))),
('HCV_ToSample', (('TOSample.Benzodiazepines', 'TOSample.Benzodiazepines'),
                  ('TOSample.Cannabinoid', 'TOSample.Cannabinoid'),
                  ('TOSample.Cocaine', 'TOSample.Cocaine'),
                  ('TOSample.Opiates', 'TOSample.Opiates'),
                ('Gender', 'as.factor(Gender)'), 
                ('Race', 'as.factor(Race)'), 
                ('HAART', 'as.factor(HAART)'), 
                ('HCV', 'as.factor(HCV)'),
                ('LVL', 'LVL'))),
]

eqns += [(name+'_NVL', vals[:-1]) for name, vals in eqns]

age_factors = [('Age', 'Age')]
group_references = ['PN']
    
    

def linker_fun(tup):
    
    (eqn_name, items), (cohort_name, cohort_mask), cyto, age_factor, group_ref = tup
    data = pat_cyto_data.ix[cohort_mask]
    #print len(data.index)
    #print eqn_name
    if ('Sample' in eqn_name) and (group_ref != 'PN'):
        return None
    #print 'after', eqn_name
    factor_refs = {
                    'HCV':'False',
                    'Grouping':group_ref,
                    'Race':'Black/AA',
                    'HIVDI':'False',
                  }
    #print 'testing'
    output = RuiRegressTest(data, items+(age_factor,), factor_refs, cyto, debug = False)
    if output is None:
        #print 'wrong'
        return None
    #print 'yes'
    ttables, pat_index = output
    ttables['Cytokine'] = cyto
    ttables['Correction'] = eqn_name
    ttables['CohortName'] = cohort_name
    ttables['DrugReference'] = group_ref
    ttables['AgeFactor'] = age_factor[0]
    pat_groups = pat_cyto_data['Grouping'].ix[pat_index].groupby(level = 'Patient ID').first()
    pat_groups = pat_groups.fillna('None')
    ttables['NumSamples'] = len(pat_index)
    ttables['NumPatients'] = len(pat_groups)
    counts = pat_groups.value_counts()
    for ind, count in zip(counts.index, counts.values):
        ttables['NumPatients-'+ind] = count
    return ttables
    
test_items = product(eqns, subcohorts, cytos, age_factors, group_references)
num_to_do = len(eqns)*len(subcohorts)*len(cytos)*len(age_factors)*len(group_references)

#with ProcessPoolExecutor(max_workers = 20) as E:
    #results = E.map(linker_fun, test_items)
results = imap(linker_fun, test_items)
for num, res in enumerate(results):
    if res is None:
        #print 'wrongness'
        continue
    #print 'rightness'
    #print res.T.to_string()
    #raise KeyError
    final_test_res.append(res)
    if num % 1000 == 0:
        print num, num_to_do


0 640

In [16]:
print final_test_res[0].T.to_string()


                                Value          Std.Error                 DF            t-value            p-value
(Intercept)                  176.6806           47.94631                303           3.684967        0.000270886
as.factor(Gender)M          -0.933958           16.24052                 92        -0.05750788          0.9542653
as.factor(HAART)dH           -20.5657           42.57718                 92         -0.4830216          0.6302278
as.factor(HAART)nH          -12.90511            26.8966                 92         -0.4798045          0.6325051
HCV                         -17.25132           18.79521                 92         -0.9178576          0.3610934
LVL                         -10.61765           8.019727                 92          -1.323942          0.1888025
Age                         0.1112264          0.9443191                 92          0.1177848           0.906495
Cytokine                          EGF                EGF                EGF                EGF                EGF
Correction          HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection
CohortName                         AA                 AA                 AA                 AA                 AA
DrugReference                      PN                 PN                 PN                 PN                 PN
AgeFactor                         Age                Age                Age                Age                Age
NumSamples                        402                402                402                402                402
NumPatients                        99                 99                 99                 99                 99
NumPatients-PN                     29                 29                 29                 29                 29
NumPatients-PC                     25                 25                 25                 25                 25
NumPatients-None                   23                 23                 23                 23                 23
NumPatients-SCa                    10                 10                 10                 10                 10
NumPatients-MDU                     7                  7                  7                  7                  7
NumPatients-SBe                     5                  5                  5                  5                  5

In [17]:
adj_data = concat([df.reset_index() for df in final_test_res], axis = 0, ignore_index=True)

In [18]:
#adj_data = concat(final_test_res, axis = 0).reset_index().rename(columns = {'level_1':'ResultType'})
print adj_data.head(n=6).T.to_string()


                                    0                  1                  2                  3                  4                  5
(Intercept)                  176.6806           47.94631                303           3.684967        0.000270886           71.97722
Age                         0.1112264          0.9443191                 92          0.1177848           0.906495           1.421886
AgeFactor                         Age                Age                Age                Age                Age                Age
Benzodiazepines                   NaN                NaN                NaN                NaN                NaN                NaN
Cannabinoid                       NaN                NaN                NaN                NaN                NaN                NaN
Cocaine                           NaN                NaN                NaN                NaN                NaN                NaN
CohortName                         AA                 AA                 AA                 AA                 AA                 AA
Correction          HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection  HCVOnlyCorrection
Cytokine                          EGF                EGF                EGF                EGF                EGF            Eotaxin
DrugReference                      PN                 PN                 PN                 PN                 PN                 PN
HCV                         -17.25132           18.79521                 92         -0.9178576          0.3610934          -26.71107
LVL                         -10.61765           8.019727                 92          -1.323942          0.1888025           -3.36522
MDU                               NaN                NaN                NaN                NaN                NaN                NaN
NumPatients                        99                 99                 99                 99                 99                100
NumPatients-MDU                     7                  7                  7                  7                  7                  7
NumPatients-None                   23                 23                 23                 23                 23                 23
NumPatients-PC                     25                 25                 25                 25                 25                 25
NumPatients-PN                     29                 29                 29                 29                 29                 29
NumPatients-SBe                     5                  5                  5                  5                  5                  5
NumPatients-SCa                    10                 10                 10                 10                 10                 11
NumSamples                        402                402                402                402                402                406
Opiates                           NaN                NaN                NaN                NaN                NaN                NaN
as.factor(Gender)M          -0.933958           16.24052                 92        -0.05750788          0.9542653           5.440777
as.factor(HAART)dH           -20.5657           42.57718                 92         -0.4830216          0.6302278            16.0457
as.factor(HAART)nH          -12.90511            26.8966                 92         -0.4798045          0.6325051          -5.036216
index                           Value          Std.Error                 DF            t-value            p-value              Value

Now we need to aggregate the data into something more useful.


In [19]:
test_cols = ['Opiates',
'Cocaine',
'Benzodiazepines',
'Cannabinoid',
'MDU',
'HCV',
'LVL',
'Age']
sample_cols = [
'NumPatients',
'NumPatients-MDU',
'NumPatients-None',
'NumPatients-PC',
'NumPatients-PN',
'NumPatients-SBe',
'NumPatients-SCa',
'NumSamples']

wanted_cols = test_cols + sample_cols

grouping_key = ['CohortName', 'Correction','Cytokine']
masks = [('Pval', adj_data['index'] == 'p-value'),
        ('Effect', adj_data['index'] == 'Value')]
concat_data = []
for name, mask in masks:
    ndata = adj_data.ix[mask]
    odata = {}
    for idx, group in ndata.groupby(grouping_key):
        odata[idx] = group[test_cols].mean()
    grouped_data = DataFrame(odata).T
    grouped_data.index = MultiIndex.from_tuples(grouped_data.index, names = grouping_key)
    grouped_data.columns = MultiIndex.from_tuples([(name, col) for col in grouped_data.columns])
    concat_data.append(grouped_data.copy())

num_samples = adj_data.groupby(grouping_key).agg(dict([(col, 'max') for col in sample_cols])).fillna(0)
num_samples.columns = MultiIndex.from_tuples([('NumSamples', col) for col in num_samples.columns])
concat_data.append(num_samples)
big_table = concat(concat_data, axis = 1)

In [20]:
from functools import partial

def do_col_analysis(typ, indf):
    odf = indf.copy()
    func = do_bh if typ == 'BH' else do_bon
    #print indf
    for col in indf.columns:
        #print indf[col]
        #print indf[col].notnull().mean()
        if indf[col].notnull().mean()>0.2:
            #print 'doing'
            odf[col] = func(indf[col])
        else:
            #print 'skipping'
            odf[col] = np.nan
    return odf

def do_bh(ser):
    
    nser = ser.copy()
    nval = nser.values
    mask = ~np.isnan(nval)
    _, bh_pvals, _, _ = multipletests(nval[mask], alpha = 0.05, method = 'fdr_bh')
    nser.values[mask] = bh_pvals
    return nser

def do_bon(ser):
    
    nser = ser.copy()
    nval = nser.values
    mask = ~np.isnan(nval)
    _, bh_pvals, _, _ = multipletests(nval[mask], alpha = 0.05, method = 'bonferroni')
    nser.values[mask] = bh_pvals
    return nser

In [21]:
bh_qvals = big_table['Pval'].groupby(level = ['CohortName', 'Correction']).apply(partial(do_col_analysis, 'BH'))
bon_qvals = big_table['Pval'].groupby(level = ['CohortName', 'Correction']).apply(partial(do_col_analysis, 'Bon'))
 
bh_qvals.columns = MultiIndex.from_tuples([('BH-Qval', col) for col in bh_qvals.columns])
bon_qvals.columns = MultiIndex.from_tuples([('Bon-Qval', col) for col in bon_qvals.columns])

final_table = concat([big_table, bh_qvals, bon_qvals], axis = 1)
final_table.to_excel('tables/ModelBasedResults.xlsx')

In [22]:
store = HDFStore('ModelOutputs/results.hdf')
store['final_table'] = final_table
store['pat_cyto_data'] = pat_cyto_data
store['raw_cyto_data'] = raw_cyto_data
store.close()


/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:1795: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot map
directly to c-types [inferred_type->mixed,key->block3_values]

  warnings.warn(ws, PerformanceWarning)

In [23]:
from types import StringType, ListType
def drop_data(inp):
    if inp < 0.1:
        return -np.log10(inp)
    else:
        return np.nan
    

def make_pval_heatmap(input_data, pval_col, wanted_corrections, wanted_cohorts, wanted_reference, 
                        filename = None, colormap = 'copper_r', labels = 'log'):
    
    if type(pval_col) == StringType:
        pval_data = input_data[[pval_col]]
        nwanted_corrections = list(wanted_corrections)
    elif type(pval_col) == ListType:
        pval_data = input_data[pval_col]
        nwanted_corrections = list(product(pval_col, wanted_corrections))
    else:
        raise TypeError
        
    corrected_data = pval_data.ix[wanted_corrections]
    tmp_data = corrected_data.reset_index()
    #print tmp_data
    
    results = pivot_table(tmp_data, rows = 'Cytokine', cols = 'Correction', values = pval_col)
    #print results
    #plt.figure(figsize = (10,10))
    tdata = results.applymap(drop_data)[nwanted_corrections].ix[cytos].dropna(axis = 1, how='all')
    
    try:
        fig = make_heatmap_df(tdata, colormap = colormap, 
                        figsize = (0.75*len(nwanted_corrections),10), 
                        grid_kwargs=grid_kwargs)
        
    except ValueError:
        return
    #fig.tight_layout()
    #plt.imshow(tdata.values, interpolation = 'nearest', cmap = get_cmap('copper_r'))
    #plt.xticks(range(len(tdata.columns)), tdata.columns, rotation = 90, fontsize = 16)
    #plt.yticks(range(len(tdata.index)), tdata.index, fontsize = 16)
    #print -np.log10([1, 0.1, 0.01, 0.001, 0.0001])
    cbar = plt.colorbar(ticks = -np.log10([0.1, 0.05, 0.01, 0.001, 0.0001]))
    plt.clim([0, 4])
    if labels != 'log':
        cbar.ax.set_yticklabels(['0.1', '0.05', '0.01', '0.001', '0.0001'])
        cbar.set_label('P-value')
    else:
        cbar.set_label('-log10(P-value)')
    plt.title(pval_col)
    if filename:
        plt.savefig(filename, pad_inches = 2)

        
cohorts = ['AA']
wanted = [('HCV', 'HCV'),
          ('Cocaine', 'Cocaine'),
          ('Cannabinoid', 'Cannabinoid')]
color_maps = ['copper_r']
label_types = ['norm']
grid_kwargs = {'lw':2, 'color':'w'}
new_wanted_corrections = [('HCV', ['HCVOnlyCorrection', 'HCV_AllSample']),
                          ('Cocaine', ['HCVOnlyCorrection', 
                                        'HCV_AllSample', 
                                        'HCV_ToSample',
                                        'HCV_AtSample']),
                          ('Cannabinoid', ['HCVOnlyCorrection', 
                                           'HCV_AllSample', 
                                           'HCV_ToSample',
                                           'HCV_AtSample']),
                          ]
iterable = product(cohorts, wanted, color_maps, label_types, new_wanted_corrections)
for cohort, (name, cols), cmap, label_type, (cor_name, wanted_corrections) in iterable:
    fname = 'figures/HeatMaps/%s-%s-%s-PvalFigure-fixed-AA-%s-%s.png' % (cohort, name, cor_name, cmap, label_type)
    #fname = None
    make_pval_heatmap(big_table.ix[cohort]['Pval'], cols, wanted_corrections, 
    'AA', 'PN', filename = fname, colormap = cmap, labels = label_type)
    plt.close()

In [24]:
def fix_nums(inp):
    if inp:
        return 0.05
    else:
        return 1
    
def nfix_nums(inp):
    if (inp == 1).all():
        return 1
    else:
        return -np.log10(np.mean(inp))

lit_review = read_csv('FormattedInput/NewLitReviewTable.csv', sep = ',')


lit_review['IsSig'] = (lit_review['WasSig']==1).map(fix_nums).combine_first(lit_review['Pval'])
lit_review['IsSig'][lit_review['WasSig']==0] = 1

print lit_review.columns
tmp = [('Cocaine', 'WithCocaine'),
        ('Cannabinoid', 'WithCannabinoid'),
        ('HCV', 'WithCoinfecHCV'),
        ('HCV', 'WithCoinfecHIV')]
big_lit = lit_review
for cohort, (col, gp) in product(['AA', 'AA-Restricted'], tmp):
    niz_res = big_table.ix[cohort]['Pval'][col].reset_index().rename(columns = {col:'IsSig'})
    niz_res['Author'] = 'Niz'
    niz_res['Year'] = 2013
    niz_res['WithHIV'] = 1.0
    niz_res[gp] = 1.0
    niz_res['Grouping'] = niz_res['Correction'].map(lambda x: cohort+'-'+x)
    niz_res['Sample'] = 'Plasma'
    niz_res['Cohort'] = cohort
    big_lit = concat([niz_res, big_lit], axis = 0, ignore_index=True)

skip = set(['GroupingCorrection', 'HCV_AllSample', 'HCV_ToSample'])
drop_mask = big_lit['Correction'].map(lambda x: x in skip) & (big_lit['Cohort'] == 'AA-Restricted')

big_lit = big_lit[~drop_mask]

big_lit['Label'] = big_lit[['Author', 'Grouping', 'Sample']].apply(lambda x: ', '.join(x), axis = 1)

col_order = {
             'Cocaine':['EGF', 'Eotaxin', 'FGF.basic', 'G.CSF', 'GM.CSF', 'HGF', 'IFN.alpha', 'IFN.gamma',
                           'IL.1', 'IL.10', 'IL.12', 'IL.13', 'IL.15', 'IL.17', 'IL.18', 'IL.1RA', 'IL.1beta',
                           'IL.2', 'IL.2R', 'IL.3', 'IL.4', 'IL.5', 'IL.6', 'IL.7', 'IL.8', 'IL.RA', 'IP.10',
                           'MCP.1', 'MIG', 'MIP.1alpha', 'MIP.1beta', 'MPA', 'NAP.2', 'Rantes', 'TGF.beta',
                           'TNF.alpha', 'TNF.beta', 'VEGF', 'sCD14', 'sCD40L', 'Th1', 'Th2'],
             'Cannabinoid':['EGF', 'Eotaxin', 'FGF.basic', 'G.CSF', 'GM.CSF', 'HGF', 'IFN.alpha', 'IFN.gamma',
                               'IL.10', 'IL.12', 'IL.13', 'IL.15', 'IL.17', 'IL.18', 'IL.1RA', 'IL.1beta',
                               'IL.2', 'IL.2R', 'IL.3', 'IL.4', 'IL.5', 'IL.6', 'IL.7', 'IL.8', 'IL.RA', 'IP.10',
                               'MCP.1', 'MIG', 'MIP.1alpha', 'MIP.1beta', 'Rantes', 'TGF.beta',
                               'TNF.alpha', 'TNF.beta', 'VEGF', 'sCD14', 'Th1', 'Th2'],
             'Coinfect':['EGF', 'Eotaxin', 'FGF.basic', 'G.CSF', 'GM.CSF', 'HGF', 'IFN.alpha', 'IFN.gamma',
                               'IL.10', 'IL.12', 'IL.13', 'IL.15', 'IL.17', 'IL.18', 'IL.1beta',
                               'IL.2', 'IL.2R', 'IL.3', 'IL.4', 'IL.5', 'IL.6', 'IL.7', 'IL.8', 'IL.RA', 'IP.10',
                               'MCP.1', 'MIG', 'MIP.1alpha', 'MIP.1beta', 'Rantes', 
                               'TNF.alpha',  'VEGF', 'sCD27', 'sTNFRII', 'Th1', 'Th2'],
             }


row_order = []
found = set()
for it in big_lit['Label'].values:
    if it not in found:
        row_order.append(it)
        found.add(it)

niz_order = ['Niz, AA-GroupingCorrection, Plasma', 
            'Niz, AA-Restricted-HCV_AtSample, Plasma', 
            'Niz, AA-HCV_AtSample, Plasma', 
            'Niz, AA-HCV_ToSample, Plasma',
            'Niz, AA-HCV_AllSample, Plasma', 
            ]
        
review_figs = [('Coinfect', ((big_lit['WithCoinfecHIV']==1 )| (big_lit['WithCoinfecHCV']==1))),
               ('Cannabinoid', ((big_lit['WithHIV']==1) | (big_lit['WithCannabinoid']==1)) & (big_lit['WithCocaine'].fillna(0)==0)),
                ('Cocaine', ((big_lit['WithHIV']==1) | (big_lit['WithCocaine']==1)) & (big_lit['WithCannabinoid'].fillna(0)==0))]
for name, mask in review_figs:
    #print big_lit.ix[mask].tail().T.to_string()
    lit_table = pivot_table(big_lit.ix[mask], 
                            rows = 'Label', 
                            cols = 'Cytokine', 
                            values = 'IsSig', 
                            aggfunc=nfix_nums).ix[row_order]
    row_order = [ind for ind in lit_table.index if not ind.startswith('Niz,')]
    row_order = niz_order + row_order

    
    for col in col_order:
        if col not in lit_table:
            lit_table[col] = np.nan
    tmpa = lit_table.ix[row_order]
    tmpb = col_order[name]
    out_lit = lit_table.ix[row_order][col_order[name]].abs().dropna(axis = 0, how = 'all')
    make_heatmap_df(out_lit, figsize = (30,10), colormap = 'copper_r', grid_kwargs=grid_kwargs)
    #print lit_table[col_order].ix['Katona, Cannabinoids, MS patients who used marijuana serum samples']
    #raise KeyError
    plt.xticks(range(len(col_order[name])), col_order[name], rotation = 90, fontsize = 16);
    plt.yticks(range(len(out_lit.index)), out_lit.index, fontsize = 16);
    plt.hlines([4.5], -0.5, len(col_order[name]), color = 'r', linewidth = 5);
    last = None
    for num, val in enumerate(out_lit.index):
        if num > 4:
            nval = tuple(val.split(', ')[1:])
            if nval != last:
                plt.hlines([num-0.5], -0.5, len(col_order[name]), color = 'g', linewidth = 5)
                last = nval
            
    #plt.hlines([4.5, 8.5, 9.5, 23.5], -0.5, len(col_order), color = 'g', linewidth = 2);
    cbar = plt.colorbar(ticks = -np.log10([1, 0.1, 0.05, 0.01, 0.001, 0.0001]))
    plt.clim([0, 4])
    cbar.ax.set_yticklabels(['1', '0.1', '0.05', '0.01', '0.001', '0.0001'])
    cbar.set_label('P-value')
    #plt.gcf().tight_layout()
    
    plt.savefig('figures/LitReviewHeatmaps/%s-litreview.png' % name)
    plt.close()


Index([Author, Year, WithHIV, WithCocaine, WithCannabinoid, WithOpiate, WithCoinfecHIV, WithCoinfecHCV, Grouping, Sample, Cytokine, WasSig, Pval, IsSig], dtype=object)

This plot shows the cytokines which were tested in our dataset or what we found in the literature. In this figure the Copper boxes we tested but are not significant and the black boxes are significant. White boxes were not tested.


In [25]:
def drop_outliers(series):
    mu = series.mean()
    std = series.std()
    
    mask = (series>(mu+1.5*std)) | (series<(mu-1.5*std))
    series[mask] = np.nan
    return series

nraw_cyto_data = raw_cyto_data.groupby(level = ['Patient ID', 'VisitNum']).median()

raw_cyto_data_no_outliers = nraw_cyto_data.apply(drop_outliers)

In [26]:
have_values = raw_cyto_data_no_outliers.apply(lambda x: x.notnull())
grouped_vals = have_values[cytos].groupby(level = 'Patient ID').mean()
grouped_vals.mean()


Out[26]:
EGF           0.982301
Eotaxin       0.973451
FGF.basic     0.955752
G.CSF         0.893805
GM.CSF        0.646018
HGF           0.955752
IFN.alpha     0.973451
IFN.gamma     0.893805
IL.10         0.955752
IL.12         0.982301
IL.13         0.902655
IL.15         0.973451
IL.17         0.831858
IL.1beta      0.991150
IL.2          0.849558
IL.2R         0.964602
IL.4          0.946903
IL.5          0.964602
IL.6          0.973451
IL.7          0.946903
IL.8          0.911504
IL.RA         0.982301
IP.10         0.920354
MCP.1         0.973451
MIG           0.964602
MIP.1alpha    0.973451
MIP.1beta     0.982301
Rantes        0.778761
TNF.alpha     0.991150
VEGF          0.982301
Th1           0.823009
Th2           0.955752
dtype: float64

In [27]:
from functools import partial
from scipy.stats import linregress

def regress_pvals(x,y, nreps = 30000):
    
    _, _, r, _, _ = linregress(x, y = y)
    lx = list(x)
    r = r**2
    num = 0
    for n in range(nreps):
        shuffle(lx)
        _, _, nr, _, _ = linregress(lx, y = y)
        num += nr**2 > r
    #print num, nreps, float(num)/float(nreps)
    return float(num)/float(nreps)

def check_outliers(data, nstd=5):
    
    med = np.median(data.unique())
    mad = (data-med).abs().mean()
    #print med, mad
    
    high_mask = (data > med+nstd*mad)
    
    while high_mask.any() & (data[~high_mask].max()*1.3 >= data[high_mask].min()):
        #print 'reclaiming', data[~high_mask].max(), data[high_mask].min()
        high_mask[data[high_mask].idxmin()] = False
        
    return high_mask

def prep_broken_figure(tmp_cyto_data, main_min, main_max):
    
    cout = tmp_cyto_data['out']
    main_ax = None
    out_ax = None
    #main_min = tmp_cyto_data['cyto'][~cout].min()
    #main_max = tmp_cyto_data['cyto'][~cout].max()
    if cout.any():
        out_min = tmp_cyto_data['cyto'][cout].min()
        out_max = tmp_cyto_data['cyto'][cout].max()

        if (out_min > main_max) & (main_min < out_min):
            #outliers are above
            top_ax = plt.axes([0.1, 0.75, 0.8, 0.2])
            bot_ax = plt.axes([0.1, 0.05, 0.8, 0.65])
            
            main_ax = bot_ax
            out_ax = top_ax
            
            out_ax.hold(True)
            out_ax.set_ylim([out_min*0.9, out_max*1.1])
            out_ax.set_xlim([-0.05,1.05])
            out_ax.set_yticks(out_ax.get_yticks()[::3])
            
            top_ax.spines['bottom'].set_visible(False)
            top_ax.set_xticks([])
            bot_ax.spines['top'].set_visible(False)
            bot_ax.tick_params(labeltop='off') # don't put tick labels at the top
            bot_ax.xaxis.tick_bottom()
    
    if main_ax is None:
        main_ax = plt.axes()
    
    main_ax.hold(True)
    main_ax.set_ylim([main_min, main_max])
    main_ax.set_xlim([-0.05,1.05])
    main_ax.set_yticks(main_ax.get_yticks()[::2])
    
    return main_ax, out_ax

In [28]:
import PlottingTools

def make_imagesc_figure(name, column, x_data_col, valid_pats, 
                        filename = None, draw_annotations = True, 
                        mdu_pats = None, nreps = 100, with_vl = True):
    wanted_cuts = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    wanted_x = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1])
    
    results = []
    valid_pats = valid_pats.groupby(level = ['Patient ID', 'VisitNum']).any().dropna()
        
    tcolumn = column.groupby(level = ['Patient ID', 'VisitNum']).median()
    drug_frac, cyto_data = known_pat_data[x_data_col][valid_pats].dropna().align(tcolumn[valid_pats].dropna())
    if with_vl:
        vl, cyto_data = known_pat_data['Latest Viral load'].align(cyto_data, join='right')
        cyto_data += vl.map(np.log10)
    tmp = DataFrame({'drug':drug_frac, 'cyto':cyto_data}).dropna()
    
    cout = check_outliers(tmp['cyto'], nstd=5)
    tmp['out'] = cout
    cyto_bins = np.linspace(tmp['cyto'][~cout].min(), tmp['cyto'][~cout].max(), 15)
    H, x, y = np.histogram2d(tmp['drug'][~cout].values, tmp['cyto'][~cout].values,
                                bins = [wanted_x, cyto_bins])
    
    m, b, r, sp, _ = linregress(tmp['drug'].values, y = tmp['cyto'].values)
    p = regress_pvals(tmp['drug'].values, tmp['cyto'].values, nreps=nreps)
    
    ny = m*wanted_x + b
    
    wanted_cmap = get_cmap('copper_r')
    wanted_cmap.set_under('w')
    fig = plt.figure(figsize = (8,8), dpi = 200)
    main_ax, out_ax = prep_broken_figure(tmp, y[0], y[-1])
        
    main_ax.pcolormesh(x-0.05,y,H.transpose(), cmap = wanted_cmap, vmax = 10, vmin=1)
       
    PlottingTools.add_grid(main_ax, x-0.05,y, color = 'w', lw = 2)
    
    for low, hi in zip(wanted_cuts, list(wanted_cuts[1:])+[1.1]):
        tmask = (tmp['drug'] >= low) & (tmp['drug'] < hi)
        med_val = tmp['cyto'][tmask].median()
        main_ax.hlines(med_val, low-0.05, hi-0.05, color = 'b', lw = 3)
    main_ax.plot(wanted_x, ny, color = 'g', lw = 3) 
    main_ax.set_ylabel(name)
    if out_ax is not None:
        out_ax.scatter(tmp['drug'], tmp['cyto'])
    return m, b, r**2, p, sp, len(tmp)

tcols = [(cyto, pat_cyto_data[cyto]) for cyto in cytos]
wanted_cols = [('Log-Viral-Load', pat_cyto_data['Log10 Viral Load']),
               ('Th1', pat_cyto_data['Th1']),
               ('Th2', pat_cyto_data['Th2']),
               ('Th1:Th2', pat_cyto_data['Th1:Th2']),
               ('CD4', pat_cyto_data['Latest CD4 count']),
               ('CD8', pat_cyto_data['Latest CD8 count']),
               ('CD4:CD8', pat_cyto_data['CD4:CD8'])
                ] + tcols

fname = 'figures/TrendingFigures/%(drug)s_%(sample)s.png'

checks = ['ALL']
drugs = ['Cocaine', 'Cannabinoid']
two_cuts = set(['Cannabinoid', 'Opiates', 'Benzodiazepines'])
with_vl = [('VL', True),
           ('NVL', False),]
iterable = product(drugs, wanted_cols, checks, with_vl)
fname = 'figures/TrendingFigures/%(drug)s_%(sample)s_%(VL)s.png'
trend_results = []
for drug, (name, col), check, (vlabel, vl) in iterable:
    
    tcols = set(['ALL.Benzodiazepines', 'ALL.Cannabinoid', 'ALL.Opiates', 'ALL.Cocaine'])
    tcols.discard('ALL.'+drug)
    check_col = check + '.' + drug
    mask = (pat_cyto_data['Race'] == 'Black/AA') & (~(pat_cyto_data[list(tcols)]>0).any(axis = 1))
    mdu_mask = (pat_cyto_data['Race'] == 'Black/AA') & (pat_cyto_data[list(tcols)]>0).any(axis = 1)
    
    tup = make_imagesc_figure(name, col, check_col, mask, nreps = 10000)
    tup += (name, drug, vlabel)
    trend_results.append(tup)
    oname = fname % {'drug':drug, 'sample':name, 'VL':vlabel}
    plt.savefig(oname, dpi = 200)
    plt.close()
    
pval_res = DataFrame(trend_results, columns = ['m', 'b', 'r^2', 'p-val', 'm-pval', 'Npats', 'Col', 'Drug', 'VL'])

In [29]:
def do_bh(ser):
    
    _, bh_pvals, _, _ = multipletests(ser, alpha = 0.05, method = 'fdr_bh')
    return Series(bh_pvals, index = ser.index)

def do_bon(ser):
    _, bh_pvals, _, _ = multipletests(ser, alpha = 0.05, method = 'bonferroni')
    return Series(bh_pvals, index = ser.index)


_, bon_pvals, _, _ = multipletests(pval_res['p-val'], alpha = 0.05, method = 'bonferroni')
pval_res['BH-qvals'] = pval_res.groupby(['Drug', 'VL'])['p-val'].transform(do_bh)
pval_res['Bon-qvals'] = pval_res.groupby(['Drug', 'VL'])['p-val'].transform(do_bon)

wcols = ['Drug', 'Col', 'Npats', 'm', 'b', 'r^2', 'p-val', 'BH-qvals', 'Bon-qvals']
pval_res[wcols].to_excel('tables/TrendingPvals.xlsx', index=False)

In [30]:
sens_cohorts = []
for num in range(1,4):
    sens_cohorts.append((str(num), pat_cyto_data['NumTotalVisits'] >= num))

drug_groups = list(pat_cyto_data['Grouping'].dropna().unique())
eqns = [
('HCV_AllSample', (('ALL.Benzodiazepines', 'ALL.Benzodiazepines'),
                  ('ALL.Cannabinoid', 'ALL.Cannabinoid'),
                  ('ALL.Cocaine', 'ALL.Cocaine'),
                  ('ALL.Opiates', 'ALL.Opiates'),
                ('Gender', 'as.factor(Gender)'), 
                ('Race', 'as.factor(Race)'), 
                ('HAART', 'as.factor(HAART)'), 
                ('HCV', 'as.factor(HCV)'))),
]

age_factors = [('AgePast50', 'AgePast50')]
    
group_references = ['PN']
    
    

def linker_fun(tup):
    
    (eqn_name, items), (cohort_name, cohort_mask), cyto, age_factor, group_ref = tup
    data = pat_cyto_data.ix[cohort_mask]
    
    #print eqn_name
    if ('Sample' in eqn_name) and (group_ref != 'PN'):
        return None
    #print 'after', eqn_name
    factor_refs = {
                    'HCV':'False',
                    'Grouping':group_ref,
                    'Race':'Black/AA',
                    'HIVDI':'False',
                  }
    #print 'testing'
    output = RuiRegressTest(data, items+(age_factor,), factor_refs, cyto, debug = False)
    if output is None:
        #print 'wrong'
        return None
    #print 'yes'
    ttables, nitems = output
    ttables['Cytokine'] = cyto
    ttables['Correction'] = eqn_name
    ttables['CohortName'] = cohort_name
    ttables['DrugReference'] = group_ref
    ttables['AgeFactor'] = age_factor[0]
    ttables['NumSamples'] = len(nitems)
    return ttables
    
test_items = product(eqns, sens_cohorts, cytos, age_factors, group_references)
num_to_do = len(eqns)*len(subcohorts)*len(cytos)*len(age_factors)*len(group_references)

sense_test_res = []

#with ProcessPoolExecutor(max_workers = 20) as E:
results = imap(linker_fun, test_items)
for num, res in enumerate(results):
    if res is None:
        #print 'wrongness'
        continue
    #print res.T.to_string()
    #raise KeyError
    sense_test_res.append(res)
    if num % 1000 == 0:
        print num, num_to_do

sens_data = concat([df.reset_index() for df in sense_test_res], axis = 0, ignore_index=True)


0 64

In [31]:
#['Correction', 'CohortName', 'AgeFactor', 'DrugReference', 'Cytokine']
for drug in drugs + ['HCV']:
    res = pivot_table(sens_data[sens_data['index'] == 'p-value'], rows = 'CohortName', cols = 'Cytokine', values = drug)<0.1
    res = res[cytos]
    plt.figure(figsize = (20, 20))
    plt.imshow(res.values, interpolation = 'nearest', cmap = get_cmap('copper_r'))
    _ = plt.xticks(range(len(res.columns)), res.columns, rotation = 90, fontsize = 16);
    _ = plt.yticks(range(len(res.index)), res.index, fontsize = 16);
    xpos = np.arange(len(res.columns))+0.5
    ypos = np.arange(len(res.index))+0.5
    PlottingTools.add_grid(plt.gca(), xpos, ypos, color = 'w', lw = 2)
    plt.ylabel('# Required Visits')
    plt.savefig('figures/HeatMaps/%s_sensitivity_analysis.png' % drug)



In [32]:
res = pat_cyto_data['NumTotalVisits'].groupby(level = ['Patient ID', 'VisitNum']).max()
for num in range(10):
    print num, (res >= num).sum()


0 113
1 113
2 95
3 87
4 66
5 47
6 34
7 22
8 15
9 8

In [33]:
from scipy.stats import ttest_ind

def group_fun(series):
    if not series.any():
        return 'PN'
    elif series.sum()>1:
        return 'MDU'
    elif series['ALL.Benzodiazepines']:
        return 'PBe'
    elif series['ALL.Cannabinoid']:
        return 'PCa'
    elif series['ALL.Cocaine']:
        return 'PC'
    elif series['ALL.Opiates']:
        return 'PO'
    
    
wanted_cols = ['Latest CD4 count', 'Log10 Viral Load', 'CD4:CD8', 'Latest CD8 count']
res = pat_cyto_data[wanted_cols].groupby(level = ['Patient ID', 'VisitNum']).max()
groupings = (pat_cyto_data[['ALL.Benzodiazepines', 
                            'ALL.Cannabinoid', 
                            'ALL.Cocaine', 
                            'ALL.Opiates']]>0).groupby(level = ['Patient ID', 'VisitNum']).any()
grouping_res = groupings.apply(group_fun, axis = 1)
check_data = res.copy()
check_data['Grouping'] = grouping_res

all_groups = grouping_res.unique()
out_checks = []
for col in wanted_cols:
    for A, B in combinations(all_groups, 2):
        maskA = check_data['Grouping'] == A
        maskB = check_data['Grouping'] == B
        dataA = check_data[col].ix[maskA].dropna().values
        dataB = check_data[col].ix[maskB].dropna().values
        _, pval = ttest_ind(dataA, dataB)
        out_checks.append((col, A, B, pval))

out_df = DataFrame(out_checks, columns = ['Data', 'GroupA', 'GroupB', 'pval'])
out_df.to_excel('tables/group_pval_table.xlsx')

In [34]:
tA = big_table['Pval'][['Cocaine', 'Cannabinoid']].ix['AA'].ix['HCV_AllSample'].ix[cytos]
fig = make_heatmap_df(-tA.applymap(np.log10), figsize = (2,10), colormap = 'copper_r', grid_kwargs=grid_kwargs)
cbar = plt.colorbar(ticks = -np.log10([1, 0.1, 0.05, 0.01, 0.001, 0.0001]))
plt.clim([0, 4])
cbar.ax.set_yticklabels(['1', '0.1', '0.05', '0.01', '0.001', '0.0001']);
plt.savefig('figures/HeatMaps/Coc_canab_cyto.png')



In [35]:
tB = big_table['Pval'][['Cocaine', 'Cannabinoid', 'HCV']].ix['AA'].ix['HCV_AllSample'].ix[cytos]
fig = make_heatmap_df(-tB.applymap(np.log10), figsize = (2,10), colormap = 'copper_r', grid_kwargs=grid_kwargs)
cbar = plt.colorbar(ticks = -np.log10([1, 0.1, 0.05, 0.01, 0.001, 0.0001]))
plt.clim([0, 4])
cbar.ax.set_yticklabels(['1', '0.1', '0.05', '0.01', '0.001', '0.0001']);
plt.savefig('figures/HeatMaps/HCV_Coc_canab_cyto.png')



In [36]:
wanted = [('GroupingCorrection', 'AA', 'CCM'),
          ('HCV_AtSample', 'AA-Restricted', 'Restricted-wLCM'),
          ('HCV_AtSample', 'AA', 'At-Sample'),
          ('HCV_ToSample', 'AA', 'To-Sample'),
          ('HCV_AllSample', 'AA', 'All-Sample')]
cols = ['Cocaine', 'Cannabinoid']
order = ['CCM', 'Restricted-wLCM', 'At-Sample',
         'To-Sample', 'All-Sample']

for typ, colormap in [('Effect', 'RdYlGn'), ('Pval', 'copper_r')]:
    for drug in cols:
        tdict = {}
        for check, cohort, name in wanted:
            tdict[name] =  big_table[typ][drug].ix[cohort].ix[check].ix[cytos]
        tdf = DataFrame(tdict)
        if typ == 'Pval':
            tdf = tdf.applymap(np.log10)
        else:
            tdf = tdf.applymap(np.sign)
        fig = make_heatmap_df(-tdf[order], 
                              figsize = (2,10), 
                              colormap = colormap, 
                              grid_kwargs=grid_kwargs)
        if typ == 'Pval':
            cbar = plt.colorbar(ticks = -np.log10([1, 0.1, 0.05, 0.01, 0.001, 0.0001]))
        else:
            cbar = plt.colorbar()
            
        plt.savefig('figures/HeatMaps/drexel_cyto_heatmap_%s_%s.png' % (drug, typ))
        plt.close()

In [37]:
from subprocess import call
import shlex

cmd = 'rsync -rvh /home/will/HIVSystemsBio/NewCytokineAnalysis/ /home/will/Dropbox/Wigdahl\ HIV\ Lab/NewCytokineAnalysis/'
call(shlex.split(cmd))


Out[37]:
0

In [37]: