In [1]:
from __future__ import division
from pandas import *
import os, os.path
import sys
import numpy as np

sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')

In [2]:
store = HDFStore('/home/will/HIVReportGen/Data/SplitRedcap/2013-01-16/EntireCohort.hdf')

In [3]:
redcap_data = store['redcap']
seq_data = store['seq_data']

t = redcap_data['Event Name'].dropna().apply(lambda x: x.split(' - ')[0])
t.unique()
redcap_data['Patient visit number'] = redcap_data['Patient visit number'].combine_first(t)

In [6]:
wanted_cols = ['Patient ID', 'Patient visit number', 'Date of visit', 'Latest CD4 count (cells/uL)', 'Latest viral load', 'Current ART status']
wanted_redcap = redcap_data[wanted_cols]

data = merge(wanted_redcap, seq_data[['LTR-bin-align']],
            left_on = ['Patient ID', 'Patient visit number'],
            right_index = True, how = 'inner').dropna()
data = data.rename(columns= {
                                'Patient visit number':'VisitNum',
                                'Date of visit':'Date',
                                'Latest CD4 count (cells/uL)':'CD4',
                                'Latest viral load':'VL',
                                'Current ART status':'ART'
                            })
print data


<class 'pandas.core.frame.DataFrame'>
Int64Index: 954 entries, 1 to 1397
Data columns:
Patient ID       954  non-null values
VisitNum         954  non-null values
Date             954  non-null values
CD4              954  non-null values
VL               954  non-null values
ART              954  non-null values
LTR-bin-align    954  non-null values
dtypes: float64(2), object(5)

In [7]:
data['ART'].unique()


Out[7]:
array([on, non-adherent, off, naive], dtype=object)

In [53]:
def QuantitateChange(seq_list):
    
    delta = 0
    for a, b in zip(seq_list, seq_list[1:]):
        delta =+ (np.array(a[200:-100]) == np.array(b[200:-100])).sum()
    return delta/len(seq_list)

In [54]:
def ProcessPatient(df):
    
    if len(df) < 2:
        return Series({
                    'NiaveVisits':np.nan,
                    'ARTVisits':np.nan,
                    'NumWellControlled':np.nan,
                    'LongestWellControlled':np.nan,
                    'NumUnControlled':np.nan,
                    'ControlledVariation':np.nan,
                    'UnControlledVariation':np.nan})
    
    num_niave = (df['ART'] == 'naive').sum()
    num_on = (df['ART'] == 'on').sum()
    
    well_controlled = (df['VL'] <= 100) & (df['CD4'] >= 250)
    num_well = well_controlled.sum()
    num_uncontrolled = (~well_controlled).sum()
    
    run = 0
    longest_run = 0
    for well in well_controlled.values:
        if well:
            run += 1
        else:
            longest_run = max(longest_run, run)
            run = 0
        
    longest_well_controlled = longest_run
    
    well_controlled_variation = 0
    if num_well > 1:
        well_controlled_variation = QuantitateChange(df['LTR-bin-align'][well_controlled])
    
    uncontrolled_variation = 0
    if num_uncontrolled > 1:
        uncontrolled_variation = QuantitateChange(df['LTR-bin-align'][~well_controlled])
        
        
        
    return Series({
                    'NiaveVisits':num_niave,
                    'ARTVisits':num_on,
                    'NumWellControlled':num_well,
                    'LongestWellControlled':longest_well_controlled,
                    'NumUnControlled':num_uncontrolled,
                    'ControlledVariation':well_controlled_variation,
                    'UnControlledVariation':uncontrolled_variation})
    

ndata = data.groupby('Patient ID').apply(ProcessPatient)
print ndata


<class 'pandas.core.frame.DataFrame'>
Index: 437 entries, A0001 to A0499
Data columns:
ARTVisits                224  non-null values
ControlledVariation      224  non-null values
LongestWellControlled    224  non-null values
NiaveVisits              224  non-null values
NumUnControlled          224  non-null values
NumWellControlled        224  non-null values
UnControlledVariation    224  non-null values
dtypes: float64(7)

In [55]:
niave_to_art = (ndata['NiaveVisits']>0) & (ndata['ARTVisits']>1)

print ndata.ix[niave_to_art].to_string()

uncontrolled_to_well_controlled = (ndata['LongestWellControlled'] >= 2) & (ndata['NumUnControlled'] >= 1)
print ndata.ix[uncontrolled_to_well_controlled].to_string()


            ARTVisits  ControlledVariation  LongestWellControlled  NiaveVisits  NumUnControlled  NumWellControlled  UnControlledVariation
Patient ID                                                                                                                               
A0058               4                 81.5                      0            1                1                  4               0.000000
A0120               2                161.0                      0            2                3                  2              62.333333
A0198               2                  0.0                      1            1                2                  1             141.500000
A0199               4                 56.5                      3            3                3                  4             110.000000
A0209               3                163.5                      1            1                3                  2             108.333333
A0312               2                162.0                      0            2                2                  2             140.000000
            ARTVisits  ControlledVariation  LongestWellControlled  NiaveVisits  NumUnControlled  NumWellControlled  UnControlledVariation
Patient ID                                                                                                                               
A0005               4            44.000000                      2            0                1                  3               0.000000
A0008               6            65.800000                      3            0                1                  5               0.000000
A0013               5            86.000000                      3            0                2                  3             154.500000
A0025               3            76.666667                      3            0                2                  3             163.500000
A0047               3            67.000000                      2            0                2                  2             126.500000
A0062               8            82.500000                      2            0                4                  4              59.000000
A0067               8            74.000000                      2            0                5                  3              39.800000
A0074               3           164.000000                      2            0                1                  2               0.000000
A0117               6            60.600000                      5            0                1                  5               0.000000
A0192               4            99.333333                      3            0                1                  3               0.000000
A0199               4            56.500000                      3            3                3                  4             110.000000
A0257               3           155.500000                      2            0                1                  2               0.000000
A0274               3           140.500000                      2            0                2                  2             152.000000
A0284               7            82.250000                      2            0                3                  4             109.666667
A0305               6            63.400000                      4            0                1                  5               0.000000

In [210]:
max_day = ndata['DateFromImpaired'].max()
min_day = ndata['DateFromImpaired'].min()

time_axis = np.arange(min_day, max_day)

good_max = ndata['DateFromImpaired'][ndata['DateFromImpaired']>0].median()
good_min = ndata['DateFromImpaired'][ndata['DateFromImpaired']<0].median()
print good_max, good_min


11.8666666667 -18.2

In [211]:
npats = len(ndata['Patient ID'].unique())
ltr_len = len(data['LTR-bin-align'].values[0])
img_mat = np.ones((len(time_axis), ltr_len, npats))*np.nan


for depth, (pat, df) in enumerate(ndata.groupby('Patient ID')):
    
    ndf = df.copy()
    ndf.sort('DateFromImpaired')
    #ltr_mat = np.vstack(ndf['LTR-bin-align'].values)
    seqs = list(ndf['LTR-bin-align'].values)
    nseqs = [seqs[0]] + seqs + [seqs[-1]]
    times = list(ndf['DateFromImpaired'].values)
    ntimes = [min_day] + times + [max_day]
    
    for start_time, stop_time, ltr in zip(ntimes, ntimes[1:], nseqs):
        start_ind = int(start_time - min_day)
        stop_ind = int(stop_time - min_day)
        #print ntimes, start_time, start_ind, stop_time, stop_ind, ltr
        #raise KeyError
        img_mat[start_ind:stop_ind,:,depth] = ltr

    #print ltr_mat
    #raise KeyError

In [212]:
from scipy.stats import ttest_ind
t_img = nanmean(img_mat, axis = 2)
pre_mask = time_axis>0

pvals = []
for n in range(ltr_len):
    pre_data = t_img[pre_mask, n]
    post_data = t_img[~pre_mask, n]
    if len(post_data)>5 and len(pre_data)>5:
        _, pval = ttest_ind(pre_data[pre_data == pre_data], 
                            post_data[post_data == post_data])
    else:
        pval = np.nan
    pvals.append(pval)
pvals = np.array(pvals)

In [224]:
from pylab import get_cmap
plt.figure(figsize = (10,10))
good_min = -24
good_max = 24
plot_mask = (time_axis > good_min) & (time_axis < good_max)

tmp = pvals < 1e-30
n_img = t_img.copy()
n_img = n_img[plot_mask, :]
n_img = n_img[:, tmp]
ticks = np.where(tmp)

plt.imshow(np.flipud(n_img), 
                interpolation = 'nearest', 
                cmap = get_cmap('gray'), 
                aspect = 'auto', vmin = 0, vmax = 1.0,
                extent = (0, len(ticks[0]), good_max, good_min))
plt.xticks(np.arange(0, len(ticks[0]), 20), ticks[0][::20], rotation=90)
plt.colorbar()
plt.ylabel('Months from Last Impaired visit');
plt.xlabel('LTR pos');
plt.title('Significant LTR')


Out[224]:
<matplotlib.text.Text at 0x5e3b7150>

In [179]:
tmp = np.abs(np.diff(t_img, axis = 0)).sum(axis = 0)
#print tmp
plt.hist(tmp, bins = 50)


Out[179]:
(array([43, 35,  0, 19,  3, 34, 23,  0, 35, 15, 14, 23,  0, 15,  6, 11, 35,
        0, 24, 16, 12, 17,  0, 43, 12, 22, 31,  0, 30,  1, 37, 17,  0, 16,
        4,  2,  5,  0,  3,  0,  6,  2,  0,  8,  4,  0,  5,  0,  1,  1]),
 array([ 0.        ,  0.01666667,  0.03333333,  0.05      ,  0.06666667,
        0.08333333,  0.1       ,  0.11666667,  0.13333333,  0.15      ,
        0.16666667,  0.18333333,  0.2       ,  0.21666667,  0.23333333,
        0.25      ,  0.26666667,  0.28333333,  0.3       ,  0.31666667,
        0.33333333,  0.35      ,  0.36666667,  0.38333333,  0.4       ,
        0.41666667,  0.43333333,  0.45      ,  0.46666667,  0.48333333,
        0.5       ,  0.51666667,  0.53333333,  0.55      ,  0.56666667,
        0.58333333,  0.6       ,  0.61666667,  0.63333333,  0.65      ,
        0.66666667,  0.68333333,  0.7       ,  0.71666667,  0.73333333,
        0.75      ,  0.76666667,  0.78333333,  0.8       ,  0.81666667,
        0.83333333]),
 <a list of 50 Patch objects>)

In [182]:
t_img.shape


Out[182]:
(48, 35)

In [ ]: