In [7]:
import os, os.path
import sys
import pandas as pd
import numpy as np
sys.path.append('/home/will/PatientPicker/')
sys.path.append('/home/will/PySeqUtils/')
import LoadingTools

In [11]:
redcap_data = LoadingTools.load_redcap_data()

In [13]:
def get_months(dt):
    
    return dt.astype('timedelta64[D]')/np.timedelta64(1, 'D')/30

date_data = redcap_data[['Patient ID', 'VisitNum','Date Of Visit']]
l = []
for tup in pat_joins:
    
    for pat in tup:
        mask = date_data['Patient ID'] == pat
        tmp = date_data[mask].copy()
        tmp.sort('VisitNum')
        tmp['Date'] = pd.to_datetime(tmp['Date Of Visit'], coerce = True)
        tmp = tmp.drop(['Date Of Visit'], axis = 1)
        try:
            tmp['Months from R00'] = (tmp['Date'] - tmp['Date'].iloc[0]).map(get_months)
        except IndexError:
            pass
        l.append(tmp.copy())
    
    
extra_res = pd.concat(l, axis = 0, ignore_index = True)
extra_res.to_excel('/home/will/Downloads/joining_patients.xlsx')

In [8]:
pat_joins = [('A0138', 'A0360'),
             ('A0017', 'A0054'),
             ('A0418', 'A0423'),
             ('A0234', 'A0266'),
             ('A0410', 'A0424'),
             ('A0082', 'A0346'),
             ('A0077', 'A0370'),
             ('A0098', 'A0126'),
             ('A0023', 'A0114'),
             ('A0131', 'A0157'),
             ('A0059', 'A0403'),
             ('A0016', 'A0053'),
             ('A0219', 'A0442')]
#pat_index = redcap_data['Patient ID']
#orig = pat_index.copy()
#for joins in pat_joins:
#    base_pat = joins[0]
#    for join_pat in joins[1:]:
#        print base_pat, join_pat, (pat_index == join_pat).sum()
#        pat_index[pat_index == join_pat] = base_pat
#(orig != pat_index).sum()

In [2]:
import glob
ld = []
for f in glob.glob('/home/will/SubCData/LANLRes/LANLResults_*.txt'):
    print f
    ld.append(pd.read_csv(f, sep='\t', index_col=0))
print 'concating'    
lanl_data = pd.concat(ld, axis = 0, ignore_index=True)


/home/will/SubCData/LANLRes/LANLResults_SubB_pre2000.txt
/home/will/SubCData/LANLRes/LANLResults_SubC.txt
/home/will/SubCData/LANLRes/LANLResults_F.txt
/home/will/SubCData/LANLRes/LANLResults_M.txt
/home/will/SubCData/LANLRes/LANLResults_SubB_post2000.txt
/home/will/SubCData/LANLRes/LANLResults_O.txt
concating

In [3]:
from types import StringType
def norm_id(inser):
    
    cols = ['Accession', 'GI number']
    for col in cols:
        if type(inser[col]) == StringType:
            return inser[col].split('.')[0]
    print inser
    raise KeyboardInterrupt
    

lanl_data['GBid'] = lanl_data.apply(norm_id, axis = 1)
agg_lanl_data = lanl_data.groupby('GBid').first()

In [4]:
import glob
from Bio import SeqIO
from concurrent.futures import ProcessPoolExecutor
from itertools import islice, imap
import os, os.path
import csv

def get_gi_acc(fname):
    gb = fname.split('/')[-1].split('.')[0]
    with open(fname) as handle:
        for line in handle:
            if line.startswith('ACCESSION'):
                acc = line.strip().split()[-1]
                return gb, acc
    raise AssertionError



gi_to_acc_dict = {}

fname = '/home/will/WLAHDB_data/gi_to_acc.csv'
if os.path.exists(fname):
    with open(fname) as handle:
        for row in csv.reader(handle):
            gi_to_acc_dict[row[0]] = row[1]
else:
    gb_files = glob.glob('/home/will/WLAHDB_data/GenbankDL/*.gb')
    with open(fname, 'w') as handle:
        writer = csv.writer(handle)
        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

In [8]:
import GeneralSeqTools
files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))),
         ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             'Subtype':sub
                             })
            
seqdf = pd.DataFrame(seqs)

In [10]:
seqdf['HasSeq'] = 1.0
cols = ['CD4 count', 'Viral load']
seq_counts = pd.pivot_table(seqdf,
                            rows = ['Subtype', 'ID'],
                            cols = 'Prot',
                            values = 'HasSeq',
                            aggfunc = 'sum')
ex_data, seq_count_aln = agg_lanl_data[cols].align(seq_counts.ix['C'], axis = 0, join = 'right')

In [26]:
fourkb_cols = ['vpr', 'tat', 'vpu', 'env', 'ltr']

ex_data['CD4'] = ex_data['CD4 count'].notnull()
ex_data['VL'] = ex_data['Viral load'].notnull()
ex_data['HasV3'] = seq_count_aln['v3'].notnull()
ex_data['Has44'] = seq_count_aln[fourkb_cols].fillna(0).sum(axis = 1)==5

for col in seq_count_aln.columns:
    ex_data[col] = seq_count_aln[col].notnull()

In [37]:
ex_data[other_cols].sum(axis=1)


Out[37]:
ID
AB002978    1
AB002981    1
AB012987    1
AB012990    1
AB013119    1
AB013123    1
AB014776    1
AB014780    1
AB014783    1
AB014789    1
AB014793    1
AB014794    1
AB014813    1
AB014819    1
AB014825    1
...
Z76295    1
Z76296    1
Z76297    1
Z76298    1
Z76299    1
Z76301    1
Z76304    1
Z76306    1
Z76310    1
Z76342    1
Z76734    1
Z92584    2
Z95462    1
Z95464    1
Z98033    0
Length: 46111, dtype: int64

In [44]:
from itertools import product
ncounts = []
other_cols = ['CD4', 'VL', 'HasV3', 'Has44']
for col, ocol in product(seq_count_aln.columns, other_cols+['ASeqs', '_All']):
    if ocol == 'ASeqs':
        ncounts.append({
                    'Prot': col,
                    'Col': ocol,
                    'Count': ex_data[col].sum()
                    })
    elif ocol == '_All':
        ncounts.append({
                    'Prot': col,
                    'Col': ocol,
                    'Count': (ex_data[col]*(ex_data[other_cols].sum(axis=1)==4)).sum()
                    })
    else:
        ncounts.append({
                        'Prot': col,
                        'Col': ocol,
                        'Count': (ex_data[col]*ex_data[ocol]).sum()
                        })
table = pd.pivot_table(pd.DataFrame(ncounts),
               rows = 'Prot',
               cols = 'Col',
               values = 'Count',
               aggfunc = 'sum')
table.to_excel('/home/will/Dropbox/Wigdahl HIV Lab/SubCAnalysis/new_count_table.xlsx')

In [45]:
table


Out[45]:
Col ASeqs CD4 Has44 HasV3 VL _All
Prot
env 11834 114 268 11823 4429 0
gag 7826 136 10 2452 724 0
gp120 12620 111 262 12610 4412 0
gp41 12423 120 268 12225 4648 0
ltr 3146 120 268 2384 909 0
nef 4125 173 267 2542 869 0
pol 2236 95 10 2201 597 0
rev 2223 22 0 2197 698 0
tat 854 5 268 342 135 0
v3 37204 123 268 37204 6518 0
vif 3564 112 263 2931 992 0
vpr 3458 111 268 2888 951 0
vpu 3796 114 268 3144 995 0

In [ ]:


In [82]:
wanted_pvs = [('A0001','R02'),
              ('A0004','R02'),('A0004','R07'),
              ('A0023','R01'),('A0041','R02'),
              ('A0046','R02'),('A0048','R02'),
              ('A0052','R01'),('A0070','R01'),
              ('A0071','R01'),('A0072','R01'),
              ('A0107','R05'),('A0110','R02'),
              ('A0111','R01'),('A0127','R01'),
              ('A0128','R00'),('A0139','R00'),
              ('A0143','R00'),('A0152','R00'),
              ('A0164','R00'),('A0165','R00'),
              ('A0172','R00'),('A0208','R00'),
              ('A0213','R00'),('A0225','R00'),
              ('A0252','R00'),('A0259','R00'),
              ('A0278','R00'),('A0281','R00'),
              ('A0310','R00'),('A0326','R00'),
              ('A0365','R00'),('A0367','R05'),
              ('A0377','R00'),('A0378','R00'),
              ('A0396','R00'),('A0403','R01'),
              ('A0421','R00')]

col_order = ['Patient ID','VisitNum','Age','NumTotalVisits','Latest CD4 count','Current Alcohol Use',
             'Current Tobacco Use','Days since baseline','Gender','Race','HAART','Hepatitis C status (HCV)',
             'HIVD score','HIVD.I','Hepatitis B status (HBV)','Latest CD8 count','Nadir CD4 count',
             'Nadir CD8 count','Peak viral load','Latest Viral load','Years Seropositive',
             'TOSample.Benzodiazepines','TOSample.Cannabinoid','TOSample.Cocaine','TOSample.Opiates',
             'ALL.Benzodiazepines','ALL.Cannabinoid','ALL.Cocaine','ALL.Opiates',
             'ATSample.Amphetamines','ATSample.Barbiturates','ATSample.Benzodiazepines',
             'ATSample.Cannabinoid','ATSample.Cocaine','ATSample.Opiates','ATSample.Phencyclidine']

wanted_pvs = pd.DataFrame(wanted_pvs, columns = ['Patient ID', 'VisitNum'])

In [83]:
test_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
at_sample = redcap_data[test_cols].rename(dict([(col, col.replace('Test-', 'ATSample.'))]))
to_sample = redcap_data.groupby(level = ['Patient ID'])[test_cols].transform(pd.expanding_mean).rename(dict([(col, col.replace('Test-', 'TOSample.'))]))
all_sample = redcap_data.groupby(level = ['Patient ID'])[test_cols].transform(lambda x: x.mean()).rename(dict([(col, col.replace('Test-', 'ALL.'))]))

In [84]:
def mark_haart(inrow):
    if inrow['HAART-Naive']:
        return 'nH'
    elif inrow['HAART-Non-Adherent']:
        return 'dH'
    elif inrow['HAART-Off']:
        return 'dH'
    elif inrow['HAART-On']:
        return 'cH'
    else:
        return np.nan
    
    
haart_cols = [col for col in redcap_data.columns if col.startswith('HAART')]
haart_data = redcap_data.groupby(level = [0,1])[haart_cols].apply(mark_haart)
num_data = redcap_data.groupby(level = [0])[['Age']].transform(len).rename(columns={'Age':'NumTotalVisits'})

In [87]:
def make_impaired_call(val):
    if val < 10:
        return 'Impaired'
    elif val >= 10:
        return 'Not Impaired'
    else:
        return np.nan

hivd_data = pd.DataFrame({'HIVD score':redcap_data['TMHDS'],
                          'HIVD.I':redcap_data['TMHDS'].map(make_impaired_call)})
days_data = redcap_data

In [88]:
rename_dict = {'Latest CD4 count (cells/uL)':'Latest CD4 count',
               'Latest viral load': 'Latest Viral load',
               'Latest CD8 count (cells/uL)':'Latest CD8 count',
               'Nadir CD4 count (cells/uL)':'Nadir CD4 count',
               'Nadir CD4 count (cells/uL)':'Nadir CD8 count',
               'Peak viral load (copies/mL)':'Peak viral load',
               }
outredcap = pd.concat([redcap_data.rename(columns=rename_dict), at_sample, to_sample, all_sample, num_data,
                       pd.DataFrame({'HAART':haart_data}), hivd_data], 
                      axis=1)

In [89]:
outredcap[col_order[2:]]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-89-182132bdaeea> in <module>()
----> 1 outredcap[col_order[2:]]

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1919         if isinstance(key, (np.ndarray, list)):
   1920             # either boolean or fancy integer index
-> 1921             return self._getitem_array(key)
   1922         elif isinstance(key, DataFrame):
   1923             return self._getitem_frame(key)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   1951             return self.take(indexer, axis=0, convert=False)
   1952         else:
-> 1953             indexer = self.ix._convert_to_indexer(key, axis=1)
   1954             return self.take(indexer, axis=1, convert=True)
   1955 

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _convert_to_indexer(self, obj, axis)
    574                             to_or = lib.map_infer(lvalues, x.__eq__)
    575                             if not to_or.any():
--> 576                                 raise KeyError('%s not in index' % str(x))
    577                             mask |= to_or
    578 

KeyError: 'Days since baseline not in index'

In [46]:
pd.rolling_mean?

In [ ]: