In [1]:
from pandas import DataFrame, Series, merge, read_csv, MultiIndex, Index, concat
from subprocess import check_call
from tempfile import NamedTemporaryFile as NTF
import os, os.path
import numpy as np
from scipy.stats import ttest_ind
from itertools import groupby,combinations, islice
from operator import itemgetter
from Bio import Phylo
import networkx
import sys
import pickle

from random import shuffle
import csv, shlex, shutil

os.chdir('/home/will/HIVTropism//')
sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')

In [2]:
from SeqProcessTools import read_pat_seq_data, load_training_seq_data, align_seq_data_frame
from GeneralSeqTools import fasta_reader, WebPSSM_V3_fasta, yield_chunks
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import glob
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from itertools import chain, product


/usr/local/lib/python2.7/dist-packages/futures/__init__.py:24: DeprecationWarning: The futures package has been deprecated. Use the concurrent.futures package instead.
  DeprecationWarning)

In [3]:
with open('trop_dict.pkl') as handle:
    trop_dict = pickle.load(handle)

with open('wanted_data.pkl') as handle:
    wanted_data = pickle.load(handle)

trans_dict = wanted_data['Name'].to_dict()
ntrop_dict = dict((trans_dict[key], val) for key, val in trop_dict.items())
trop_dict = ntrop_dict

wanted_data = wanted_data.set_index('Name')

In [ ]:


In [25]:
from GeneralSeqTools import fasta_writer
fourkb_cols = ['gp120-seq-align', 'Nef-seq-align', 'Vpr-seq-align', 
                 'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align']
four = wanted_data[fourkb_cols].dropna()
wseqs = set()
with open('/home/will/Dropbox/HIVseqs/BensTropismLabels.csv') as handle:
    for row in csv.DictReader(handle, delimiter=','):
        wseqs.add(row['Patient ID'])

        
for col in four.columns:
    found = set()
    prot = col.rsplit('-', 2)[0]
    fname = 'AlignForBenj/fourKB_%s.fasta' % prot
    with open(fname, 'w') as handle:
        for seq, name in zip(four[col], four.index):
            if name in wseqs and name not in found:
                fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))])
                found.add(name)
    print prot, len(found)


gp120 25
Nef 25
Vpr 25
Tat-1 25
Tat-2 25
LTR 25

In [26]:
wseqs = set(wanted_data['gp120'].dropna().index)
cols = ['gp120-seq-align', 'Nef-seq-align', 'Vpr-seq-align', 
                 'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align']
for col in cols:
    found = set()
    prot = col.rsplit('-', 2)[0]
    fname = 'AlignForBenj/has_env_%s.fasta' % prot
    df = wanted_data[col].dropna()
    with open(fname, 'w') as handle:
        for seq, name in zip(df, df.index):
            if name in wseqs and name not in found:
                fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))])
                found.add(name)

In [4]:
def yield_regions(trop_dict):
    
    regions = ['LTR-seq-align',
               'gp41-seq-align',
               'gp120-seq-align',
               'Nef-seq-align',
               'Vpr-seq-align',
               'Tat-1-seq-align',
               'Tat-2-seq-align',
                ]
    tail_cols = ['gp120', 'gp41', 'Nef', 'Vpr', 
                 'Tat-1', 'Tat-2', 'LTR']
    fourkb_cols = ['gp120', 'Nef', 'Vpr', 
                 'Tat-1', 'Tat-2', 'LTR']
    groups = [('fourkb', wanted_data[fourkb_cols].dropna().index),
              ('full_env', wanted_data[['gp120', 'gp41']].dropna().index),
              ('full_tail', wanted_data[tail_cols].dropna().index),
              ]
    subs = ['SubB']
    win_sizes = [5, 10, 15, 20, 30, 35]
    
    for region, (gname, ind), sub in product(regions, groups, subs):
        prot = region.split('-')[0]
        gwanted = wanted_data.ix[ind]
        mask = gwanted['Sub'] == sub
        seq_ser = gwanted[mask][region].dropna()
        print prot, gname, sub, len(seq_ser)
        seqs = [(name, ''.join(list(seq))) for name, seq in zip(seq_ser.index, seq_ser.values)]
        seq_len = len(seqs[0][1])
        
        for win, start in product(win_sizes, range(seq_len)):
            stop = start+win
            if stop < seq_len:
                nseqs = [(name, seq[start:stop]) for name, seq in seqs]
                yield gname, sub, prot, start, win, nseqs, trop_dict

In [5]:
import dendropy

In [6]:
from Bio.Alphabet import generic_dna, generic_protein
import TreeingTools
def calculate_region(arg):
    gname, sub, prot, start, win, nseqs, trop_dict = arg
    
    treename = 'newphyliptrees/%s-%s-%s-%i-%i.tree' % (gname, sub, prot, start, win)
    matfname = 'newphyliptrees/%s-%s-%s-%i-%i.pkl' % (gname, sub, prot, start, win)
    
    if os.path.exists(treename):
        #benj_res = 'Already Processed'
        #return gname, sub, prot, win, start, benj_res
        
        with open(matfname) as handle:
            dmat = pickle.load(handle)
            
        with open(treename) as handle:
            tree = dendropy.Tree.get_from_stream(handle, 'newick')
        
    else:
        
        is_aa = prot != 'LTR'
        alphabet = generic_protein if is_aa else generic_dna
        
        try:
            tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alphabet)
        except ValueError:
            benj_res = 'Too few unique sequences to process'
            return gname, sub, prot, win, start, benj_res
        except:
            benj_res = 'uncaught exception in dist-mat'
            return gname, sub, prot, win, start, benj_res
        with open(matfname, 'w') as handle:
            pickle.dump(dmat, handle)
        with open(treename, 'w') as handle:
            tree.write_to_stream(handle, 'newick')
    
    try:
        benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50)
    except AssertionError:
        benj_res = 'too few groups'
        return  gname, sub, prot, win, start, benj_res
    except:
        benj_res = 'uncaught exception'
        return  gname, sub, prot, win, start, benj_res
    
    
    try:
        out = TreeingTools.evaluate_association_index(tree, trop_dict)
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out
    except:
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = ('error', 'error', 'error')
    
    return gname, sub, prot, win, start, benj_res

In [7]:
from itertools import groupby, imap
from operator import itemgetter
from types import StringType
from concurrent.futures import ThreadPoolExecutor


benj_fields = ['GroupName',
               'Subtype',
               'Prot',
                'Start',
                'WinSize',
                'Group2Mean',
                'Group2Std',
                'Group2Name',
                'Group1Mean',
                'Group1Std',
                'RawPval',
                'AdjPval',
                'Group1Name',
                'AI',
                'AI-pval',
                'AI-null']
fname = 'more_phylip_BenjRes.tsv'
benj_writer = csv.DictWriter(open(fname, 'w'), benj_fields, delimiter = '\t')
   

benj_writer.writeheader()

multi = True
print 'Starting multiprocessing!'
if multi:
    pool = ProcessPoolExecutor(max_workers = 30)
    results = pool.map(calculate_region, yield_regions(trop_dict))
else:
    results = imap(calculate_region, islice(yield_regions(trop_dict), 0,35))

for gname, sub, prot, win, start, benj_res in results:
    
    #print prot, start, win
    tdict = {
             'Prot':prot,
             'Start':start,
             'WinSize':win,
             'GroupName':gname,
             'Subtype':sub,
             }
    if type(benj_res) is StringType:
        if (benj_res == 'Already Processed') or benj_res.startswith('Too few unique sequences'):
            continue
        print benj_res, prot, start, win
    else:
        benj_res.update(tdict)
        benj_writer.writerow(benj_res)
    
        
if multi:
    pool.shutdown()


Starting multiprocessing!
LTR fourkb SubB 304
gp41 fourkb SubB 304
gp120 fourkb SubB 304
Nef fourkb SubB 304
Vpr fourkb SubB 304
Tat fourkb SubB 304
Tat fourkb SubB 304
uncaught exception gp41 156 5
uncaught exception gp41 157 5
uncaught exception gp41 158 5
uncaught exception gp41 302 5
uncaught exception gp41 303 5
uncaught exception gp41 304 5
uncaught exception gp41 305 5
uncaught exception gp41 306 5
uncaught exception gp41 307 5
uncaught exception gp41 302 10

In [ ]: