In [1]:
from pandas import DataFrame, Series, merge, read_csv, MultiIndex, Index, concat
from subprocess import check_call
from tempfile import NamedTemporaryFile as NTF
import os, os.path
import numpy as np
from scipy.stats import ttest_ind
from itertools import groupby,combinations, islice
from operator import itemgetter
from Bio import Phylo
import networkx
import sys
import pickle
from random import shuffle
import csv, shlex, shutil
os.chdir('/home/will/HIVTropism//')
sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')
In [2]:
from SeqProcessTools import read_pat_seq_data, load_training_seq_data, align_seq_data_frame
from GeneralSeqTools import fasta_reader, WebPSSM_V3_fasta, yield_chunks
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import glob
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from itertools import chain, product
In [3]:
with open('trop_dict.pkl') as handle:
trop_dict = pickle.load(handle)
with open('wanted_data.pkl') as handle:
wanted_data = pickle.load(handle)
trans_dict = wanted_data['Name'].to_dict()
ntrop_dict = dict((trans_dict[key], val) for key, val in trop_dict.items())
trop_dict = ntrop_dict
wanted_data = wanted_data.set_index('Name')
In [ ]:
In [25]:
from GeneralSeqTools import fasta_writer
fourkb_cols = ['gp120-seq-align', 'Nef-seq-align', 'Vpr-seq-align',
'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align']
four = wanted_data[fourkb_cols].dropna()
wseqs = set()
with open('/home/will/Dropbox/HIVseqs/BensTropismLabels.csv') as handle:
for row in csv.DictReader(handle, delimiter=','):
wseqs.add(row['Patient ID'])
for col in four.columns:
found = set()
prot = col.rsplit('-', 2)[0]
fname = 'AlignForBenj/fourKB_%s.fasta' % prot
with open(fname, 'w') as handle:
for seq, name in zip(four[col], four.index):
if name in wseqs and name not in found:
fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))])
found.add(name)
print prot, len(found)
In [26]:
wseqs = set(wanted_data['gp120'].dropna().index)
cols = ['gp120-seq-align', 'Nef-seq-align', 'Vpr-seq-align',
'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align']
for col in cols:
found = set()
prot = col.rsplit('-', 2)[0]
fname = 'AlignForBenj/has_env_%s.fasta' % prot
df = wanted_data[col].dropna()
with open(fname, 'w') as handle:
for seq, name in zip(df, df.index):
if name in wseqs and name not in found:
fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))])
found.add(name)
In [4]:
def yield_regions(trop_dict):
regions = ['LTR-seq-align',
'gp41-seq-align',
'gp120-seq-align',
'Nef-seq-align',
'Vpr-seq-align',
'Tat-1-seq-align',
'Tat-2-seq-align',
]
tail_cols = ['gp120', 'gp41', 'Nef', 'Vpr',
'Tat-1', 'Tat-2', 'LTR']
fourkb_cols = ['gp120', 'Nef', 'Vpr',
'Tat-1', 'Tat-2', 'LTR']
groups = [('fourkb', wanted_data[fourkb_cols].dropna().index),
('full_env', wanted_data[['gp120', 'gp41']].dropna().index),
('full_tail', wanted_data[tail_cols].dropna().index),
]
subs = ['SubB']
win_sizes = [5, 10, 15, 20, 30, 35]
for region, (gname, ind), sub in product(regions, groups, subs):
prot = region.split('-')[0]
gwanted = wanted_data.ix[ind]
mask = gwanted['Sub'] == sub
seq_ser = gwanted[mask][region].dropna()
print prot, gname, sub, len(seq_ser)
seqs = [(name, ''.join(list(seq))) for name, seq in zip(seq_ser.index, seq_ser.values)]
seq_len = len(seqs[0][1])
for win, start in product(win_sizes, range(seq_len)):
stop = start+win
if stop < seq_len:
nseqs = [(name, seq[start:stop]) for name, seq in seqs]
yield gname, sub, prot, start, win, nseqs, trop_dict
In [5]:
import dendropy
In [6]:
from Bio.Alphabet import generic_dna, generic_protein
import TreeingTools
def calculate_region(arg):
gname, sub, prot, start, win, nseqs, trop_dict = arg
treename = 'newphyliptrees/%s-%s-%s-%i-%i.tree' % (gname, sub, prot, start, win)
matfname = 'newphyliptrees/%s-%s-%s-%i-%i.pkl' % (gname, sub, prot, start, win)
if os.path.exists(treename):
#benj_res = 'Already Processed'
#return gname, sub, prot, win, start, benj_res
with open(matfname) as handle:
dmat = pickle.load(handle)
with open(treename) as handle:
tree = dendropy.Tree.get_from_stream(handle, 'newick')
else:
is_aa = prot != 'LTR'
alphabet = generic_protein if is_aa else generic_dna
try:
tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alphabet)
except ValueError:
benj_res = 'Too few unique sequences to process'
return gname, sub, prot, win, start, benj_res
except:
benj_res = 'uncaught exception in dist-mat'
return gname, sub, prot, win, start, benj_res
with open(matfname, 'w') as handle:
pickle.dump(dmat, handle)
with open(treename, 'w') as handle:
tree.write_to_stream(handle, 'newick')
try:
benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50)
except AssertionError:
benj_res = 'too few groups'
return gname, sub, prot, win, start, benj_res
except:
benj_res = 'uncaught exception'
return gname, sub, prot, win, start, benj_res
try:
out = TreeingTools.evaluate_association_index(tree, trop_dict)
benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out
except:
benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = ('error', 'error', 'error')
return gname, sub, prot, win, start, benj_res
In [7]:
from itertools import groupby, imap
from operator import itemgetter
from types import StringType
from concurrent.futures import ThreadPoolExecutor
benj_fields = ['GroupName',
'Subtype',
'Prot',
'Start',
'WinSize',
'Group2Mean',
'Group2Std',
'Group2Name',
'Group1Mean',
'Group1Std',
'RawPval',
'AdjPval',
'Group1Name',
'AI',
'AI-pval',
'AI-null']
fname = 'more_phylip_BenjRes.tsv'
benj_writer = csv.DictWriter(open(fname, 'w'), benj_fields, delimiter = '\t')
benj_writer.writeheader()
multi = True
print 'Starting multiprocessing!'
if multi:
pool = ProcessPoolExecutor(max_workers = 30)
results = pool.map(calculate_region, yield_regions(trop_dict))
else:
results = imap(calculate_region, islice(yield_regions(trop_dict), 0,35))
for gname, sub, prot, win, start, benj_res in results:
#print prot, start, win
tdict = {
'Prot':prot,
'Start':start,
'WinSize':win,
'GroupName':gname,
'Subtype':sub,
}
if type(benj_res) is StringType:
if (benj_res == 'Already Processed') or benj_res.startswith('Too few unique sequences'):
continue
print benj_res, prot, start, win
else:
benj_res.update(tdict)
benj_writer.writerow(benj_res)
if multi:
pool.shutdown()
In [ ]: