In [1]:
import os, os.path
from pandas import DataFrame, Series, MultiIndex
from collections import defaultdict
from glob import glob
from itertools import groupby, product
In [2]:
def fasta_reader(handle):
for key, lines in groupby(handle, lambda x: x.startswith('>')):
if key:
name = next(lines)[1:].strip()
else:
seq = ''.join(line.strip() for line in lines)
yield name, seq
In [8]:
infiles = glob('/home/will/HIVReportGen/Data/TrainingSequences/*')
seq_data = defaultdict(dict)
for f in infiles:
fname = f.split(os.sep)[-1].split('.')[0]
prot, name = fname.split('-',1)
if 'multi' in name:
with open(f) as handle:
for name, seq in fasta_reader(handle):
seq_data[prot][name] = ''.join(s for s in seq if s.isalpha())
else:
with open(f) as handle:
_, seq = next(fasta_reader(handle))
seq_data[prot][name] = seq
In [17]:
rmkeys = set()
for key, seq in seq_data['V3'].items():
if len(seq) != 35:
rmkeys.add(key)
print(len(rmkeys))
In [18]:
for key in rmkeys:
seq_data['V3'].pop(key);
In [20]:
for key in seq_data.keys():
with open('/home/will/HIVReportGen/Data/TrainingSequences/' + key + '.fasta', 'w') as handle:
for sname, seq in seq_data[key].items():
handle.write('>%s\n%s\n' % (sname, seq))
In [27]:
import re
reg_obj = re.compile('(A\d+)_(R\d+)-(Primer\d+)')
nseq_file = '/home/will/HIVReportGen/Data/PatientFasta/pbmc_new_all[1].fasta'
path_func = lambda x,y,z: '/home/will/HIVReportGen/Data/PatientFasta/%s-%s-%s.fasta' % (x,y,z)
with open(nseq_file) as handle:
for name, seq in fasta_reader(handle):
pat, visit, primer = reg_obj.findall(name)[0]
nfname = path_func(pat, visit, primer)
with open(nfname, 'w') as handle:
handle.write('>%s-%s-%s\n%s' % (pat, visit, primer, seq))
In [5]:
import pickle
import StringIO
with open('/home/will/HIVReportGen/Data/TrainingSequences/training_seqs.pkl', 'r') as handle:
training_seqs = pickle.load(handle)
with open('/home/will/HIVReportGen/Data/TrainingSequences/training_pssm.pkl') as handle:
training_scores = pickle.load(handle)
In [7]:
training_scores.to_csv('/home/will/Downloads/PSSMscores.csv')
In [8]:
from pandas import *
fields = ['name','score','pred','x4.pct','r5.pct','geno','pos.chg','net.chg','percentile']
PSSMscores = read_csv('/home/will/Dropbox/HIVseqs/output_data.tsv', names = fields, sep = '\t')
In [19]:
r5_cut = -6.95
x4_cut = -2.88
def BenMeth(val):
if val < r5_cut:
return 'R5'
elif val > x4_cut:
return 'X4'
else:
return 'None'
In [18]:
PSSMscores['n-name'] = 'name'
scores = PSSMscores.groupby('name', as_index = False).agg({'score':np.mean, 'n-name':'count'})
In [20]:
scores['Tropism'] = scores['score'].map(BenMeth)
In [24]:
bscores = scores[scores['n-name'] == 1].drop(['n-name'], axis = 1)
bscores.to_csv('/home/will/Downloads/BiggerPSSMscores.csv')
In [33]:
bscores['Tropism'].value_counts()
Out[33]:
In [ ]: