In [1]:
import os
import sys
sys.path.append('/home/will/PySeqUtils/')

In [2]:
import GeneralSeqTools
import glob

In [3]:
import pandas as pd
files = sorted(glob.glob('/home/will/HIVTropism/LANLdata/SubB*.fasta'))

seqs = []
for f in files:
    prot_name = f.split('/')[-1].split('.')[0].split('-')[1]
    print prot_name
    with open(f) as handle:
        for name, seq in GeneralSeqTools.fasta_reader(handle):
            seqs.append({
                         'GI':name,
                         'Seq':seq.replace('-', '').upper(),
                         'Prot':prot_name
                         })


Int
LTR
Nef
PR
RT
Tat_1
Tat_2
V3
Vif
Vpr
gp120
gp41

In [4]:
seq_df = pd.pivot_table(pd.DataFrame(seqs),
                        rows = 'GI',
                        cols = 'Prot',
                        values = 'Seq',
                        aggfunc = 'first')

In [14]:
from Bio import Seq
from Bio.Alphabet import generic_dna
res = Seq.Seq('ATG', alphabet=generic_dna).translate()
res.tostring()


Out[14]:
'M'

In [16]:
def translate(inseq):
    return Seq.Seq(inseq, alphabet=generic_dna).translate().tostring()
benj_seqs = seq_df[['LTR', 'Tat_1', 'Tat_2', 'Vpr', 'V3']].dropna()['Tat_2'].map(translate)

In [18]:
with open('/home/will/Downloads/tat2_for_benj.fasta', 'w') as handle:
    GeneralSeqTools.fasta_writer(handle, benj_seqs.to_dict().items())

In [ ]: