In [10]:
import sys
import os, os.path
import pandas
sys.path.append('/home/will/PySeqUtils/')
from GeneralSeqTools import fasta_reader, seq_align_to_ref
os.chdir('/home/will/HIVRate/')

In [94]:
ltr_seqs = []
with open('LTRsearch.fasta') as handle:
    for name, seq in fasta_reader(handle):
        gi = name.split('|')[1]
        ltr_seqs.append((gi, seq.replace('-', '')))

In [95]:
from Bio import SeqIO
from dateutil import parser


gi2date = {}
with open('ltrsequence.gbx.xml') as handle:
    for rec in SeqIO.parse(handle, 'genbank'):
        tdate = parser.parse(rec.annotations['date'])
        gi = rec.annotations['gi']
        gi2date[gi] = tdate

In [96]:
hxb2 = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCT
GATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGT
TGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATG
ACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTT--CATCACATGGCCCGAG------------
------------AGCTGCATCCGGAGTACTTC---------AAGAACTGCT-----------------------------
-------GACATCGA------------------------GCTTG---CT----------------------ACAA---GG
GACTTTCCGCTGGGGACTTTCCAG-------------GGAGGCGTGGCCTGGGCGGGACT---GGGGAGTGGCGA---GC
CCTCAGATCCTGCATATAAGCAGC---TGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAG
CTCTCTGGCT---AACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCT---TGAGTGCTTC---AAGTAGTGTG
TGC---CCGTCTG---TTGTGTGACTCTGGT---AACTAGAGATCCC---TCAGAC---CCT---TTTAGTCAGTGTGG-
--AAAATCTCT""".replace('\n', '').replace('-', '')

In [97]:
aln_seqs = []
for gi, aln_seq in seq_align_to_ref(ltr_seqs, hxb2, max_workers = 50):
    try:
        aln_seqs.append((gi2date[gi], aln_seq))
    except KeyError:
        pass

In [98]:
sorted_alns = sorted(aln_seqs, key = lambda x: x[0])

In [99]:
from itertools import groupby
from collections import defaultdict

found_seqs = set()
nums = []


for date, seq in sorted_alns:
    isNew = False
    if seq not in found_seqs:
        new_seq_num += 1
        found_seqs.add(seq)
        isNew = True
    nums.append((date, 1, isNew))
    
ltr_df = pandas.DataFrame(nums, columns = ['Date', 'isSeq', 'isNew'])

In [107]:
nltr = ltr_df.groupby('Date').sum()
nltr.cumsum().plot(legend = False)


Out[107]:
<matplotlib.axes.AxesSubplot at 0x7f69ac089490>

In [45]:
hxb2


Out[45]:
'TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCT'

In [46]:
sorted_alns[-1]


Out[46]:
('2011',
 'MGARASVLSGGELDSWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLHPSLQTGSEELKSLYNTIAVLYCVHQRISIKDTKEALDQIEEEQNKCKKKAQQAAADTENNSQVSQNYPIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRLHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPTSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTACQGVGGPGHKARVLAEAMSQATNSAAIMMQRGNFKNQRKSVKCFNCGKEGHIARNCRAPRKKGCWKCGREGHQMKDCTERQANFLGKIWPSHKGRPGNFPQGRLEPTAPPEESFRFGGETATPSPRQEPIDKELYPTTSLRSLFGNDPSSQ')

In [ ]: