notebook.community

Edit and run



In [3]:

    
import sys
sys.path.append('/home/will/PySeqUtils/')
from GeneralSeqTools import fasta_reader, fasta_writer
import os
os.chdir('/home/will/PySeqUtils/TransToolStuff/')



In [13]:

    
from itertools import islice
start = 806
stop = -1
path = 'HIV1_ALL_2012_env_PRO.fasta'
outpath = 'HIV1_ALL_2012_gp41_PRO.fasta'
with open(path) as handle:
    for name, seq in islice(fasta_reader(handle), 20):
        tseq = seq[start:stop] 
        print tseq[:5], tseq[-5:]









    



A-VG- I--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LV
A-VG- A--LL
A-VG- A--WL
A-VVG A--LI
A-VVG A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-G-- L--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--LL
A-VG- A--WL



In [14]:

    
seqs = []
with open(path) as handle:
    for name, seq in fasta_reader(handle):
        seqs.append((name, seq[start:stop]))
with open(outpath, 'w') as handle:
    fasta_writer(handle, seqs)



In [16]:

    
from Bio import Entrez
from Bio import SeqIO
ids = '544451412,544451410,544451408,544451406,544451404,544451402,544451400,544451398,544451396'

fetch_handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text",
                             id=ids)
records = list(SeqIO.parse(fetch_handle, "gb"))









    



/usr/local/lib/python2.7/dist-packages/Bio/Entrez/__init__.py:442: UserWarning: 
Email address is not specified.

To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
your email address with each request. From June 1, 2010, this will be
mandatory. As an example, if your email address is A.N.Other@example.com, you
can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
  E-utilities.""", UserWarning)



In [18]:

    
rec = records[0]



In [24]:

    
rec.annotations['gi']









    Out[24]:





'544451412'



In [28]:

    
batch_size = 1000
num_res = 1300
inds = range(0, num_res, batch_size)+[num_res]
start_inds = inds
stop_inds = inds[1:]
zip(start_inds, stop_inds)









    Out[28]:





[(0, 1000), (1000, 1300)]



In [62]:

    
from collections import defaultdict

counts = defaultdict(int)
seqs = []
with open('/home/will/WLAHDB_data/RegionDBs/LTR/HIV1_ALL_2012_ltr_DNA.fasta') as handle:
    for name, seq in fasta_reader(handle):
        seqs.append((name, seq.replace('-', '')))

with open('/home/will/WLAHDB_data/RegionDBs/LTR/LTR.fasta', 'w') as handle:
    fasta_writer(handle, seqs)



In [65]:

    
max(len(s) for _, s in seqs)









    Out[65]:





675



In [76]:

    
from Bio import SeqIO
from Bio.Alphabet import generic_dna
with open('/home/will/WLAHDB_data/SubtypeDB/HIV1_genome_DNA.fasta') as handle:
    seqs = list(SeqIO.parse(handle, 'fasta', generic_dna))



In [80]:

    
import glob

files = glob.glob('/home/will/WLAHDB_data/GenbankDL/*.gb')

counts = []
for fnum, f in enumerate(files):
    if fnum % 10000 == 0:
        print fnum, sum(counts)
    with open(f) as handle:
        for num, seq in enumerate(SeqIO.parse(handle, 'gb'), 1):
            pass
        counts.append(num)









    



0 0.0
10000 10000
20000 20000
30000 30000
40000 40000
50000 50000
60000 60000
70000 70000
80000 80000
90000 90000
100000 100000
110000 110000
120000 120000
130000 130000
140000 140000
150000 150000
160000 160000
170000 170000
180000 180000
190000 190000
200000 200000
210000 210000
220000 220000
230000 230000
240000 240000
250000 250000
260000 260000
270000 270000
280000 280000
290000 290000
300000 300000
310000 310000
320000 320000
330000 330000
340000 340000
350000 350000
360000 360000
370000 370000
380000 380000
390000 390000
400000 400000
410000 410000
420000 420000
430000 430000
440000 440000
450000 450000
460000 460000
470000 470000
480000 480000
490000 490000
500000 500000
510000 510000



In [54]:

    
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastxCommandline, NcbiblastnCommandline
from tempfile import NamedTemporaryFile
from StringIO import StringIO

window_size = 500
inds = range(0, len(seq), window_size) + [len(seq)]
seq = seqs[0]
blocks = []
for start, stop in zip(inds, inds[1:]):
    blocks.append(seq[start:stop])

with NamedTemporaryFile(suffix='.fasta', delete=False) as handle:
    with NamedTemporaryFile() as ohandle:
    
        SeqIO.write(blocks, handle, 'fasta')
        handle.flush()
        os.fsync(handle.fileno())
    
        cline = NcbiblastnCommandline(db='/home/will/WLAHDB_data/SubtypeDB/HIV1_genome_DNA.fasta',
                                      query=handle.name,
                                      out=ohandle.name,
                                      outfmt=5)
        _, _ = cline()
        records = list(NCBIXML.parse(ohandle))



In [57]:

    
rec = records[0]
align = rec.alignments[0]



In [56]:

    
rec.query









    Out[56]:





u'A1-1'



In [61]:

    
align.hit_def









    Out[61]:





u'A1-1'



In [ ]: