# -*- coding: utf-8 -*- """ Created on Wed Jul 23 11:12:40 2014 @author: andylane Heald Lab, UC Berkeley Steps: 1) load in abi files 2) extract out the full sgRNA sequences: - first trim out everything from 3' pUC19 on - match on sgRNA region (~85% identity?) - pick out intervening 21mers 3) check results against existing target Version 2.0: - Matching on amps144 amplicons is using BLAST; this allows some error in sequencing etc. Next: - figure out how many of the sgRNA adjacent sequences are found using BLAST in Amps300: looks like 42 - Which are right at primer termini? looks like 25 x Replace Ns with stars... [not necessary; BLAST respects Ns] - update printfeat to print multiple features on a map... (include HpaII, BfaI, ScrFI). - How many are next to PAMs as they should be? 14/17, or 82%. Not bad! """

In [879]:
import os
import Bio 
import re
import timeit
import copy
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction 
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import pandas

In [880]:
lib5pr = []

for filename in os.listdir("abifiles/5prlib"):
    handle = open("abifiles/5prlib" + "/" + filename, 'rb')
    record = SeqIO.read(handle, "abi", alphabet=IUPACAmbiguousDNA())    
    lib5pr.append(record)

In [881]:
#sgRNAconst = SeqRecord(Seq("gttt"))
sgRNAconst = SeqRecord(Seq("GTTTAAGAG"))

# An alternative approach in which, rather than the T7 promoter being used, the first 9 nt of the sgRNA hairpin is used
# to find 20/21mers. This gets 45 targets...

sgRNAfiltlib5pr = copy.deepcopy(lib5pr)

for seqrecord in sgRNAfiltlib5pr:
    fwdlocs = []  
    revlocs = []  
    fwdlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq))]
    #print fwdlocs
    for item in fwdlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(int((item) + len(sgRNAconst) + 1))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type="sgRNAconst", strand = +1)
        seqrecord.features.append(feature)
    #print fwdlocs
    revlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq))]
    for item in revlocs:
        start = ExactPosition(int(item) - 1)
        end = ExactPosition(start + len(sgRNAconst))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type="sgRNAconst", strand = -1)
        seqrecord.features.append(feature)

#pick out 21mers before first 9nt of sgRNA hairpin
alltgts = []
for seqrecord in sgRNAfiltlib5pr:
    for feat in seqrecord.features:
        if feat.strand == 1:
            tgtstart = int(feat.location.start) - 21
            tgtend = int(feat.location.start)
            sgtgt = seqrecord[tgtstart:tgtend]
            alltgts.append(sgtgt)
            #print "pos \n \n"
        if feat.strand == -1:
            tgtend = int(feat.location.end) + 21
            tgtstart = int(feat.location.end)
            sgtgt = seqrecord[tgtstart:tgtend].reverse_complement()
            sgtgt.name=seqrecord.name
            alltgts.append(sgtgt)

In [883]:
# search against amps300; get hits:
allamps = []
#for item in SeqIO.parse("amps144masked_iter0.fasta", "fasta"):
    #allamps.append(item)

for item in SeqIO.parse("amps144.fasta", "fasta"):
    allamps.append(item)
    
for item in SeqIO.parse("theextraamps.fasta", "fasta"):
    allamps.append(item)

#for record in allamps:
    #record.seq = "GGGGGGGGGGGGGGGGGGGGGGGGGG"

In [884]:
len(allamps[0])


Out[884]:
14642

Next: re-generate that list of cut guides that you used to BLAST before. Are the hits (within amps300) actually within the expected PAM-adjacent cut sites?


In [32]:
# This part cuts the amps300 fragments and makes the theoretical guide list
supercutslist = []
for individual_sequence in allamps:
    cutslist = []
    amp = individual_sequence
    substrate_name = amp.id 
    substrate = amp.seq
    pos = HpaII.search(substrate)
    # Positions in this list correspond to the right boundaries of fragments; 
    # last one is thus the sequence end
    pos.append(len(substrate))
    pos = iter(pos)
    cuts = HpaII.catalyze(substrate)
    for item in cuts:
        cutslist.append([item, "HpaII", int(pos.next())])
        
    cuts = BfaI.catalyze(substrate)
    pos = BfaI.search(substrate)
    pos.append(len(substrate))
    pos = iter(pos)
    for item in cuts:
        cutslist.append([item, "BfaI", int(pos.next())])
        
    cuts = ScrFI.catalyze(substrate)
    pos = ScrFI.search(substrate)
    pos.append(len(substrate))
    pos = iter(pos)
    for item in cuts:
        cutslist.append([item, "ScrFI", int(pos.next())])
        
    #The above is all to get the results of a catalyze operation (i.e. tuples) intp
    # a list format. Next part makes them into SeqRecords.
    
    i = 0
    cutslistrecords = []
    for item in cutslist:     
        cutslistrecords.append(SeqRecord(item[0], id = str(i), description = str(item[1]), name=str(item[2]), dbxrefs=[str(substrate_name)]))
        i = i+1
    
    cutslist = []
    cutslist = cutslistrecords
    
    # This part takes the 3' 20nt of each fragment and makes a new sequence with it.
    # For the 5' end, the Mung-Bean treatment is simulated by removing two more nt (for HpaII and BfaI), or one nt for ScrFI;
    # these would be the 5' overhang. Then we take the reverse-complement of the sequence. 
    # The Restriction module just returns sequences as if the top strand only was being cut. In other words,
    # no bases are deleted from consecutive fragments. 
    
    # Eventually, this might be better re-implemented as SeqFeatures on the original sequence..
    
    from Bio.Seq import MutableSeq
    twentymers = []
    record2 = []
    for record2 in cutslist:
            try: # This is because a second run of this code on already mutable seqs seems to fail. Not sure how to flush out and revert back to non-mutables...
                record2.seq = record2.seq.tomutable()
            except:
                pass
            if record2.description == "ScrFI":
                #offset here (e.g. 1:21 is for simulating MBN digeston)
                # because the entry.names are rooted on the right of each fragment, the length
                # of the entry.name has to be subtracted to get the desired left position for the "reverse"
                # tgts
                entry = record2[1:21].reverse_complement\
                (description=True, id=True, name=True)
                entry.name = int(record2.name)+1 - len(record2.seq)
                twentymers.append(entry)
            else: # Should work for HpaII/BfaI
                entry = record2[2:22].reverse_complement\
                (description=True, id=True, name=True)
                entry.name = int(record2.name)+2 - len(record2.seq)
                twentymers.append(entry)
            record2.id = str("%s_fwd" % record2.id)
            entry = record2[-20:]
            entry.name = int(record2.name)-20
            twentymers.append(entry)
    
    for item in twentymers:
        item.dbxrefs = [substrate_name]
    
    # The ends of the fragments aren't bonafide CRISPR targets; these can be removed:
    noends = []
    twentymerstr = [item for item in twentymers if item.description == "HpaII"]
    trimmed = twentymerstr[1:-1] # removes first and last 20mer
    noends.append(trimmed)
    twentymerstr = [item for item in twentymers if item.description == "BfaI"]
    trimmed = twentymerstr[1:-1]
    noends.append(trimmed)
    twentymerstr = [item for item in twentymers if item.description == "ScrFI"]
    trimmed = twentymerstr[1:-1]
    noends.append(trimmed)
    
    cutslist = [item for sublist in noends for item in sublist]
    supercutslist.append(cutslist)

In [33]:
#dude, flatten me
supercutslist2 = []
for item in supercutslist:
    for subitem in item:
        supercutslist2.append(subitem)
        
supercutslist = supercutslist2[:]

In [420]:
len(supercutslist) # These are all the calculated 20mers from the input substrate sequence


Out[420]:
25022

In [35]:
# hitsumm is the subset of sequenced 20mers that find a hit in amps144 (and the extra amps)
yesitsok = [] # This will be a tuple of (sequenced 20mer, substrate 20mer) of matching items
notok = [] # This will be the sequenced 20mers that don't match a predicted cutsite. Probably the ends of DNA fragments.
for item in hitsumm:
    madeahit = 0
    for cut in supercutslist:
        if item[1].seq[2:20] in cut.seq:
            yesitsok.append((cut, item))
            madeahit = 1
        if item[1].reverse_complement().seq[2:20] in cut.seq:
            yesitsok.append((cut, item))
            madeahit = 1
    if madeahit == 0:
        notok.append(item)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-35-97b975240f64> in <module>()
      2 yesitsok = [] # This will be a tuple of (sequenced 20mer, substrate 20mer) of matching items
      3 notok = [] # This will be the sequenced 20mers that don't match a predicted cutsite. Probably the ends of DNA fragments.
----> 4 for item in hitsumm:
      5     madeahit = 0
      6     for cut in supercutslist:

NameError: name 'hitsumm' is not defined

In [ ]:
yesitsok[1][1][1]

In [ ]:
nohit = []
hitsumm = []
for item in notok:
    tgt = item[1]
    hitout = []
    nohits = 0
    for amp in allamps:
        hit = [tloc.start() for tloc in re.finditer(str(tgt.seq[1:-1]), str(amp.seq))]
        if len(hit) > 0:        
            hitout = (hit, tgt, tgt.reverse_complement().seq, amp.id, "fwd", amp.seq)
            hitsumm.append(hitout)
    if len(hitout) == 0: #make a marker of nohits =1 if there are no hits
        nohits = 1
    # and search again for the reverse complement of the target
    for amp in allamps:
        hit = [tloc.start() for tloc in re.finditer(str(tgt.reverse_complement().seq[1:-1]), str(amp.seq))]
        if len(hit) > 0:        
            hitout = (hit, tgt, tgt.reverse_complement().seq, amp.id, "rev", amp.seq)
            hitsumm.append(hitout)
    if len(hitout) == 0: # if the rc search has no hits, and if the previous search also had no hits, add this target to a list
        if nohits == 1:
            nohit.append(tgt)

This is somewhat of a reboot: starting way back from the 21mers after T7 in sequencing data, BLAST against a newly made Amps144 db. The goal is to get the match locations and visualize them.


In [885]:
Bio.SeqIO.write(alltgts, "alltgtstemp.fa", "fasta")
blastn_cline = NcbiblastnCommandline(query="alltgtstemp.fa", db="amps144", \
task = "blastn-short",outfmt=5, out="alltgts.blast", max_target_seqs=100, num_threads = 7, evalue = 0.005)
timeit.timeit(blastn_cline, number =1)


Out[885]:
0.13829612731933594

In [886]:
result_handle = open("alltgts.blast")
blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

blast_records_list = []
for blast_record in blast_records:
    blast_records_list.append(blast_record)
result_handle.close()

In [887]:
blastsandrecords = []
for i,j in enumerate(alltgts):
    blastsandrecords.append((j, blast_records_list[i]))

In [887]:


In [888]:
i = 0
for item in blastsandrecords:
    print(item[0].name + " " + item[0].seq) # Print out the query seq and its title, basically
    for alignment in [item[1]]:
        for item in alignment.alignments: 
            print item.title # Print out each Amp of amps144+extra amps that a match is made on
            i= i+1
            for hit in item.hsps:
                print("--" + str(hit)) # Within each hit amp, print out each specific hit sequence
    print "\n"

print i


02-5-1_B01_013 ACCATGCTACTTGAGGGCATT
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 ACCATGCTACTTGAGGGCATT 21
               |||||||||||||||||||||
Sbjct:    3381 ACCATGCTACTTGAGGGCATT 3361


02-5-1_B01_013 GGAGCTGGCATTGGCCTGTCC
gnl|BL_ORD_ID|54 6229761 <unknown description>
--Score 16 (32 bits), expectation 3.9e-03, alignment length 20
Query:       2 GAGCTGGCATTGGCCTGTCC 21
               |||||||||||| |||||||
Sbjct:    1275 GAGCTGGCATTGACCTGTCC 1256


04-5-3_D01_009 NCATGCATTAAAGCCATTGCC
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 CATGCATTAAAGCCATTGCC 21
               ||||||||||||||||||||
Sbjct:    1093 CATGCATTAAAGCCATTGCC 1074


09-5-4_A02_016 TAATGGTCTGGTTGGCTGGTG
gnl|BL_ORD_ID|104 2901440 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TAATGGTCTGGTTGGCTGGT 20
               ||||||||||||||||||||
Sbjct:     746 TAATGGTCTGGTTGGCTGGT 727


10-5-5_B02_014 CGTANGNNNNTNNNANNNNNG


11-5-6_C02_012 AGCTAATATTTTGGGGGCCCC
gnl|BL_ORD_ID|143 3110360 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGCTAATATTTTGGGGGCCCC 21
               |||||||||||||||||||||
Sbjct:    2628 AGCTAATATTTTGGGGGCCCC 2608


12-5-7_D02_010 CCATCCATATGCAACTCATGA
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CCATCCATATGCAACTCATGA 21
               |||||||||||||||||||||
Sbjct:     147 CCATCCATATGCAACTCATGA 127


17-5-8_A03_031 NNNNNNNNNNTTNNNANNNCC


17-5-8_A03_031 GATGGCCACTACAGGGACCCC
gnl|BL_ORD_ID|161 9981706 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GATGGCCACTACAGGGACCCC 21
               |||||||||||||||||||||
Sbjct:    4433 GATGGCCACTACAGGGACCCC 4413


18-5-9_B03_029 TGTGCATGGGAGCCAGATCTG
gnl|BL_ORD_ID|312 3476947 18
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TGTGCATGGGAGCCAGATCTG 21
               |||||||||||||||||||||
Sbjct:    1428 TGTGCATGGGAGCCAGATCTG 1448
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TGTGCATGGGAGCCAGATCTG 21
               |||||||||||||||||||||
Sbjct:    1428 TGTGCATGGGAGCCAGATCTG 1448


19-5-10_C03_027 TACAGCACGGCTACAATGGAG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TACAGCACGGCTACAATGGAG 21
               |||||||||||||||||||||
Sbjct:   10583 TACAGCACGGCTACAATGGAG 10563


20-5-11_D03_025 AGGAGGCACTTTGAAGAGCAG
gnl|BL_ORD_ID|333 8426516 20
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGGAGGCACTTTGAAGAGCAG 21
               |||||||||||||||||||||
Sbjct:    1693 AGGAGGCACTTTGAAGAGCAG 1713
gnl|BL_ORD_ID|283 8424649 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGGAGGCACTTTGAAGAGCAG 21
               |||||||||||||||||||||
Sbjct:    3560 AGGAGGCACTTTGAAGAGCAG 3580


20-5-11_D03_025 GGTGAAGGACAGCCCAATTAG
gnl|BL_ORD_ID|59 8480056 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 GGTGAAGGACAGCCCAATTA 20
               ||||||||||||||||||||
Sbjct:    3855 GGTGAAGGACAGCCCAATTA 3874


25-5-12_A04_032 CCATATCTGAGTAGTGGGCCG
gnl|BL_ORD_ID|143 3110360 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CCATATCTGAGTAGTGGGCC 20
               ||||||||||||||||||||
Sbjct:    3047 CCATATCTGAGTAGTGGGCC 3028


25-5-12_A04_032 TACAGCACGGCTACAATGGAG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TACAGCACGGCTACAATGGAG 21
               |||||||||||||||||||||
Sbjct:   10583 TACAGCACGGCTACAATGGAG 10563


26-5-13_B04_030 CACAAAAGGGGATCTGAACCG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CACAAAAGGGGATCTGAACC 20
               ||||||||||||||||||||
Sbjct:   10986 CACAAAAGGGGATCTGAACC 11005


27-5-14_C04_028 CCATGCTACTTGAGGGCATTG
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:    3380 CCATGCTACTTGAGGGCATT 3361


28-5-15_D04_026 TATAAAAAAGTGAAGGACCTG
gnl|BL_ORD_ID|250 9278465 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TATAAAAAAGTGAAGGACCTG 21
               |||||||||||||||||||||
Sbjct:     225 TATAAAAAAGTGAAGGACCTG 205


34-5-17_B05_045 NNNNNCTGNNNNNNNCNTNGN


35-5-18_C05_043 CACAGAATAAGGTCACTTACG
gnl|BL_ORD_ID|33 7685624 <unknown description>
--Score 16 (32 bits), expectation 3.9e-03, alignment length 20
Query:       1 CACAGAATAAGGTCACTTAC 20
               |||||||||| |||||||||
Sbjct:    1502 CACAGAATAACGTCACTTAC 1521


36-5-19_D05_041 NANGAGGCACTTTGAAGAGCA
gnl|BL_ORD_ID|333 8426516 20
--Score 18 (36 bits), expectation 2.5e-04, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:    1695 GAGGCACTTTGAAGAGCA 1712
gnl|BL_ORD_ID|283 8424649 <unknown description>
--Score 18 (36 bits), expectation 2.5e-04, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:    3562 GAGGCACTTTGAAGAGCA 3579


41-5-20_A06_048 GTCTTACCAAGGCACATGTGG
gnl|BL_ORD_ID|154 270146 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GTCTTACCAAGGCACATGTGG 21
               |||||||||||||||||||||
Sbjct:     136 GTCTTACCAAGGCACATGTGG 116


41-5-20_A06_048 GCTCAAGTAAGGCATTCCTGA
gnl|BL_ORD_ID|41 9593441 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 CTCAAGTAAGGCATTCCTGA 21
               ||||||||||||||||||||
Sbjct:    5151 CTCAAGTAAGGCATTCCTGA 5132


43-5-22_C06_044 GCCCCATGGGAGTGACAGCAC
gnl|BL_ORD_ID|104 2901440 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GCCCCATGGGAGTGACAGCAC 21
               |||||||||||||||||||||
Sbjct:    6504 GCCCCATGGGAGTGACAGCAC 6484


44-5-23_D06_042 CTATGTGNNNCNNGNANNNCC


50-5-25_B07_061 CTGTGCATGGGAGCCAGATCT
gnl|BL_ORD_ID|312 3476947 18
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCT 1447
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCT 1447


51-5-26_C07_059 GTTGCAGGTGTGGGAGAGCCG
gnl|BL_ORD_ID|17 7562871 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 GTTGCAGGTGTGGGAGAGCC 20
               ||||||||||||||||||||
Sbjct:    7569 GTTGCAGGTGTGGGAGAGCC 7550


52-5-27_D07_057 TGCTCCTTGTCTTTCCCTCCG
gnl|BL_ORD_ID|312 3476947 18
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TGCTCCTTGTCTTTCCCTCC 20
               ||||||||||||||||||||
Sbjct:     703 TGCTCCTTGTCTTTCCCTCC 722
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TGCTCCTTGTCTTTCCCTCC 20
               ||||||||||||||||||||
Sbjct:     703 TGCTCCTTGTCTTTCCCTCC 722


52-5-27_D07_057 TAACGACGGCAGTAGCTTCCG
gnl|BL_ORD_ID|277 4995367 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      88 TAACGACGGCAGTAGCTTCC 69


57-5-28_A08_064 GGCAGGGTCTCCTCTTCAATT
gnl|BL_ORD_ID|13 5085511 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 GCAGGGTCTCCTCTTCAATT 21
               ||||||||||||||||||||
Sbjct:    4050 GCAGGGTCTCCTCTTCAATT 4069


58-5-29_B08_062 CTGTGTCACGGTCTTGTCTCC
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGTCACGGTCTTGTCTCC 21
               |||||||||||||||||||||
Sbjct:   12262 CTGTGTCACGGTCTTGTCTCC 12282


60-5-31_D08_058 TAACGACGGCAGTAGCTTCCG
gnl|BL_ORD_ID|277 4995367 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      88 TAACGACGGCAGTAGCTTCC 69


65-5-32_A09_079 CACTTGCCCTAATTAAATCCG
gnl|BL_ORD_ID|161 9981706 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CACTTGCCCTAATTAAATCC 20
               ||||||||||||||||||||
Sbjct:    4120 CACTTGCCCTAATTAAATCC 4101


66-5-33_B09_077 NNNNGNGGNNAANNNNNNGNC


67-5-34_C09_075 TTTACCAAGGCATCAGGTGAG
gnl|BL_ORD_ID|176 1639848 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TTTACCAAGGCATCAGGTGAG 21
               |||||||||||||||||||||
Sbjct:    3336 TTTACCAAGGCATCAGGTGAG 3356


67-5-34_C09_075 NNNNNNNACAGATTCNNNNNG


68-5-35_D09_073 GGTTCAGAAAGCAACAACCCC
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 GTTCAGAAAGCAACAACCCC 21
               ||||||||||||||||||||
Sbjct:    5415 GTTCAGAAAGCAACAACCCC 5396


73-5-36_A10_080 NNNNNTANNNNNNANTANNGN


76-5-39_D10_074 GCAGAAGTCACAAGTGAACCC
gnl|BL_ORD_ID|192 895665 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 CAGAAGTCACAAGTGAACCC 21
               ||||||||||||||||||||
Sbjct:     913 CAGAAGTCACAAGTGAACCC 894


81-5-40_A11_095 GCGTTGGCGAGTCTCTGATTA
gnl|BL_ORD_ID|54 6229761 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GCGTTGGCGAGTCTCTGATTA 21
               |||||||||||||||||||||
Sbjct:    2616 GCGTTGGCGAGTCTCTGATTA 2596


82-5-41_B11_093 GAGGACAACAGGGGGAGCCCC
gnl|BL_ORD_ID|12 5570943 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 AGGACAACAGGGGGAGCCCC 21
               ||||||||||||||||||||
Sbjct:    2484 AGGACAACAGGGGGAGCCCC 2465


83-5-42_C11_091 GGCCTGTATGTGAACATCAGC
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 GCCTGTATGTGAACATCAGC 21
               ||||||||||||||||||||
Sbjct:     844 GCCTGTATGTGAACATCAGC 825


89-5-44_A12_096 CCATGCTACTTGAGGGCATTG
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:    3380 CCATGCTACTTGAGGGCATT 3361


90-5-45_B12_094 AGAAGGTGGTATGCTGATGGG
gnl|BL_ORD_ID|139 4286355 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGAAGGTGGTATGCTGATGGG 21
               |||||||||||||||||||||
Sbjct:    1662 AGAAGGTGGTATGCTGATGGG 1682


91-5-46_C12_092 NCNNNTGANTCCNNNNANNGG


42

In [889]:
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_start
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_end
#blastsandrecords[3][1].alignments[0].title.split()[1] # splits the generated BLAST alignment title on \
                                                      # whitespace; extracts the second element which \
                                                      # corresponds to the position on the scaffold

In [890]:
mappable = []
for tgt, blast in blastsandrecords:
    for i, j in enumerate(blast.alignments):
        try:
            pcr = blast.alignments[i].title.split()[1]
            start = blast.alignments[i].hsps[0].sbjct_start
            end = blast.alignments[i].hsps[0].sbjct_end
            match = blast.alignments[i].hsps[0].match
            query = blast.alignments[i].hsps[0].query
            print blast.alignments[i].title
            print pcr
            print i
            print(" " * (blast.alignments[i].hsps[0].query_start - 1) + query)
            print (" " * (blast.alignments[i].hsps[0].query_start - 1) + match)
            print tgt.seq + "\n"
            print 
            mapstring = (pcr, start, end)
            mappable.append((tgt, mapstring, match))
        except:
            pcr, start, end = 0,0,0


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
ACCATGCTACTTGAGGGCATT
|||||||||||||||||||||
ACCATGCTACTTGAGGGCATT


gnl|BL_ORD_ID|54 6229761 <unknown description>
6229761
0
 GAGCTGGCATTGGCCTGTCC
 |||||||||||| |||||||
GGAGCTGGCATTGGCCTGTCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
 CATGCATTAAAGCCATTGCC
 ||||||||||||||||||||
NCATGCATTAAAGCCATTGCC


gnl|BL_ORD_ID|104 2901440 <unknown description>
2901440
0
TAATGGTCTGGTTGGCTGGT
||||||||||||||||||||
TAATGGTCTGGTTGGCTGGTG


gnl|BL_ORD_ID|143 3110360 <unknown description>
3110360
0
AGCTAATATTTTGGGGGCCCC
|||||||||||||||||||||
AGCTAATATTTTGGGGGCCCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
CCATCCATATGCAACTCATGA
|||||||||||||||||||||
CCATCCATATGCAACTCATGA


gnl|BL_ORD_ID|161 9981706 <unknown description>
9981706
0
GATGGCCACTACAGGGACCCC
|||||||||||||||||||||
GATGGCCACTACAGGGACCCC


gnl|BL_ORD_ID|312 3476947 18
3476947
0
TGTGCATGGGAGCCAGATCTG
|||||||||||||||||||||
TGTGCATGGGAGCCAGATCTG


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
TGTGCATGGGAGCCAGATCTG
|||||||||||||||||||||
TGTGCATGGGAGCCAGATCTG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
TACAGCACGGCTACAATGGAG
|||||||||||||||||||||
TACAGCACGGCTACAATGGAG


gnl|BL_ORD_ID|333 8426516 20
8426516
0
AGGAGGCACTTTGAAGAGCAG
|||||||||||||||||||||
AGGAGGCACTTTGAAGAGCAG


gnl|BL_ORD_ID|283 8424649 <unknown description>
8424649
1
AGGAGGCACTTTGAAGAGCAG
|||||||||||||||||||||
AGGAGGCACTTTGAAGAGCAG


gnl|BL_ORD_ID|59 8480056 <unknown description>
8480056
0
GGTGAAGGACAGCCCAATTA
||||||||||||||||||||
GGTGAAGGACAGCCCAATTAG


gnl|BL_ORD_ID|143 3110360 <unknown description>
3110360
0
CCATATCTGAGTAGTGGGCC
||||||||||||||||||||
CCATATCTGAGTAGTGGGCCG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
TACAGCACGGCTACAATGGAG
|||||||||||||||||||||
TACAGCACGGCTACAATGGAG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
CACAAAAGGGGATCTGAACC
||||||||||||||||||||
CACAAAAGGGGATCTGAACCG


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
CCATGCTACTTGAGGGCATT
||||||||||||||||||||
CCATGCTACTTGAGGGCATTG


gnl|BL_ORD_ID|250 9278465 <unknown description>
9278465
0
TATAAAAAAGTGAAGGACCTG
|||||||||||||||||||||
TATAAAAAAGTGAAGGACCTG


gnl|BL_ORD_ID|33 7685624 <unknown description>
7685624
0
CACAGAATAAGGTCACTTAC
|||||||||| |||||||||
CACAGAATAAGGTCACTTACG


gnl|BL_ORD_ID|333 8426516 20
8426516
0
   GAGGCACTTTGAAGAGCA
   ||||||||||||||||||
NANGAGGCACTTTGAAGAGCA


gnl|BL_ORD_ID|283 8424649 <unknown description>
8424649
1
   GAGGCACTTTGAAGAGCA
   ||||||||||||||||||
NANGAGGCACTTTGAAGAGCA


gnl|BL_ORD_ID|154 270146 <unknown description>
270146
0
GTCTTACCAAGGCACATGTGG
|||||||||||||||||||||
GTCTTACCAAGGCACATGTGG


gnl|BL_ORD_ID|41 9593441 <unknown description>
9593441
0
 CTCAAGTAAGGCATTCCTGA
 ||||||||||||||||||||
GCTCAAGTAAGGCATTCCTGA


gnl|BL_ORD_ID|104 2901440 <unknown description>
2901440
0
GCCCCATGGGAGTGACAGCAC
|||||||||||||||||||||
GCCCCATGGGAGTGACAGCAC


gnl|BL_ORD_ID|312 3476947 18
3476947
0
CTGTGCATGGGAGCCAGATCT
|||||||||||||||||||||
CTGTGCATGGGAGCCAGATCT


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
CTGTGCATGGGAGCCAGATCT
|||||||||||||||||||||
CTGTGCATGGGAGCCAGATCT


gnl|BL_ORD_ID|17 7562871 <unknown description>
7562871
0
GTTGCAGGTGTGGGAGAGCC
||||||||||||||||||||
GTTGCAGGTGTGGGAGAGCCG


gnl|BL_ORD_ID|312 3476947 18
3476947
0
TGCTCCTTGTCTTTCCCTCC
||||||||||||||||||||
TGCTCCTTGTCTTTCCCTCCG


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
TGCTCCTTGTCTTTCCCTCC
||||||||||||||||||||
TGCTCCTTGTCTTTCCCTCCG


gnl|BL_ORD_ID|277 4995367 <unknown description>
4995367
0
TAACGACGGCAGTAGCTTCC
||||||||||||||||||||
TAACGACGGCAGTAGCTTCCG


gnl|BL_ORD_ID|13 5085511 <unknown description>
5085511
0
 GCAGGGTCTCCTCTTCAATT
 ||||||||||||||||||||
GGCAGGGTCTCCTCTTCAATT


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
CTGTGTCACGGTCTTGTCTCC
|||||||||||||||||||||
CTGTGTCACGGTCTTGTCTCC


gnl|BL_ORD_ID|277 4995367 <unknown description>
4995367
0
TAACGACGGCAGTAGCTTCC
||||||||||||||||||||
TAACGACGGCAGTAGCTTCCG


gnl|BL_ORD_ID|161 9981706 <unknown description>
9981706
0
CACTTGCCCTAATTAAATCC
||||||||||||||||||||
CACTTGCCCTAATTAAATCCG


gnl|BL_ORD_ID|176 1639848 <unknown description>
1639848
0
TTTACCAAGGCATCAGGTGAG
|||||||||||||||||||||
TTTACCAAGGCATCAGGTGAG


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
 GTTCAGAAAGCAACAACCCC
 ||||||||||||||||||||
GGTTCAGAAAGCAACAACCCC


gnl|BL_ORD_ID|192 895665 <unknown description>
895665
0
 CAGAAGTCACAAGTGAACCC
 ||||||||||||||||||||
GCAGAAGTCACAAGTGAACCC


gnl|BL_ORD_ID|54 6229761 <unknown description>
6229761
0
GCGTTGGCGAGTCTCTGATTA
|||||||||||||||||||||
GCGTTGGCGAGTCTCTGATTA


gnl|BL_ORD_ID|12 5570943 <unknown description>
5570943
0
 AGGACAACAGGGGGAGCCCC
 ||||||||||||||||||||
GAGGACAACAGGGGGAGCCCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
 GCCTGTATGTGAACATCAGC
 ||||||||||||||||||||
GGCCTGTATGTGAACATCAGC


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
CCATGCTACTTGAGGGCATT
||||||||||||||||||||
CCATGCTACTTGAGGGCATTG


gnl|BL_ORD_ID|139 4286355 <unknown description>
4286355
0
AGAAGGTGGTATGCTGATGGG
|||||||||||||||||||||
AGAAGGTGGTATGCTGATGGG



In [891]:
len(mappable)


Out[891]:
42

In [892]:
ampsdict = {}
for item in allamps:
    ampsdict[item.id] = item.seq

Problem: not all BLAST hit titles (sequences) seem to map to amps300 dict entries...


In [893]:
def printloc(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
    '''
    This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
    is contained in a field in queryseq, highlighting the location of queryseq in it. 
    Returns a string.
    '''
    subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
    #for seqrecord in subjectseq:
    locstart = queryseq[1][1]
    #print queryseq
    locend = queryseq[1][2]
    fwdlocs = []  
    revlocs = []
    # Figure out which strand the BLAST hit is on
    if locstart <= locend:
        fwdlocs.append(locstart)
    if locstart > locend:
        revlocs.append(locend)
    
    for item in fwdlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
        subjectseq.features.append(feature)

    for item in revlocs:
        start = ExactPosition(int(item) - 2)
        end = ExactPosition(start + len(queryseq[0].seq) -1)
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
        subjectseq.features.append(feature)
    #print subjectseq.features

    mask = list((("-" * 9) + "^" )* int(round(len(subjectseq.seq)/10.0)))
    for feature in subjectseq.features:
        featstart = int(feature.location.start)
        featend = int(feature.location.end)
        if feature.strand == 1:
            mask = mask[:featstart] + [">"] * int(featend-1 - featstart) + mask[featend-1:]
            #context = subjectseq[featstart+1:featend+4]
            context = subjectseq[featstart-10:featend+10]
        if feature.strand == -1:
            mask = mask[:featstart+1] + ["<"] * int(featend+1 - featstart) + mask[featend+1:]
            #context = subjectseq[featstart-2:featend+2]
            context = subjectseq[featstart-10:featend+10]
    mask = "".join(mask)
    # Add labels
    masklab = list(" " * (len(subjectseq.seq)))
    for feature in subjectseq.features:
        featstart = int(feature.location.start)
        featend = int(feature.location.end)
        featname = str(feature.type)
        masklab = masklab[:featstart] + list(str(featname)) + list(" " * (featend-1 - featstart - len(featname))) + masklab[featend-1:]
    masklab = "".join(masklab)
    #print subjectseq.name
    lines = int(round(len(subjectseq.seq) / 100))
    i = 0
    fullstring = []
    # Draw out the map, with three lines: subject seq, a marker/counter line with chevrons over features, then a 
    # feature label
    while i <= lines:
        indexstart = i*100
        indexend = (i+1) * 100
        if indexend > len(subjectseq.seq):
            indexend = len(subjectseq.seq)
        outstring = list(str(indexstart+1) + "  " + subjectseq.seq[indexstart:indexend] + "   " + str(indexend) + "\n" + \
        str(indexstart + 1) + "  " + mask[indexstart:indexend] + "   " + str(indexend) + "\n" + \
        str(indexstart +1) + "  " + masklab[indexstart:indexend] + "   " + str(indexend) + "\n")
        i = i + 1
        fullstring.extend(outstring)
    fullstring = "".join(fullstring)
    return (fullstring, context, subjectseq, fwdlocs, start, end, feature)

In [894]:
#t = printloc(mappable[3], ampsdict[mappable[3][1][0]])
t = printloc(mappable[6], ampsdict)

In [895]:
t[1]


Out[895]:
SeqRecord(seq=Seq('AAATTTAACCAGGGGTCCCTGTAGTGGCCATCTCCCCTCT', SingleLetterAlphabet()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [896]:
len(mappable)


Out[896]:
42

next up: trim the sequence files properly based on being followed by hairpin start. and include a map of the expected cut sites in that amplicon.


In [897]:
mappable[1]


Out[897]:
(SeqRecord(seq=Seq('GGAGCTGGCATTGGCCTGTCC', IUPACAmbiguousDNA()), id='02-5-1', name='02-5-1_B01_013', description='', dbxrefs=[]),
 (u'6229761', 1275, 1256),
 u'|||||||||||| |||||||')

In [898]:
for number, item in enumerate(mappable):
    if str("N") in item[0]:
        print number
        print item


2
(SeqRecord(seq=Seq('NCATGCATTAAAGCCATTGCC', IUPACAmbiguousDNA()), id='04-5-3', name='04-5-3_D01_009', description='', dbxrefs=[]), (u'3385262', 1093, 1074), u'||||||||||||||||||||')
19
(SeqRecord(seq=Seq('NANGAGGCACTTTGAAGAGCA', IUPACAmbiguousDNA()), id='36-5-19', name='36-5-19_D05_041', description='', dbxrefs=[]), (u'8426516', 1695, 1712), u'||||||||||||||||||')
20
(SeqRecord(seq=Seq('NANGAGGCACTTTGAAGAGCA', IUPACAmbiguousDNA()), id='36-5-19', name='36-5-19_D05_041', description='', dbxrefs=[]), (u'8424649', 3562, 3579), u'||||||||||||||||||')

In [899]:
mappable[7]


Out[899]:
(SeqRecord(seq=Seq('TGTGCATGGGAGCCAGATCTG', IUPACAmbiguousDNA()), id='<unknown id>', name='18-5-9_B03_029', description='<unknown description>', dbxrefs=[]),
 (u'3476947', 1428, 1448),
 u'|||||||||||||||||||||')

In [900]:
t = printloc(mappable[10], ampsdict)
#t = printloc(mappable[10], ampsdict["9981706"])

In [901]:
print t[1].seq


TTTATTCATAGGAGGCACTTTGAAGAGCAGACATCTGCAGTA

In [902]:
mappable[-1]


Out[902]:
(SeqRecord(seq=Seq('AGAAGGTGGTATGCTGATGGG', IUPACAmbiguousDNA()), id='90-5-45', name='90-5-45_B12_094', description='', dbxrefs=[]),
 (u'4286355', 1662, 1682),
 u'|||||||||||||||||||||')

In [906]:
primers_fwd = '''\
Fwd
TCCCTTTCTTTCCCGTTACC
AGAATAGGGACCGCATTGAC
GAGAGTCGGCGAGTCCATAA
AGAGCTGAAGCACCACAGGT
GCCGGATTCACTCAGATCA
ACCAGCCAACCAGACCATTA
GCCCGCTCCATTATAACAAG
TTCGTGCAATGGACAAGTAGA
GTTGACCCAAATGACCCATC
TGGTGAAGTGTGACGTAGCC
GGAATTGACATGGACAATGG
GGCTAAACCGGAATGAACAA
TGGGTCCCAATGATGAGTCT
CAGAGCTGCCTCATGACACT
GCAATGTGATGCGAAGTGTA
TAATCAGAGACTCGCCAACG
AAAGGATTGTGGACTCATGG
TGGACGCAGTTGTCATGATT
CGAAGAGGGATGCAGGACTA
AAGCGCCCTTCTTTCCTAAT
TAACAAGCCATTTGCCACCT
TTTCTGGAAGTTTGGACACTG
TGACTCAGAGGGTAGCAACT
ACCAAGAGTCCTCATGCACT
ACAATTCGCCAATCATTGCT
TCACACTTAACTGGGAGAATG
ATCATCCTGCGCCTAAGGTT
GCACTCTACACAAAGTTCTCG
CTGGGCCCAAAGTATCTCAT
TGTCACCCACTAATGTTTCAGG
CAAAGCTCACGTCAAATAAACG
GAACCAGAATGAGTGCTGTCC
AAAGACGGCCAGTATGCAGT
CAGTGTTCATCGGAACAAAGC
AGATCTTGGAGGCCCTGTTT
TGTGATTATGCAGAGGACAACC
TGCTCAATTACGGGTTTGGT
TTGGCCATACTTCAGCCAAT
CCGACCTGAACCCTCCTAAT
CTGTCTGTCTCTACCAATCACC
TGTGCTCTGTTGATGCGTCT
TCCTTCTCAGTATGCGCTGA
TGCAAGAGCGTCTGAATTTG
TATATTGCCTGGGCGCTAAC
TGTCACAACCCACTGATTCC
CCACTGATATAGTGTGGGCTAA
GTTACTGCCGTGAGGGATGA
AGTGATGGGTCTGCCAGAAT
AAACATGGTAAGCATCTGTGG
CAGTTATGGCTGCCTCGAA
GGGATTAGGGAGGATCAGGA
GCCAGGAATTGGCAGTAGTC
GACACGGGAAAGAAACATGA
TTTCAGTAGCCGCATCAGTG
CCAATTAAGCAGATTGGAGTTC
CCTTGTAATCCTACTGTGCCTA
GGCTTGCTCTGAGAAGGCTAT
TGCTGGAGTCCACCTGATTA
GACTGAACCGTCATTCCGATA
CGCCCACCAACTGAACTTAG
AGTGTGACGTCAGAGGCAAG
TTGCATTATTATGCGCTACTGG
GTTGTGAAATCTATTGCCTCCA
AGACACAATCTAATGAGGGATG
TTGGATGAGGTTGAGGCTTA
AGACTCCTGAGAGCCCATTT
CGTGCGATTGTTTCAGGTTT
GGAACGGTGTGTATGTCCAA
CCAAACCTAGGTGGTTCTCG
GGAACACTCATTAGGGAGCA
TCTTTACAGCACCTGCTTCTGA
GAGCCGAATAAAGTGACAAA
TGTGAATCAATCTGTCTTACGC
TATGATTGAGGGCCTTGTGG
CCAGTTCCAGGTGTGCCTA
CAGTGCCCACAAGGAGTAGG
AGAATAGGTGGATTCACTGAGG
AGTTGGGCAGGCCTAACATT
CCAATGGGCAGGAACTTATG
TCATCAACAACTGGAGTCTGC
ACGATGCAGCAATTCCCTAC
GGTACTGCCATCACCCTTGT
GCAGTGTGAGCCCAACAGTA
AGCCTGGACCTCTCCTTGAT
CCCATAAGTGCCGACTTCA
CCAGAAAGTAGGAGCCGATG
TCCCGGCTCTAAAGTAGTCTTG
AAAGTCAAGGGCTGCCATC
GGGAGAGCCCTTGGAATAAA
AACGATGTACAACACCAGTTGC
GCCAAGGATGAAACCAAATC
AATGGATCAATACCCTGTCC
CCACATAGCTTCCCTGTTCTTT
TGGTCATACCACACCAATGAA
CCAAGCTAGGCTTGAACTGG
TAGCCGCTTCGCAGTTTAAT
CCACCCTTCAGACTGGCTAC
AGGAAGGACATGGAATTAACTG
AATGCCCTCAAGTAGCATGG
GTCTTGAGGAAGCAGCAACC
TTTGCCCGGTGATAGAATGT
TCATGAGTTGCATATGGATGG
GTTCATTGATGGGTGCCAGT
TGGTGAACCTGTATCAAATACG
TCAGGAATGCCTTACTTGAGA
CTTGCAGGAACTTATGAACACA
TGAATGGATCCACCACAGAA
ATCCCAAGGGAACACGTAAG
GCCCACAGATTGCATTCAC
CGGCCCTGTCTCACAGTAA
CCAGGGTATTCTAACCCTATGC
TGGAGAATCCCAAGGATGTT
AACGTGCAACCTTTGAGTCC
TCCTCCTAAAGAAACGACGTG
TCCAAGCACTCCAACCTTGT
TTTCTGATGGGCCTCTGG
TCCTCGTAAGAGGTGTTTCCA
CACCCAACTCTTATGGTGGAA
ACCCGCCTCAATACCAAAGT
TCAGAATGGCTATGGCTGTG
CTAGCGGTTTATGAGCGTCAC
CCAACTCACACTCCAATAATCA
TCCATTGTAGCCGTGCTGTA
GCATTGCAGTTCCAATCAGA
GGCTGGACAAATACCACTGC
TCGTCAGAAGTTGTCCAAGG
CCACTATGGCCAACAAGAGAG
CCTGTGGGAAGTTATGAGACG
CTATTTGACCCGCAGTTTCC
TGGTTGCTCACATCACTGAA
CTCTTTGCAGATGAGCGTGA
GGAAGCTACTGCCGTCGTTA
TCCTTTATTGTCCCGCCATA
CCACATGTGCCTTGGTAAGA
CAACAGCAATCACCCTTCAA
AGGAGGATTATTGCACCCATA
TGTGCATCACACACTCTGGA
CTGGGACCACAGGGATAAAG
GTCGCACACATAAACGCAGT
AACAGGATCGGAGAGCATTG
ACACTCATGATAGTGACCTGCT
GGGAACCGTAGAGTTTATTGTG
AGGTTCCACAAGGAGGGAGT
ACATTGGCCTTGATCCTGAG
'''

In [907]:
primers_rev = '''\
Rev
GCGCCTAAGTGTCTTTGCAT
CAAGTGCAGAGCACCTTGAC
TGCCAACGTTTGTCTCTGAC
ATCACATGTGTCTCCAGGAA
GGCAGTTGGGACGTATTTGT
GCCAGTACCTGCCAGTAACC
TTGCTGGCACATTACCACTC
TTGGCTCCTGGACTGTCTTC
GAAACAGCCGTGTCCAGAT
TTCTGCAACGAACTGTCTCTG
GTTTCGGACCCACAATGG
TGTCACTAAAGCCTAGCAGAAA
CCCAGGAGATGGTCATAATC
GCCCTGAGTATCGGCATACA
TTTGTCAGCTTGTGGACCTG
AGGAATCCTATGCTATTTCTCG
CCCATGGTCCTTACAGACTGA
GAGCGTAGCACCACTTACGG
AACTGGTATGAATGCGCAAC
CCTGCTATCTCATCTTCCTTCA
CGAAATCCGGAAATCTCTGT
TCGGTCAGAATCACATCTGC
GCACTGGGATCTCAGGTTTG
CATTCTTCACGCTTGTTCCA
GCTGAAAGATACCTGCCAACA
GGGCAGGCTCTCTTAGTCAA
GCTGCTTGAATAATTCGTCTGC
TGACCGGAAATGTTGGAAAC
CTTGCACAAGTTGCTTCACA
GCTGTACCCGTGTAGGCTTT
GGCAAAGGGCTCCAGATATAA
CCCAGGAATAGAAGTCACGTTT
CTAATTGAATGCGTTCATGC
TAGGATGCTGCCCTATGGTC
GTCACCGACCATTCATTTCA
TTTCATCCCTTGTCATCCAT
GGGCTCTGGTCAAATGAT
CTTCTGCTCATGGGTTTGGT
CCAGTCTAGTGGCCAGGATT
AATTGAAGAGGAGACCCTGCT
AGGGCATCTCCAATGGTGTA
CTGCTGTACATCCAGGCTGA
CCATTACGGATGTAGTTCAGCA
GCAGTCGAGGCTTTGAGTCT
GCAAACCTTCAGGAGCATGT
AAAGACCCAGCAGGAATTGA
GGGTGTCTTAGAGGGTAACAAA
CACTCCATACAAAGCGCTCA
AATGGAAATCGCCACTATACG
ACAGACCCGCCCTGATGAT
CATTGACATGACACATTTCTCG
AGCTCTGTCCCAGGGTATAGT
AAGCCGTAAAGTGGAAGCAG
TGAGCTAACATTCTCAAGTCCA
TAATTGGGCTGTCCTTCACC
CCAGCTCAAGTTCGAGGAAA
ACATTCGCCGTAAAGCAAAG
AAATCCATTGGGCCTGCT
AAGGGACCATCTGGGTATGT
TGACCTGTACAACACCTTGTGA
TGTGCTACTGCCATGTACCC
CCATCTTAGGCCAACTTCCA
ACCCATCCTGGCACACTGTA
ACCCAAGGGTCTCACACTTC
GGGATTGAGTCAGGTGGGTTA
ATCCAAGTCCTGCCTGAGGT
TTTCATGTGAGGTTGCCAAT
AACACTTGTGTATCGGCCATC
TGAGCAACTTATTGAGGCACA
AGGGTTAGACGACTGCCAAG
TGCATTTAGACGTTTGGTTG
TCGAACTATCATCCCGCAGT
TCACCTGATGCCTTGGTAAA
CCCTCCAAATGAAGTGACCT
CCACCCAGGATCTATTTAGAGG
TTCGGCATCGCTTATTTACG
CTTTACGGATTGGGCAAGAA
TGACCCACTCAGCATAATGAA
TTTCTGGCAAGCACTCAGAA
AAACAAGGACATGCCACACA
TTCAATCCAAACGATGCAGA
TGCACCAGTCTATTCGGTCA
CCTGCATGCCTAGGGTATATT
CCCTGTGGTTGTCTAGCGTA
CCTAAGGCGCAATAGTGTGG
CTATCCAGAACCTCCCAGCA
AGAATACCACTGCTTGCTGAGA
CCTCTGCTGGCTACAGTTTG
CCATAAACCTTGGACGCAAC
TTTGAGTTGCCTGAACGTGA
GGTCTTCTTGGCCTTCCTAAA
GGTAGATACCCGTGGAATGC
GGGAGGGTATCCACATGAGA
GGAAGTGTAAGCTAAGGCTCA
ATTTCACGGCAAGCCAATTA
TTTGTCGCGCATCACTTT
CATTCCCTAAGGCATTTGTTTC
CCCTTAGAGGACAACGGAGA
TCGAGCATGGTCTGCATTAG
ACCTCTGTTGGTCCCTATGC
GGATTACAGTGGCCATATCGTT
TTAAGGAGCTGATGATTCCAG
CTACATGCCTTGGGCTTAGG
CTCAGGGTTCCTGTGCTCTC
CCCTCTTAGGGTATACGGGTTA
AATTTGGGTCGTGCGTATGT
ACTCCACTGAGGCCCAGATA
TTGGAAGGGCCATGTATAGG
AAGGGACGGTTTAGGGTCAG
CACGTGAGCTTCGGATGTTA
TTGAATGCATAGCACCTTTG
TGGCCTTGATCCTTCAGTTC
ATTTCAAATGCCCAAACGAC
TAACACCATGGCCGAGATTT
AAGTTCTCCAGGCGAATCAG
AGGTGTTTACCGAAGGCAGA
CGAGCTGTTGGTCATTGCTA
TCCAACCATTCCAAAGTCAA
CATCAGCACAAGCAGTCGTT
TGGTCTGATGTGACGAAAGC
TCTATCCATGGAGTCATTTGG
CTGGATGGCCAACTTCTGTC
GGAGACAAGACCGTGACACA
AACCTTGGCCAGGTATTATG
CCACATTTGTAAACGGCTCA
CCCATATTTGCGACATGTGTT
CCCATCAGCATACCACCTTC
CCCAGATTCCTGCCCATT
GATTTCGGGTGCATTGTCTT
GGACAACAGCTATGGCTTGC
TGTGTGTTATGGCGATGTCC
TTGCCTATAATTGAGCCAGAGA
AAACCATAGATCCTGGTTCAG
TTCAGATACTTCATCCTCAACC
ATGTATATTCACGCCTGTGG
GCTGCACAGAGATTCGATGA
CATGTCCAGGCAGTCCAAT
GGCAGGGTCCATCTACAGTT
CCCTCTCTCGGCTCCTATCT
TGCTCTTCAAAGTGCCTCCT
GGCCATCTGAGACTTTGCAC
CTGCAGCAATGGCCTTAAAT
AACTTGAGCGCAGGGAACT
AGATCTGGCTCCCATGCAC
'''

In [908]:
primers_fwd = pandas.read_table(io.BytesIO(primers_fwd))
primers_rev = pandas.read_table(io.BytesIO(primers_rev))

In [909]:
fwdprimerlist = []
for index,item in [list(x) for x in primers_fwd.itertuples()]:
    fwdprimerlist.append(item)

revprimerlist = []
for index,item in [list(x) for x in primers_rev.itertuples()]:
    revprimerlist.append(item)

In [910]:
revprimersreversed = []
for item in revprimerlist:
    item = Seq(item, IUPACAmbiguousDNA()).reverse_complement()
    revprimersreversed.append(item)
revprimerslist = revprimersreversed

In [911]:
revprimerlist = copy.deepcopy(revprimersreversed)

In [912]:
len(revprimersreversed)


Out[912]:
144

In [913]:
f = []
for item in fwdprimerlist:
    item = Seq(item, IUPACAmbiguousDNA())
    f.append(item)
fwdprimerlist = copy.deepcopy(f)

In [914]:
?item.


Object `item.` not found.

In [915]:
fwdprimerlist


Out[915]:
[Seq('TCCCTTTCTTTCCCGTTACC', IUPACAmbiguousDNA()),
 Seq('AGAATAGGGACCGCATTGAC', IUPACAmbiguousDNA()),
 Seq('GAGAGTCGGCGAGTCCATAA', IUPACAmbiguousDNA()),
 Seq('AGAGCTGAAGCACCACAGGT', IUPACAmbiguousDNA()),
 Seq('GCCGGATTCACTCAGATCA', IUPACAmbiguousDNA()),
 Seq('ACCAGCCAACCAGACCATTA', IUPACAmbiguousDNA()),
 Seq('GCCCGCTCCATTATAACAAG', IUPACAmbiguousDNA()),
 Seq('TTCGTGCAATGGACAAGTAGA', IUPACAmbiguousDNA()),
 Seq('GTTGACCCAAATGACCCATC', IUPACAmbiguousDNA()),
 Seq('TGGTGAAGTGTGACGTAGCC', IUPACAmbiguousDNA()),
 Seq('GGAATTGACATGGACAATGG', IUPACAmbiguousDNA()),
 Seq('GGCTAAACCGGAATGAACAA', IUPACAmbiguousDNA()),
 Seq('TGGGTCCCAATGATGAGTCT', IUPACAmbiguousDNA()),
 Seq('CAGAGCTGCCTCATGACACT', IUPACAmbiguousDNA()),
 Seq('GCAATGTGATGCGAAGTGTA', IUPACAmbiguousDNA()),
 Seq('TAATCAGAGACTCGCCAACG', IUPACAmbiguousDNA()),
 Seq('AAAGGATTGTGGACTCATGG', IUPACAmbiguousDNA()),
 Seq('TGGACGCAGTTGTCATGATT', IUPACAmbiguousDNA()),
 Seq('CGAAGAGGGATGCAGGACTA', IUPACAmbiguousDNA()),
 Seq('AAGCGCCCTTCTTTCCTAAT', IUPACAmbiguousDNA()),
 Seq('TAACAAGCCATTTGCCACCT', IUPACAmbiguousDNA()),
 Seq('TTTCTGGAAGTTTGGACACTG', IUPACAmbiguousDNA()),
 Seq('TGACTCAGAGGGTAGCAACT', IUPACAmbiguousDNA()),
 Seq('ACCAAGAGTCCTCATGCACT', IUPACAmbiguousDNA()),
 Seq('ACAATTCGCCAATCATTGCT', IUPACAmbiguousDNA()),
 Seq('TCACACTTAACTGGGAGAATG', IUPACAmbiguousDNA()),
 Seq('ATCATCCTGCGCCTAAGGTT', IUPACAmbiguousDNA()),
 Seq('GCACTCTACACAAAGTTCTCG', IUPACAmbiguousDNA()),
 Seq('CTGGGCCCAAAGTATCTCAT', IUPACAmbiguousDNA()),
 Seq('TGTCACCCACTAATGTTTCAGG', IUPACAmbiguousDNA()),
 Seq('CAAAGCTCACGTCAAATAAACG', IUPACAmbiguousDNA()),
 Seq('GAACCAGAATGAGTGCTGTCC', IUPACAmbiguousDNA()),
 Seq('AAAGACGGCCAGTATGCAGT', IUPACAmbiguousDNA()),
 Seq('CAGTGTTCATCGGAACAAAGC', IUPACAmbiguousDNA()),
 Seq('AGATCTTGGAGGCCCTGTTT', IUPACAmbiguousDNA()),
 Seq('TGTGATTATGCAGAGGACAACC', IUPACAmbiguousDNA()),
 Seq('TGCTCAATTACGGGTTTGGT', IUPACAmbiguousDNA()),
 Seq('TTGGCCATACTTCAGCCAAT', IUPACAmbiguousDNA()),
 Seq('CCGACCTGAACCCTCCTAAT', IUPACAmbiguousDNA()),
 Seq('CTGTCTGTCTCTACCAATCACC', IUPACAmbiguousDNA()),
 Seq('TGTGCTCTGTTGATGCGTCT', IUPACAmbiguousDNA()),
 Seq('TCCTTCTCAGTATGCGCTGA', IUPACAmbiguousDNA()),
 Seq('TGCAAGAGCGTCTGAATTTG', IUPACAmbiguousDNA()),
 Seq('TATATTGCCTGGGCGCTAAC', IUPACAmbiguousDNA()),
 Seq('TGTCACAACCCACTGATTCC', IUPACAmbiguousDNA()),
 Seq('CCACTGATATAGTGTGGGCTAA', IUPACAmbiguousDNA()),
 Seq('GTTACTGCCGTGAGGGATGA', IUPACAmbiguousDNA()),
 Seq('AGTGATGGGTCTGCCAGAAT', IUPACAmbiguousDNA()),
 Seq('AAACATGGTAAGCATCTGTGG', IUPACAmbiguousDNA()),
 Seq('CAGTTATGGCTGCCTCGAA', IUPACAmbiguousDNA()),
 Seq('GGGATTAGGGAGGATCAGGA', IUPACAmbiguousDNA()),
 Seq('GCCAGGAATTGGCAGTAGTC', IUPACAmbiguousDNA()),
 Seq('GACACGGGAAAGAAACATGA', IUPACAmbiguousDNA()),
 Seq('TTTCAGTAGCCGCATCAGTG', IUPACAmbiguousDNA()),
 Seq('CCAATTAAGCAGATTGGAGTTC', IUPACAmbiguousDNA()),
 Seq('CCTTGTAATCCTACTGTGCCTA', IUPACAmbiguousDNA()),
 Seq('GGCTTGCTCTGAGAAGGCTAT', IUPACAmbiguousDNA()),
 Seq('TGCTGGAGTCCACCTGATTA', IUPACAmbiguousDNA()),
 Seq('GACTGAACCGTCATTCCGATA', IUPACAmbiguousDNA()),
 Seq('CGCCCACCAACTGAACTTAG', IUPACAmbiguousDNA()),
 Seq('AGTGTGACGTCAGAGGCAAG', IUPACAmbiguousDNA()),
 Seq('TTGCATTATTATGCGCTACTGG', IUPACAmbiguousDNA()),
 Seq('GTTGTGAAATCTATTGCCTCCA', IUPACAmbiguousDNA()),
 Seq('AGACACAATCTAATGAGGGATG', IUPACAmbiguousDNA()),
 Seq('TTGGATGAGGTTGAGGCTTA', IUPACAmbiguousDNA()),
 Seq('AGACTCCTGAGAGCCCATTT', IUPACAmbiguousDNA()),
 Seq('CGTGCGATTGTTTCAGGTTT', IUPACAmbiguousDNA()),
 Seq('GGAACGGTGTGTATGTCCAA', IUPACAmbiguousDNA()),
 Seq('CCAAACCTAGGTGGTTCTCG', IUPACAmbiguousDNA()),
 Seq('GGAACACTCATTAGGGAGCA', IUPACAmbiguousDNA()),
 Seq('TCTTTACAGCACCTGCTTCTGA', IUPACAmbiguousDNA()),
 Seq('GAGCCGAATAAAGTGACAAA', IUPACAmbiguousDNA()),
 Seq('TGTGAATCAATCTGTCTTACGC', IUPACAmbiguousDNA()),
 Seq('TATGATTGAGGGCCTTGTGG', IUPACAmbiguousDNA()),
 Seq('CCAGTTCCAGGTGTGCCTA', IUPACAmbiguousDNA()),
 Seq('CAGTGCCCACAAGGAGTAGG', IUPACAmbiguousDNA()),
 Seq('AGAATAGGTGGATTCACTGAGG', IUPACAmbiguousDNA()),
 Seq('AGTTGGGCAGGCCTAACATT', IUPACAmbiguousDNA()),
 Seq('CCAATGGGCAGGAACTTATG', IUPACAmbiguousDNA()),
 Seq('TCATCAACAACTGGAGTCTGC', IUPACAmbiguousDNA()),
 Seq('ACGATGCAGCAATTCCCTAC', IUPACAmbiguousDNA()),
 Seq('GGTACTGCCATCACCCTTGT', IUPACAmbiguousDNA()),
 Seq('GCAGTGTGAGCCCAACAGTA', IUPACAmbiguousDNA()),
 Seq('AGCCTGGACCTCTCCTTGAT', IUPACAmbiguousDNA()),
 Seq('CCCATAAGTGCCGACTTCA', IUPACAmbiguousDNA()),
 Seq('CCAGAAAGTAGGAGCCGATG', IUPACAmbiguousDNA()),
 Seq('TCCCGGCTCTAAAGTAGTCTTG', IUPACAmbiguousDNA()),
 Seq('AAAGTCAAGGGCTGCCATC', IUPACAmbiguousDNA()),
 Seq('GGGAGAGCCCTTGGAATAAA', IUPACAmbiguousDNA()),
 Seq('AACGATGTACAACACCAGTTGC', IUPACAmbiguousDNA()),
 Seq('GCCAAGGATGAAACCAAATC', IUPACAmbiguousDNA()),
 Seq('AATGGATCAATACCCTGTCC', IUPACAmbiguousDNA()),
 Seq('CCACATAGCTTCCCTGTTCTTT', IUPACAmbiguousDNA()),
 Seq('TGGTCATACCACACCAATGAA', IUPACAmbiguousDNA()),
 Seq('CCAAGCTAGGCTTGAACTGG', IUPACAmbiguousDNA()),
 Seq('TAGCCGCTTCGCAGTTTAAT', IUPACAmbiguousDNA()),
 Seq('CCACCCTTCAGACTGGCTAC', IUPACAmbiguousDNA()),
 Seq('AGGAAGGACATGGAATTAACTG', IUPACAmbiguousDNA()),
 Seq('AATGCCCTCAAGTAGCATGG', IUPACAmbiguousDNA()),
 Seq('GTCTTGAGGAAGCAGCAACC', IUPACAmbiguousDNA()),
 Seq('TTTGCCCGGTGATAGAATGT', IUPACAmbiguousDNA()),
 Seq('TCATGAGTTGCATATGGATGG', IUPACAmbiguousDNA()),
 Seq('GTTCATTGATGGGTGCCAGT', IUPACAmbiguousDNA()),
 Seq('TGGTGAACCTGTATCAAATACG', IUPACAmbiguousDNA()),
 Seq('TCAGGAATGCCTTACTTGAGA', IUPACAmbiguousDNA()),
 Seq('CTTGCAGGAACTTATGAACACA', IUPACAmbiguousDNA()),
 Seq('TGAATGGATCCACCACAGAA', IUPACAmbiguousDNA()),
 Seq('ATCCCAAGGGAACACGTAAG', IUPACAmbiguousDNA()),
 Seq('GCCCACAGATTGCATTCAC', IUPACAmbiguousDNA()),
 Seq('CGGCCCTGTCTCACAGTAA', IUPACAmbiguousDNA()),
 Seq('CCAGGGTATTCTAACCCTATGC', IUPACAmbiguousDNA()),
 Seq('TGGAGAATCCCAAGGATGTT', IUPACAmbiguousDNA()),
 Seq('AACGTGCAACCTTTGAGTCC', IUPACAmbiguousDNA()),
 Seq('TCCTCCTAAAGAAACGACGTG', IUPACAmbiguousDNA()),
 Seq('TCCAAGCACTCCAACCTTGT', IUPACAmbiguousDNA()),
 Seq('TTTCTGATGGGCCTCTGG', IUPACAmbiguousDNA()),
 Seq('TCCTCGTAAGAGGTGTTTCCA', IUPACAmbiguousDNA()),
 Seq('CACCCAACTCTTATGGTGGAA', IUPACAmbiguousDNA()),
 Seq('ACCCGCCTCAATACCAAAGT', IUPACAmbiguousDNA()),
 Seq('TCAGAATGGCTATGGCTGTG', IUPACAmbiguousDNA()),
 Seq('CTAGCGGTTTATGAGCGTCAC', IUPACAmbiguousDNA()),
 Seq('CCAACTCACACTCCAATAATCA', IUPACAmbiguousDNA()),
 Seq('TCCATTGTAGCCGTGCTGTA', IUPACAmbiguousDNA()),
 Seq('GCATTGCAGTTCCAATCAGA', IUPACAmbiguousDNA()),
 Seq('GGCTGGACAAATACCACTGC', IUPACAmbiguousDNA()),
 Seq('TCGTCAGAAGTTGTCCAAGG', IUPACAmbiguousDNA()),
 Seq('CCACTATGGCCAACAAGAGAG', IUPACAmbiguousDNA()),
 Seq('CCTGTGGGAAGTTATGAGACG', IUPACAmbiguousDNA()),
 Seq('CTATTTGACCCGCAGTTTCC', IUPACAmbiguousDNA()),
 Seq('TGGTTGCTCACATCACTGAA', IUPACAmbiguousDNA()),
 Seq('CTCTTTGCAGATGAGCGTGA', IUPACAmbiguousDNA()),
 Seq('GGAAGCTACTGCCGTCGTTA', IUPACAmbiguousDNA()),
 Seq('TCCTTTATTGTCCCGCCATA', IUPACAmbiguousDNA()),
 Seq('CCACATGTGCCTTGGTAAGA', IUPACAmbiguousDNA()),
 Seq('CAACAGCAATCACCCTTCAA', IUPACAmbiguousDNA()),
 Seq('AGGAGGATTATTGCACCCATA', IUPACAmbiguousDNA()),
 Seq('TGTGCATCACACACTCTGGA', IUPACAmbiguousDNA()),
 Seq('CTGGGACCACAGGGATAAAG', IUPACAmbiguousDNA()),
 Seq('GTCGCACACATAAACGCAGT', IUPACAmbiguousDNA()),
 Seq('AACAGGATCGGAGAGCATTG', IUPACAmbiguousDNA()),
 Seq('ACACTCATGATAGTGACCTGCT', IUPACAmbiguousDNA()),
 Seq('GGGAACCGTAGAGTTTATTGTG', IUPACAmbiguousDNA()),
 Seq('AGGTTCCACAAGGAGGGAGT', IUPACAmbiguousDNA()),
 Seq('ACATTGGCCTTGATCCTGAG', IUPACAmbiguousDNA())]

In [916]:
ampsdict["10206893"]


Out[916]:
Seq('GGAGAATTCAACCCATAATAAAAAAAAGGCTCCCCCCTACCCTGGGTAGACTCC...CGC', SingleLetterAlphabet())

In [917]:
fwdprimerlist.extend(revprimerlist)

In [918]:
f = []
for index, item in enumerate(fwdprimerlist):
    item = SeqRecord(item)
    item.id = str(index)
    item.description = str(index)
    item.name = str(index)
    f.append(item)

In [919]:
with open("288primers.fasta", "w") as handle:
        SeqIO.write(f, handle, "fasta")

Made a BLAST database from 288 primers


In [920]:
# To match against the primer BLAST database, which of the mappable 20mers are basically fragment ends?
# First, write out a query fasta containing the mappable targets:
m = [j for j,k,l in copy.deepcopy(mappable)]

In [921]:
for item in m:
    item.id = item.name

In [922]:
Bio.SeqIO.write(m, "mappablesequencedtgts.fasta", "fasta")


Out[922]:
42

In [923]:
blastn_cline = NcbiblastnCommandline(query="mappablesequencedtgts.fasta", db="288prim", \
task = "blastn-short",outfmt=5, out="288prim.blast", max_target_seqs=100, num_threads = 7, evalue = 0.01)
timeit.timeit(blastn_cline, number =1)


Out[923]:
0.22045302391052246

In [924]:
result_handle = open("288prim.blast")
prim_blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

prim_blast_records_list = []
for blast_record in prim_blast_records:
    prim_blast_records_list.append(blast_record)
result_handle.close()

In [925]:
counter = 0
for item in prim_blast_records_list:
    for i, h in enumerate(item.alignments):
        print i
        for j in h.hsps:
            print j
            counter = counter + 1
print counter


0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 CCATGCTACTTGAGGGCATT 21
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TAATGGTCTGGTTGGCTGGT 20
               ||||||||||||||||||||
Sbjct:      20 TAATGGTCTGGTTGGCTGGT 1
0
Score 21 (42 bits), expectation 1.1e-08, alignment length 21
Query:       1 CCATCCATATGCAACTCATGA 21
               |||||||||||||||||||||
Sbjct:      21 CCATCCATATGCAACTCATGA 1
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       2 GTGCATGGGAGCCAGATCT 20
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       2 GTGCATGGGAGCCAGATCT 20
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TACAGCACGGCTACAATGGA 20
               ||||||||||||||||||||
Sbjct:      20 TACAGCACGGCTACAATGGA 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 AGGAGGCACTTTGAAGAGCA 20
               ||||||||||||||||||||
Sbjct:       1 AGGAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 AGGAGGCACTTTGAAGAGCA 20
               ||||||||||||||||||||
Sbjct:       1 AGGAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 GGTGAAGGACAGCCCAATTA 20
               ||||||||||||||||||||
Sbjct:       1 GGTGAAGGACAGCCCAATTA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TACAGCACGGCTACAATGGA 20
               ||||||||||||||||||||
Sbjct:      20 TACAGCACGGCTACAATGGA 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 18 (36 bits), expectation 6.9e-07, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:       3 GAGGCACTTTGAAGAGCA 20
0
Score 18 (36 bits), expectation 6.9e-07, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:       3 GAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 TCTTACCAAGGCACATGTGG 21
               ||||||||||||||||||||
Sbjct:      20 TCTTACCAAGGCACATGTGG 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 CTCAAGTAAGGCATTCCTGA 21
               ||||||||||||||||||||
Sbjct:      20 CTCAAGTAAGGCATTCCTGA 1
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      20 TAACGACGGCAGTAGCTTCC 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 GCAGGGTCTCCTCTTCAATT 21
               ||||||||||||||||||||
Sbjct:       2 GCAGGGTCTCCTCTTCAATT 21
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 TGTGTCACGGTCTTGTCTCC 21
               ||||||||||||||||||||
Sbjct:       1 TGTGTCACGGTCTTGTCTCC 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      20 TAACGACGGCAGTAGCTTCC 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TTTACCAAGGCATCAGGTGA 20
               ||||||||||||||||||||
Sbjct:       1 TTTACCAAGGCATCAGGTGA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 CGTTGGCGAGTCTCTGATTA 21
               ||||||||||||||||||||
Sbjct:      20 CGTTGGCGAGTCTCTGATTA 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 GAAGGTGGTATGCTGATGGG 21
               ||||||||||||||||||||
Sbjct:       1 GAAGGTGGTATGCTGATGGG 20
25

In [ ]:


In [940]:
# Generate a list of the mappable targets that didn't get matches against the primer library
mappable_noends = []
for item in prim_blast_records_list:
    if len(item.alignments) == 0:
        mappable_noends.append(item.query.split()[0])

So, 25 of the 42 mappable matches are the ends of PCR products. Next, filter the mappable list to remove these. Then check PAM-adjacency.


In [941]:
mappable_noends


Out[941]:
[u'02-5-1_B01_013',
 u'04-5-3_D01_009',
 u'11-5-6_C02_012',
 u'17-5-8_A03_031',
 u'25-5-12_A04_032',
 u'26-5-13_B04_030',
 u'28-5-15_D04_026',
 u'35-5-18_C05_043',
 u'43-5-22_C06_044',
 u'51-5-26_C07_059',
 u'52-5-27_D07_057',
 u'52-5-27_D07_057',
 u'65-5-32_A09_079',
 u'68-5-35_D09_073',
 u'76-5-39_D10_074',
 u'82-5-41_B11_093',
 u'83-5-42_C11_091']

In [950]:
len(mappable[0])


Out[950]:
3

In [1002]:
d = []
for item, j, k in mappable:
    if item.name in mappable_noends:
        d.append((item, j, k))

In [1013]:


In [1049]:
pamlist = []
for item in d:
    t = printloc(item, ampsdict)
    if t[1].features[0].strand == 1:
        pamlist.append((t, t[2][t[-2]-1:t[-2]+2]))
    if t[1].features[0].strand == -1:
        pamlist.append((t, t[2][t[-3]-2:t[-3]+1].reverse_complement()))

In [1047]:
print(t[0])


1  GCATACAGACTTATAGCATATTCATCAAGCCAAGTCATGTAATATTAAGGGAGCAGGATTAAGAGCAGATGAGATGGCAGTGAGGAAATGACATGCCAGT   100
1  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   100
1                                                                                                         100
101  TTGTTATCTATTACCCAGTCTGATTATCATGAGTTGCATATGGATGGGGACACACATGCTGTCTGGTGGGCAGAGAAGTTGAACCATCAGCTGTCATGGA   200
101  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   200
101                                                                                                         200
201  GTTGTACTTAAACAACAGCTGCGGGACCACAAGTTGCATATTGTTACTGTTTTCATCTTGTCCTACTGTCTCCATCTTGACCTATTTTTGCCATCTTGTT   300
201  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   300
201                                                                                                         300
301  CTACTGTCTCCATCTTCCTTACTGTTACTATCTTGCTCTACTGCCTCCATTGTTAAAAGGCCCACATTCAACATAGTCTTAATGATCCCTCTATTTATTT   400
301  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   400
301                                                                                                         400
401  CTCCACAGTGTAGTTCTCCAACCCAGGGTACAGGATGCCATTTTTTGGGAGGGGTCATTGCATATATATGATAAGCCCCAGGCTTGGCATTTTGATAAGC   500
401  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   500
401                                                                                                         500
501  AGGGGTTGCAAAGAGGACTACAACATAGCACATCAATACATGGCTTATTTTATCATGTAGTAGAGCCACCGGCCATGATGTATGTGCCATGTGCTGCCAT   600
501  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   600
501                                                                                                         600
601  AAGCTGTAAAGTGCCAGTACCCAACTCACACTGCAAATGCCCTTAGTCTGAGGATTTAAGGGAGAAGGCAGAACTAAAGAAAGGGGCTGTGAGAAAGTAC   700
601  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   700
601                                                                                                         700
701  TGGCGCACACAACATATAATGTAATATATCGTACTAATAAAACAAAGGCTTTATTTACTGTTCGCTTTACTTTCCCTTTAAGCCACCAAAGATAAATTAG   800
701  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   800
701                                                                                                         800
801  TATCTCTCTAAAGACAGAATACTAGCTGATGTTCACATACAGGCATCTTTCCACTATTCATTCTTCCCACAAAGTCCTCTCTGCCCTACGGCCATATGCC   900
801  ---------^---------^----<<<<<<<<<<<<<<<<<<<<<-----^---------^---------^---------^---------^---------   900
801                         cutsite_rev                                                                     900
901  TCCTGACAGTGGTATGATGATACAATGCCCAGCACAGGGTCTAAAAACCATCTGCAAAAGAATAGACTGGAATAAACAAAGCTGCCATATTCCCCATGCA   1000
901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1000
901                                                                                                         1000
1001  ACTTTCCACCCACCCCCATAAATCCCACCCTCCCCATATTTTTGGCCAAGGGTAGAGAAGCTAAGTTGACCCTGGCAATGGCTTTAATGCATGGGGAATG   1100
1001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1100
1001                                                                                                         1100
1101  GGAAGCTCTTGGATGCTCTGCGGGAAGTGTGCAGGTTGCAGACACTGCAGAATCCTTTGCATTGCTGCTAAGTACAACCGGGCCGGCCGTTTGCCCACCA   1200
1101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1200
1101                                                                                                         1200
1201  GGTAGTTAAATTTGTCCTCTCCCAGGAGGGGACTCCAAAACCCTGGGTTGTCTCTCAGCATCTCTCTCATTTTAAAGTTGAGGGAAGATGAGAAAAGCAA   1300
1201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1300
1201                                                                                                         1300
1301  GAGATAATCCAGGCAGAGCTTTTTGGACTCCACGTTGACTTTGGAACTCCACAGCTGCTCCAAAATGACGTCCAAGGGGGTTTCCCCTTTGCGGTCAATT   1400
1301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1400
1301                                                                                                         1400
1401  ATTTTGGGTGAGGCCCCGTTGCCTAGAAGAGTCAAGACAGTTTCGGCGCTCAGAAGTTCACAGGCCAAGTGCAAAGGTGTTTTACCATCCGACAAATGAA   1500
1401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1500
1401                                                                                                         1500
1501  AACAACTCGCCCTGTTGATGTAGGACTGAAGGCTGGGCAGCTCATGGATCACCTTAATGATCATGACTAAAATGTCCCTCCGATCATAAGTAACCGCCAG   1600
1501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1600
1501                                                                                                         1600
1601  GGCTAGATGAGGTGCAGAGGGTGGACAGCAGCAGAATTTGATGCCTGGCACCTTCAAGGCTTCCTCTGCATAGTGAGACAGCAGATACTGAGCATAGCGT   1700
1601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1700
1601                                                                                                         1700
1701  AGGTGGTTATGCACTACAGCGTACATGAGGGCGTCGGATGGTAAGTAGCACCTAAGACTTGCGTCTTCCTCCAAATGAAAATACTCCATAGTCCTCATTT   1800
1701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1800
1701                                                                                                         1800
1801  CTTCCAGTTTCCACACCGGCTCCTGGTCCCGCACCGCCTGGTAGAACATATAGGCGTAGTACTTGCACTGCTCATCCTGCAAGCTGACTTGATTGCTGCA   1900
1801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1900
1801                                                                                                         1900
1901  CATGTCTCCTGCCCAGGGAGAGGGAAACAAAGAGAAGAGTCAATGGTTTCCCTGCAGGGTTCCACGAGGAACGTCTTTCATGTGTCGCCGCCGTCACGCA   2000
1901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2000
1901                                                                                                         2000
2001  GTCCTTCCAGGGAGCAGAGCCTGGTCAGATGCCACCATCACTGCTGATTTATTCCTGGAGAACAAGCCTCCTGCTTGAGTTAACACTTGTCTCACCGATT   2100
2001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2100
2001                                                                                                         2100
2101  CCTCCCAATTCCTTCTGCCCGGCAGCCTAAAGTATGGGCACTGGTTACAGAATGGCATGGGATCCTATCATATAAAGGATGATCAGTCCATAGGGTTGAT   2200
2101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2200
2101                                                                                                         2200
2201  TGAATCCGATAACCTTGAACCTTCCTCTTTCTTCTCCACCCACCTGCCCCTCACGGTTTCCCTGCAGCCTTTTGCTGCATTTGGATGATGGGAAGCTGCT   2300
2201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2300
2201                                                                                                         2300
2301  GTCGGCTCACTAGCCTGCTGCTCCCTGCAGTGGTGAATGGAAGGGTGGGGGTTGCAATCTATTGAGGGTTTTGCTGTGTAGGAAGAAAAATACTAAGTAT   2400
2301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2400
2301                                                                                                         2400
2401  ATATCAAGTGGATCTCCGGTAGAGTCAGAGGCTTTCTGATAGGGAGGAGACTAGCAGCAGCCCCTCCGTTCTGCACCCACAGTGCAATTATAGCACAAAT   2500
2401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2500
2401                                                                                                         2500
2501  CTGCAGACACTGGAGCGCTGGAATCATCAGCTCCTTAATAGTGACTGAATATTAAAATGAACACAGGATTCAGCCAAACCCCAGTACTTTCTGCAGGATT   2600
2501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2600
2501                                                                                                         2600
2601  CTTTAGTTTTTAGCCAAGCCCAGTCCAAACTCATGCAAATTTAAAGCAGTGGTCAAGCAAATATAACACTGTTTTTGTTCACAATTGATTGACAAATGTG   2700
2601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2700
2601                                                                                                         2700
2701  AATATGCAAATTAGGAGTCCATTCAGGAACTGTATGCACCAAACATATGTTCATCCGAATCCATATTGCCACACTGCACACGAGTCGTGGTGTACACGAT   2800
2701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2800
2701                                                                                                         2800
2801  CCCCCTGTGTGGTTTTGAACTGGGCATCAACACTTTAAAACATTGGATAGAAATTCAGCCAGTTGCATCAAAGTGATGCAATAACCTTGCCCTGGTGTCA   2900
2801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2900
2801                                                                                                         2900
2901  TAACTCCACCAACATTATTGTGCCCACCTCTGATGTCATCATGCCTGTTCCTACATCACTGCCCCACTCCCAATGTTATCTGCCCCGCCTAGGGTTGCCA   3000
2901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3000
2901                                                                                                         3000
3001  CCTGGCCGTTATTTTACCGGCCTGGTCGATAAAAATGATGGTTGATCCCAATGTTATTAATAGGGAAAAAAGATAAATATATAGGAAGGCCGGTATTTTT   3100
3001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3100
3001                                                                                                         3100
3101  TTCCAGAAAAGGGGGCAACCCTAGCCCCACCCCCCATGTTCGGGTTTAGCCATCAGCAAAGGTGGCAACTCTAGTACAGCTCTTCTAATAATGTTGGAGT   3200
3101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3200
3101                                                                                                         3200
3201  ACAACCCTGTTGGCTGTCATGAATGGTTTGAGAGTTGTAGTTTAACAGCTGAAGGGTCATAGTTATCCATCATTCTGTGTGTATGTCACATGGTGTAACT   3300
3201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3300
3201                                                                                                         3300
3301  GAATAAACAAAGGCAAAACACACATGGATATGGGAACACCTGATATAAACTACTTGTCACTAGCCCCTCCCACCTATACACAGTCAGTAATAGCAAAGCA   3400
3301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3400
3301                                                                                                         3400
3401  AGGGTTTGTTGAAAGTTGCAGTTCAGCTGGAAGGCTACAGACTAGACAGCATTGCTCTATATGAATAATAGGGTTGTATATATGGTTGCCACCTGGCCGG   3500
3401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3500
3401                                                                                                         3500
3501  TATTTTACTGGCCTGGCCGGTAAAAATGATGGTTGATCCCAATGTTATTAATAGGGGAAAAAGATAAATATAAAGGAAGGCCGGTGGCAACCCTAGTTGT   3600
3501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3600
3501                                                                                                         3600
3601  ATATTGGAAGTGGGGACTCTGGGACCTCGTCTTCTGGTTATTGACTAATGTACTACTGTAAGGAAGGCAGGAGTTGATGGGAGTTCTGTGCAGTGAATGT   3700
3601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3700
3601                                                                                                         3700
3701  GAAAATGTGTGATATATTCAGGGAAGTGTTCCCCTGTGAGACTGTACAAAGGAGTGAGGACATCAGTGTTCTTATTAACTATGCGCCGATACAGATGGAT   3800
3701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3800
3701                                                                                                         3800
3801  CCATCAGTTTTGGTGTAAATTATGCATAGGACCTGCATGAATTATTCAACACCCCTCTTGAAAATCACTGTATTCTGTGCTTAGTCCTCAAATCCACAAT   3900
3801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3900
3801                                                                                                         3900
3901  AGGATTTTTTTCCAGTAGTTTAAATTCTCTCCCTCTTTTTACTGCTAAGGGGAAGCCCCAGGGGCTAATATACAGCTGTGGGTTAAAAACACCCCATTCC   4000
3901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4000
3901                                                                                                         4000
4001  CTTTATGTATCAACTACTGAACCACTATACCTTCAGATTGGCCTGCTGTCATAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTG   4100
4001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4100
4001                                                                                                         4100
4101  TTCCTGCTGAATTGTGCTTAGTACAGGGGGATACCTATGCTGCCATAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTGTTCCTG   4200
4101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4200
4101                                                                                                         4200
4201  CTGAATTGTGCTTAGTACATGGGAATACCTATGCTGCCACAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTGTTCCTGCTGAAT   4300
4201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4300
4201                                                                                                         4300
4301  TGTGCTTAGTACAGAGGAATACCTATGCTGCCATAGTTTTATGGTATCTCTCTGTACAGGCTATGAACCCTACCTAATGAGCTGTTACTGCTGAATTGTG   4400
4301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4400
4301                                                                                                         4400
4401  CCAAGTACTGTAAAGGGAAATACTTATGCTGTCATAGTTGTCTCAATGTGACAAATGATGGAATTATCTTTGCTATGATAAGTCCCAGAACTGCCGCTGG   4500
4401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4500
4401                                                                                                         4500
4501  GTCTGCTGTATCTAGTGTTATGTCCCTGAACCACACTTGCTTGATTGCCCAGGGAATACTTGGGTACAGATGTCCACTGCTACTCTGTCATTACTGGACA   4600
4501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4600
4501                                                                                                         4600
4601  TGAAATAGATCATTTATTGACAGCGGTGCCGTGTTTTGCACTGAGCTCTCCTGCTATAGCAGCAAGTATCATTTCAGCCCTAGGAGCTGACACTGTTGTC   4700
4601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4700
4601                                                                                                         4700
4701  TCTGTGCTGTAGTTCTCCACTTTTGTCACAGGAAATTAATCCATATCAACATGGTCACCCATCCACTTGTGCCCAGAAAAAGACTGTTAGGCATGGAAAA   4800
4701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4800
4701                                                                                                         4800
4801  CAAGAGTTTTAGATAGAATCTGTGCAGCCACTGGGACAGAATGCTCTGTTATACAGATAGCCATAAAGCAGGACAGGACTGCTGCTTACAATGGGGATCA   4900
4801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4900
4801                                                                                                         4900
4901  GACAGGATTATTGCTCATTTACTACATATCATACATAGTTTCAAGGCACAAATAAAAATCATGTATTCTGATATATCGTCTCTCTGTTGGAGGCATTTCT   5000
4901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5000
4901                                                                                                         5000
5001  ATCAGAGCTGCACAAAGACTTCTTCCCAGCAGACTTCATCCCAAGAGACTTTCCCTCATACAATGAGTATTCAGCTCCCCCTATCGGCAAAACACGAGCA   5100
5001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5100
5001                                                                                                         5100
5101  CAGATAATCTCTCAGGCTCCTCCAAAAGACTAAAGCGCTCACTGTGCGCCCTCTACAGGTTGGAACGGCTGACAACAGGGTCACGTGACTAGAACGCGGC   5200
5101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5200
5101                                                                                                         5200
5201  ACTATTTAATTGCCGGTGCGGATTCCATCACGTGACGAAATAGAATTACACAGCTAGTGCCACATACAAGAATAAGGCGGTCTTTATCGCCGGAGCATTA   5300
5201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5300
5201                                                                                                         5300
5301  CAAGGAGTAAAAACTGCCGAGTTAATTAACAGGAATTCTTTTTGAAGGTATTTCCAGCGTTACTTTTCCTTTAACCATCTATACGTAGGACACACACACA   5400
5301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5400
5301                                                                                                         5400
5401  AACAAGCCCTCCTCCTTTATTCGATCCGTTACTATCCCTTTAAGACAGCATCGATAGGGTTAAGAAGCCCTCGTGGCTGGAATGTTGTGCTGTCACAGGG   5500
5401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5500
5401                                                                                                         5500
5501  GTGGAGGCGGGGATTGGTGACGTCAGAAGGTCCCGCCTCCTCTCGCTGCGGCTCCTGGGTGGGCGGGACTTTGTGCCGGATGATTGACGGCTTCAGTAGG   5600
5501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5600
5501                                                                                                         5600
5601  GAAGAGGGAGGGGCCTAGTCAGACAGAACATTTGGTGTCGGTGCAGCAGTTCGTGTCTCGGGACCAGGGAGCGGCCGTAGGGAGTTCTGCGCTGGGAGCA   5700
5601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5700
5601                                                                                                         5700
5701  ACTAGAGGCTGGTGGAGAGGAGAGGCCTGTGCTTGAGACGAGGAGACCCCCTTTCTGTGTGCGGCCTGACTGGTACGGCTTGTGCCCCGCTCGCTCGTAG   5800
5701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5800
5701                                                                                                         5800
5801  CTTCTCCTGGGGTCGGGGAGGCGCCTCCAGGCTGACAGCGGAGTTTTGGGTGTCCGAGGGGCGGGATAATGACCTCATGGCTTGAGGGGGCTCTTAGGGC   5900
5801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5900
5801                                                                                                         5900
5901  CATGGGAAGCTTCAATCTGGATCTGACCAGACTGGGGGAGGGGCGTTGAAGAATGTGGGCGTGGCTAAATACTGCTGGGTCAGTTTGGGGGGAGGGGCAT   6000
5901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6000
5901                                                                                                         6000
6001  AGTAAAGTTGATGAGTGTAAGAATAGGGGACAGGAGCAGGGCCGGACAGAGAGAAGGTGATGAGACAATAGAGAAGAGACACGTCCTGGCCAGATGGGCA   6100
6001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6100
6001                                                                                                         6100
6101  CAGAACCCAAAGTTTGTTTGTGTGTTGGGGGGGGGCACATGGGCTGAGTTTGGGGCAGTTTCTTCTCACTAATCCTAGAAATGGTCTCATGTTATAGTAT   6200
6101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6200
6101                                                                                                         6200
6201  TGTATGAAAGTTGTCTCCTGCTCATGTCTCCTC   6233
6201  ^---------^---------^---------^   6233
6201                                      6233


In [1037]:
print(t[1])


ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 1
Seq('ACAGAATACTAGCTGATGTTCACATACAGGCATCTTTCCA', SingleLetterAlphabet())

In [1064]:
for item, (j, otherjunk) in enumerate(pamlist):
    print item
    print otherjunk
    print "\n"


0
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AAA', SingleLetterAlphabet())


1
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


2
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


3
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


4
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


5
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


6
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('ATT', SingleLetterAlphabet())


7
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAA', SingleLetterAlphabet())


8
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAT', SingleLetterAlphabet())


9
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GTG', SingleLetterAlphabet())


10
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('CGG', SingleLetterAlphabet())


11
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


12
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAG', SingleLetterAlphabet())


13
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAG', SingleLetterAlphabet())


14
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('ATA', SingleLetterAlphabet())


15
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


16
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


17
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


18
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


19
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TAG', SingleLetterAlphabet())



In [1070]:
print pamlist[14][0][0]


1  TTTTACCCCAACAAAATGACAGTTGCATATTGCAGATTTGATTAACAATGTTACATCCCGTATTATATGGAAGCTACTGCCGTCGTTAATTGAATCCCGA   100
1  ---------^---------^---------^---------^---------^---------^--------<<<<<<<<<<<<<<<<<<<<<-^---------   100
1                                                                     cutsite_rev                         100
101  GTGCATTGTCTTCAGTTGTGTGTCTAGAACATTCCCAACCATGTATTGCTGTAAGGAAGGACTAATGGCTGTCAAGGTGGTGACGCACAATCACACACAA   200
101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   200
101                                                                                                         200
201  GTGTATAGCGAGGATGAGGCCTAAGGCACATGGGGGGTATTCTCAGAAGGCCACATTGTTGCTTCTAGGCTAATAAGATGCCCTTCTGCCTTTAAGATGC   300
201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   300
201                                                                                                         300
301  ATTTACCTGTTGAGTGCCCTTACTTTTTGAGTTGACACCCGTCACTTTGTCACCCACCATGAAATCCGTCCAATAGTATTGCATGTGTGGGGCTAATGGC   400
301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   400
301                                                                                                         400
401  ACTAGATGTTCAAAGGACACATAGTGTTGCTATGGTATAGTTCAGCTGCAGTGCATTTCACTAGGGTGTACATAACATTACAACCAGTATGGCAGCAGCT   500
401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   500
401                                                                                                         500
501  AGTGCTTTTGGCTGTAGGAACCCCCTGTTTCAACCAATCATGCAAACCCCAAGATGTGGCACCTGACAAGTCTTGTGTTGTCTTTAGCCTTGGCAAGATC   600
501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   600
501                                                                                                         600
601  TTTGGGTGATGTTTAATCGAGCCATGGTTAAGTATTCGTGCAGAAGGGCTCAAGGAAATGGCCTTAAAATCCTTCTACTCTTCTTGTCTTTGCAAAACCT   700
601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   700
601                                                                                                         700
701  CTGTCAAATTCAGAAATATATGGAGGTAACTTGCTCAATGGTAACAAGTCAGTAAATGATGACTGAAGCCTTTGCAGCGTTTCCATTAAGCAAGGACTTG   800
701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   800
701                                                                                                         800
801  TAGCAGTCAGTGGTCTCTGCTGATCACTTCCAGTTCATGCTCCACCTAGTGGCAGAATGCAGAAGTACGACACATTGATTTGTTATATGTTATGGGTTCT   900
801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   900
801                                                                                                         900
901  AATCAGAGGACACACAAGTGACTGTGGGACTAGTAGTCTTATTGGAGATGGGGGAGAATAAAGGGAGAATGGGACAGTTATTTATAATAAACACGATCAA   1000
901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1000
901                                                                                                         1000
1001  CGTGACATTTTGCCTGGGGTAAATTAAGAAAGTTTGAGATTGGCTGATTTATGTTGCACTGACTGGTAGTTAATTGTTCACTTTCTCTTCTCCGTGTGAT   1100
1001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1100
1001                                                                                                         1100
1101  TTACCCTTTAATCTTTTTTGTTTGGCCTGTGTTGGTTACTCCTTTCCATTTGCTTCTTTACCAAACCAGGCCGAGAGCGCAGGCCTTTTAGGAATCAACC   1200
1101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1200
1101                                                                                                         1200
1201  TTTGTGTGTCCCTTTATGTTGTTCCACTATAATTATATATATCAACCCCTGTCGGTTGCAATGCAGTGTGTGTGCTCCACTCACCCGAGGTGGGGGATGT   1300
1201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1300
1201                                                                                                         1300
1301  AGAAAAGTAGCAGTTTCTTAAAGGTTTTTAATATCTCAATCTGCTCGATGCATGGATTTGTACATATATATATTAATGTTATCTTTATTTGATTTTTCTG   1400
1301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1400
1301                                                                                                         1400
1401  CTTGGGAAAGAAATGAATGAGCCAAGGTCGATAAATCGGCAGCTCCGGGGGTTGCACTACAGCTACTGGGGCTGAAAATTAAAAGTCTCTGGCTCAATTA   1500
1401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1500
1401                                                                                                         1500
1501  TAGGCAAGTCAGGCTTTTCCATTTGCAGCTGTTGCCAGGGTTGCCAGGTCTAATTTTGAAAAACAGCCAAAGTCGGCTACAAAACTAGCCAAGAGGCACT   1600
1501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1600
1501                                                                                                         1600
1601  TAAAAGTAGCCCAAAAATAGCGCAATATGTGCAGTGAAAAAAATTCTAAAGAAATGAATGAATTTATGTGAAAAAATGCCTTTTTGAATTGTTCACTGTT   1700
1601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1700
1601                                                                                                         1700
1701  TAACAAATGTTCCCATGAATTCGCCCGTCACCGGGGATGTAACAACAGAAGGGGCCCAGGAGATATAGGGGCCCCGTAATACATATACAATTTCAATAAA   1800
1701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1800
1701                                                                                                         1800
1801  TATTGGTGAAAAAGCTCAACCTCTAGACATTTTGGTGGCCAGCAGATTTTTGCTGAAATTGAGGAAAGTCACCGGAATGGGAGACGGGGGGGACAGAGCC   1900
1801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1900
1801                                                                                                         1900
1901  CAAAATCTGATAACTCCCATCTTCTACACAAGTCAGTCCCATTGCATGCTGGGTATTGTAGTCTTACATTAAACTTGGCAGTTGGCAAATTGTTAAACAA   2000
1901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2000
1901                                                                                                         2000
2001  AAACTACATTACCCAGCATGCATTGGGAGACGCAGGCTTGGCTCATTAGGCAAGAAAAAACTGTGGCTGGTTTCCAAATTACAAACCAGCTAAAGGCCCA   2100
2001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2100
2001                                                                                                         2100
2101  AAAAGTAGCCCAAATCCTTACCTTGGCTAGTTTGTACTTTTAAAACCTGCCTGGGCTTTAAATTAGTAGCCCAGTTTGGCTGGAAACCTGCCGACCGTGG   2200
2101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2200
2101                                                                                                         2200
2201  CAACCCTGGCCGCTGCTTCAGATACCGTCGGACGCGGCTACTTGTATGATTTTGTATAAACGCATCGGAACAGGGCGGAGCTACAAAGAATAATGTAAAT   2300
2201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2300
2201                                                                                                         2300
2301  ACGTCACAGACACAAAGGAGAAAAGCACTTTCGGAATCCTCGGAACGCAGACTGTCGTCCCTCCTGGTCATCGTTCTGCTCCTTGTTCTTCTGACTTCTC   2400
2301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2400
2301                                                                                                         2400
2401  TGTCTCATTGCCTTCAAGCTGTCGGGGGACGCAGCACAAGTGCAATTGAATCTTTCAATAATGTATTTAACGGTAATGAAACCTCCAAGTCTTGGGCTCT   2500
2401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2500
2401                                                                                                         2500
2501  TTCCTCTTTTGTTTTCATTTCTGTTTGATACGTGTTGTGGTGTCGCCGAGGGGCGGCCGCGTCTCACGCTGTACATTGTTCAGCTCTTACAGAAATGACC   2600
2501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2600
2501                                                                                                         2600
2601  TTTGTCCTTATTAAAACAGATTTCTCTCCTTTTATCAAATCATTTCAATGTGTCGTTGGTCTGTCTGTGAGCTCGGTAATTTAATAAGGGCAACGGAATA   2700
2601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2700
2601                                                                                                         2700
2701  CTTGCTGGCAATTGTCGCCAGTTACATCGGGCCCCATACAACAACATTTTCTGGGCTCCTCCCAGGCCAGACCCCACCTACCCCATTGGTAACTATGGCC   2800
2701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2800
2701                                                                                                         2800
2801  CCCTACTACAAGTTAAAAACATCATTGGTGGTCAAGCCCCCCAATTCAAGTAAAAAAATAATCATAGGTGACCAGTGCCCCCCCCTACAAGTTAAAAAAA   2900
2801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2900
2801                                                                                                         2900
2901  AAAAATGTTGGTGCCCAGGGTTCCCCCTACAAGTTAAAAAAACAGGAAAAGCAACTGTAACTTGATTTATTTTATTGTAAAACAATTAGAATTAAAAAAA   3000
2901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3000
2901                                                                                                         3000
3001  TATTGGAGACCAGATTTCTGTCCTACAAGTTAAAAAACAAAAATATTGGTTGGAAAACAGGAACAGCAACTGTAACTAGATTTCTTTTATTGTAAAACAA   3100
3001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3100
3001                                                                                                         3100
3101  TAAAAAATATTGGTGACCAGGGTTCCCCCTACAAGTTAAAAAAAAATTATTGGTGGCCTTTCTTATGCACAGATTGCCAGCTATGACATCACAGGTGGGA   3200
3101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3200
3101                                                                                                         3200
3201  TTGACATGACAGAGCCATCTTGGTAGAGAAAATCTGTTTAGGTGGATCTGGGTTTCTCAACAATTCCTTGGTGAATAGCAGAGGGGCCAGACTATTCTGC   3300
3201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3300
3201                                                                                                         3300
3301  TTAGTCTCTAGACAGCAGGACCACATCCTGGGCATCTCAATCGAAGGCCGTCTCTTTTCCTTGACACTCGTCCTTCTCTCAGGTGGTCTCCCAAGATGAC   3400
3301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3400
3301                                                                                                         3400
3401  CAGAGCTGCAGACAGTGTTCTTAGCCAAGTGCTATTGGTGGTCTCACCATCTGGCTTGGGCCGGTTTGAGTTCAATGCAAGAATGTCACAGCAGATCTTA   3500
3401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3500
3401                                                                                                         3500
3501  TTATGGGTAGAACACAACAAGGCAATCCTGTCTACTATTCACATTCCAGGGGTTTCCAACACAGAGGTAGACTTGACAGTTGCAATCAGCAAGACCTTGG   3600
3501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3600
3501                                                                                                         3600
3601  AGATGAGGAGCTTCATCAGAATGTATTTGGTGGGGCTGACCAAAATCCTTACCCCACCATGGTCAATACAATGATTCAAGCCTGGCCATTTTGTCTGGAC   3700
3601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3700
3601                                                                                                         3700
3701  TACATATTACCTTCACTCCCATTGCTGCTCAAAGTTCTAAAGAATGTCCAAAATTGACATAGAGCATTCTGGCCTTGGTTTATTTTTATGGAAAGGCTTT   3800
3701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3800
3701                                                                                                         3800
3801  TGAAGCTGCTATTTGGAGAAGAAATGGAATAACCGAGGAGGTAATAACTATGTGAAAAGCCTGAAACCCGCTGCATATGAAGATGCCTACCATAGAGTTT   3900
3801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3900
3801                                                                                                         3900
3901  GGGGCTGGTGCCAGTAATACAATGTTTCTACAGCGAGGTTTAGAAGTCGGCTCTAGAAAGGTACAGGTGTTGCACTGTCAGTGCTTTTTCAATCTCCCAT   4000
3901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4000
3901                                                                                                         4000
4001  TACCTGGACTGATTGAACTCCTTTTAAATCAGACACCTGAGCCTATGATTAAGTGTTTGAAAGATACGAAACGTGTACCGAAATAAACTACTAATTATTT   4100
4001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4100
4001                                                                                                         4100
4101  TACAAGTCTGTCTGGAAGAATGGCCTTTGGAAGTGCCTGCTCTTTTTCAATAAAGTTATCTGCTCCATCAGCTGAAGGTTTAGGGCGGTGCACCTGGACC   4200
4101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4200
4101                                                                                                         4200
4201  TCTTCTTGCAACTAGTAAGTTTTTACATTTTGAGACCTGAGAACTTCAATATACCTGTTATCAGCTTGTGTTCATTATATATTTTGAGATCCTCTCCCCG   4300
4201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4300
4201                                                                                                         4300
4301  ATATGTCCACCTTCAGGTGGGCAATATCATGCTGATCCGATCGTGGGCCCTAGGGCACAAAGAGAAGAATACGGGGGGTCAGATCGATGACCGCATTAAA   4400
4301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4400
4301                                                                                                         4400
4401  AAAACCAATGCCATCCTCCATCCCATGGGATTTTTAAACCCGCCCGACCGACATCTGGCCAACTTTAGGCCAGAATTGATAGGGGAATCTGTCGGCAGCT   4500
4401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4500
4401                                                                                                         4500
4501  TATATCTGCCCCTGTATGGGGTCCTTTAAAAAACAGATCAGTACAATTGCACCCCTCTGGTTATTGGATCGTGTACACATGTAAGGGTTCTGTATGTATT   4600
4501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4600
4501                                                                                                         4600
4601  AAATACAATGTACCGTTACCCATTCTCTAGTGAAGACCGTGTCCTCACGTCAGAAGCTCTAATATTGTTTAGCCATGGGAGCTTCCACTGAGAGTTCCAG   4700
4601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4700
4601                                                                                                         4700
4701  CAAGGCCCAAATATTACTTTACCTTTAATATAGAATGAGCATATAAATCAGTTCTGAGCCCTGTTTGGCCACTCTACTATGGTCATTTCTTCAGCTGTAA   4800
4701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4800
4701                                                                                                         4800
4801  GCAAAAGAATTTCCAGACTCTATTATACAGTATTGTGGATAGAAAACTTCACTAGACATAATTTAGGATCCGACTTGGAGATGAGATTCTAAAAGGCACA   4900
4801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4900
4801                                                                                                         4900
4901  GGAAAGCAATTAAAGTGTATATAACGGCTGAGCTCAGGCCCGAACTGGCAATCTGTGGGTTCTGGCAAATGCCACATGGGCTGCTGTAAGTTGCCATAGA   5000
4901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5000
4901                                                                                                         5000
5001  CAGTCACTATTTATTGGGCTGGTGGAGGGCTGCTTGGGCCTCTGTGTACTTGGAATGCTATTGTCTATTGTAAATCCCAGTCCAGACCTGGCTAAGCTCA   5100
5001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5100
5001                                                                                                         5100
5101  TGCAAATGCTGATACACAGTGATGTCATATTTCCCTTAGTGAGTCTGTGACATCATCAGGTATCAAGGCAGCAATCATACATGGACTTAGTTATACCAGG   5200
5101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5200
5101                                                                                                         5200
5201  ACCCTATAAAGCTAAGTGGTCAGTCCAGGGGTCAAAGACATGGGACAGCCTCCAAGCTTGATATCCAAAAATATAAGCAATGTTGTTGAAACGCTAACAC   5300
5201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5300
5201                                                                                                         5300
5301  CCAGACTTTGCTGGACTACAACTCCCAGACTTCGTAACTGCTCAATTTCATGTTGCAGAATGCTGGGAGATTCCAGACAACCTCAAATTCCAGTCTCCTA   5400
5301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5400
5301                                                                                                         5400
5401  TGGTTTATCAGTACAACAGAA   5421
5401  ^---------^---------^   5421
5401                          5421

In this analysis, 6 items are not next to NAG/NGG. But 3 are probably leftover fragment end dupes, leaving 14/17 correct, or an 82% yield.

1 is in a primer (989347). 

6 (141439)

7 (not in primer...), 

8 (not in primer),

9 (not in primer)

14. (is in 4995367)

In [ ]: