In [5]:
import os
import Bio
import re
import sys
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import random
import itertools
import multiprocessing
In [1]:
from collections import Counter
import operator
def al_plot_complexity(list):
'''
Takes a list of sequences, plots count of sequences vs percent of unique sequences.
'''
c = Counter(list)
freqs = []
for item in c:
freqs.append((c[item], c))
sorted_c = sorted(c.items(), key=operator.itemgetter(1))
x = xrange(0, len(sorted_c))
counts = []
for item, count in sorted_c:
counts.append(count)
global rangelist
rangelist = [a/100.0 for a in xrange(0, 100, 2)]
y = []
for p in rangelist:
a, l = int(len(counts)*p), len(counts)
y.append(float(sum(counts[l-a:l]))/sum(counts)*100)
zip(y, rangelist)
%pylab inline
figure()
plot(rangelist, y, "o")
xlabel("Cumulative fraction of unique spacers")
ylabel("Percent of total reads (molecules) in library")
In [ ]:
def al_string2feat(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
'''
This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
is contained in a field in queryseq, highlighting the location of queryseq in it.
Returns a string.
'''
subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
#for seqrecord in subjectseq:
locstart = queryseq[1][1]
#print queryseq
locend = queryseq[1][2]
fwdlocs = []
revlocs = []
# Figure out which strand the BLAST hit is on
if locstart <= locend:
fwdlocs.append(locstart)
if locstart > locend:
revlocs.append(locend)
for item in fwdlocs:
start = ExactPosition(int(item))
end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
subjectseq.features.append(feature)
for item in revlocs:
start = ExactPosition(int(item))
end = ExactPosition(start + len(queryseq[0].seq))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
subjectseq.features.append(feature)
#print subjectseq.features
return subjectseq
In [ ]:
def al_print_features(inputseq, addpamcutters, directlabel):
'''
Takes 3 arguments:
inputseq: SeqRecord to draw
addpamcutters: int - 0 or 1. If 1, also draw HpaII/BfaI/ScrFI sites on map.
directlabel: int, 0 or 1. Recommend 1. Changes position of feature labels (1 = on markings, 0 = below them)
'''
if addpamcutters == 1:
cutline = list(" " * len(inputseq))
HpaIIsites = HpaII.search(inputseq.seq)
BfaIsites = BfaI.search(inputseq.seq)
ScrFIsites = ScrFI.search(inputseq.seq)
for cut in HpaIIsites:
cutline[cut-1:cut + len("HpaII")] = "<HpaII"
for cut in BfaIsites:
cutline[cut-1:cut + len("BfaI")] = "<BfaI"
for cut in ScrFIsites:
cutline[cut-1:cut + len("ScrFI")] = "<ScrFI"
cutline = "".join(cutline)
mask = [list((("-" * 9) + "^" )* int(round(len(inputseq.seq)/10.0)))]
newmaskline = list((("-" * 9) + "^" )* int(round(len(inputseq.seq)/10.0)))
for feature in inputseq.features:
# Make a new marker strand if any features overlap. All marker strands can be elements of a list.
featstart = int(feature.location.start)
featend = int(feature.location.end)
if featstart > featend:
print("Error! Feature end must be after feature start. Use strand to specify direction! Feature " + feature.type + \
" will not be displayed!")
#if "<" in mask[-1][featstart:featend] or ">" in mask[-1][featstart:featend]:
#mask.append(newmaskline)
clean = 0
for item in mask[-1][featstart:featend]:
if item == "-":
clean = 1
elif item == "^":
clean = 1
else:
clean = 0
mask.append(newmaskline)
break
#print mask[-1][0:50]
if feature.strand == 1:
mask[-1] = mask[-1][:featstart-1] + [">"] * int(featend - featstart + 1) + mask[-1][featend:]
if directlabel == 1:
mask[-1] = mask[-1][:featstart] + list(str(feature.type)) + mask[-1][featstart + len(str(feature.type)):]
if feature.strand == -1:
mask[-1] = mask[-1][:featstart-1] + ["<"] * int(featend+1 - featstart) + mask[-1][featend:]
if directlabel == 1:
mask[-1] = mask[-1][:featstart+1] + list(str(feature.type)) + mask[-1][featstart+2 + len(str(feature.type)):]
#if addpamcutters = 1:
#cutline = list(" " * len(inputseq)
#HpaIIsites = HpaII.search(inputseq.seq)
for index, maskline in enumerate(mask):
maskline = "".join(maskline)
mask[index] = maskline
# add labels
if directlabel == 0:
masklab = list(" " * (len(inputseq.seq)))
for feature in inputseq.features:
featstart = int(feature.location.start)
featend = int(feature.location.end)
featname = str(feature.type)
masklab = masklab[:featstart] + list(str(feature.type)) + list(" " * (featend-1 - featstart - len(feature.type))) + masklab[featend-1:]
masklab = "".join(masklab)
lines = int(round(len(inputseq.seq) / 100)) + 1
i = 0
outstring = list(inputseq.name + "\n")
#print inputseq.name
outstring = []
while i < lines:
indexstart = i*100
indexend = (i+1) * 100
if indexend > len(inputseq.seq):
indexend = len(inputseq.seq)
if addpamcutters ==1:
outstring.extend((str(indexstart + 1)) + " " + cutline[indexstart:indexend] + " " + str(indexend)+ "\n")
#print (str(indexstart + 1)) + " " + cutline[indexstart:indexend] + " " + str(indexend)
outstring.extend(str(indexstart+1) + " " + inputseq.seq[indexstart:indexend] + " " + str(indexend)+ "\n")
#print str(indexstart+1) + " " + inputseq.seq[indexstart:indexend] + " " + str(indexend)
for maskline in mask:
outstring.extend((str(indexstart + 1)) + " " + maskline[indexstart:indexend] + " " + str(indexend)+ "\n")
#print (str(indexstart + 1)) + " " + maskline[indexstart:indexend] + " " + str(indexend)
if directlabel == 0:
outstring.extend(str(indexstart +1) + " " + masklab[indexstart:indexend] + " " + str(indexend)+ "\n")
#print str(indexstart +1) + " " + masklab[indexstart:indexend] + " " + str(indexend)
outstring.extend("\n")
#print "\n"
i = i + 1
outstring = "".join(outstring)
return outstring
In [4]:
def al_digesttarget(list_of_targets, process_to_file=0):
'''
Substrate should be a list of SeqRecords (i.e. scaffolds, or whatever).
Process_to_file [optional] is an integer; if 1 will avoid building list in memory and will write to file instead.
Defaults to 0, building list in memory.
Output list has information in it:
ID is a count of the potential spacer along the scaffold, starting from zero.
Name is the base position on the scaffold.
Description is the enzyme that generated the spacer-end (ScrFI/HpaII/BfaI)
dbxrefs is the name the scaffold was given in the input SeqRecord
'''
f = open("genome_allspacers.fa","w")
global substrate
scaffoldcuts = []
for substrate in list_of_targets:
cutslist = []
pos = HpaII.search(substrate.seq)
# Positions in this list correspond to the right boundaries of fragments;
# last one is thus the sequence end
pos.append(len(substrate.seq))
pos = iter(pos)
cuts = HpaII.catalyze(substrate.seq)
for item in cuts:
cutslist.append([item, "HpaII", int(pos.next())])
cuts = BfaI.catalyze(substrate.seq)
pos = BfaI.search(substrate.seq)
pos.append(len(substrate.seq))
pos = iter(pos)
for item in cuts:
cutslist.append([item, "BfaI", int(pos.next())])
cuts = ScrFI.catalyze(substrate.seq)
pos = ScrFI.search(substrate.seq)
pos.append(len(substrate.seq))
pos = iter(pos)
for item in cuts:
cutslist.append([item, "ScrFI", int(pos.next())])
#The above is all to get the results of a catalyze operation (i.e. tuples) into
# a list format. Next part makes them into SeqRecords.
cutslistrecords = []
for i, item in enumerate(cutslist):
cutslistrecords.append(SeqRecord(item[0], id = str(i), description = str(item[1]), name=str(item[2]), \
dbxrefs=[str(substrate.id)]))
cutslist = cutslistrecords[:]
# This part takes the 3' 20nt of each fragment and makes a new sequence with it.
# For the 5' end, the Mung-Bean treatment is simulated by removing two more nt (for HpaII and BfaI), or one nt for ScrFI;
# these would be the 5' overhang. Then we take the reverse-complement of the sequence.
# The Restriction module just returns sequences as if the top strand only was being cut. In other words,
# no bases are deleted from consecutive fragments.
from Bio.Seq import MutableSeq
twentymers = []
#record2 = []
for record2 in cutslist:
try: # This is because a second run of this code on already mutable seqs seems to fail. Not sure how to flush out and revert back to non-mutables...
record2.seq = record2.seq.tomutable()
except:
pass
if record2.description == "ScrFI":
#offset here (e.g. 1:21 is for simulating MBN digeston)
# because the entry.names are rooted on the right of each fragment, the length
# of the entry.name has to be subtracted to get the desired left position for the "reverse"
# tgts
entry = record2[1:21].reverse_complement\
(description=True, id=True, name=True)
entry.name = str(int(record2.name)+1 - len(record2.seq))
entry.id = str("%s_R" % record2.id) ##
twentymers.append(entry)
else: # Should work for HpaII/BfaI
entry = record2[2:22].reverse_complement\
(description=True, id=True, name=True)
entry.name = str(int(record2.name)+2 - len(record2.seq))
entry.id = str("%s_R" % record2.id) ##
twentymers.append(entry)
record2.seq = record2.seq.toseq()
record2.id = str("%s_F" % record2.id)
entry = record2[-20:]
entry.name = str(int(record2.name)-20)
twentymers.append(entry)
for item in twentymers:
item.dbxrefs = [substrate.id]
# The ends of the fragments aren't bonafide CRISPR targets; these can be removed:
noends = []
twentymerstr = [item for item in twentymers if item.description == "HpaII"]
trimmed = twentymerstr[1:-1] # removes first and last 20mer
noends.append(trimmed)
twentymerstr = [item for item in twentymers if item.description == "BfaI"]
trimmed = twentymerstr[1:-1]
noends.append(trimmed)
twentymerstr = [item for item in twentymers if item.description == "ScrFI"]
trimmed = twentymerstr[1:-1]
noends.append(trimmed)
index = str(substrate.id + str(random.random()))
if process_to_file != 1:
scaffoldcuts.append((item for sublist in noends for item in sublist))
if process_to_file == 1:
f = open("genome_allspacers.fa","a") #opens file with name of "test.txt"
for item in [item for sublist in noends for item in sublist]:
f.write(">lcl|" + str(substrate.id) + "|" + str(item.description) + "|" + str(item.name) + "|" + str(item.id) + "\n")
f.write(str(item.seq) + "\n")
f.close()
if process_to_file != 1:
return itertools.chain.from_iterable(scaffoldcuts) #dammit, the from_iterable part is important here. that took a while.
In [1]:
def al_scoreguide(guide, genome, genomedict, hits=50):
'''
To produce a score for a given guide in a given BLAST database. Returns an int, the score.
Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
all potential guide hits are on the plus strand.
Arguments: guide = a SeqRecord object containing a proposed guide sequence
genome = a BLAST database set up on this machine.
genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
BLAST db hit_def.
Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
'''
name = multiprocessing.current_process().name
M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
#import random
random.jumpahead(1)
#filesuffix = str(int(random.random()*100000000))
filesuffix = str(guide.id)
filename = str("currseq" + filesuffix + ".tmp")
Bio.SeqIO.write(guide, filename, "fasta")
# Added dust="no" to stop inflating scores from polyN repeats...
blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 10, dust="no")
#timeit.timeit(blastn_cline, number =1)
blastn_cline()
result_handle = open(filename + ".blast")
hits = NCBIXML.parse(result_handle)
firstmatch = 0
notfound = 1
for item in hits:
scorelist = []
pamlist = []
for scaffold in item.alignments:
for pair in scaffold.hsps:
hit_threeprime_offset = len(guide) - pair.query_end
start = pair.sbjct_start
end = pair.sbjct_end
if end > start:
pamstart = end + hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
elif start > end:
pamstart = end - hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()
# Construct a bar-based match string, padded to query length,
# where bar = match position and space is non-match
mmloc = []
mmstr = list(pair.match)
if pair.query_start > 1:
mmstr = list(" " * (pair.query_start - 1)) + mmstr
if pair.query_end < 20:
mmstr = mmstr + list(" " * (20 - pair.query_end))
mmstr = "".join(mmstr)
# Test for PAM adjacency
if len(pam) == 3:
if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
and (pam[2] == "G" or pam[2] == "g"):
if (pair.positives >16 and pair.positives < 20):
pos = 20
for linestring in mmstr:
if linestring != "|":
mmloc.append(pos)
pos = pos - 1
# Actually implement Zhang lab algorithm
mmscore = [21 -x for x in mmloc]
t1 = 1
for mismatchlocation in mmscore:
t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
if len(mmscore) > 1:
d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
else:
d = 19
t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
t3 = float(1)/ float(pow(len(mmloc), 2))
scorelist.append({"match_score": float(t1 * t2 * t3 * 100), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
# Zhang lab algorithm doesn't handle perfect matches (I think?): give it a 50 if it's perfect
if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
if firstmatch != 0:
scorelist.append({"match_score": float(50), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
firstmatch = 1
notfound = 0
else:
scorelist.append({"match_score": float(0), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
os.remove(filename) # want to keep BLAST results? comment out these two lines.
os.remove(filename + ".blast")
# Only include the scorelist items that have a score greater than zero
guide.annotations["blastdata"] = [s for s in scorelist if s["match_score"] > 0]
finalscore = int(10000.000 / (100.000 + float(sum(item["match_score"] for item in scorelist))))
guide.annotations["score"] = finalscore
return guide
In [ ]:
def al_scoremanyguides(guides, genome, genomedict, hits=50, return_blast =1):
'''
To produce a score for a given guide in a given BLAST database. Returns an int, the score.
Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
all potential guide hits are on the plus strand.
Arguments: guide = a SeqRecord object containing a proposed guide sequence
genome = a BLAST database set up on this machine.
genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
BLAST db hit_def.
Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
'''
name = multiprocessing.current_process().name
M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
#import random
random.jumpahead(1)
#filesuffix = str(int(random.random()*100000000))
filesuffix = str(guides[0].id)
filename = str("currseq" + filesuffix + ".tmp")
Bio.SeqIO.write(guides, filename, "fasta")
# Added dust="no" to stop inflating scores from polyN repeats...
blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 10, dust="no")
#timeit.timeit(blastn_cline, number =1)
blastn_cline()
result_handle = open(filename + ".blast")
blasts = NCBIXML.parse(result_handle)
# This generates an generator of BLAST objects, each corresponding to a guide query. Loop through those.
for guideindex, item in enumerate(blasts):
firstmatch = 0
notfound = 1
scorelist = []
pamlist = []
# Within each queried guide, loop through the aligned scaffolds
for scaffold in item.alignments:
# Within each aligned scaffold, loop through invidivual HSPs on that scaffold.
for pair in scaffold.hsps:
hit_threeprime_offset = len(guides[0]) - pair.query_end
start = pair.sbjct_start
end = pair.sbjct_end
if end > start:
pamstart = end + hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
elif start > end:
pamstart = end - hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()
# Construct a bar-based match string, padded to query length,
# where bar = match position and space is non-match
mmloc = []
mmstr = list(pair.match)
if pair.query_start > 1:
mmstr = list(" " * (pair.query_start - 1)) + mmstr
if pair.query_end < 20:
mmstr = mmstr + list(" " * (20 - pair.query_end))
mmstr = "".join(mmstr)
# Test for PAM adjacency
if len(pam) == 3:
if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
and (pam[2] == "G" or pam[2] == "g"):
if (pair.positives >16 and pair.positives < 20):
pos = 20
for linestring in mmstr:
if linestring != "|":
mmloc.append(pos)
pos = pos - 1
# Actually implement Zhang lab algorithm
mmscore = [21 -x for x in mmloc]
t1 = 1
for mismatchlocation in mmscore:
t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
if len(mmscore) > 1:
d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
else:
d = 19
t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
t3 = float(1)/ float(pow(len(mmloc), 2))
scorelist.append({"match_score": float(t1 * t2 * t3 * 100), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
# Zhang lab algorithm doesn't handle perfect matches (I think?): give it a 50 if it's perfect
if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
if firstmatch != 0:
scorelist.append({"match_score": float(50), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
firstmatch = 1
notfound = 0
else:
scorelist.append({"match_score": float(0), "scaffold": scaffold.hit_def, \
"hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
"match_bars":mmstr})
#pamlist.append(pam)
# Only include the scorelist items that have a score greater than zero
if return_blast == 1:
guides[guideindex].annotations["blastdata"] = [s for s in scorelist if s["match_score"] > 0]
finalscore = int(10000.000 / (100.000 + float(sum(item["match_score"] for item in scorelist))))
guides[guideindex].annotations["score"] = finalscore
os.remove(filename) # want to keep BLAST results? comment out these two lines.
os.remove(filename + ".blast")
#return hits
return guides
In [1]:
def al_scoreguide_density(guide, genome, genomedict, hits=50):
'''
To produce a score for a given guide in a given BLAST database. Returns an int, the score.
Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
all potential guide hits are on the plus strand.
Arguments: guide = a SeqRecord object containing a proposed guide sequence
genome = a BLAST database set up on this machine.
genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
BLAST db hit_def.
Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
'''
name = multiprocessing.current_process().name
M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
import random
random.jumpahead(1)
#filesuffix = str(int(random.random()*100000000))
filesuffix = str(guide.id)
filename = str("currseq" + filesuffix + ".tmp")
Bio.SeqIO.write(guide, filename, "fasta")
blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 100)
#timeit.timeit(blastn_cline, number =1)
blastn_cline()
result_handle = open(filename + ".blast")
hits = NCBIXML.parse(result_handle)
firstmatch = 0
notfound = 1
for item in hits:
scorelist = []
pamlist = []
for scaffold in item.alignments:
for pair in scaffold.hsps:
hit_threeprime_offset = len(guide) - pair.query_end
start = pair.sbjct_start
end = pair.sbjct_end
if end > start:
pamstart = end + hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
elif start > end:
pamstart = end - hit_threeprime_offset
pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()
if len(pam) == 3:
t1 = 0
t2 = 0
t3=0
if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
and (pam[2] == "G" or pam[2] == "g"):
if (pair.positives >12 and pair.positives < 20):
mmloc = []
mmstr = list(pair.match)
if pair.query_start > 1:
mmstr = list(" " * (pair.query_start - 1)) + mmstr
if pair.query_end < 20:
mmstr = mmstr + list(" " * (20 - pair.query_end))
"".join(mmstr)
pos = 20
for linestring in mmstr:
if linestring != "|":
mmloc.append(pos)
pos = pos - 1
mmscore = [21 -x for x in mmloc]
t1 = 1
for mismatchlocation in mmscore:
t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
if len(mmscore) > 1:
d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
else:
d = 19
t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
t3 = float(1)/ float(pow(len(mmloc), 2))
scorelist.append([float(t1 * t2 * t3 * 100), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
#pamlist.append(pam)
if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
scorelist.append([float(100), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
notfound = 0
else:
scorelist.append([float(0), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
#pamlist.append(pam)
os.remove(filename) # want to keep BLAST results? comment out these two lines.
os.remove(filename + ".blast")
finalscore = int(10000.000 / (100.000 + float(sum(score for score, details in scorelist))))
return (finalscore, guide, scorelist)
In [ ]: