notebook.community

Edit and run



In [5]:

    
import os
import Bio 
import re
import sys
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction 
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import random
import itertools
import multiprocessing



In [1]:

    
from collections import Counter
import operator

def al_plot_complexity(list):
    '''
    Takes a list of sequences, plots count of sequences vs percent of unique sequences.
    '''
    c = Counter(list)
    freqs = []
    for item in c:
       freqs.append((c[item], c))
    sorted_c = sorted(c.items(), key=operator.itemgetter(1))
    x = xrange(0, len(sorted_c))
    counts = []
    for item, count in sorted_c:
        counts.append(count)
    global rangelist
    rangelist = [a/100.0 for a in xrange(0, 100, 2)]
    y = []
    for p in rangelist:
        a, l = int(len(counts)*p), len(counts)
        y.append(float(sum(counts[l-a:l]))/sum(counts)*100)
    zip(y, rangelist)
    %pylab inline
    figure()
    plot(rangelist, y, "o")
    xlabel("Cumulative fraction of unique spacers")
    ylabel("Percent of total reads (molecules) in library")



In [ ]:

    
def al_string2feat(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
    '''
    This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
    is contained in a field in queryseq, highlighting the location of queryseq in it. 
    Returns a string.
    '''
    subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
    #for seqrecord in subjectseq:
    locstart = queryseq[1][1]
    #print queryseq
    locend = queryseq[1][2]
    fwdlocs = []  
    revlocs = []
    # Figure out which strand the BLAST hit is on
    if locstart <= locend:
        fwdlocs.append(locstart)
    if locstart > locend:
        revlocs.append(locend)
    
    for item in fwdlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
        subjectseq.features.append(feature)

    for item in revlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(start + len(queryseq[0].seq))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
        subjectseq.features.append(feature)
    #print subjectseq.features
    return subjectseq



In [ ]:

    
def al_print_features(inputseq, addpamcutters, directlabel):
    '''
    Takes 3 arguments:
    inputseq: SeqRecord to draw
    addpamcutters: int - 0 or 1. If 1, also draw HpaII/BfaI/ScrFI sites on map.
    directlabel: int, 0 or 1. Recommend 1. Changes position of feature labels (1 = on markings, 0 = below them)
    '''
    if addpamcutters == 1:
        cutline = list(" " * len(inputseq))
        HpaIIsites = HpaII.search(inputseq.seq)
        BfaIsites = BfaI.search(inputseq.seq)
        ScrFIsites = ScrFI.search(inputseq.seq)
        for cut in HpaIIsites:
            cutline[cut-1:cut + len("HpaII")] = "<HpaII"
        for cut in BfaIsites:
            cutline[cut-1:cut + len("BfaI")] = "<BfaI"
        for cut in ScrFIsites:
            cutline[cut-1:cut + len("ScrFI")] = "<ScrFI"
        cutline = "".join(cutline)
    
    
    mask = [list((("-" * 9) + "^" )* int(round(len(inputseq.seq)/10.0)))]
    newmaskline = list((("-" * 9) + "^" )* int(round(len(inputseq.seq)/10.0)))

    for feature in inputseq.features:
        # Make a new marker strand if any features overlap. All marker strands can be elements of a list.
        featstart = int(feature.location.start)
        featend = int(feature.location.end)
        if featstart > featend:
            print("Error! Feature end must be after feature start. Use strand to specify direction! Feature " + feature.type + \
                  " will not be displayed!")
        #if "<" in mask[-1][featstart:featend] or ">" in mask[-1][featstart:featend]:
            #mask.append(newmaskline)

        clean = 0
        for item in mask[-1][featstart:featend]:
            if item == "-":
                clean = 1
            elif item == "^":
                clean = 1
            else:
                clean = 0
                mask.append(newmaskline)
                break

        #print mask[-1][0:50]
        if feature.strand == 1:
            mask[-1] = mask[-1][:featstart-1] + [">"] * int(featend - featstart + 1) + mask[-1][featend:]
            if directlabel == 1:
                mask[-1] = mask[-1][:featstart] + list(str(feature.type)) + mask[-1][featstart + len(str(feature.type)):]
        if feature.strand == -1:
            mask[-1] = mask[-1][:featstart-1] + ["<"] * int(featend+1 - featstart) + mask[-1][featend:]
            if directlabel == 1:
                mask[-1] = mask[-1][:featstart+1] + list(str(feature.type)) + mask[-1][featstart+2 + len(str(feature.type)):]

    #if addpamcutters = 1:
        #cutline = list(" " * len(inputseq)
        #HpaIIsites = HpaII.search(inputseq.seq)



    for index, maskline in enumerate(mask):
        maskline = "".join(maskline)
        mask[index] = maskline
    # add labels
    if directlabel == 0:
        masklab = list(" " * (len(inputseq.seq)))
        for feature in inputseq.features:
            featstart = int(feature.location.start)
            featend = int(feature.location.end)
            featname = str(feature.type)
            masklab = masklab[:featstart] + list(str(feature.type)) + list(" " * (featend-1 - featstart - len(feature.type))) + masklab[featend-1:]
        masklab = "".join(masklab)

    lines = int(round(len(inputseq.seq) / 100)) + 1
    i = 0
    outstring = list(inputseq.name + "\n")
    #print inputseq.name
    outstring = []
    while i < lines:
        indexstart = i*100
        indexend = (i+1) * 100
        if indexend > len(inputseq.seq):
            indexend = len(inputseq.seq)
        if addpamcutters ==1:
            outstring.extend((str(indexstart + 1)) + "  " + cutline[indexstart:indexend] + "   " + str(indexend)+ "\n")
            #print (str(indexstart + 1)) + "  " + cutline[indexstart:indexend] + "   " + str(indexend)
        outstring.extend(str(indexstart+1) + "  " + inputseq.seq[indexstart:indexend] + "   " + str(indexend)+ "\n")
        #print str(indexstart+1) + "  " + inputseq.seq[indexstart:indexend] + "   " + str(indexend)
        for maskline in mask:
            outstring.extend((str(indexstart + 1)) + "  " + maskline[indexstart:indexend] + "   " + str(indexend)+ "\n")
            #print (str(indexstart + 1)) + "  " + maskline[indexstart:indexend] + "   " + str(indexend)
        if directlabel == 0:
            outstring.extend(str(indexstart +1) + "  " + masklab[indexstart:indexend] + "   " + str(indexend)+ "\n")
            #print str(indexstart +1) + "  " + masklab[indexstart:indexend] + "   " + str(indexend)
        outstring.extend("\n")
        #print "\n"
        i = i + 1
    outstring = "".join(outstring)
    return outstring



In [4]:

    
def al_digesttarget(list_of_targets, process_to_file=0):
    '''
    Substrate should be a list of SeqRecords (i.e. scaffolds, or whatever).
    Process_to_file [optional] is an integer; if 1 will avoid building list in memory and will write to file instead.
        Defaults to 0, building list in memory.
    Output list has information in it:
        ID is a count of the potential spacer along the scaffold, starting from zero.
        Name is the base position on the scaffold.
        Description is the enzyme that generated the spacer-end (ScrFI/HpaII/BfaI)
        dbxrefs is the name the scaffold was given in the input SeqRecord
    '''
    f = open("genome_allspacers.fa","w")
    global substrate
    scaffoldcuts = []
    for substrate in list_of_targets:
        cutslist = []
        pos = HpaII.search(substrate.seq)
        # Positions in this list correspond to the right boundaries of fragments; 
        # last one is thus the sequence end
        pos.append(len(substrate.seq))
        pos = iter(pos)
        cuts = HpaII.catalyze(substrate.seq)
        for item in cuts:
            cutslist.append([item, "HpaII", int(pos.next())])

        cuts = BfaI.catalyze(substrate.seq)
        pos = BfaI.search(substrate.seq)
        pos.append(len(substrate.seq))
        pos = iter(pos)
        for item in cuts:
            cutslist.append([item, "BfaI", int(pos.next())])

        cuts = ScrFI.catalyze(substrate.seq)
        pos = ScrFI.search(substrate.seq)
        pos.append(len(substrate.seq))
        pos = iter(pos)
        for item in cuts:
            cutslist.append([item, "ScrFI", int(pos.next())])

        #The above is all to get the results of a catalyze operation (i.e. tuples) into
        # a list format. Next part makes them into SeqRecords.

        cutslistrecords = []
        for i, item in enumerate(cutslist):
            cutslistrecords.append(SeqRecord(item[0], id = str(i), description = str(item[1]), name=str(item[2]), \
                                             dbxrefs=[str(substrate.id)]))
        cutslist = cutslistrecords[:]

        # This part takes the 3' 20nt of each fragment and makes a new sequence with it.
        # For the 5' end, the Mung-Bean treatment is simulated by removing two more nt (for HpaII and BfaI), or one nt for ScrFI;
        # these would be the 5' overhang. Then we take the reverse-complement of the sequence. 
        # The Restriction module just returns sequences as if the top strand only was being cut. In other words,
        # no bases are deleted from consecutive fragments. 

        from Bio.Seq import MutableSeq
        twentymers = []
        #record2 = []
        for record2 in cutslist:
                try: # This is because a second run of this code on already mutable seqs seems to fail. Not sure how to flush out and revert back to non-mutables...
                    record2.seq = record2.seq.tomutable()
                except:
                    pass
                if record2.description == "ScrFI":
                    #offset here (e.g. 1:21 is for simulating MBN digeston)
                    # because the entry.names are rooted on the right of each fragment, the length
                    # of the entry.name has to be subtracted to get the desired left position for the "reverse"
                    # tgts
                    entry = record2[1:21].reverse_complement\
                    (description=True, id=True, name=True)
                    entry.name = str(int(record2.name)+1 - len(record2.seq))
                    entry.id = str("%s_R" % record2.id) ##
                    twentymers.append(entry)
                else: # Should work for HpaII/BfaI
                    entry = record2[2:22].reverse_complement\
                    (description=True, id=True, name=True)
                    entry.name = str(int(record2.name)+2 - len(record2.seq))
                    entry.id = str("%s_R" % record2.id) ##
                    twentymers.append(entry)
                record2.seq = record2.seq.toseq()
                record2.id = str("%s_F" % record2.id)
                entry = record2[-20:]
                entry.name = str(int(record2.name)-20)
                twentymers.append(entry)

        for item in twentymers:
            item.dbxrefs = [substrate.id]

        # The ends of the fragments aren't bonafide CRISPR targets; these can be removed:
        noends = []
        twentymerstr = [item for item in twentymers if item.description == "HpaII"]
        trimmed = twentymerstr[1:-1] # removes first and last 20mer
        noends.append(trimmed)
        twentymerstr = [item for item in twentymers if item.description == "BfaI"]
        trimmed = twentymerstr[1:-1]
        noends.append(trimmed)
        twentymerstr = [item for item in twentymers if item.description == "ScrFI"]
        trimmed = twentymerstr[1:-1]
        noends.append(trimmed)

        index = str(substrate.id + str(random.random()))
        if process_to_file != 1:
            scaffoldcuts.append((item for sublist in noends for item in sublist))
        if process_to_file == 1:
            f = open("genome_allspacers.fa","a") #opens file with name of "test.txt"
            for item in [item for sublist in noends for item in sublist]:
                f.write(">lcl|" + str(substrate.id) + "|" + str(item.description) + "|" + str(item.name) + "|" + str(item.id) + "\n")
                f.write(str(item.seq) + "\n")
            f.close()
    if process_to_file != 1:
        return itertools.chain.from_iterable(scaffoldcuts) #dammit, the from_iterable part is important here. that took a while.



In [1]:

    
def al_scoreguide(guide, genome, genomedict, hits=50):
    '''
    To produce a score for a given guide in a given BLAST database. Returns an int, the score.
    Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
    This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
    all potential guide hits are on the plus strand.
    Arguments:  guide = a SeqRecord object containing a proposed guide sequence
                genome = a BLAST database set up on this machine.
                genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
                    BLAST db hit_def.
    Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
    '''
    name = multiprocessing.current_process().name
    M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
    #import random
    random.jumpahead(1)
    #filesuffix = str(int(random.random()*100000000))
    filesuffix = str(guide.id)
    filename = str("currseq" + filesuffix + ".tmp")
    Bio.SeqIO.write(guide, filename, "fasta")
    # Added dust="no" to stop inflating scores from polyN repeats...
    blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
    task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 10, dust="no")
    #timeit.timeit(blastn_cline, number =1)
    blastn_cline()
    result_handle = open(filename + ".blast")
    hits = NCBIXML.parse(result_handle)
    firstmatch = 0
    notfound = 1
    for item in hits:
        scorelist = []
        pamlist = []
        for scaffold in item.alignments:
            for pair in scaffold.hsps:
                hit_threeprime_offset = len(guide) - pair.query_end
                start = pair.sbjct_start
                end = pair.sbjct_end
                if end > start:
                    pamstart = end + hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
                elif start > end:
                    pamstart = end - hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()
                
                # Construct a bar-based match string, padded to query length, 
                # where bar = match position and space is non-match
                mmloc = []
                mmstr = list(pair.match)
                if pair.query_start > 1:
                    mmstr = list(" " * (pair.query_start - 1)) + mmstr
                if pair.query_end < 20:
                    mmstr = mmstr + list(" " * (20 - pair.query_end))
                mmstr = "".join(mmstr)

                # Test for PAM adjacency
                if len(pam) == 3:
                    if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
                        and (pam[2] == "G" or pam[2] == "g"):
                        if (pair.positives >16 and pair.positives < 20):
                            pos = 20
                            for linestring in mmstr:
                                if linestring != "|":
                                    mmloc.append(pos)
                                pos = pos - 1
                            
                            # Actually implement Zhang lab algorithm
                            mmscore = [21 -x for x in mmloc]
                            t1 = 1
                            for mismatchlocation in mmscore:
                                t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
                            if len(mmscore) > 1:
                                d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
                            else:
                                d = 19
                            t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
                            t3 = float(1)/ float(pow(len(mmloc), 2))
                            scorelist.append({"match_score": float(t1 * t2 * t3 * 100), "scaffold": scaffold.hit_def, \
                                              "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                              "match_bars":mmstr})
                            #pamlist.append(pam)
                        # Zhang lab algorithm doesn't handle perfect matches (I think?): give it a 50 if it's perfect
                        if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
                            if firstmatch != 0:
                                scorelist.append({"match_score": float(50), "scaffold": scaffold.hit_def, \
                                                  "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                                  "match_bars":mmstr})
                                #pamlist.append(pam)
                            firstmatch = 1
                            notfound = 0
                    else: 
                        scorelist.append({"match_score": float(0), "scaffold": scaffold.hit_def, \
                                          "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                          "match_bars":mmstr})
                        #pamlist.append(pam)
    os.remove(filename) # want to keep BLAST results? comment out these two lines.
    os.remove(filename + ".blast")
    # Only include the scorelist items that have a score greater than zero
    guide.annotations["blastdata"] = [s for s in scorelist if s["match_score"] > 0]
    finalscore = int(10000.000 / (100.000 + float(sum(item["match_score"] for item in scorelist))))
    guide.annotations["score"] = finalscore
    return guide



In [ ]:

    
def al_scoremanyguides(guides, genome, genomedict, hits=50, return_blast =1):
    '''
    To produce a score for a given guide in a given BLAST database. Returns an int, the score.
    Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
    This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
    all potential guide hits are on the plus strand.
    Arguments:  guide = a SeqRecord object containing a proposed guide sequence
                genome = a BLAST database set up on this machine.
                genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
                    BLAST db hit_def.
    Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
    '''
    name = multiprocessing.current_process().name
    M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
    #import random
    random.jumpahead(1)
    #filesuffix = str(int(random.random()*100000000))
    filesuffix = str(guides[0].id)
    filename = str("currseq" + filesuffix + ".tmp")
    Bio.SeqIO.write(guides, filename, "fasta")
    # Added dust="no" to stop inflating scores from polyN repeats...
    blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
    task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 10, dust="no")
    #timeit.timeit(blastn_cline, number =1)
    blastn_cline()
    result_handle = open(filename + ".blast")
    blasts = NCBIXML.parse(result_handle)
    # This generates an generator of BLAST objects, each corresponding to a guide query. Loop through those.
    for guideindex, item in enumerate(blasts):
        firstmatch = 0
        notfound = 1
        scorelist = []
        pamlist = []
        # Within each queried guide, loop through the aligned scaffolds
        for scaffold in item.alignments:
            # Within each aligned scaffold, loop through invidivual HSPs on that scaffold.
            for pair in scaffold.hsps:
                hit_threeprime_offset = len(guides[0]) - pair.query_end
                start = pair.sbjct_start
                end = pair.sbjct_end
                if end > start:
                    pamstart = end + hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
                elif start > end:
                    pamstart = end - hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()

                # Construct a bar-based match string, padded to query length, 
                # where bar = match position and space is non-match
                mmloc = []
                mmstr = list(pair.match)
                if pair.query_start > 1:
                    mmstr = list(" " * (pair.query_start - 1)) + mmstr
                if pair.query_end < 20:
                    mmstr = mmstr + list(" " * (20 - pair.query_end))
                mmstr = "".join(mmstr)

                # Test for PAM adjacency
                if len(pam) == 3:
                    if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
                        and (pam[2] == "G" or pam[2] == "g"):
                        if (pair.positives >16 and pair.positives < 20):
                            pos = 20
                            for linestring in mmstr:
                                if linestring != "|":
                                    mmloc.append(pos)
                                pos = pos - 1

                            # Actually implement Zhang lab algorithm
                            mmscore = [21 -x for x in mmloc]
                            t1 = 1
                            for mismatchlocation in mmscore:
                                t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
                            if len(mmscore) > 1:
                                d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
                            else:
                                d = 19
                            t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
                            t3 = float(1)/ float(pow(len(mmloc), 2))
                            scorelist.append({"match_score": float(t1 * t2 * t3 * 100), "scaffold": scaffold.hit_def, \
                                              "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                              "match_bars":mmstr})
                            #pamlist.append(pam)
                        # Zhang lab algorithm doesn't handle perfect matches (I think?): give it a 50 if it's perfect
                        if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
                            if firstmatch != 0:
                                scorelist.append({"match_score": float(50), "scaffold": scaffold.hit_def, \
                                                  "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                                  "match_bars":mmstr})
                                #pamlist.append(pam)
                            firstmatch = 1
                            notfound = 0
                    else: 
                        scorelist.append({"match_score": float(0), "scaffold": scaffold.hit_def, \
                                          "hit_location":pair.sbjct_start, "hit_sequence": pair.sbjct, "pam":str(pam.seq),\
                                          "match_bars":mmstr})
                        #pamlist.append(pam)
        # Only include the scorelist items that have a score greater than zero
        if return_blast == 1:
            guides[guideindex].annotations["blastdata"] = [s for s in scorelist if s["match_score"] > 0]
        finalscore = int(10000.000 / (100.000 + float(sum(item["match_score"] for item in scorelist))))
        guides[guideindex].annotations["score"] = finalscore
    
    os.remove(filename) # want to keep BLAST results? comment out these two lines.
    os.remove(filename + ".blast")

    #return hits
    return guides



In [1]:

    
def al_scoreguide_density(guide, genome, genomedict, hits=50):
    '''
    To produce a score for a given guide in a given BLAST database. Returns an int, the score.
    Only evaluates a max of 500 hit sequences (i.e. really bad guides will not necessarily have a super accurate score.)
    This version only searches the plus strand of the BLAST db, meant to be used with PAM BLAST DBs generated such that
    all potential guide hits are on the plus strand.
    Arguments:  guide = a SeqRecord object containing a proposed guide sequence
                genome = a BLAST database set up on this machine.
                genomedict = a dict made from a FASTA file identical to the one used to make the BLAST DB. Dict keys should be
                    BLAST db hit_def.
    Returns: Tuple containing (finalscore, a list of the individual hits, their locations and their subscores)
    '''
    name = multiprocessing.current_process().name
    M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0 ,0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828, 0.615, 0.804, 0.685, 0.583]
    import random
    random.jumpahead(1)
    #filesuffix = str(int(random.random()*100000000))
    filesuffix = str(guide.id)
    filename = str("currseq" + filesuffix + ".tmp")
    Bio.SeqIO.write(guide, filename, "fasta")
    blastn_cline = NcbiblastnCommandline(query=filename, db=genome, \
    task = "blastn-short",outfmt=5, out=filename + ".blast", max_target_seqs=100, num_threads = 7, evalue = 100)
    #timeit.timeit(blastn_cline, number =1)
    blastn_cline()
    result_handle = open(filename + ".blast")
    hits = NCBIXML.parse(result_handle)
    firstmatch = 0
    notfound = 1
    for item in hits:
        scorelist = []
        pamlist = []
        for scaffold in item.alignments:
            for pair in scaffold.hsps:
                hit_threeprime_offset = len(guide) - pair.query_end
                start = pair.sbjct_start
                end = pair.sbjct_end
                if end > start:
                    pamstart = end + hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart:pamstart+3]
                elif start > end:
                    pamstart = end - hit_threeprime_offset
                    pam = genomedict[scaffold.hit_def][pamstart-4:pamstart-1].reverse_complement()
                if len(pam) == 3:
                    t1 = 0
                    t2 = 0
                    t3=0
                    if (pam[1] == "G" or pam[1] == "A" or pam[1] == "g" or pam[1] == "a") \
                        and (pam[2] == "G" or pam[2] == "g"):
                        if (pair.positives >12 and pair.positives < 20):
                            mmloc = []
                            mmstr = list(pair.match)
                            if pair.query_start > 1:
                                mmstr = list(" " * (pair.query_start - 1)) + mmstr
                            if pair.query_end < 20:
                                mmstr = mmstr + list(" " * (20 - pair.query_end))
                            "".join(mmstr)
                            pos = 20
                            for linestring in mmstr:
                                if linestring != "|":
                                    mmloc.append(pos)
                                pos = pos - 1

                            mmscore = [21 -x for x in mmloc]
                            t1 = 1
                            for mismatchlocation in mmscore:
                                t1 = t1 * (1.0 - float(M[mismatchlocation - 1]))
                            if len(mmscore) > 1:
                                d = (float(max(mmscore)) - float(min(mmscore))) / float((len(mmscore) - 1))
                            else:
                                d = 19
                            t2 = 1 / ( (((19.0 - d)/19.0) * 4) + 1)
                            t3 = float(1)/ float(pow(len(mmloc), 2))
                            scorelist.append([float(t1 * t2 * t3 * 100), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
                            #pamlist.append(pam)
                        if pair.positives >= 20: #changed from == 20; miight be worth keeping in mind for bugs
                            scorelist.append([float(100), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
                            notfound = 0
                    else: 
                        scorelist.append([float(0), ((t1, t2, t3), scaffold.hit_def, pair.sbjct_start, pair.sbjct)])
                        #pamlist.append(pam)
    os.remove(filename) # want to keep BLAST results? comment out these two lines.
    os.remove(filename + ".blast")
    finalscore = int(10000.000 / (100.000 + float(sum(score for score, details in scorelist))))
    return (finalscore, guide, scorelist)



In [ ]: