Compares two sequences and identifies similar features.
Useful in the case of identifying elements of a shuffled sequence.
Requires an original annotated og_reference_sequence.ape file and a new_reference_sequence.ape file located in the notebook folder.
(Should it grab annotated sequences and use those blocks to try to align to new ref?)
In [ ]:
#load necessary modules
from Bio import SeqIO #reading and writing genbank files
from Bio import pairwise2 #alignments
#generation and output of .ape file
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
from Bio import SeqFeature
from Bio.SeqFeature import FeatureLocation
#dict structure
from collections import defaultdict
In [ ]:
#load original reference sequence
og_reference_sequence = list(SeqIO.parse(open("og_reference_sequence.ape","r"), "genbank"))[0]
print "Name %s, %i features" % (og_reference_sequence.name, len(og_reference_sequence.features))
#load new reference sequence
new_reference_sequence = list(SeqIO.parse(open("new_reference_sequence.ape","r"), "genbank"))[0]
print "Name %s, %i features" % (new_reference_sequence.name, len(new_reference_sequence.features))
local alignment and variables describing the match and gap penalties http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html
alignments = pairwise2.align.localms(og_refseq.seq, "CCA",2, -1, -.5, -.1)
Identical characters are given 2 points, 1 point is deducted for each non-identical character 0.5 points are deducted when opening a gap and 0.1 points are deducted when extending it
In [ ]:
#Main scanning loop
scan_stop = len(new_reference_sequence.seq)
slice1 = 0
slice2 = 5
annotation_hits = defaultdict(list) #append format ['feature_name'] = [] <- (start_pos, end_pos) ...
while True:
if slice2 > scan_stop:
slice2 = scan_stop
if slice1 >= scan_stop:
break
alignments = pairwise2.align.localms(og_reference_sequence.seq, new_reference_sequence.seq[slice1:slice2] ,2, -1, -.5, -.1)
#better selection
if len(alignments) >= 1: #unique, add to list
fragment_start = alignments[0][3]
fragment_end = alignments[0][4]
for feature in og_reference_sequence.features:
if fragment_start >= feature.location.start and fragment_start <= feature.location.end and fragment_end >= feature.location.start and fragment_end <= feature.location.end:
#annotation hit
annotation_hits[feature.qualifiers['label'][0]].append((fragment_start, fragment_end))
slice1+=5
slice2+=5
else:
slice1+=1
slice2+=1
In [ ]:
print annotation_hits
In [ ]:
#function for eliminating overlapping tuples
def merge(times):
saved = list(times[0])
for st, en in sorted([sorted(t) for t in times]):
if st <= saved[1]+1:
saved[1] = max(saved[1], en)
else:
yield tuple(saved)
saved[0] = st
saved[1] = en
yield tuple(saved)
In [ ]:
#apply merge tuples to annotation hits dictionary
for annotation in annotation_hits:
annotation_hits[annotation] = list(merge(annotation_hits[annotation]))
In [ ]:
#add annotation features
for annotation in annotation_hits:
for ref_feature in og_reference_sequence.features:
if ref_feature.qualifiers['label'][0] == annotation:
og_qualifiers = ref_feature.qualifiers
break
for location in annotation_hits[annotation]:
feature_location = FeatureLocation(SeqFeature.ExactPosition(location[0]),SeqFeature.ExactPosition(location[1]))
feature = SeqFeature.SeqFeature(location=feature_location, type=ref_feature.type)
for qual in og_qualifiers:
feature.qualifiers[qual] = og_qualifiers[qual]
new_reference_sequence.features.append(feature)
In [ ]:
#output new ape file
SeqIO.write(new_reference_sequence, 'output.ape', 'genbank')
#open file
%system open output.ape
In [ ]:
from Bio.Emboss.Applications import WaterCommandline
water = WaterCommandline(gapopen=10, gapextend=0.5)
water.asequence = 'asis:' + str(og_reference_sequence.seq)
water.bsequence = 'asis:' + str(new_reference_sequence.seq)
water.stdout = True
water()
In [ ]: