In [ ]:
from Bio import SeqIO, AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Graphics import GenomeDiagram
from reportlab.lib import colors
from reportlab.lib.units import cm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
#Make a variable point to where ClustalW2 is on your computer
clustalw_exe = r"C:\Program Files (x86)\ClustalW2\clustalw2.exe"
#Make sure you are in the same directory as the sequence FASTA files
In [ ]:
#Read in the SCN5A refrence sequence
SCN5A_ref = SeqIO.read('SCN5A.fasta','fasta')
SCN5A_ref.name = 'SCN5A_ref'
#Read in the SCN5A sequence with 11 known LQTS3 mutations added in
SCN5A_mut = SeqIO.read('SCN5A_mutants.fasta','fasta')
SCN5A_mut.name = 'SCN5A_mut'
In [ ]:
#Put both reference and mutant sequences into a list
SCN5A_all = [SCN5A_ref,SCN5A_mut]
#Write the sequences to a single FASTA file, so ClustalW can align them
SeqIO.write(SCN5A_all,'SCN5A_all.fasta','fasta');
In [ ]:
#THIS TOOK FOREVER TO RUN
'''
#Align the ref and mutant sequences
cline_Clustal = ClustalwCommandline(clustalw_exe,infile='SCN5A_all.fasta')
stdout, sdterr = cline_Clustal()
'''
In [ ]:
#Read in the alignment file
SCN5A_aln = AlignIO.read('SCN5A_all.aln','clustal')
In [ ]:
def get_diff_locations(ref,compare):
"""
Takes 2 Biopython Seq objects of equal size (aligned).
Returns a list of locations at which base in compare sequence != base in ref sequence.
"""
if len(ref) != len(compare):
raise ValueError('Seqs are not the same length!')
diff_locations = []
for i in range(len(ref)):
if ref[i] != compare[i]:
diff_locations.append(i)
return diff_locations
In [ ]:
#Find differences between ref and mutant sequences
SCN5A_diffs = get_diff_locations(SCN5A_aln[0].seq,SCN5A_aln[1].seq)
In [ ]:
#Set up track plotting diagram
tracks = GenomeDiagram.Diagram('SCN5A Mutations')
features = tracks.new_track(1,greytrack=False)
feature_set = features.new_set()
#Background green to show reference seq
feature_set.add_feature(SeqFeature(FeatureLocation(0,(len(SCN5A_aln[0].seq)-1))),
name = 'SCN5A Mutations Associated w/ LQTS3',
label=True, label_angle=0);
#Add blue to show mutations
for i in range(len(SCN5A_diffs)):
feature = SeqFeature(FeatureLocation(SCN5A_diffs[i],SCN5A_diffs[i]))
feature_set.add_feature(feature, color=colors.blue)
In [ ]:
#Write the diagram to a png, then read it back in using matplotlib
tracks.draw(format='linear', pagesize=(15*cm,10*cm), fragments=4,
start=0, end=(len(SCN5A_aln[0].seq)-1))
tracks.write('SCN5A_tracks.png', 'png', dpi=600)
tracks_im = mpimg.imread('SCN5A_tracks.png');
#Plot tracks
fig_tracks = plt.figure(figsize=(15,10),dpi=600)
ax_tracks = fig_tracks.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(tracks_im);
This diagram shows the SCN5A gene (sequence here), broken up into 4 fragments proceeding from 5' at the upper left to 3' at the bottom right. There are 11 mutations indicated in blue, and they all appear to be toward the 3' end of the gene. In fact, the several mutations closest to the 3' end are tightly grouped within the same short region. These 11 mutations are known to cause LQTS3, as indicated by OMIM. The mutations are: