In [1]:
from Bio import AlignIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Graphics import GenomeDiagram
from reportlab.lib import colors
from reportlab.lib.units import cm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.legend_handler import HandlerLine2D
import numpy as np

%matplotlib inline

In [2]:
#Import the aligned sequences
all_aln = AlignIO.read('all_seq.aln','clustal')
all_aln.sort()
print all_aln


SingleLetterAlphabet() alignment with 8 rows and 13379 columns
------------AATATGGAAAGAATAAAAGAACTACGAAATCT...--- h1935
--------------TATGGAAAGAATAAAAGAGCTAAGGAGTCT...--- h1978
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...--- h2009
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...TAC h2014
TCAAATATATTCAATATGGAGAGAATAAAAGAACTAAGGGATCT...--- s1935
TCAAATATATTCAATATGGAGAGAATAAAGGAACTAAGAAATCT...--- s1978
---------------ATGGAGAGAATAAAAGAACTAAGAGATCT...--- s2009
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...--- s2014

In [3]:
h_ids = [all_aln[i].id for i in range(0,4)]
s_ids = [all_aln[i].id for i in range(4,8)]
all_ids = [all_aln[i].id for i in range(len(all_aln))]

In [4]:
#Create Genome Diagram for human
h_tracks = GenomeDiagram.Diagram('Human H1N1 Tracks Plot')

h_features = {}
h_feature_set = {}

#Generate tracks and feature sets for each year
track_count = 1
for i in h_ids: 
    h_features['%s' %i] = h_tracks.new_track(track_count, greytrack=False)
    h_feature_set['%s'%i] = h_features['%s' %i].new_set()
    track_count += 1

#Repeat for swine
s_tracks = GenomeDiagram.Diagram('Swine H1N1 Tracks Plot')

s_features = {}
s_feature_set = {}

track_count = 1
for i in s_ids:
    s_features['%s' %i] = s_tracks.new_track(track_count, greytrack=False)
    s_feature_set['%s'%i] = s_features['%s' %i].new_set()
    track_count += 1

In [5]:
def get_diff_locations(ref,compare):
    """
    Takes 2 Biopython Seq objects of equal size (aligned).
    Returns a list of locations at which base in compare sequence != base in ref sequence.
    """
    if len(ref) != len(compare):
        raise ValueError('Seqs are not the same length!')
    
    diff_locations = [0]
    
    for i in range(len(ref)):
        if ref[i] != compare[i]:
            diff_locations.append(i)
    
    del diff_locations[0]
    
    return diff_locations

In [6]:
years = [1935,1978,2009,2014]

diffs = {}

for i in range(len(years)):
    for ii in range(len(years)):
        diffs['h_%i_%i'% (years[i],years[ii])] = get_diff_locations(all_aln[i].seq,all_aln[ii].seq)
        diffs['s_%i_%i'% (years[i],years[ii])] = get_diff_locations(all_aln[i+4].seq,all_aln[ii+4].seq)
        diffs['h_%i_s_%i'% (years[i],years[ii])] = get_diff_locations(all_aln[i].seq,all_aln[ii+4].seq)

In [7]:
#Baseline solid green tracks for reference sequences (1935)
h_feature_set['h1935'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human: 1935 (reference)', label=True, label_angle=0);

s_feature_set['s1935'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Swine: 1935 (reference)', label=True, label_angle=0);

In [8]:
#Background green to show similarities
h_feature_set['h1978'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human: 1978 --- %i differences vs. 1935'%len(diffs['h_1935_1978']), 
                                   label=True, label_angle=0);
#Add blue to show each difference
for i in range(len(diffs['h_1935_1978'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1935_1978'][i],diffs['h_1935_1978'][i]))
    h_feature_set['h1978'].add_feature(feature, color=colors.blue)

#Repeat for swine
s_feature_set['s1978'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Swine: 1978 --- %i differences vs. 1935'%len(diffs['s_1935_1978']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['s_1935_1978'])):
    feature = SeqFeature(FeatureLocation(diffs['s_1935_1978'][i],diffs['s_1935_1978'][i]))
    s_feature_set['s1978'].add_feature(feature, color=colors.blue)

In [9]:
#Repeat above cell for 2009
h_feature_set['h2009'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human: 2009 --- %i differences vs. 1935'%len(diffs['h_1935_2009']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['h_1935_2009'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1935_2009'][i],diffs['h_1935_2009'][i]))
    h_feature_set['h2009'].add_feature(feature, color=colors.blue)


s_feature_set['s2009'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Swine: 2009 --- %i differences vs. 1935'%len(diffs['s_1935_2009']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['s_1935_2009'])):
    feature = SeqFeature(FeatureLocation(diffs['s_1935_2009'][i],diffs['s_1935_2009'][i]))
    s_feature_set['s2009'].add_feature(feature, color=colors.blue)

In [10]:
#Repeat for 2014
h_feature_set['h2014'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human: 2014 --- %i differences vs. 1935'%len(diffs['h_1935_2014']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['h_1935_2014'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1935_2014'][i],diffs['h_1935_2014'][i]))
    h_feature_set['h2014'].add_feature(feature, color=colors.blue)


s_feature_set['s2014'].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Swine: 2014 --- %i differences vs. 1935'%len(diffs['s_1935_2014']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['s_1935_2014'])):
    feature = SeqFeature(FeatureLocation(diffs['s_1935_2014'][i],diffs['s_1935_2014'][i]))
    s_feature_set['s2014'].add_feature(feature, color=colors.blue)

In [11]:
#Write each diagram to a png, then read it back in using Matplotlib
h_tracks.draw(format='linear', pagesize=(14*cm,7*cm), fragments=1,
         start=0, end=13378)
h_tracks.write('h_tracks.png', 'png', dpi=600)
h_tracks_im = mpimg.imread('h_tracks.png');

s_tracks.draw(format='linear', pagesize=(14*cm,7*cm), fragments=1,
         start=0, end=13378)
s_tracks.write('s_tracks.png', 'png', dpi=600)
s_tracks_im = mpimg.imread('s_tracks.png');

In [12]:
#Plot each set of tracks
fig_h = plt.figure(figsize=(14,7),dpi=600)
ax_h = fig_h.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(h_tracks_im);

fig_s = plt.figure(figsize=(14,7),dpi=600)
ax_s = fig_s.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(s_tracks_im);



In [13]:
#Create Genome Diagram to compare human and swine in eaach year
hs_tracks = GenomeDiagram.Diagram('Human vs. Swine H1N1 Tracks Plot')

hs_features1935 = hs_tracks.new_track(1, greytrack=False)
hs_feature_set1935 = hs_features1935.new_set()

hs_features1978 = hs_tracks.new_track(2, greytrack=False)
hs_feature_set1978 = hs_features1978.new_set()

hs_features2009 = hs_tracks.new_track(3, greytrack=False)
hs_feature_set2009 = hs_features2009.new_set()

hs_features2014 = hs_tracks.new_track(4, greytrack=False)
hs_feature_set2014 = hs_features2014.new_set()

In [14]:
#Background green to show similarities
hs_feature_set1935.add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human vs. Swine: 1935 --- %i differences'%len(diffs['h_1935_s_1935']), 
                                   label=True, label_angle=0);
#Add blue to show differences
for i in range(len(diffs['h_1935_s_1935'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1935_s_1935'][i],diffs['h_1935_s_1935'][i]))
    hs_feature_set1935.add_feature(feature, color=colors.blue)

#Repeat for each year
hs_feature_set1978.add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human vs. Swine: 1978 --- %i differences'%len(diffs['h_1978_s_1978']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['h_1978_s_1978'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1978_s_1978'][i],diffs['h_1978_s_1978'][i]))
    hs_feature_set1978.add_feature(feature, color=colors.blue)


hs_feature_set2009.add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human vs. Swine: 2009 --- %i differences'%len(diffs['h_2009_s_2009']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['h_2009_s_2009'])):
    feature = SeqFeature(FeatureLocation(diffs['h_2009_s_2009'][i],diffs['h_2009_s_2009'][i]))
    hs_feature_set2009.add_feature(feature, color=colors.blue)


hs_feature_set2014.add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human vs. Swine: 2014 --- %i differences'%len(diffs['h_2014_s_2014']), 
                                   label=True, label_angle=0);
for i in range(len(diffs['h_2014_s_2014'])):
    feature = SeqFeature(FeatureLocation(diffs['h_2014_s_2014'][i],diffs['h_2014_s_2014'][i]))
    hs_feature_set2014.add_feature(feature, color=colors.blue)

In [15]:
#Write diagram to a png, then read it back in using Matplotlib
hs_tracks.draw(format='linear', pagesize=(14*cm,7*cm), fragments=1,
         start=0, end=13378)
hs_tracks.write('hs_tracks.png', 'png', dpi=600)
hs_tracks_im = mpimg.imread('hs_tracks.png');

In [16]:
#Plot diagram
fig_hs = plt.figure(figsize=(14,7),dpi=600)
ax_hs = fig_hs.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(hs_tracks_im);



In [17]:
#Plot differences vs. 1935 over time

years = [1935,1978,2009,2014]

#Lists of human and siwne diff lengths vs. 1935
hdl = [len(diffs['h_1935_%i'%years[i]]) for i in range(len(years))]
sdl = [len(diffs['s_1935_%i'%years[i]]) for i in range(len(years))]

fig_diffs_over_time = plt.figure(figsize=(6,4), dpi=600);
ax_diffs_over_time = fig_diffs_over_time.add_axes([0.025,0.025,0.95,0.95])

hline = plt.plot(years,hdl, 'bo-', label='Human Viruses')
sline = plt.plot(years,sdl, 'g^-', label='Swine Viruses')

ax_diffs_over_time.set_xlabel('Year')
ax_diffs_over_time.set_ylabel('Differences vs. 1935')

plt.legend(loc='best');



In [18]:
#Plot differences vs. 1935 as a percentage of total genome length over time

years = [1935,1978,2009,2014]

#Lists of human and siwne diff lengths vs. 1935 as a percentage of total genome length
hdl_percent = [i*100./13378. for i in hdl]
sdl_percent = [i*100./13378. for i in sdl]

fig_diffs_over_time2 = plt.figure(figsize=(6,4), dpi=600);
ax_diffs_over_time2 = fig_diffs_over_time2.add_axes([0.025,0.025,0.95,0.95])

hline2 = plt.plot(years,hdl_percent, 'bo-', label='Human Viruses')
sline2 = plt.plot(years,sdl_percent, 'g^-', label='Swine Viruses')

ax_diffs_over_time2.set_xlabel('Year')
ax_diffs_over_time2.set_ylabel('% Change Since 1935')

plt.legend(loc='best');



In [21]:
#Comparison of human 1978 and 2009
h_1978_2009_track = GenomeDiagram.Diagram('Human 1978 vs. 2009 H1N1 Track Plot')

h_1978_2009_features = h_1978_2009_track.new_track(1, greytrack=False)
h_1978_2009_feature_set = h_1978_2009_features.new_set()

#Background green to show similarities
h_1978_2009_feature_set.add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                   name = 'Human: 1978 vs. 2009 --- %i differences'%len(diffs['h_1978_2009']), 
                                   label=True, label_angle=0);
#Add blue to show differences
for i in range(len(diffs['h_1978_2009'])):
    feature = SeqFeature(FeatureLocation(diffs['h_1978_2009'][i],diffs['h_1978_2009'][i]))
    h_1978_2009_feature_set.add_feature(feature, color=colors.blue)

h_1978_2009_track.draw(format='linear', pagesize=(14*cm,3*cm), fragments=1,
         start=0, end=13378)
h_1978_2009_track.write('h_1978_2009_track.png', 'png', dpi=600)
h_1978_2009_track_im = mpimg.imread('h_1978_2009_track.png');

fig_h_1978_2009 = plt.figure(figsize=(14,7),dpi=600)
ax_h_1978_2009 = fig_h_1978_2009.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(h_1978_2009_track_im);