READ ME:

This could be a way to quantify the differences between genomes in an alignment. It's a work in progress and is not very well commented yet.


In [1]:
import Bio
from Bio import AlignIO
from Bio import Phylo
import inspect
import os

#specify dir where alignment files are located
aligned_dir = "Biopython Alignment"

In [2]:
#optional step
alignment = AlignIO.read(os.path.join(aligned_dir,'all_seq.aln'),'clustal')
print alignment


SingleLetterAlphabet() alignment with 5 rows and 13387 columns
TCAATTATATTCAATATGGAAAGAATAAAAGAACTAAGGAATCT...--- 1945
TCAATTATATTCAATATGGAAAGAATAAAAGAACTAAGGAATCT...AAA 1943
--------------TATGGAAAGAATAAAAGAGCTAAGGAGTCT...--- 1978
------------AATATGGAAAGAATAAAAGAACTACGAAATCT...--- 1935
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...A-- 2009

Phylo Operations

From biopython phylo: The distance values show the number of substitutions as a proportion of the length of the alignment (excluding gaps).


In [3]:
#http://biopython.org/wiki/Phylo_cookbook
#http://biopython.org/wiki/Phylo
tree = Phylo.read(os.path.join(aligned_dir,'all_seq.dnd'),"newick")

In [8]:
Phylo.draw_ascii(tree)
print tree


        _________________________________________________________________ 2009
    ___|
  _|   |_________ 1935
 | |
 | |__________ 1978
_|
 |___ 1945
 |
 |___ 1943

Tree(rooted=False, weight=1.0)
    Clade()
        Clade(branch_length=0.0046)
            Clade(branch_length=0.00835)
                Clade(branch_length=0.14128, name='2009')
                Clade(branch_length=0.02236, name='1935')
            Clade(branch_length=0.02491, name='1978')
        Clade(branch_length=0.00976, name='1945')
        Clade(branch_length=0.00912, name='1943')

In [5]:
#get tree elements
elements = list(tree.find_elements())
named_elems = [i for i in elements if i.name != None]
named_elems


Out[5]:
[Clade(branch_length=0.14128, name='2009'),
 Clade(branch_length=0.02236, name='1935'),
 Clade(branch_length=0.02491, name='1978'),
 Clade(branch_length=0.00976, name='1945'),
 Clade(branch_length=0.00912, name='1943')]

In [9]:
branch_lens = []
for i in named_elems:
    branch_lens.append(i.branch_length)
    print i.branch_length,
    print i.branch_length*alignment.get_alignment_length() #I guess we don't really need to do this. We could just compare i.branch_length


0.14128 1891.31536
0.02236 299.33332
0.02491 333.47017
0.00976 130.65712
0.00912 122.08944

In [14]:
print sum(branch_lens)
print tree.total_branch_length() #if you run 'print tree' you see that several of the clades do not have names,which is why sum(branch_lens) != tree.total_branch_length().


0.20743
0.22038