Working with ete2

Data munging

We assume that trees are provided in the newick-tree format. Conveniently, in Python there is a package for handling newick trees.

Basically we just use ete2 to store the tree structure for iterating and visualization. The newick trees produced by phylobayes are not directly following any standard type of newick tree that ete2 understands so if we load directly there is a loss of information. Specifically, the transitions at internal nodes will be ignored.

To fix this there is a function called fix_tree that reads the tree in to get the basic structure then it iterates over the newick formatted string to both confirm the structure and repopulate the missing informtion.

Before we can fix the tree we must standardize it first. For example we need to make the following types of substitutions

_Q:0.0367044:Q) --> _Q:0.0367044)
_Q:0.044464:Q:0.0512411:K) --> _Q:0.044464)

In [1]:
import os
from ete2 import Tree,TreeStyle
from ete2 import faces, AttrFace
from vertebratesLib import *

def standardize_tree(nwTree):
    pattern1 = "[\w|:]+.+?[,|\)|\(]"
    parsedTree = nwTree

    for r in re.finditer(pattern1,nwTree):
        matched = nwTree[r.start(0):r.end(0)]
        parsed = re.sub(":[A-Z]","",matched)
        multiples = re.findall(":\d\.[\de-]+",parsed)
        transitions = "".join(multiples)
    
        if len(multiples) > 1:
            parsed = parsed.replace(transitions,multiples[0])
        parsedTree = parsedTree.replace(matched,parsed)

    return re.sub("\)_[A-Z]:",")",parsedTree)[:-3]+";"

## standardize a phylobayes tree
pbtree = "(Acipenser__R:0.137306:R:0.167816:K,((Takifugu_r_K:0.623751:K,Danio_reri_K:0.428613:K)_K:0.319631:K,((Lepisosteu_K:0.00243856:K,Lepisost00_K:0.00389519:K)_K:0.24244:K,Amia_calva_K:0.252829:K)_K:0.0341578:K)_K:0.156204:K,(Polypterus_K:0.538725:K,((((Neotrygon__K:0.199854:K,(Raja_clava_K:0.0276807:K,Leucoraja__K:0.0338708:K)_K:0.142449:K)_K:0.123911:K,((Ginglymost_K:0.0614048:K,Chiloscyll_K:0.063792:K)_K:0.0894881:K,(Scyliorhin_K:0.126886:K,Carcharodo_K:0.0816871:K)_K:0.0261212:K)_K:0.100872:K)_K:0.138297:K,Callorhinc_K:0.406368:K)_K:0.596451:K,(Latimeria__Q:0.435962:Q,((Neoceratod_E:0.139027:E,((Protopteru_E:0.0118172:E,Protopte00_E:0.00600774:E)_E:0.125265:E,Lepidosire_E:0.129048:E)_E:0.283337:E)_E:0.185699:E:0.230877:Q,(((Rhinatrema_K:0.168308:K,((Typhlonect_K:0.00595736:K,Typhlone00_K:0.0172491:K)_K:0.154939:K,Geotrypete_K:0.173365:K)_K:0.0837134:K)_K:0.277616:K,(((Discogloss_K:0.211532:K,Bombina_ma_K:0.247467:K)_K:0.0796761:K,((Pipa_pipa__K:0.246234:K,(Xenopus_tr_K:0.236198:K,Hymenochir_K:0.300396:K)_K:0.0241566:K)_K:0.103826:K,(Megophrys__K:0.422485:K,((Rana_chens_K:0.096737:K,(Pelophylax_K:0.0493814:K,Pelophyl00_K:0.0377842:K)_K:0.0159202:K)_K:0.212804:K,(Cyclorana__K:0.0911112:K,(Espadarana_R:0.119242:R:0.018515:K,Atelopus_z_S:0.0687643:S:0.0657081:K)_K:0.0180531:K)_K:0.146541:K)_K:0.133463:K)_K:0.0325331:K)_K:0.0320521:K)_K:0.45317:K,((Hynobius_c_K:0.13165:K,Andrias_da_K:0.116493:K)_K:0.0732691:K,(Siren_lace_K:0.293752:K,(Proteus_an_K:0.252888:K,((Salamandra_K:0.0876578:K,(Pleurodele_K:0.0771019:K,(Notophthal_K:0.0586901:K,(Cynops_pyr_K:0.0403819:K,Calotriton_K:0.048577:K)_K:0.0333907:K)_K:0.0108502:K)_K:0.0327197:K)_K:0.131402:K,Ambystoma__K:0.199927:K)_K:0.0356739:K)_K:0.0444576:K)_K:0.0182886:K)_K:0.293596:K)_K:0.0448927:K)_K:0.00925933:K:0.0960703:Q,((Ornithorhy_E:0.228367:E:0.0919702:Q,((Monodelphi_Q:0.116685:Q,(Sarcophilu_K:0.0956754:K,Macropus_e_K:0.0967417:K)_K:0.018078:K:0.0103883:Q)_Q:0.173422:Q,((Loxodonta__Q:0.134972:Q,Dasypus_no_Q:0.160758:Q)_Q:0.00642451:Q,((Mus_muscul_Q:0.227068:Q,Homo_sapie_Q:0.100987:Q)_Q:0.0152245:Q,(Felis_catu_Q:0.057015:Q,Canis_fami_Q:0.0722375:Q)_Q:0.0483643:Q)_Q:0.0151335:Q)_Q:0.235403:Q)_Q:0.0967933:Q)_Q:0.289468:Q,((Sphenodon__Q:0.263159:Q,((Tarentola__Q:0.138711:Q,Eublephari_Q:0.126597:Q)_Q:0.134156:Q,((Scincella__Q:0.129828:Q,((Saproscinc_E:0.0436589:E,Lamprophol_E:0.039153:E)_E:0.0132249:E,Carlia_rub_E:0.0421727:E)_E:0.0298716:E:0.0919685:Q)_Q:0.144222:Q,((Tupinambis_Q:0.263447:Q,Podarcis_s_Q:0.166879:Q)_Q:0.0318745:Q,(((((Thamnophis_Q:0.0743196:Q,(Pantheroph_Q:0.0216722:Q,Opheodrys__Q:0.0297485:Q)_Q:0.0184298:Q)_Q:0.0120419:Q,(Ophiophagu_Q:0.0287086:Q,Micrurus_f_Q:0.0346685:Q)_Q:0.0394398:Q)_Q:0.0306892:Q,(Echis_colo_Q:0.0582188:Q,(Sistrurus__Q:0.0146283:Q,Crotalus_a_Q:0.0105047:Q)_Q:0.0452256:Q)_Q:0.0417915:Q)_Q:0.0890153:Q,(Python_reg_Q:0.0643131:Q,Boa_constr_Q:0.0698609:Q)_Q:0.0173203:Q)_Q:0.244803:Q,(Elgaria_mu_Q:0.162227:Q,((Pogona_vit_Q:0.142077:Q,Chamaeleo__Q:0.204871:Q)_Q:0.0904862:Q,((Iguana_igu_Q:0.0899908:Q,Basiliscus_Q:0.0856177:Q)_Q:0.00740167:Q,(Sceloporus_Q:0.118961:Q,Anolis_car_Q:0.182233:Q)_Q:0.00884581:Q)_Q:0.0816258:Q)_Q:0.0306025:Q)_Q:0.00670356:Q)_Q:0.0222824:Q)_Q:0.0403438:Q)_Q:0.025957:Q)_Q:0.231231:Q)_Q:0.0674595:Q,(((Phrynops_h_Q:0.0852888:Q,Pelusios_c_Q:0.111423:Q)_Q:0.0256123:Q,(Pelodiscus_Q:0.119918:Q,(Sternother_Q:0.0824974:Q,(((Trachemys__Q:0.0144476:Q,Emys_orbic_Q:0.0187597:Q)_Q:0.0159572:Q,(Chinemys_r_Q:0.021249:Q,Chelonoidi_Q:0.0343593:Q)_Q:0.0145747:Q)_Q:0.00699482:Q,Caretta_ca_Q:0.0473377:Q)_Q:0.00586399:Q)_Q:0.0272828:Q)_Q:0.0283806:Q)_Q:0.0828567:Q,(((Struthio_c_Q:0.0456057:Q,Dromaius_n_Q:0.0442109:Q)_Q:0.040561:Q,(Taeniopygi_Q:0.2042:Q,((Meleagris__Q:0.0381837:Q,Gallus_gal_Q:0.0307035:Q)_Q:0.106136:Q,Anas_platy_Q:0.0913882:Q)_Q:0.0298145:Q)_Q:0.0373143:Q)_Q:0.189056:Q,(Crocodylus_Q:0.0349965:Q,(Caiman_cro_Q:0.0451652:Q,Alligator__Q:0.0189561:Q)_Q:0.0222454:Q)_Q:0.204443:Q)_Q:0.0457489:Q)_Q:0.0369577:Q)_Q:0.126106:Q)_Q:0.238162:Q)_Q:0.306579:Q)_Q:0.0367044:Q)_Q:0.044464:Q:0.0512411:K)_K:0.322251:K)_K:0.0235979:K)_K;"
nwtree = standardize_tree(pbtree)
#print(pbtree)
#print(nwree)

## fix phylobayes tree (standardize, add internal nodes and extract extra info)
fixedTree,treeSummary = fix_tree(pbtree)
for key in ['aa','pairs','transitions','dist','parent']:
    print("%s - %s"%(key, treeSummary["Danio_reri_K"][key]))

print("...parent N4- %s"%treeSummary["N4"]['pairs'])

## plot the tree
def my_layout(node):
    if node.is_leaf():
        name_face = AttrFace("name",fsize=25)
        faces.add_face_to_node(name_face, node, column=0, position="branch-right")
    else:
        name_face = AttrFace("name", fsize=20, fgcolor="red")
        faces.add_face_to_node(name_face, node, column=0, position="branch-right")
        
ts = TreeStyle()
ts.show_leaf_name = False
ts.show_branch_length = True
ts.show_branch_support = True
ts.scale =  180
ts.layout_fn = my_layout
out = fixedTree.render("./figures/simple-tree.png", units="mm", tree_style=ts,dpi=400)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-1-d6e857214566> in <module>()
     29 fixedTree,treeSummary = fix_tree(pbtree)
     30 for key in ['aa','pairs','transitions','dist','parent']:
---> 31     print("%s - %s"%(key, treeSummary["Danio_reri_K"][key]))
     32 
     33 print("...parent N4- %s"%treeSummary["N4"]['pairs'])

KeyError: 'Danio_reri_K'

In [10]:
from IPython.display import Image
Image(filename='./figures/simple-tree.png')


Out[10]:

In [ ]:

A note on unrooted trees

From the documentation...

When a tree is loaded from external sources, a pointer to the top-most node is returned. This is called the tree root, and it will exist even if the tree is conceptually considered as unrooted. This is, the root node can be considered as the master node, since it represents the whole tree structure. Unrooted trees can be identified as trees in which master root node has more than two children

Phylogenetic Trees

We want to link the alignment with the tree. This is the example from the docs.


In [18]:
from ete2 import PhyloTree, TreeStyle

alg = """
 >Dme_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEAL--YYASQTDDIKDRREEAH
 >Dme_002
 MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
 >Cfa_001
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Mms_001
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Hsa_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_002
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_002
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Hsa_002
 MAEAPDETIQQFM-LTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Mmu_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_001
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_003
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
"""

def get_example_tree():

    # Performs a tree reconciliation analysis 
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()

# Visualize the reconciled tree
t, ts = get_example_tree()
out = t.render("./figures/phylotree-example.png", w=750)


Warnning: [5] terminal nodes could not be found in the alignment.

In [19]:
Image(filename='./figures/phylotree-example.png')


Out[19]:

In [ ]: