A transition count matrix is a matrix that contains all the vocabulary (rows) and all of the positions (columns) and each of the elements is simply a count of the number of times that a transition occurs. This matrix is can be created from an alignment matrix or it may be found by counting all of the transitions in a given phylogenetic tree.
We assume that trees are provided in the newick-tree format. Conveniently, in Python there is a package for handling newick trees.
Basically we just use ete2 to store the tree structure for iterating and visualization. The newick trees produced by phylobayes are not directly following any standard type of newick tree that ete2 understands so if we load directly there is a loss of information. Specifically, the transitions at internal nodes will be ignored.
To fix this there is a function called fix_tree that reads the tree in to get the basic structure then it iterates over the newick formatted string to both confirm the structure and repopulate the missing informtion.
Before we can fix the tree we must standardize it first. For example we need to make the following types of substitutions
_Q:0.0367044:Q) --> _Q:0.0367044)
_Q:0.044464:Q:0.0512411:K) --> _Q:0.044464)
In [2]:
import os
from ete2 import Tree,TreeStyle
from ete2 import faces, AttrFace
from vertebratesLib import *
def standardize_tree(nwTree):
pattern1 = "[\w|:]+.+?[,|\)|\(]"
parsedTree = nwTree
for r in re.finditer(pattern1,nwTree):
matched = nwTree[r.start(0):r.end(0)]
parsed = re.sub(":[A-Z]","",matched)
multiples = re.findall(":\d\.[\de-]+",parsed)
trans = "".join(multiples)
if len(multiples) > 1:
parsed = parsed.replace(trans,multiples[0])
parsedTree = parsedTree.replace(matched,parsed)
return re.sub("\)_[A-Z]:",")",parsedTree)[:-3]+";"
## standardize a phylobayes tree
pbtree = "(Acipenser__R:0.137306:R:0.167816:K,((Takifugu_r_K:0.623751:K,Danio_reri_K:0.428613:K)_K:0.319631:K,((Lepisosteu_K:0.00243856:K,Lepisost00_K:0.00389519:K)_K:0.24244:K,Amia_calva_K:0.252829:K)_K:0.0341578:K)_K:0.156204:K,(Polypterus_K:0.538725:K,((((Neotrygon__K:0.199854:K,(Raja_clava_K:0.0276807:K,Leucoraja__K:0.0338708:K)_K:0.142449:K)_K:0.123911:K,((Ginglymost_K:0.0614048:K,Chiloscyll_K:0.063792:K)_K:0.0894881:K,(Scyliorhin_K:0.126886:K,Carcharodo_K:0.0816871:K)_K:0.0261212:K)_K:0.100872:K)_K:0.138297:K,Callorhinc_K:0.406368:K)_K:0.596451:K,(Latimeria__Q:0.435962:Q,((Neoceratod_E:0.139027:E,((Protopteru_E:0.0118172:E,Protopte00_E:0.00600774:E)_E:0.125265:E,Lepidosire_E:0.129048:E)_E:0.283337:E)_E:0.185699:E:0.230877:Q,(((Rhinatrema_K:0.168308:K,((Typhlonect_K:0.00595736:K,Typhlone00_K:0.0172491:K)_K:0.154939:K,Geotrypete_K:0.173365:K)_K:0.0837134:K)_K:0.277616:K,(((Discogloss_K:0.211532:K,Bombina_ma_K:0.247467:K)_K:0.0796761:K,((Pipa_pipa__K:0.246234:K,(Xenopus_tr_K:0.236198:K,Hymenochir_K:0.300396:K)_K:0.0241566:K)_K:0.103826:K,(Megophrys__K:0.422485:K,((Rana_chens_K:0.096737:K,(Pelophylax_K:0.0493814:K,Pelophyl00_K:0.0377842:K)_K:0.0159202:K)_K:0.212804:K,(Cyclorana__K:0.0911112:K,(Espadarana_R:0.119242:R:0.018515:K,Atelopus_z_S:0.0687643:S:0.0657081:K)_K:0.0180531:K)_K:0.146541:K)_K:0.133463:K)_K:0.0325331:K)_K:0.0320521:K)_K:0.45317:K,((Hynobius_c_K:0.13165:K,Andrias_da_K:0.116493:K)_K:0.0732691:K,(Siren_lace_K:0.293752:K,(Proteus_an_K:0.252888:K,((Salamandra_K:0.0876578:K,(Pleurodele_K:0.0771019:K,(Notophthal_K:0.0586901:K,(Cynops_pyr_K:0.0403819:K,Calotriton_K:0.048577:K)_K:0.0333907:K)_K:0.0108502:K)_K:0.0327197:K)_K:0.131402:K,Ambystoma__K:0.199927:K)_K:0.0356739:K)_K:0.0444576:K)_K:0.0182886:K)_K:0.293596:K)_K:0.0448927:K)_K:0.00925933:K:0.0960703:Q,((Ornithorhy_E:0.228367:E:0.0919702:Q,((Monodelphi_Q:0.116685:Q,(Sarcophilu_K:0.0956754:K,Macropus_e_K:0.0967417:K)_K:0.018078:K:0.0103883:Q)_Q:0.173422:Q,((Loxodonta__Q:0.134972:Q,Dasypus_no_Q:0.160758:Q)_Q:0.00642451:Q,((Mus_muscul_Q:0.227068:Q,Homo_sapie_Q:0.100987:Q)_Q:0.0152245:Q,(Felis_catu_Q:0.057015:Q,Canis_fami_Q:0.0722375:Q)_Q:0.0483643:Q)_Q:0.0151335:Q)_Q:0.235403:Q)_Q:0.0967933:Q)_Q:0.289468:Q,((Sphenodon__Q:0.263159:Q,((Tarentola__Q:0.138711:Q,Eublephari_Q:0.126597:Q)_Q:0.134156:Q,((Scincella__Q:0.129828:Q,((Saproscinc_E:0.0436589:E,Lamprophol_E:0.039153:E)_E:0.0132249:E,Carlia_rub_E:0.0421727:E)_E:0.0298716:E:0.0919685:Q)_Q:0.144222:Q,((Tupinambis_Q:0.263447:Q,Podarcis_s_Q:0.166879:Q)_Q:0.0318745:Q,(((((Thamnophis_Q:0.0743196:Q,(Pantheroph_Q:0.0216722:Q,Opheodrys__Q:0.0297485:Q)_Q:0.0184298:Q)_Q:0.0120419:Q,(Ophiophagu_Q:0.0287086:Q,Micrurus_f_Q:0.0346685:Q)_Q:0.0394398:Q)_Q:0.0306892:Q,(Echis_colo_Q:0.0582188:Q,(Sistrurus__Q:0.0146283:Q,Crotalus_a_Q:0.0105047:Q)_Q:0.0452256:Q)_Q:0.0417915:Q)_Q:0.0890153:Q,(Python_reg_Q:0.0643131:Q,Boa_constr_Q:0.0698609:Q)_Q:0.0173203:Q)_Q:0.244803:Q,(Elgaria_mu_Q:0.162227:Q,((Pogona_vit_Q:0.142077:Q,Chamaeleo__Q:0.204871:Q)_Q:0.0904862:Q,((Iguana_igu_Q:0.0899908:Q,Basiliscus_Q:0.0856177:Q)_Q:0.00740167:Q,(Sceloporus_Q:0.118961:Q,Anolis_car_Q:0.182233:Q)_Q:0.00884581:Q)_Q:0.0816258:Q)_Q:0.0306025:Q)_Q:0.00670356:Q)_Q:0.0222824:Q)_Q:0.0403438:Q)_Q:0.025957:Q)_Q:0.231231:Q)_Q:0.0674595:Q,(((Phrynops_h_Q:0.0852888:Q,Pelusios_c_Q:0.111423:Q)_Q:0.0256123:Q,(Pelodiscus_Q:0.119918:Q,(Sternother_Q:0.0824974:Q,(((Trachemys__Q:0.0144476:Q,Emys_orbic_Q:0.0187597:Q)_Q:0.0159572:Q,(Chinemys_r_Q:0.021249:Q,Chelonoidi_Q:0.0343593:Q)_Q:0.0145747:Q)_Q:0.00699482:Q,Caretta_ca_Q:0.0473377:Q)_Q:0.00586399:Q)_Q:0.0272828:Q)_Q:0.0283806:Q)_Q:0.0828567:Q,(((Struthio_c_Q:0.0456057:Q,Dromaius_n_Q:0.0442109:Q)_Q:0.040561:Q,(Taeniopygi_Q:0.2042:Q,((Meleagris__Q:0.0381837:Q,Gallus_gal_Q:0.0307035:Q)_Q:0.106136:Q,Anas_platy_Q:0.0913882:Q)_Q:0.0298145:Q)_Q:0.0373143:Q)_Q:0.189056:Q,(Crocodylus_Q:0.0349965:Q,(Caiman_cro_Q:0.0451652:Q,Alligator__Q:0.0189561:Q)_Q:0.0222454:Q)_Q:0.204443:Q)_Q:0.0457489:Q)_Q:0.0369577:Q)_Q:0.126106:Q)_Q:0.238162:Q)_Q:0.306579:Q)_Q:0.0367044:Q)_Q:0.044464:Q:0.0512411:K)_K:0.322251:K)_K:0.0235979:K)_K;"
nwtree = standardize_tree(pbtree)
#print(pbtree)
#print(nwree)
## fix phylobayes tree (standardize, add internal nodes and extract extra info)
fixedTree,treeSummary = fix_tree(pbtree)
for key in ['aa','pairs','transitions','dist','parent']:
print("%s - %s"%(key, treeSummary["Danio_reri"][key]))
print("...parent N4- %s"%treeSummary["N4"]['pairs'])
## plot the tree
def my_layout(node):
if node.is_leaf():
name_face = AttrFace("name",fsize=25)
faces.add_face_to_node(name_face, node, column=0, position="branch-right")
else:
name_face = AttrFace("name", fsize=20, fgcolor="red")
faces.add_face_to_node(name_face, node, column=0, position="branch-right")
ts = TreeStyle()
ts.show_leaf_name = False
ts.show_branch_length = True
ts.show_branch_support = True
ts.scale = 180
ts.layout_fn = my_layout
out = fixedTree.render("./figures/simple-tree.png", units="mm", tree_style=ts,dpi=400)
In [3]:
from IPython.display import Image
Image(filename='./figures/simple-tree.png')
Out[3]:
In [4]:
dataDir = None
for ddir in [os.path.join("..","data","herve-vertebrates"),\
os.path.join("/","media","ganda","mojo","phylogenetic-models","herve-vertebrates")]:
if os.path.isdir(ddir):
dataDir = ddir
split = "SPLIT1"
position = "25000"
treeList = get_trees(split,position,dataDir)
countMatrix = np.zeros((len(treeList),len(TRANSITIONS)),)
t = 0
for t,pbTree in enumerate(treeList):
fixedTree,treeSummary = fix_tree(pbTree)
tlist = []
for item in treeSummary.itervalues():
tlist.extend(item['pairs'])
counts = transitions_to_counts(tlist)
countMatrix[t,:] = counts
## make example plots
figTitle = "%s-%s"%(split,position)
figName1 = os.path.join("figures","example-profile-heatmap-1.png")
profile_heatmap_plot(countMatrix,figName1,figTitle=figTitle,cmap=plt.cm.bone)
Image(filename=figName1)
Out[4]:
In [5]:
figName2 = os.path.join("figures","example-profile-bplot-1.png")
profile_box_plot(countMatrix,figName2,figTitle=figTitle)
Image(filename=figName2)
Out[5]:
In [ ]:
In [6]:
## load data from data munge
outputDir = os.path.join("..","data","hv-compressed")
outputFile = os.path.join(outputDir,"%s.npz"%(split))
npz = np.load(outputFile)
summaryTree = npz['tree']
summarySpecies = {}
for key,item in npz.iteritems():
if key == 'tree':
continue
summarySpecies[key] = item
splitPositions = get_positions(split,dataDir)
splitIndex = np.where(splitPositions==position)[0]
nonZero = np.where(summaryTree[splitIndex,:] != 0)[1]
print nonZero
for nz in nonZero:
print("%s - %s"%(TRANSITIONS[nz],summaryTree[splitIndex,nz]))
There is a very larege reduction in data and this is what is being saved.
In [7]:
print("Full Tree - %s x %s "%summaryTree.shape)
for key,item in summarySpecies.iteritems():
print("%s - %s x %s"%(key,item.shape[0],item.shape[1]))
In [8]:
print("approx total number of matrices saved: %s"%(30000 * 10 * len(npz.keys())))
We go from about 2.2 GB per split to 2.2 MB per file.
In [ ]: