In [1]:
import ete2
import skbio
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
In [2]:
# it'd be nice if this expandvars call happened interally
from os.path import expandvars
et = ete2.Tree(expandvars('$HOME/data/gg_13_8_otus/trees/61_otus.tree'))
st = skbio.TreeNode.read(expandvars('$HOME/data/gg_13_8_otus/trees/61_otus.tree'))
In [3]:
et
Out[3]:
In [4]:
st
Out[4]:
In [5]:
print et
In [6]:
et.describe()
In [7]:
print st
In [8]:
%%timeit
# load the tree on each loop to avoid caching of results
ete2.Tree(expandvars('$HOME/data/gg_13_8_otus/trees/85_otus.tree')).traverse("postorder")
In [9]:
%%timeit
# load the tree on each loop to avoid caching of results
skbio.TreeNode.read(expandvars('$HOME/data/gg_13_8_otus/trees/85_otus.tree')).postorder()
Phylogenetic Diversity (PD) is the amount of branch length in a phylogenetic tree that is observed in a given biological community. It is a phylogenetic estimator of community richness. I describe it in some more detail in IAB).
In [10]:
from random import shuffle
tip_names = [t.name for t in st.tips()]
shuffle(tip_names)
observed_tips = tip_names[:10]
In [11]:
%timeit et.get_leaves_by_name('801940')[0]
In [12]:
%timeit st.find('801940')
In [13]:
%timeit et.get_leaves_by_name('801940')[0].get_ancestors()
In [14]:
%timeit st.find('801940').ancestors()
In [15]:
def et_pd(tree, observed_tips):
observed_nodes = set()
for tip_name in observed_tips:
observed_nodes |= set(tree.get_leaves_by_name(tip_name)[0].get_ancestors())
pd = sum([o.dist for o in observed_nodes])
ds = [(o.dist, o.name) for o in observed_nodes]
ds.sort()
print len(ds)
return ds, pd
ds, pd = et_pd(et, observed_tips)
print pd
_ = plt.hist([e[0] for e in ds])
In [16]:
def sk_pd(tree, observed_tips):
observed_nodes = set()
for tip_name in observed_tips:
observed_nodes |= set(tree.find(tip_name).ancestors())
pd = sum([o.length for o in observed_nodes if o.length != None])
ds = [(o.length, o.name) for o in observed_nodes if o.length != None]
print len(ds)
return ds, pd
ds, pd = sk_pd(st, observed_tips)
print pd
_ = plt.hist([e[0] for e in ds])
In [17]:
# let's just parse the floats from the newick tree now and see what we get...
import re
s = open(expandvars("$HOME/data/gg_13_8_otus/trees/61_otus.tree")).read()
print s
In [18]:
lengths = map(float,[e[1:] for e in re.findall(':\d\.\d+', s)])
lengths.sort()
print lengths
print len(lengths), np.min(lengths), np.median(lengths), np.max(lengths)
_ = plt.hist(lengths)
In [19]:
# and compare that with the skbio lengths
lengths = [n.length for n in st.postorder() if n.length is not None]
lengths.sort()
print lengths
print len(lengths), np.min(lengths), np.median(lengths), np.max(lengths)
_ = plt.hist(lengths)
In [20]:
# and the ete2 lengths
lengths = [n.dist for n in et.traverse("postorder")]
lengths.sort()
print lengths
print len(lengths), np.min(lengths), np.median(lengths), np.max(lengths)
_ = plt.hist(lengths)
This is bad... it looks like ete2 is not recognizing the node names, but silently converting them to 1.0. This should raise an exception as it can give very misleading results (as is the case here).
Loading with format=1 gets us closer, but there is one label that is still being converted to 1.0. This is still very bad, but there also may be an issue with that node in the tree... Is the name of the node really supposed to be 0.081?
In [21]:
et1 = ete2.Tree(expandvars('$HOME/data/gg_13_8_otus/trees/61_otus.tree'), format=1)
In [22]:
ds, pd = et_pd(et1, observed_tips)
print pd
_ = plt.hist([e[0] for e in ds])