In [1]:
from ete3 import Tree
import string
import scipy.stats as stats
import numpy as np
tl = Tree()
# We create a random tree topology
numTips = 20
candidateNames = list(string.ascii_lowercase)
tipNames = candidateNames[0:20]
tl.populate(numTips, names_library=tipNames)
print (tl)
#Alternatively we could read a tree from a file into a string "line", and then use:
# tl = Tree( line )
In [2]:
def reNumberNodes (tl):
nodeId = 0
for n in tl.traverse():
n.add_features(ND=nodeId)
if n.name=="":
n.name = str(nodeId)
nodeId = nodeId + 1
return
reNumberNodes(tl)
#Writing in NHX format
tl.write(features=['ND'])
print(tl.get_ascii(show_internal=True))
In [3]:
# We could use some dynamic programming to be able to generate paths that yield n transitions exactly.
# INstead we randomly generate transitions on the tree until we get the desired number.
# We have two states: ancestral (0) and convergent (1).
# We count the numbers of transitions
def randomTransitions(numTransitions, tree):
numberOfNodes = len(tree.get_tree_root().get_descendants()) + 1
rate = float(numTransitions)/float(numberOfNodes)
ancestralTransition=dict()
totalNumberOfTransitions = 0
nodesWithTransitions = list()
for node in tree.traverse("levelorder"):
if node.is_root() :
ancestralTransition[node] = False
elif ( ancestralTransition[node.up] == True):
ancestralTransition[node] = True
else :
sisterHasAlreadyTransitioned=False
if ancestralTransition.__contains__(node.get_sisters()[0]): #Here we assume binary trees!
sisterHasAlreadyTransitioned=True
#randomly draw whether we do a transition or not
transitionBool = stats.bernoulli.rvs(rate, size=1)[0] == 1
if (transitionBool and not sisterHasAlreadyTransitioned):
ancestralTransition[node] = True
nodesWithTransitions.append(node)
totalNumberOfTransitions = totalNumberOfTransitions + 1
else:
ancestralTransition[node] = False
return nodesWithTransitions, totalNumberOfTransitions, ancestralTransition
def placeNTransitionsInTree(numTransitions, tree):
observedNumTransitions = 2*numTransitions
nodesWithTransitions = list()
numTries = 0
convergentModel = dict()
while observedNumTransitions != numTransitions and numTries < 100:
observedNumTransitions = 0
nodesWithTransitions, observedNumTransitions, convergentModel = randomTransitions(numTransitions, tree)
print ("Observed Number of Transitions: "+ str(observedNumTransitions ) + " compared to "+ str(numTransitions) + " wanted")
numTries = numTries + 1
if numTries < 100:
for n in nodesWithTransitions:
print(n.get_ascii())
else:
print("It seems like it is too difficult to place "+ str(numTransitions) + " events in this tree.")
return convergentModel
In [4]:
convergentModel = dict()
convergentModel = placeNTransitionsInTree(5, tl)
Now we want to output the tree and the command line for bppseqgen and bppml.
In [5]:
# convergentModel is the ouput from thefunction that places transitions
# C1 and C2 are two profile numbers
# Nch is the number of characters
def getBppSeqGenCommandFromNodesWithTransitions(convergentModel, C1, C2, Nch):
#First, get the nodes with the convergent model and the nodes with the ancestral model.
nodesWithConvergentModel = list()
nodesWithAncestralModel = list()
for k,v in convergentModel.items():
if v == True:
nodesWithConvergentModel.append(k.ND)
if v == False:
nodesWithAncestralModel.append(k.ND)
n1="\""+ str(nodesWithAncestralModel[0])
for n in nodesWithAncestralModel[1:len(nodesWithAncestralModel)-1]:
n1 += "," + str(n)
n1 += "\""
n2="\""+ str(nodesWithConvergentModel[0])
for n in nodesWithConvergentModel[1:len(nodesWithConvergentModel)-1]:
n2 += "," + str(n)
n2 += "\""
#Dummy values:
command="bppseqgen param=CATseq.bpp mod1Nodes=%s mod2Nodes=%s Nch=%d Ns1=%d Ns2=%d Ne1=%d Ne2=%d"%(n1,n2,Nch,C1,C2,C1,C2)
return (command)
getBppSeqGenCommandFromNodesWithTransitions(convergentModel, 1, 2, 1000)
Out[5]:
In [6]:
# we load a small tree, for testing purpose
t = Tree('((((H,K)D,(F,I)G)B,E)A,((L,(N,Q)O)J,(P,S)M)C);', format=1)
print(t.write(format=1))
print(t.get_ascii(show_internal=True))
#for node in t.traverse("levelorder"):
# Do some analysis on node
#print (node.name)
In [7]:
def getCherries(leaves, tree):
length = len(leaves)
dist=list()
leaveslist = list(leaves)
for i in range(length-1):
for j in range(i+1,length):
di = tree.get_distance(leaveslist[i].name, leaveslist[j].name, topology_only=True)
dist.append([di, leaveslist[i].name, leaveslist[j].name])
#print(dist)
cherries = list()
for d in dist:
if (d[0] == 1.0):
cherries.append(d)
return (cherries)
def placeTransitionOnBranchIfSisterHasNotTransitioned (node, ancestralTransition, nodesWithTransitions):
if node.is_leaf():
if ancestralTransition.__contains__(node.get_sisters()[0]) and ancestralTransition[node.get_sisters()[0]]==True:
print("Problem: sister of node "+node.name+" has transitioned!")
return
ancestralTransition[ node ] = True
nodesWithTransitions.append(node)
print ("Adding transition on leaf node "+node.name)
return
descen = node.get_descendants()
SisterHasTransitioned = True
converg = 0
while (not SisterHasTransitioned):
converg = np.random.randint( 0, high=len(descen) )
if ancestralTransition.__contains__(descen[converg].get_sisters()[0] and ancestralTransition[descen[converg].get_sisters()[0]]==True):
pass
else:
SisterHasTransitioned = False
ancestralTransition[ descen[converg] ] = True
nodesWithTransitions.append(descen[converg])
print ("Adding transition on node "+node.name)
return
def randomTransitionsWithHypergeometricDistribution(numTransitions, tree):
numberOfNodes = len(tree.get_tree_root().get_descendants()) + 1
rate = float(numTransitions)/float(numberOfNodes)
node2leaves = tree.get_cached_content()
#Computing the maximum number of transitions possible
numLeaves = len(node2leaves[tree.get_tree_root()])
cherries = getCherries(node2leaves[tree.get_tree_root()], tree)
#print("cherries: "+str(cherries))
maxNumTrans = numLeaves - len(cherries)
print("Maximum number of transitions: "+str(maxNumTrans))
if maxNumTrans < numTransitions:
print ("Sorry, we cannot fit "+str(numTransitions)+ " in this tree, which can only accommodate "+str(maxNumTrans)+" transitions.")
# Now, we want to annotate all nodes with the number of available underlying branches.
listOfCherryPartners = list()
for c in cherries:
listOfCherryPartners.append( c[1] )
#print("listOfCherryPartners: "+str(listOfCherryPartners))
for node in tree.traverse("levelorder"):
numLea = len(node2leaves[node])
numCherries = 0
for n in node2leaves[node]:
if n.name in listOfCherryPartners:
numCherries = numCherries +1
node.add_feature("numberOfAvailableBranches", numLea - numCherries)
#print(node.name + " : "+ str(node.numberOfAvailableBranches))
ancestralTransition=dict()
for node in tree.traverse("levelorder"):
ancestralTransition[node] = False
nodesWithTransitions = list()
#Now we traverse the tree from the root, and at each node choose
#how many transitions we place in the right and left subtrees
tree.get_tree_root().add_feature("underlyingNumTransitions", numTransitions)
for node in tree.traverse("preorder"):
if (not node.is_leaf()):
rightChild = node.children[0]
leftChild = node.children[1]
if node.underlyingNumTransitions <= 1:
rightChild.add_feature("underlyingNumTransitions",
1)
leftChild.add_feature("underlyingNumTransitions",
1)
else:
numRight = np.random.hypergeometric(rightChild.numberOfAvailableBranches,
leftChild.numberOfAvailableBranches,
node.underlyingNumTransitions)
numLeft = node.underlyingNumTransitions - numRight
rightChild.add_feature("underlyingNumTransitions",
numRight)
leftChild.add_feature("underlyingNumTransitions",
numLeft)
if (numRight == 1):
#We randomly place the transition in one of the branches of the right subtree
placeTransitionOnBranchIfSisterHasNotTransitioned (rightChild,
ancestralTransition,
nodesWithTransitions)
if (numLeft == 1):
#We randomly place the transition in one of the branches of the left subtree
placeTransitionOnBranchIfSisterHasNotTransitioned (leftChild,
ancestralTransition,
nodesWithTransitions)
print ("\t\tTotal number of transitions placed in the tree: "+str(len(nodesWithTransitions)))
return nodesWithTransitions, ancestralTransition
In [8]:
for i in range (10):
n,a = randomTransitionsWithHypergeometricDistribution(5, t)
In [9]:
def placeTransitionsAndGetBppSeqGenCommand(numTransitions, tree, C1, C2, Nch):
reNumberNodes(tree)
nodesWithTransitions,convergentModel = randomTransitionsWithHypergeometricDistribution(numTransitions, tree)
return( getBppSeqGenCommandFromNodesWithTransitions(convergentModel, C1, C2, Nch) )
# We try the function on the large tree simulated at the beginning
placeTransitionsAndGetBppSeqGenCommand(10, tl, 1, 2, 1000)
Out[9]:
In [ ]: