In [4]:
import random
from math import ceil
from math import factorial
from math import sqrt
random.seed()
#partitions lst
#so that for a in partition[i] and b in partition[j]
#i < j ==> a < b
#gensizes is a function that takes a list
#and returns a list of partition sizes
def partition(lst,gensizes):
#create partition sizes
part_sizes = gensizes(lst)
#create and fill partitions array
items_left = lst
partitions = []
for i in part_sizes:
partitions.append(items_left[0:i])
items_left = items_left[i:]
return partitions
#randPart randomly partitions the list lst into
#at least 2 partitions
def randPart(lst):
return partition(lst,genRandPartSizes)
#genRandPartSizes returns a random list part_sizes, [p1,p2,...,pn]
#so that len(partsizes) > 1 and
#p1 + p2 + ...+ pn = len(lst)
def genRandPartSizes(lst):
#random.seed(1)
#generate partition sizes
n = len(lst)
part_sizes = []
while n > 1:
ps = random.randint(1,n-1)
part_sizes.append(ps)
n -=ps
#append the size of the last partition if non-zero
if n > 0:
part_sizes.append(n)
return part_sizes
def randBinPart(lst):
n = random.randint(1,len(lst)-1)
return partition(lst,lambda x: [n,len(lst)-n])
In [109]:
#functions to perform tree rotations or NNI moves
def rotateLeft(subtree):
return [[subtree[0],subtree[1][0]],subtree[1][1]]
def rotateRight(subtree):
return [subtree[0][0],[subtree[0][1],subtree[1]]]
#return rotated subtree if it can be rotated
#otherwise returns []
def randRotate(subtree):
actions = []
if isinstance(subtree[0],list):
actions.append(rotateRight)
if isinstance(subtree[1],list):
actions.append(rotateLeft)
length = len(actions)
if length != 0:
return actions[random.randint(0,length -1)](subtree)
else:
return []
__rotation_performed__ = False
def rotateNode(subtree,targetNode,i):
global __rotation_performed__
#base cases
#leaf
if not isinstance(subtree,list):
return (subtree,i-1)
#target node reached
if targetNode == i:
rotated = randRotate(subtree)
if rotated != []:
__rotation_performed__ = True
return (rotated,i+1)
else:
return (subtree,i)
#rotation performed
if targetNode < i:
return (subtree,i)
#recursive calls
left,j = rotateNode(subtree[0],targetNode,i+1)
right,k = rotateNode(subtree[1],targetNode,j+1)
return ([left,right],k)
#perform a random NNI move
def randNNI(tree,total_nodes):
global __rotation_performed__
__rotation_performed__ = False
while True:
targetNode = random.randint(1,total_nodes)
rotated,i = rotateNode(tree,targetNode,1)
if __rotation_performed__:
return rotated
In [6]:
#generates a random tree containing the leaves in the "leaves" list
#Base case: for input [leaf] ==> leaf whch is the only posible tree
#Induction: ranPart yields partitions smaller than n+1
#it will produce the tree [genRandTree(p1),...,genRandTree(pn)]
#which works by IHOP
#Halting: calling randPart will yield partitions of smaller size
#eventually reachin the base case
def genRandTree(leaves):
if len(leaves) < 2:
return leaves[0]
subtree=[]
for part in randPart(leaves):
subtree.append(genRandTree(part))
return subtree
In [7]:
#generates a binary tree
def genRandBinTreeWithPartitions(leaves):
if len(leaves) < 2:
return leaves[0]
subtree=[]
for part in randBinPart(leaves):
subtree.append(genRandBinTreeWithPartitions(part))
return subtree
#generates a binary tree
def genRandBinTree(leaves):
while len(leaves) > 1:
#pick two random elements
a = random.randint(0,len(leaves)-1)
sibling1 = leaves[a]
del leaves[a]
b = random.randint(0,len(leaves)-1)
sibling2 = leaves[b]
del leaves[b]
leaves.append([sibling1,sibling2])
return leaves[0]
In [8]:
#generates a random binary tree in Newick format
def randNewickBinSubTree(leaves):
if len(leaves) < 2:
return str(leaves[0]) + ":1"
subtree="("
for part in randBinPart(leaves):
subtree += randNewickBinSubTree(part) + ","
return subtree[0:-1] + "):1"
def randNewickBinTree(leaves):
if len(leaves) < 2:
return "(" + str(leaves[0]) + ":1" + ")"
tree ="("
for part in randBinPart(leaves):
tree += randNewickBinSubTree(part) + ","
return tree[0:-1] + ")"
In [9]:
#convert tree to Newick format
def toNewickSubTree(tree):
#case leaf reached
if not isinstance(tree,list):
return str(tree) + ":1"
ntree = "("
for subtree in tree:
ntree += toNewickSubTree(subtree) + ","
return ntree[0:-1] + "):1"
def toNewickTree(tree):
ntree = "("
for subtree in tree:
ntree += toNewickSubTree(subtree) + ","
return ntree[0:-1] + ")"
def newickToNestedLst(newick_tree):
return eval(newick_tree.replace(":1","").replace("(","[").replace(")","]"))
In [10]:
#compare two trees using variation of AHU algorithm
#for every node or leaf at every level
#the sorted nodes must be the same
def Tree_Signature(tree):
if not isinstance(tree,list):
return str(tree)
lst_subtrees = []
for subtree in tree:
lst_subtrees.append(Tree_Signature(subtree))
sorted_subtrees = sorted(lst_subtrees)
return "[" + str(sorted_subtrees) + "]"
#this tree signature is acomplete invariant among equal trees
#threfore tree1 = tree2 iff Tree_Signature(tree1) = Tree_Signature(tree2)
def treeEq(tree1,tree2):
return Tree_Signature(tree1) == Tree_Signature(tree2)
#compte catalan number
def catalan(n):
return factorial(2*n) / (factorial(n+1)*factorial(n))
In [33]:
#write trees to text file
#with open('randtrees','w') as file:
# for i in range(0,10):
# file.write(randNewickBinTree(range(0,10)) + "\n")
In [25]:
#TODO
#1 - generate random trees by performing random rotations
#2 - read GTP output file
#5 - run GTP on the tree sequence file and and get file with
# tree distance and the geodesic
In [11]:
#create count nodes
def countNodes(tree):
if not isinstance(tree,list):
return 0
return 1 + countNodes(tree[0]) + countNodes(tree[1])
In [12]:
#perform random SPR move
#PRE: tree is a nested list representation of a tree
#POST: returns a new tree with a random SPR move
# the resulting tree will have same number of nodes as the
# initial tree
__prunedbranch__ = []
def randSPR(tree,total_nodes):
global __prunedbranch__
targetNode = random.randint(1,total_nodes)
pruned,n = __prune__(tree,targetNode,0)
if not isinstance(pruned,list):#if the pruned tree is a leaf return the original tree
return (tree,0)
targetNode = random.randint(1,total_nodes - countNodes(__prunedbranch__) - 1)
return __regraft__(pruned,targetNode,0)
#POST: return a pair of the form (pruned tree, number of visited nodes) and the
# variable __prunedbranch__ contains the subtree removed from tree
#Base Cases:
#two kinds of base cases
# 1 cases where no visited node is counted
# like (leaf) or (number of visited nodes== targetNode)
#in this case it returns the tree unmodified and the
#number of visited nodes is not incremented
# 2 occurs when targetNode is visited
# saves and removes the pruned branch from the node
#other branch is returned so the tree remains complete
#and we count this node as visited
#the base cases work as we would want
#Assume __prune__ works for subtree with 1..k nodes
#for a tree having k+1 nodes
#__prune__(tree[0],targetNode,visited_nodes + 1)
#will return the properly pruned left subtree and the correct number of visited nodes
#so __prune__(tree[1],targetNode,visited_nodes1)
#will return the properly pruned right subtree and the correct total of visited nodes
#then [subtree1,subtree2] will be a properly pruned tree and
#visuted_nodes2 will be the total number of visited nodes
def __prune__(tree,targetNode,visited_nodes):
global __prunedbranch__
#basecases
#tree is a leaf
if not isinstance(tree,list):
return (tree,visited_nodes + 0)#no nodes where visited no pruning
#we don't want to visit any nodes after target node has been reached
if targetNode == visited_nodes:
return (tree,visited_nodes + 0)
#(not a leaf) or (target not reached) ==> node visited
#therefore we count current node as visited and write visited_nodes+1
#node to be pruned
if targetNode == visited_nodes+1:
#save the pruned branch
whichBranch = random.randint(0,1)
__prunedbranch__ = tree[whichBranch]
#return pruned subtree
return (tree[not whichBranch],visited_nodes+1)
#recursive step
subtree1, visited_nodes1 = __prune__(tree[0],targetNode,visited_nodes + 1)
subtree2, visited_nodes2 = __prune__(tree[1],targetNode,visited_nodes1)
#visited_nodes2 is the total
return ([subtree1,subtree2],visited_nodes2)
#__regraft__ uses the same logic as __prune__ except it performs a different
#operation on the target node
def __regraft__(tree,targetNode,visited_nodes):
global __prunedbranch__
#basecases
#tree is a leaf
if not isinstance(tree,list):
return (tree,visited_nodes + 0)#no nodes where visited no pruning
#we don't want to visit any nodes after target node has been reached
if targetNode == visited_nodes:
return (tree,visited_nodes + 0)
#(not a leaf) or (target not reached) ==> node visited
#therefore we count current node as visited and write visited_nodes+1
#attach to edge after targetNode
if targetNode == visited_nodes+1:
if random.randint(0,1) == 0:
return ([[tree[0],__prunedbranch__],tree[1]],visited_nodes+1)
else:
return ([tree[0],[tree[1],__prunedbranch__]],visited_nodes+1)
#recursive step
subtree1, visited_nodes1 = __regraft__(tree[0],targetNode,visited_nodes + 1)
subtree2, visited_nodes2 = __regraft__(tree[1],targetNode,visited_nodes1)
#visited_nodes2 is the total
return ([subtree1,subtree2],visited_nodes2)
In [10]:
#write SPR sequence to file
rand_tree = genRandBinTree(range(100))
total_nodes = countNodes(rand_tree)
with open('sprseq','w') as treefile:
treefile.write(toNewickTree(rand_tree) + "\n")
current_tree = rand_tree
for i in range(1000):
current_tree = randSPR(current_tree,total_nodes)[0]
treefile.write(toNewickTree(current_tree) + "\n")
In [11]:
#run GTP on the file and read the results as csv
import os
#assumes GTP file is in current working directory
os.system("java -jar gtp.jar -r 1 -o seqdist.csv sprseq")
Out[11]:
In [77]:
import csv
import numpy as np
import matplotlib.pyplot as plt
datapoints = []
#seqdist.txt will be of the form
#src_tree<tab>target_tree<tab>geo_distance
with open("seqdist.csv", 'r') as seqfile:
seq_reader = csv.reader(seqfile,delimiter='\t')
for row in seq_reader:
datapoints.append([int(row[1]),float(row[2])])
#convert to numpy array
datapoints = np.array(datapoints)
plt.scatter(datapoints[:,0],datapoints[:,1])
plt.show()
In [82]:
plt.hist(datapoints[:,1],bins=10)
plt.show()
In [48]:
#check that random SPR sequence works
#count the number of different tree signatures
rand_tree = genRandBinTree([1,2,3,4,5,6])
total_nodes = countNodes(rand_tree)
tree_dict = {}
tree_dict[Tree_Signature(rand_tree)]=1
current_tree = rand_tree
for i in range(10000):
current_tree = randSPR(current_tree,total_nodes)[0]
#add to dictionary
signature = Tree_Signature(current_tree)
tree_dict[signature] = tree_dict.get(signature,0) + 1
print("total_nodes =",total_nodes,"no. distinct trees = ",len(tree_dict.keys()))
In [121]:
#write NNI sequence to file
rand_tree = genRandBinTree(list(range(100)))
total_nodes = countNodes(rand_tree)
with open('nniseq','w') as treefile:
treefile.write(toNewickTree(rand_tree) + "\n")
current_tree = rand_tree
for i in range(1000):
current_tree = randNNI(current_tree,total_nodes)
treefile.write(toNewickTree(current_tree) + "\n")
In [122]:
#run GTP on the file and read the results as csv
import os
#assumes GTP file is in current working directory
os.system("java -jar gtp.jar -r 1 -o nniseqdist.csv nniseq")
Out[122]:
In [123]:
import csv
import numpy as np
import matplotlib.pyplot as plt
datapoints = []
#seqdist.txt will be of the form
#src_tree<tab>target_tree<tab>geo_distance
with open("nniseqdist.csv", 'r') as seqfile:
seq_reader = csv.reader(seqfile,delimiter='\t')
for row in seq_reader:
datapoints.append([int(row[1]),float(row[2])])
#convert to numpy array
datapoints = np.array(datapoints)
plt.scatter(datapoints[:,0],datapoints[:,1])
plt.show()
In [124]:
plt.hist(datapoints[:,1],bins=10)
plt.show()
In [12]:
Out[12]:
In [25]:
#
import csv
#get number of nodes from file
with open("sprseq","r") as seq:
newick_tree = seq.readline()
tree = newickToNestedLst(newick_tree)
number_nodes = countNodes(tree)
max_dist = 2*sqrt(number_nodes)
count_steps=0
with open("seqdist.csv", 'r') as distfile:
dist_reader = csv.reader(distfile,delimiter='\t')
for row in dist_reader:
#row 2 is the geo distance
if float(row[2]) > (max_dist * 0.9):
break
count_steps+=1
In [27]:
number_nodes
Out[27]:
In [18]:
eval(newick_tree.replace(":1","").replace("(","[").replace(")","]"))
Out[18]:
In [ ]: