In [6]:
import numpy as np
import ete3 as ete
In [146]:
%load_ext rpy2.ipython
In [1]:
def getarray(locifile, tree, split1=0, split2=0):
""" parse the loci list and return a presence/absence matrix ordered by
the tips on the tree"""
## parse the loci file
loci = open(locifile).read().split("\n//")[:-1]
## order (ladderize) the tree
tree.ladderize()
## get tip names
names = tree.get_leaf_names()
if split1:
snames = [i.split("_", 1)[0] for i in names]
else:
snames = names
## make empty matrix
lxs = np.zeros((len(snames), len(loci)))
## fill the matrix
for loc in xrange(len(loci)):
for seq in loci[loc].split("\n"):
if ">" in seq:
if split2:
lxs[snames.index(seq.split()[0][1:].rsplit("_", 1)[0]), loc] += 1
else:
lxs[snames.index(seq.split()[0][1:]), loc] += 1
return lxs
In [2]:
def countmatrix(lxsabove, lxsbelow, max=0):
""" fill a matrix with pairwise data sharing
between each pair of samples. You could put
in two different 'share' matrices to have
different results above and below the diagonal.
Can enter a max value to limit fill along diagonal.
"""
share = np.zeros((lxsabove.shape[0],
lxsbelow.shape[0]))
## fill above
names = range(lxsabove.shape[0])
for row in lxsabove:
for samp1,samp2 in itertools.combinations(names,2):
shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
share[samp1,samp2] = shared
## fill below
for row in lxsbelow:
for samp2,samp1 in itertools.combinations(names,2):
shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
share[samp1,samp2] = shared
## fill diagonal
if not max:
for row in range(len(names)):
share[row,row] = lxsabove[row,].sum()
else:
for row in range(len(names)):
share[row,row] = max
return share
In [3]:
def count_inf4(tree, matrix, node):
""" count the number of loci with data spanning
a given node in the tree """
## get children of selected node
a, b = node.get_children()
## get tip descendents of a and b
tips_a = set(a.get_leaf_names())
tips_b = set(b.get_leaf_names())
## get every other tip (outgroups)
upone = node.up
if upone.is_root():
ch = upone.children
sis = [i for i in ch if i != node][0]
if sis.children:
tips_c = sis.children[0].get_leaf_names()
tips_d = sis.children[1].get_leaf_names()
else:
return 0
else:
upone = set(node.up.get_leaf_names())
tips_c = upone - tips_a - tips_b
tips_all = set(tree.get_leaf_names())
tips_d = tips_all - tips_a - tips_b - tips_c
## get indices in matrix for leaf tips
names = tree.get_leaf_names()
index_a = [names.index(i) for i in tips_a]
index_b = [names.index(i) for i in tips_b]
index_c = [names.index(i) for i in tips_c]
index_d = [names.index(i) for i in tips_d]
## how man loci are "informative"
inf = 0
for col in matrix.T:
hits_a = sum([col[i] for i in index_a])
hits_b = sum([col[i] for i in index_b])
hits_c = sum([col[i] for i in index_c])
hits_d = sum([col[i] for i in index_d])
if all([hits_a, hits_b, hits_c, hits_d]):
inf += 1
return inf
In [ ]:
def count_hier(tree, node):
## get children of selected node
a, b = node.get_children()
## get tip descendents of a and b
tips_a = set(a.get_leaf_names())
tips_b = set(b.get_leaf_names())
## get every other tip (outgroups)
upone = node.up
if upone.is_root():
ch = upone.children
sis = [i for i in ch if i != node][0]
if sis.children:
tips_c = sis.children[0].get_leaf_names()
tips_d = sis.children[1].get_leaf_names()
else:
return 0
else:
upone = set(node.up.get_leaf_names())
tips_c = upone - tips_a - tips_b
tips_all = set(tree.get_leaf_names())
tips_d = tips_all - tips_a - tips_b - tips_c
## get indices in matrix for leaf tips
names = tree.get_leaf_names()
index_a = [names.index(i) for i in tips_a]
index_b = [names.index(i) for i in tips_b]
index_c = [names.index(i) for i in tips_c]
index_d = [names.index(i) for i in tips_d]
## avg. tips per quartet edge
meanhier = np.median([len(i) for i in [index_a, index_b, index_c, index_d]])
minhier = min([len(i) for i in [index_a, index_b, index_c, index_d]])
return medianhier, minhier
In [167]:
def count_nest(tree, node):
## get children of selected node
leaves_down = tree.get_leaf_names()
## get all other tips
leaves_up = list(set(tree.get_leaf_names()) - set(leaves_down))
## get distances to leaves below (+1 because we aren't counting closest node)
n1 = [node.get_distance(tree.get_leaves_by_name(name)[0], topology_only=True)+1 for name in leaves_down]
n2 = [node.get_distance(tree.get_leaves_by_name(name)[0], topology_only=True) for name in leaves_up]
return np.mean(n1+n2), np.min(n1+n2)
In [182]:
## Viburnum
loci_vib = "empirical_1/fullrun/outfiles/empirical_1_full_m4.loci"
tree_vib = ete.Tree("empirical_1/fullrun/RAxML_bestTree.empirical_1_fullrun")
lxs_vib = getarray(loci_vib, tree_vib)
lxs_vib
Out[182]:
In [38]:
## Quercus
loci_que = "empirical_3/outfiles/empirical_3_m4.loci"
tree_que = ete.Tree("empirical_3/RAxML_bestTree.empirical_3")
lxs_que = getarray(loci_que, tree_que)
lxs_que
Out[38]:
In [180]:
## Orestias
loci_ore = "empirical_4/outfiles/empirical_4.loci"
tree_ore = ete.Tree("empirical_4/RAxML_bestTree.empirical_4")
lxs_ore = getarray(loci_ore, tree_ore)
lxs_ore
Out[180]:
In [183]:
def get_data(tree, lxs):
data = []
for node in tree.traverse():
if not (node.is_leaf() or node.is_root() or node.up.is_root()):
## get number of quartet loci at this split
a = count_inf4(tree, lxs, node)
## get closest leaf at this split
bmean, bmin = count_nest(tree, node)
## get hierarch. (min N descendants) at split
cmedian, cmin = count_hier(tree, node)
## save
data.append([a, bmean, bmin, cmedian, cmin])
#print "{:<6} {:<.3f} {:<.1f} {:<.3f} {:<.1f}".format(a, bmean, bmin, cmedian, cmin)
data = np.array(data)
return data
In [195]:
## get Vib data
data_vib = get_data(tree_vib, lxs_vib)
## get Quercus data
data_que = get_data(tree_que, lxs_que)
## get Orestias data
data_ore = get_data(tree_ore, lxs_ore)
In [196]:
dat = np.array(data)
%R -i data_vib
%R -i data_que
%R -i data_ore
In [216]:
%%R -w 300 -h 300
#pdf("hierplots.pdf", width=4, height=12, pointsize=16)
#par(mfrow=c(3, 1))
boxplot(data_vib[,1] ~ data_vib[,5], col='gray', main="Viburnum", cex=2)
print(cor.test(data_vib[,5], data_vib[,1], method='s'))
boxplot(data_que[,1] ~ data_que[,5], col='gray', main="Quercus")
print(cor.test(data_que[,5], data_que[,1], method='s'))
boxplot(data_ore[,1] ~ data_ore[,5], col='gray', main="Orestias")
cor.test(data_ore[,5], data_ore[,1], method='s')
#dev.off()
In [ ]: