In [6]:
import numpy as np
import ete3 as ete

In [146]:
%load_ext rpy2.ipython

In [1]:
def getarray(locifile, tree, split1=0, split2=0):
    """ parse the loci list and return a presence/absence matrix ordered by 
        the tips on the tree"""
    ## parse the loci file
    loci = open(locifile).read().split("\n//")[:-1]
    ## order (ladderize) the tree
    tree.ladderize()
    
    ## get tip names
    names = tree.get_leaf_names()
    if split1:
        snames = [i.split("_", 1)[0] for i in names]
    else:
        snames = names
    
    ## make empty matrix
    lxs = np.zeros((len(snames), len(loci)))
    
    ## fill the matrix
    for loc in xrange(len(loci)):
        for seq in loci[loc].split("\n"):
            if ">" in seq:
                if split2:
                    lxs[snames.index(seq.split()[0][1:].rsplit("_", 1)[0]), loc] += 1
                else:
                    lxs[snames.index(seq.split()[0][1:]), loc] += 1

    return lxs

In [2]:
def countmatrix(lxsabove, lxsbelow, max=0):
    """ fill a matrix with pairwise data sharing
        between each pair of samples. You could put
        in two different 'share' matrices to have
        different results above and below the diagonal.
        Can enter a max value to limit fill along diagonal.
        """
    share = np.zeros((lxsabove.shape[0], 
                      lxsbelow.shape[0]))
    ## fill above
    names = range(lxsabove.shape[0])
    for row in lxsabove:
        for samp1,samp2 in itertools.combinations(names,2):
            shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
            share[samp1,samp2] = shared
    ## fill below
    for row in lxsbelow:
        for samp2,samp1 in itertools.combinations(names,2):
            shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
            share[samp1,samp2] = shared
    ## fill diagonal
    if not max:
        for row in range(len(names)):
            share[row,row] = lxsabove[row,].sum()
    else:
        for row in range(len(names)):
            share[row,row] = max
    return share

In [3]:
def count_inf4(tree, matrix, node):
    """ count the number of loci with data spanning 
    a given node in the tree """
    
    ## get children of selected node
    a, b = node.get_children()
    
    ## get tip descendents of a and b
    tips_a = set(a.get_leaf_names())
    tips_b = set(b.get_leaf_names())
    
    ## get every other tip (outgroups)
    upone = node.up
    if upone.is_root():
        ch = upone.children
        sis = [i for i in ch if i != node][0]
        if sis.children:
            tips_c = sis.children[0].get_leaf_names()
            tips_d = sis.children[1].get_leaf_names()
        else:
            return 0
    else:
        upone = set(node.up.get_leaf_names())
        tips_c = upone - tips_a - tips_b
        tips_all = set(tree.get_leaf_names())
        tips_d = tips_all - tips_a - tips_b - tips_c
    
    ## get indices in matrix for leaf tips
    names = tree.get_leaf_names()
    index_a = [names.index(i) for i in tips_a]
    index_b = [names.index(i) for i in tips_b]
    index_c = [names.index(i) for i in tips_c]
    index_d = [names.index(i) for i in tips_d]
   
    ## how man loci are "informative"
    inf = 0
    for col in matrix.T:
        hits_a = sum([col[i] for i in index_a])
        hits_b = sum([col[i] for i in index_b])
        hits_c = sum([col[i] for i in index_c])
        hits_d = sum([col[i] for i in index_d])

        if all([hits_a, hits_b, hits_c, hits_d]):
                inf += 1 
    return inf

In [ ]:
def count_hier(tree, node):
    ## get children of selected node
    a, b = node.get_children()
    
    ## get tip descendents of a and b
    tips_a = set(a.get_leaf_names())
    tips_b = set(b.get_leaf_names())
    
    ## get every other tip (outgroups)
    upone = node.up
    if upone.is_root():
        ch = upone.children
        sis = [i for i in ch if i != node][0]
        if sis.children:
            tips_c = sis.children[0].get_leaf_names()
            tips_d = sis.children[1].get_leaf_names()
        else:
            return 0
    else:
        upone = set(node.up.get_leaf_names())
        tips_c = upone - tips_a - tips_b
        tips_all = set(tree.get_leaf_names())
        tips_d = tips_all - tips_a - tips_b - tips_c
    
    ## get indices in matrix for leaf tips
    names = tree.get_leaf_names()
    index_a = [names.index(i) for i in tips_a]
    index_b = [names.index(i) for i in tips_b]
    index_c = [names.index(i) for i in tips_c]
    index_d = [names.index(i) for i in tips_d]
   
    ## avg. tips per quartet edge
    meanhier = np.median([len(i) for i in [index_a, index_b, index_c, index_d]])
    minhier = min([len(i) for i in [index_a, index_b, index_c, index_d]])
    
    return medianhier, minhier

In [167]:
def count_nest(tree, node):
    ## get children of selected node
    leaves_down = tree.get_leaf_names()
    ## get all other tips
    leaves_up = list(set(tree.get_leaf_names()) - set(leaves_down))

    ## get distances to leaves below (+1 because we aren't counting closest node)
    n1 = [node.get_distance(tree.get_leaves_by_name(name)[0], topology_only=True)+1 for name in leaves_down]
    n2 = [node.get_distance(tree.get_leaves_by_name(name)[0], topology_only=True) for name in leaves_up]
   
    return np.mean(n1+n2), np.min(n1+n2)

Calculation


In [182]:
## Viburnum
loci_vib = "empirical_1/fullrun/outfiles/empirical_1_full_m4.loci"
tree_vib = ete.Tree("empirical_1/fullrun/RAxML_bestTree.empirical_1_fullrun")
lxs_vib = getarray(loci_vib, tree_vib)
lxs_vib


Out[182]:
array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [38]:
## Quercus
loci_que = "empirical_3/outfiles/empirical_3_m4.loci"
tree_que = ete.Tree("empirical_3/RAxML_bestTree.empirical_3")
lxs_que = getarray(loci_que, tree_que)
lxs_que


Out[38]:
array([[ 1.,  1.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  0.,  0., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  0.,  1.,  1.],
       [ 1.,  0.,  1., ...,  1.,  1.,  1.]])

In [180]:
## Orestias
loci_ore = "empirical_4/outfiles/empirical_4.loci"
tree_ore = ete.Tree("empirical_4/RAxML_bestTree.empirical_4")
lxs_ore = getarray(loci_ore, tree_ore)
lxs_ore


Out[180]:
array([[ 0.,  0.,  1., ...,  1.,  1.,  0.],
       [ 1.,  0.,  1., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [183]:
def get_data(tree, lxs):
    data = []
    for node in tree.traverse():
        if not (node.is_leaf() or node.is_root() or node.up.is_root()):
            ## get number of quartet loci at this split
            a = count_inf4(tree, lxs, node)
            ## get closest leaf at this split
            bmean, bmin = count_nest(tree, node)
            ## get hierarch. (min N descendants) at split
            cmedian, cmin = count_hier(tree, node)
            ## save
            data.append([a, bmean, bmin, cmedian, cmin])
            #print "{:<6} {:<.3f} {:<.1f} {:<.3f} {:<.1f}".format(a, bmean, bmin, cmedian, cmin)
    data = np.array(data)
    return data

In [195]:
## get Vib data
data_vib = get_data(tree_vib, lxs_vib)

## get Quercus data
data_que = get_data(tree_que, lxs_que)

## get Orestias data
data_ore = get_data(tree_ore, lxs_ore)

In [196]:
dat = np.array(data)
%R -i data_vib
%R -i data_que
%R -i data_ore

In [216]:
%%R -w 300 -h 300

#pdf("hierplots.pdf", width=4, height=12, pointsize=16)
#par(mfrow=c(3, 1))

boxplot(data_vib[,1] ~ data_vib[,5], col='gray', main="Viburnum", cex=2)
print(cor.test(data_vib[,5], data_vib[,1], method='s'))

boxplot(data_que[,1] ~ data_que[,5], col='gray', main="Quercus")
print(cor.test(data_que[,5], data_que[,1], method='s'))

boxplot(data_ore[,1] ~ data_ore[,5], col='gray', main="Orestias")
cor.test(data_ore[,5], data_ore[,1], method='s')

#dev.off()


	Spearman's rank correlation rho

data:  data_vib[, 5] and data_vib[, 1]
S = 14952, p-value = 6.206e-08
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6234835 


	Spearman's rank correlation rho

data:  data_que[, 5] and data_que[, 1]
S = 1573.7, p-value = 4.976e-06
alternative hypothesis: true rho is not equal to 0
sample estimates:
     rho 
0.711559 


	Spearman's rank correlation rho

data:  data_ore[, 5] and data_ore[, 1]
S = 5324.3, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.8520608 


In [ ]: