Spectral signal of gene trees when there is missing data

  • I need to work on the normalization

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import sys, subprocess, time, dendropy, os
import numpy as np
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing

bindir="/home/leo/local/bin/"
localdir="/tmp/"

In [5]:
# distances are dups, loss, dcoal, spr, rf, hdist
def distance_signal (gtree, stree): 
    os.chdir(localdir)
    gtree.write(path="gene.tre", schema="nexus")
    stree.write(path="species.tre", schema="nexus")
    proc_run = subprocess.run(bindir + "gf_distsignal_genetree_sptree gene.tre species.tre", shell=True, 
                              stdout=subprocess.PIPE, universal_newlines=True)

    x = [[float(i) for i in line.split()] for line in proc_run.stdout.splitlines()]
    return np.array(x)

In [6]:
def add_trees (tree1, tree2, first):
    otre = tree1
    for i in range(first, len(tree2)):
        otre.append(tree2[i])
    return otre

def add_noise (tree, n_noise):
    os.chdir(localdir)
    tree.write(path="tmp2.tre", schema="nexus")
    proc_run = subprocess.run([bindir + "bmc2_addTreeNoise", "tmp2.tre", "0.9", "0.1", str(n_noise)],
                              stdout=subprocess.PIPE, universal_newlines=True)
    return dendropy.TreeList.get(path="noise.tre", schema="nexus")

In [7]:
def generate_trees (n_trees, n_spr, max_error, n_noise, n_leaves):
    os.chdir(localdir)
    proc_run = subprocess.run([bindir + "gf_generate_spr_trees", str(n_leaves), str(n_trees-1), str(n_spr)],
                              stdout=subprocess.PIPE, universal_newlines=True)
    with open("tmp.tre", "w") as trefile:
        print(proc_run.stdout, file=trefile)
    proc_run = subprocess.run([bindir + "bmc2_addTreeNoise", "tmp.tre", "0.9", str(max_error), str(n_noise)],
                              stdout=subprocess.PIPE, universal_newlines=True)
    return dendropy.TreeList.get(path="tmp.tre", schema="nexus"), dendropy.TreeList.get(path="noise.tre", schema="nexus")

In [8]:
def create_missing_leaves(tree, n_missing):
    tx_miss = np.random.permutation(tree.taxon_namespace)[:n_missing]
    for t in tree:
        t.prune_taxa(tx_miss)
    for tx in tx_miss:
        tree.taxon_namespace.remove_taxon(tx)

In [27]:
g_sample_size = 5
n_loci = 9  # actually one more than the number selected here since we have the first locus, initialized differently
st_skip = 5 # if st_skip == g_sample_size then it will not have the "true" gene trees
n_taxa_pruned = 8
n_leaves = 16

gt = [None] * (n_loci+1)
gt[0], st = generate_trees (g_sample_size,1,0.1, 10, n_leaves)
st = st[st_skip:]
create_missing_leaves(gt[0], n_taxa_pruned)
glabels = np.repeat(0,g_sample_size)
slabels = np.repeat(0.2,len(st))

for i in range(1,n_loci+1):
    gt[i],y = generate_trees (g_sample_size,1,0.1, 10, n_leaves)
    create_missing_leaves(gt[i], n_taxa_pruned)
    st = add_trees(st, y, st_skip)
    glabels = np.append(glabels,np.repeat(i, g_sample_size))
    slabels = np.append(slabels,np.repeat(i + 0.2, len(y) - st_skip))

In [28]:
signal = distance_signal(st, st)
signal /= n_leaves
for i in range(0,n_loci+1):
    s_i = distance_signal(gt[i], st)
    s_i /= (n_leaves - n_taxa_pruned)
    signal = np.append(signal, s_i, 0)
print (signal, signal.shape,len(slabels), len(glabels))


[[ 0.      0.      0.     ...,  0.625   1.625   4.    ]
 [ 0.5625  3.125   2.     ...,  0.75    1.625   3.625 ]
 [ 0.5625  3.0625  1.9375 ...,  0.625   1.625   3.75  ]
 ..., 
 [ 0.5     2.25    1.25   ...,  0.5     0.5     0.5   ]
 [ 0.5     2.25    1.25   ...,  0.5     0.5     0.5   ]
 [ 0.5     2.25    1.25   ...,  0.25    0.25    0.    ]] (533, 2898) 483 50

In [29]:
transf=manifold.MDS(n_components=2).fit_transform(signal)

In [30]:
sps = len(slabels)
sl2 = slabels/10.2; gl2 = glabels/10.2
fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 6)
axes.scatter(transf[:sps,1], transf[:sps,0], c=sl2, edgecolor="none", alpha=.5, s=40)
axes.scatter(transf[sps:,1], transf[sps:,0], c=gl2, edgecolor="black", alpha=.5, s=80)


Out[30]:
<matplotlib.collections.PathCollection at 0x7f93771735c0>

In [31]:
transf2=manifold.MDS(n_components=2).fit_transform(signal[sps:]) ## only gene trees
fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 6)
axes.scatter(transf2[:,1], transf2[:,0], c=glabels, edgecolor="black", alpha=.5, s=100)


Out[31]:
<matplotlib.collections.PathCollection at 0x7f93770c8470>

In [60]:
names = [set([taxon.label for taxon in gt[i].taxon_namespace]) for i in range(n_loci+1)]
sizes = [len(names[i].intersection(names[j])) for i in range(n_loci+1) for j in range(i)]
fig, axes = plt.subplots(1) ; fig.set_size_inches(6, 6)
axes.hist(sizes, np.linspace(1, 9, 30))
axes.set_title("number of leaves in common")


Out[60]:
<matplotlib.text.Text at 0x7f937629ae10>

In [ ]: