In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import sys, subprocess, time, dendropy, os, copy
import numpy as np
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import treesignal
In [2]:
## quick-and-dirty function: user must make sure that gene_chain has enough species
def generate_gene_families(n_species = 32, n_samples=5, n_families=5, n_missing=1, n_spr=1):
#split chain: sptrees --- gt1 --- sptrees --- gt2 --- ... --- sptrees --- gt_n --- sptrees #
# sptrees blocks are twice as long as genetree blocks: 2 X n_s X (n_f +1)
sp_factor = 3
n_trees = n_samples * n_families + sp_factor * n_samples * (n_families+1)
# generate a SPR chain with "complete" trees
gf = treesignal.TreeSignal(n_species = n_species, chain_size = n_trees, n_spr = n_spr)
feat_all = gf()
signal = feat_all/feat_all.mean(0);
# creation of new sptree set, excluding gene trees
full_sptrees = gf.sp_trees[:sp_factor*n_samples]
for i in range(n_families):
start = (sp_factor+1)*n_samples*(i+1)
for j in range(start,start+sp_factor*n_samples):
full_sptrees.append(gf.sp_trees[j])
# creation of gene trees, each family will have distinct missing data
all_tx_miss = [] # list with all pruned species
gt_missing = [] # genes with a few missing taxa
gt_all_pruned = [] # genes pruned s.t. all have same taxa (i.e. smaller than gt_missing)
for i in range(n_families): # at least 4 leaves in common
tx_miss = np.random.permutation(gf.sp_trees.taxon_namespace[4:])[:n_missing]
tx_miss = [x.label for x in tx_miss]
all_tx_miss.append(tx_miss) # is a list of lists
start = sp_factor*n_samples*(i+1)
for t in gf.sp_trees[start:start+n_samples]:
t2 = t.extract_tree_without_taxa_labels(labels=tx_miss)
gt_missing.append(dendropy.Tree.get(data=t2.as_string(schema="newick"),schema="newick"))
gt_all_pruned.append(copy.deepcopy(t))
# gt_all_pruned now will have only taxa in common
all_tx_miss = list(set([x0 for x in all_tx_miss for x0 in x]))
for t in gt_all_pruned:
t.prune_taxa_with_labels(all_tx_miss)
gf.update_spstring_from_trees(full_sptrees)
cols = np.repeat(range(n_families),n_samples)
return gt_missing, gf, treesignal.TreeSignal(sp_trees = gt_all_pruned), cols, signal
gt, gf1, gf2, clabels, signal_original = generate_gene_families(n_species = 32, n_samples=5, n_families = 6, n_missing=8, n_spr=1)
print (len(gf1.sp_trees), len(gf2.sp_trees), len(gt))
print (len(gf1.sp_trees.taxon_namespace), len(gf2.sp_trees.taxon_namespace))
In [3]:
# Reference trees and classical distmatrix distance
feat_miss = gf2()
feat_full = []
for t in gt:
feat_full.append (gf1(t))
feat_full = np.array(feat_full)
print (feat_miss.shape, feat_full.shape)
# random trees
gf_random = treesignal.TreeSignal(n_species=len(gf1.sp_trees.taxon_namespace), chain_size=len(gf1.sp_trees)-1, n_spr=10)
feat_rand = []
for t in gt:
feat_rand.append (gf_random(t))
feat_rand = np.array(feat_rand)
print (feat_rand.shape)
In [12]:
signal_full = feat_full/feat_full.mean(0);
signal_miss = feat_miss/feat_miss.mean(0);
signal_rand = feat_rand/feat_rand.mean(0);
fig, axes = plt.subplots(2,2) ; fig.set_size_inches(16, 10)
fig.subplots_adjust(top=.99, bottom=.01, left=.02, right=.98, wspace=.1, hspace=.2)
transf=manifold.MDS(n_components=2).fit_transform(signal_full)
#kmod1 = cluster.AgglomerativeClustering(n_clusters=6, affinity='cosine', linkage='average').fit(signal_full).labels_
jit = 0.01 * transf.max() * np.random.normal(size=feat_miss.shape[0]) # avoid complete overlap of points
axes[0,0].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels,edgecolor="black", cmap="jet", alpha=.4, s=700)
#axes[0,0].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=kmod1, edgecolor="none", cmap="gnuplot", alpha=.5, s=30)
axes[0,0].set_title("MDS using features to reference trees (w/ jittering)", fontsize=18)
transf=manifold.MDS(n_components=2).fit_transform(signal_rand)
#kmod1 = cluster.AgglomerativeClustering(n_clusters=6, affinity='cosine', linkage='average').fit(signal_rand).labels_
jit = 0.001 * transf.max() * np.random.normal(size=feat_rand.shape[0]) # avoid complete overlap of points
axes[1,0].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels,edgecolor="black", cmap="jet", alpha=.4, s=700)
#axes[1,0].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=kmod1, edgecolor="none", cmap="gnuplot", alpha=.5, s=30)
axes[1,0].set_title("MDS using features to random trees (w/ jittering)", fontsize=18)
dist1 = signal_miss[:,3::7] + signal_miss[:,4::7] # 3rd element is SPR (4 = spr residue, 5 = RF, 6 = Hdist)
dist1 = (dist1 + dist1.T)/2 # matrix must be symmetric
transf=manifold.MDS(n_components=2, dissimilarity="precomputed").fit_transform(dist1)
#kmod1 = cluster.AgglomerativeClustering(n_clusters=6, affinity='precomputed', linkage='average').fit(dist1).labels_
jit = 0.02 * transf.max() * np.random.normal(size=feat_miss.shape[0]) # avoid complete overlap of points
axes[0,1].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels, edgecolor="black", cmap="jet", alpha=.4, s=700)
#axes[0,1].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=kmod1, edgecolor="none", cmap="gnuplot", alpha=.5, s=30)
axes[0,1].set_title("MDS using SPR distance matrix (w/ jittering)", fontsize=18)
dist2 = signal_miss[:,5::7] # 3rd element is SPR (4 = spr residue, 5 = RF, 6 = Hdist)
dist2 = (dist2 + dist2.T)/2 # matrix must be symmetric
transf=manifold.MDS(n_components=2, dissimilarity="precomputed").fit_transform(dist2)
#kmod1 = cluster.AgglomerativeClustering(n_clusters=6, affinity='precomputed', linkage='average').fit(dist2).labels_
jit = 0.02 * transf.max() * np.random.normal(size=feat_miss.shape[0]) # avoid complete overlap of points
axes[1,1].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels, edgecolor="black", cmap="jet", alpha=.4, s=700)
#axes[1,1].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=kmod1, edgecolor="none", cmap="gnuplot", alpha=.5, s=30)
axes[1,1].set_title("MDS using Robinson-Foulds distance matrix (w/ jittering)", fontsize=18)
Out[12]:
In [33]:
fig, axes = plt.subplots(2,1) ; fig.set_size_inches(16, 7)
fig.subplots_adjust(top=.99, bottom=.01, left=.03, right=.97, wspace=.1, hspace=.2)
transf=decomposition.PCA(n_components=2).fit_transform(signal_full.T) ## order FEATURES
feat_order = np.argsort(transf[:,0])
im = axes[0].imshow(signal_full[:,feat_order], aspect='auto', cmap="Spectral_r", interpolation="nearest")
axes[0].set_title("Feature matrix using reference trees", fontsize=20)
im = axes[1].imshow(signal_rand[:,feat_order], aspect='auto', cmap="Spectral_r", interpolation="nearest")
axes[1].set_title("Feature matrix using random trees", fontsize=20)
fig, axes = plt.subplots(1,2) ; fig.set_size_inches(14, 6)
fig.subplots_adjust(top=.99, bottom=.01, left=.03, right=.97, wspace=.1, hspace=.1)
axes[0].imshow(dist1, aspect='auto', cmap="hot", interpolation="nearest")
axes[0].set_title("SPR distance matrix", fontsize=20)
axes[1].imshow(dist2, aspect='auto', cmap="hot", interpolation="nearest")
axes[1].set_title("Robinson-Foulds distance matrix", fontsize=20)
Out[33]:
In [ ]:
fig, axes = plt.subplots(1) ; fig.set_size_inches(10, 6)
fig.subplots_adjust(top=.99, bottom=.01, left=.02, right=.98, wspace=.1, hspace=.1)
transf=manifold.MDS(n_components=2).fit_transform(signal_original)
axes.scatter(transf[:,1], transf[:,0], c=list(range(transf.shape[0])),edgecolor="black", alpha=.4, s=50)
axes.set_title("MDS of all original trees used in simulation")
In [14]:
n_t = len(gt)
treecl_rf = np.zeros((n_t,n_t)) ## square distance matrix
treecl_spr = np.zeros((n_t,n_t)) ## square distance matrix
for i in range(n_t):
for j in range (i):
l_i = [x.label for x in gt[i].taxon_namespace]
l_j = [x.label for x in gt[j].taxon_namespace]
common =list(set(l_i) & set(l_j))
n_c = len(common)
t_i = copy.deepcopy(gt[i])
t_i.retain_taxa_with_labels(common)
t_j = copy.deepcopy(gt[j])
t_j.retain_taxa_with_labels(common)
#print ("\n", l_i, "\n", l_j, "\n", common, "common")
spc = treesignal.lowlevel_calculate_spectrum_from_tree_strings(
t_i.as_string(schema="newick"),
t_j.as_string(schema="newick"))
treecl_rf[i,j] = treecl_rf[j,i] = spc[5]/(n_c-3)
treecl_spr[i,j] = treecl_spr[j,i] = (spc[3]+spc[4])/(2*n_c-3)
In [15]:
fig, axes = plt.subplots(1,2) ; fig.set_size_inches(16, 5)
fig.subplots_adjust(top=.99, bottom=.01, left=.02, right=.98, wspace=.1, hspace=.1)
transf=manifold.MDS(n_components=2, dissimilarity="precomputed").fit_transform(treecl_spr)
jit = 0.02 * transf.max() * np.random.normal(size=feat_miss.shape[0]) # avoid complete overlap of points
axes[0].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels, edgecolor="black", cmap="jet", alpha=.4, s=700)
axes[0].set_title("MDS using normalized SPR distance matrix (w/ jittering)", fontsize=16)
transf=manifold.MDS(n_components=2, dissimilarity="precomputed").fit_transform(treecl_rf)
jit = 0.02 * transf.max() * np.random.normal(size=feat_miss.shape[0]) # avoid complete overlap of points
axes[1].scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=clabels, edgecolor="black", cmap="jet", alpha=.4, s=700)
axes[1].set_title("MDS using normalized Robinson-Foulds distance matrix (w/ jittering)", fontsize=16)
fig, axes = plt.subplots(1,2) ; fig.set_size_inches(14, 6)
fig.subplots_adjust(top=.99, bottom=.01, left=.03, right=.97, wspace=.1, hspace=.1)
axes[0].imshow(treecl_spr, aspect='auto', cmap="hot", interpolation="nearest")
axes[0].set_title("normalized SPR distance matrix", fontsize=18)
axes[1].imshow(treecl_rf, aspect='auto', cmap="hot", interpolation="nearest")
axes[1].set_title("normalized Robinson-Foulds distance matrix", fontsize=18)
Out[15]:
In [30]:
transf=decomposition.PCA(n_components=2).fit_transform(signal_rand.T) ## order FEATURES
feat_order = np.argsort(transf[:,0])
#feat_order = np.argsort(signal_rand[:5].sum(0))
signal_order = signal_rand[:,feat_order]
print (feat_order.shape)
fig, axes = plt.subplots(1) ; fig.set_size_inches(16, 3.5)
fig.subplots_adjust(top=.99, bottom=.01, left=.03, right=.97, wspace=.1, hspace=.2)
im = axes.imshow(signal_order, aspect='auto', cmap="Spectral_r", interpolation="nearest")
axes.set_title("Feature matrix using random trees", fontsize=20)
Out[30]: