anhima.tree - Trees


In [1]:
import numpy as np
np.random.seed(42)
import scipy.stats
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import itertools
# import logging
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
# debug = logger.debug
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.tree
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.dist

In [2]:
# simulate genotypes with no relatedness
n_variants = 1000
n_samples = 100
samples = [''.join(s) for s in itertools.product('ABCDEFGHIJ', repeat=2)]
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
gn = anhima.gt.as_n_alt(genotypes)

In [3]:
# simulate three sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=1000)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=1000)
genotypes_subpop2a = anhima.sim.simulate_relatedness(genotypes_subpop2[:, n_samples//4:], relatedness=.5, n_iter=500)
genotypes_subpop2b = anhima.sim.simulate_relatedness(genotypes_subpop2[:, :n_samples//4], relatedness=.5, n_iter=500)
genotypes_related = np.concatenate([genotypes_subpop1, genotypes_subpop2a, genotypes_subpop2b], axis=1)
gnr = anhima.gt.as_n_alt(genotypes_related)

In [4]:
# calculate pairwise distance
d, s = anhima.dist.pairwise_distance(gn, metric='euclidean')
dr, sr = anhima.dist.pairwise_distance(gnr, metric='euclidean')

In [5]:
# build a neighbour joining tree
t = anhima.tree.nj(s, labels=samples)
tr = anhima.tree.nj(sr, labels=samples)

In [6]:
anhima.tree.plot_phylo(t, plot_kwargs={'type': 'unrooted', 
                                       'tip.color': 'black',
                                       'edge.color': 'gray',
                                       'lab4ut': 'axial'});



In [7]:
# first 50 samples are in pop1, second 25 are in pop2a, third 25 are in pop 2b
populations = (['pop1'] * 50) + (['pop2a']*25) + (['pop2b']*25) 
# choose a color for each sub-population
population_colors = {
    'pop1': 'red', 
    'pop2a': 'blue',
    'pop2b': 'green',
}
# assign a color to each sample based on population membership
sample_colors = [population_colors[p] for p in populations]

In [8]:
# assign a color to each tree edge based on population majority
edge_colors = anhima.tree.color_edges_by_group_majority(tr, 
                                                        labels=samples, 
                                                        groups=populations, 
                                                        colors=population_colors)

# now plot the tree
anhima.tree.plot_phylo(tr, plot_kwargs={'type': 'unrooted', 
                                        'tip.color': sample_colors,
                                        'edge.color': edge_colors,
                                        'lab4ut': 'axial'});


Comparison of distance metrics


In [9]:
metrics = 'euclidean', 'cityblock'

fig = plt.figure(figsize=(len(metrics)*5, 5))

for i, metric in enumerate(metrics):
    ax = fig.add_subplot(1, len(metrics), i+1)
    ax.set_title(metric)
    
    d, s = anhima.dist.pairwise_distance(gnr, metric=metric)
    t = anhima.tree.nj(s, labels=samples)
    edge_colors = anhima.tree.color_edges_by_group_majority(t, 
                                                            labels=samples, 
                                                            groups=populations, 
                                                            colors=population_colors)
    anhima.tree.plot_phylo(t, 
                           plot_kwargs={'type': 'unrooted', 
                                        'tip.color': sample_colors,
                                        'edge.color': edge_colors,
                                        'lab4ut': 'axial'},
                           width=5, height=5, units='in', res=plt.rcParams['savefig.dpi'],
                           add_scale_bar={},
                           ax=ax)
    
fig.subplots_adjust(0, 0, 1, 1, hspace=0, wspace=0)


Reading and writing trees


In [10]:
d, s = anhima.dist.pairwise_distance(gnr, metric=metric)
t = anhima.tree.nj(s, labels=samples)

In [11]:
# write to string
anhima.tree.write_tree(t)


Out[11]:
'(((BA:111.6477157,EH:99.35228431):10.42058249,EC:113.5794175):13.29073334,((((CE:68.03757196,EI:74.96242804):30.63386418,BC:96.36613582):8.990157278,(BI:56.35263158,CH:57.64736842):54.25984272):12.37453391,(AF:66.7742513,DG:70.2257487):43.25046609):7.490516663,(((((((AA:99.13324653,DI:93.86675347):16.48313802,(CB:65.04765625,EB:67.95234375):49.51686198):8.523166233,((DF:79.40153952,EE:78.59846048):32.51942274,(AG:73.88515625,BG:78.11484375):32.98057726):17.10183377):3.946893311,((CC:74.75218987,DB:74.24781013):29.94176025,((BD:38.25510204,EF:34.74489796):30.47274389,CI:65.52725611):37.80823975):18.99060669):2.113254547,((((AI:61.78645833,CG:74.21354167):25.45729519,AH:91.54270481):44.75330529,CJ:117.2466947):3.734179688,(((BE:66.35432943,EJ:69.64567057):20.77012055,DE:82.22987945):31.00595093,((AJ:63.79125434,EG:58.20874566):24.45673198,CD:82.54326802):32.36904907):11.32832031):6.246120453):2.00596873,(((((BF:98.25020345,DC:113.7497965):4.521684126,AE:110.4783159):13.57556152,BH:130.4244385):7.720659528,(((((((GG:33.26111779,HB:36.73888221):22.50350839,FF:51.49649161):3.518669577,(FD:31.25510333,GF:32.74489667):21.73133042):1.671435547,((((((GB:18.66489362,HE:19.33510638):18.86659564,HD:31.13340436):15.47457627,GH:46.52542373):2.232142857,GA:53.01785714):6.220667614,GD:57.90433239):2.990123821,(((FB:29.13878676,GC:29.86121324):16.03255208,HA:36.96744792):7.048588268,((GE:27.2186334,GI:28.7813666):11.16124488,FJ:40.83875512):9.951411732):9.642688679):2.531689453):3.719935099,FG:60.84842428):3.952421148,(((((FC:20.27419355,FH:17.72580645):13.1373291,HC:31.8626709):14.07886584,FI:46.92113416):9.712402344,FA:47.28759766):2.853595344,(FE:41.55146329,GJ:39.44853671):23.77140466):6.921114008):64.75165856,((((((((HF:31.62209302,JF:31.37790698):11.89082278,(IC:18.98369565,ID:20.01630435):23.60917722):5.585,(IG:24.35714286,JB:22.64285714):26.165):2.920162671,(((HH:24.91176471,JH:25.08823529):7.549382716,JC:36.45061728):6.186698718,(HJ:28.56818182,IH:30.43181818):18.06330128):6.111087329):3.902126736,((JE:37,JJ:38):9.788961039,(IJ:35.52298851,JI:34.47701149):14.21103896):9.394748264):2.72931338,(((HI:28.11235955,II:28.88764045):8.210843373,IB:37.78915663):12.7660473,HG:45.7339527):9.01287412):3.536132812,(JD:28.69642857,JG:27.30357143):25.51464844):3.629288383,(((IE:24.51111111,JA:28.48888889):16.4640625,IF:40.5359375):6.420230263,IA:48.57976974):16.52500849):74.90666663):112.5577218):1.790445964,((((BJ:65.30346112,DD:62.69653888):40.83464844,BB:101.1653516):7.476295108,((DA:64.51941216,DH:65.48058784):19.02415248,EA:88.97584752):25.89870489):16.39668274,((AB:58.42580344,AC:67.57419656):21.04301453,CF:84.95698547):45.47831726):3.298360189):4.166501363):3.118453979,(((CA:68.2628866,DJ:60.7371134):28.1230957,ED:93.8769043):40.54652913,AD:111.4534709):7.271705627):1.079856873);'

In [12]:
# write to file
anhima.tree.write_tree(t, '/tmp/tree.newick')

In [13]:
# read from file
t2 = anhima.tree.read_tree('/tmp/tree.newick')

In [14]:
anhima.tree.plot_phylo(t2, plot_kwargs={'type': 'unrooted'});



In [15]:
# N.B., labels will not be preserved in original order
list(t2[2])[:10]


Out[15]:
['BA', 'EH', 'EC', 'CE', 'EI', 'BC', 'BI', 'CH', 'AF', 'DG']