ipyrad.analysis simulations
This notebook demonstrates how to use the baba
module in ipyrad
to test for admixture and introgression. The code is written in Python and is best implemented in a Jupyter-notebook like this one. Analyses can be split up among many computing cores. Finally, there are some simple plotting methods to accompany tests which I show below.
InĀ [1]:
## imports
import numpy as np
import ipyrad as ip
import ipyparallel as ipp
from ipyrad.analysis import baba
## print versions
print "ipyrad v.{}".format(ip.__version__)
print "ipyparallel v.{}".format(ipp.__version__)
print "numpy v.{}".format(np.__version__)
InĀ [2]:
## start 4 engines by running the commented line below in a separate bash terminal.
##
## ipcluster start --n=4
InĀ [3]:
## connect to client and print connection summary
ipyclient = ipp.Client()
print ip.cluster_info(client=ipyclient)
The function baba.Tree()
takes a newick
string as an argument, or, if no value is passed, it constructs the default 12 taxon tree shown below. The Tree
Class object holds a representation of the tree, which can be used for plotting, as well as for describing a demographic model for simulating data, as we'll show below.
InĀ [4]:
tre.draw(width=400, height=200);
Out[4]:
InĀ [38]:
## tree2dict
import ete3
newick = "./analysis_tetrad/pedtest1.full.tre"
## load the newick string as a Tree object
tre = ete3.Tree(newick)
## set przewalskiis as the outgroup
prz = [i for i in tre.get_leaves() if "prz" in i.name]
out = tre.get_common_ancestor(prz)
## set new outgroup and store back as a newick string
tre.set_outgroup(out)
newick = tre.write()
def test_constraint(node, cdict, tip, exact):
names = set(node.get_leaf_names())
const = set(cdict[tip])
if const:
if exact:
if len(names.intersection(const)) == len(const):
return 1
else:
return 0
else:
if len(names.intersection(const)) == len(names):
return 1
else:
return 0
return 1
InĀ [39]:
## constraints
#if not constraint_dict:
cdict = {"p1":[], "p2":[], "p3":[], "p4":[]}
cdict.update({"p4":['a']})
cdict
Out[39]:
InĀ [45]:
def tree2tests(newick, constraint_dict=None, constraint_exact=True):
"""
Returns dict of all possible four-taxon splits in a tree. Assumes
the user has entered a rooted tree. Skips polytomies.
"""
## make tree
tree = ete3.Tree(newick)
## constraints
#if not constraint_dict:
cdict = {"p1":[], "p2":[], "p3":[], "p4":[]}
cdict.update(constraint_dict)
print cdict
## traverse root to tips. Treat the left as outgroup, then the right.
tests = []
## topnode must have children
for topnode in tree.traverse("levelorder"):
## test onode as either child
for oparent in topnode.children:
## test outgroup as all descendants on one child branch
for onode in oparent.traverse():
## put constraints on onode
if test_constraint(onode, cdict, "p4", constraint_exact):
## p123 parent is sister to oparent
p123parent = oparent.get_sisters()[0]
## test p3 as all descendants of p123parent
for p3parent in p123parent.children:
## p12 parent is sister to p3parent
p12parent = p3parent.get_sisters()[0]
for p3node in p3parent.traverse():
if test_constraint(p3node, cdict, "p3", constraint_exact):
if p12parent.children:
p1parent, p2parent = p12parent.children
for p2node in p2parent.traverse():
if test_constraint(p2node, cdict, "p2", constraint_exact):
for p1node in p1parent.traverse():
if test_constraint(p1node, cdict, "p1", constraint_exact):
test = {}
test['p4'] = onode.get_leaf_names()
test['p3'] = p3node.get_leaf_names()
test['p2'] = p2node.get_leaf_names()
test['p1'] = p1node.get_leaf_names()
tests.append(test)
return tests
InĀ [46]:
tests = tree2tests(newick,
constraint_dict={"p4":['32082_przewalskii', '33588_przewalskii'],
"p3":['30686_cyathophylla']},
constraint_exact=True)
len(tests)
Out[46]:
InĀ [43]:
tests
Out[43]:
InĀ [5]:
## pass a newick string to the Tree class object
tre = baba.Tree(newick="((a,b),c);")
tre.draw(width=200, height=200);
InĀ [6]:
## store newick tree and a migration event list [source, sink, start, stop, rate]
## if not edges are entered then it is converted to cladogram
newick = "(((a,b),Cow), (d,e));"
events = [['e', 'Cow', 0, 1, 1e-6],
['3', '1', 1, 2, 1e-6]]
## initiate Tree object with newick and admix args
tre = baba.Tree(newick=newick,
admix=events)
## show the tree
tre.draw(width=250, height=250, yaxis=True);
InĀ [7]:
## a way of finding names xpos from verts
tre.verts[tre.verts[:, 1] == 0]
tre.tree.search_nodes(name='b')[0].idx
tre.verts[6, 0]
print [tre.verts[tre.tree.search_nodes(name=name)[0].idx, 0]
for name in tre.tree.get_leaf_names()]
print tre.tree.get_leaf_names()
InĀ [8]:
## returns a Sim data object
sims = tre.simulate(nreps=10000, Ns=50000, gen=20)
InĀ [9]:
## what is in the sims object? 4 attributes:
for key, val in sims.__dict__.items():
print key, val
InĀ [10]:
baba._msp_to_arr(sims, test)
InĀ [31]:
def _msp_to_arr(Sim, test):
## the fixed tree dictionary
#fix = {j: [i, i+1] for j, i in zip(list("abcdefghijkl"), range(0, 24, 2))}
fix = {j: [i, i+1] for j, i in zip(Sim.names, range(0, len(Sim.names)*2, 2))}
## fill taxdict by test
keys = ['p1', 'p2', 'p3', 'p4']
arr = np.zeros((Sim.nreps, 4, 100))
## unless it's a 5-taxon test
if len(test) == 5:
arr = np.zeros((100000, 6, 100))
keys += ['p5']
## create array sampler for taxa
taxs = [test[key] for key in keys]
idxs = [list(itertools.chain(*[fix[j] for j in i])) for i in taxs]
print idxs
InĀ [32]:
from ipyrad.analysis.baba import *
_msp_to_arr(sims, test)
InĀ [39]:
def print_variant_matrix(tree):
shape = tree.get_num_mutations(), tree.get_sample_size()
arr = np.empty(shape, dtype="u1")
for variant in tree.variants():
arr[variant.index] = variant.genotypes
print(arr)
InĀ [45]:
## in order of ladderized tip names (2 copies per tip)
print_variant_matrix(sims.sims.next())
InĀ [46]:
sims
Out[46]:
InĀ [12]:
## simulate data on tree
#sims = tre.simulate(nreps=10000, Ns=50000, gen=20)
## pass sim object to baba
test = {
'p4': ['e', 'd'],
'p3': ['Cow'],
'p2': ['b'],
'p1': ['a'],
}
#baba.batch(sims, test, ipyclient=ipyclient)
InĀ [Ā ]: