In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [37]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy # dadi calls numpy (not np)
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
I am going to use unfolded spectra from ANGSD that are folded in dadi.
In [3]:
%%px --local
# import 1D spectrum of ery on all engines:
fs_ery = dadi.Spectrum.from_file('ERY.unfolded.sfs').fold()
# import 1D spectrum of ery on all engines:
fs_par = dadi.Spectrum.from_file('PAR.unfolded.sfs').fold()
In [15]:
%psource dadi.Demographics1D.growth
In [23]:
%%px --local
def expGrowth_bottleneck(params, ns, pts):
"""
exponential growth followed by instantanous size change.
params = (nuB,TB,nuF,TF)
ns = (n1,)
nuB: Ratio of population size after exponential growth to ancient
population size
nuF: Ratio of contemporary to ancient population size
TB: Time during which exponential growth happened
(in units of 2*Na generations)
TF: Time in the past at which instantanous size change happened (after exp. growth)
n1: Number of samples in resulting Spectrum
pts: Number of grid points to use in integration.
"""
nuB,TB,nuF,TF = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
nu_func = lambda t: numpy.exp(numpy.log(nuB) * t/TB)
phi = dadi.Integration.one_pop(phi, xx, TB, nu_func)
phi = dadi.Integration.one_pop(phi, xx, TF, nuF)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,))
return fs
This model specifies exponential growth/decline toward $\nu_B$ for some time $TB$, after which the population size undergoes an instantaneous size change to the contemporary size (ratio with $N_{ref}$).
In [24]:
%%px --local
# create link to function that specifies the model
func = expGrowth_bottleneck
# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [33]:
%%px
# set up global variables on engines required for run_dadi function call
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery # use ERY spectrum
perturb = True
fold = 2 # perturb randomly up to 6-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_expGrowth_bottleneck/ERY_perturb" # set file name stub for opt. result files
# set lower and upper bounds to nu and T
upper_bound = [1e4, 5, 1e4, 5]
lower_bound = [1e-4, 0, 1e-4, 0]
ns = fs_ery.sample_sizes # both populations have the same sample size
fs_ery.pop_ids = ['ery']
fs_par.pop_ids = ['par']
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
In [26]:
lbview = cl.load_balanced_view()
In [27]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [28]:
from itertools import repeat
In [38]:
# perturb parameter neutral values
p0 = [1, 1, 1, 1]
ar_ery_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [40]:
def get_flag_count(out, NM=True):
"""
out: list of tuples, each containing p_init and popt + additional info, including warnflags
as produced by run_dadi.py
"""
from collections import defaultdict
if NM: # if ar from Nelder-Mead
i = 4 # the warnflag is reported at index position 4 in the output array
else: # ar from BFGS optimisation
i = 6
warnflag = defaultdict(int)
for res in out:
if res[1][i] == 1: # notice the change in indexing
warnflag[1] +=1
elif res[1][i] == 2:
warnflag[2] += 1
elif res[1][i] == 0:
warnflag[0] += 1
else:
warnflag[999] +=1
if NM:
print "success", warnflag[0]
print "Maximum number of function evaluations made.", warnflag[1]
print "Maximum number of iterations reached.", warnflag[2]
print "unknown flag", warnflag[999]
else:
print "success", warnflag[0]
print "Maximum number of iterations exceeded.", warnflag[1]
print "Gradient and/or function calls not changing.", warnflag[2]
print "unknown flag", warnflag[999]
In [41]:
get_flag_count(ar_ery_1, NM=True)
In [42]:
def flatten(array):
"""
Returns a list of flattened elements of every inner lists (or tuples)
****RECURSIVE****
"""
import numpy
res = []
for el in array:
if isinstance(el, (list, tuple, numpy.ndarray)):
res.extend(flatten(el))
continue
res.append(el)
return list( res )
In [51]:
success = [flatten(out)[:9] for out in ar_ery_1 if out[1][4] == 0]
In [52]:
import pandas as pd
In [53]:
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[53]:
In [54]:
print success[0][4:8]
In [55]:
# perturb previous optimal parameter combination
p0 = success[0][4:8]
ar_ery_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [56]:
get_flag_count(ar_ery_1, NM=True)
In [57]:
success = [flatten(out)[:9] for out in ar_ery_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[57]:
The optimised parameter values do not deviate very much from the initial parameter values.
In [58]:
# specify the initial parameter values, they will be randomly perturbed by up to a factor of 4
p0 = [100, 0.01, 0.1, 1] # quick exp. growth, then long period of small size
ar_ery_2 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [60]:
success = [flatten(out)[:9] for out in ar_ery_2 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[60]:
In [61]:
# specify the initial parameter values, they will be randomly perturbed by up to a factor of 4
p0 = [0.01, 0.01, 10, 1] # quick exp. decline, then long period of increased size
ar_ery_3 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [62]:
success = [flatten(out)[:9] for out in ar_ery_3 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[62]:
In all successful optimisations above, $\nu_F$, the ratio of contemporary population size to ancient population size, converges to a value below 1/3.
In [63]:
# perturb previous optimal parameter combination
p0 = success[0][4:8]
ar_ery_4 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [64]:
get_flag_count(ar_ery_4, NM=True)
In [65]:
success = [flatten(out)[:9] for out in ar_ery_4 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[65]:
In [73]:
# save optimisation results to file
import dill
optout = []
for ar_ery in (ar_ery_1, ar_ery_2, ar_ery_3, ar_ery_4):
optout.extend(list(ar_ery.get()))
dill.dump(optout, open("OUT_expGrowth_bottleneck/ERY_perturb_ar_ery.dill", "w"))
In [74]:
%%px
# set up global variables on engines required for run_dadi function call
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par # use PAR spectrum
perturb = True
fold = 2 # perturb randomly up to 6-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_expGrowth_bottleneck/PAR_perturb" # set file name stub for opt. result files
In [75]:
p0 = [1, 1, 1, 1]
ar_par_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)
In [76]:
%ll OUT_expGrowth_bottleneck/PAR*
In [77]:
ar_par_1 = []
import glob
for filename in glob.glob("OUT_expGrowth_bottleneck/PAR*"):
ar_par_1.append(dill.load(open(filename)))
In [78]:
get_flag_count(ar_par_1, NM=True)
In [79]:
success = [flatten(out)[:9] for out in ar_par_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[79]:
In [82]:
success[1][4:8]
Out[82]:
In [83]:
p0 = success[1][4:8]
ar_par_2 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)
In [84]:
get_flag_count(ar_par_2, NM=True)
In [85]:
success = [flatten(out)[:9] for out in ar_par_2 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[85]:
Again, note that the inferred optimal parameter values only deviate slightly from the initial values and that all optimal parameter combinations have the same likelihood.
In [86]:
p0 = [0.01, 0.01, 10, 1] # quick exp. decline, then long period of increased size
ar_par_3 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)
In [87]:
get_flag_count(ar_par_3, NM=True)
The initial parameter values specified above (and then perturbed) are too far away from an optimum.
In [94]:
optout = []
for ar_par in (ar_par_1, list(ar_par_2.get())):
optout.extend(ar_par)
dill.dump(optout, open("OUT_expGrowth_bottleneck/PAR_perturb_ar_ery.dill", "w"))
In [ ]: