In [2]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[2]:
In [3]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy as np
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [4]:
%%px --local
# import 1D spectrum of ery on all engines:
fs_ery = dadi.Spectrum.from_file('ERY_modified.sfs')
# import 1D spectrum of par on all engines:
fs_par = dadi.Spectrum.from_file('PAR_modified.sfs')
In [5]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = [12, 10]
pylab.rcParams['font.size'] = 14
In [5]:
pylab.plot(fs_ery, 'ro--', label='ery', markersize=12)
pylab.plot(fs_par, 'g>--', label='par', markersize=12)
pylab.grid()
pylab.xlabel('minor allele count')
pylab.ylabel('')
pylab.legend()
pylab.title('1D spectra - Ludivics correction applied')
Out[5]:
In [6]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output, \
fixed_params=fixed_params)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [7]:
from glob import glob
import dill
from utility_functions import *
import pandas as pd
In [8]:
lbview = cl.load_balanced_view()
In [9]:
from itertools import repeat
In [10]:
dadi.Demographics1D.growth?
In [7]:
%%px --local
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics1D.growth)
In [8]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 4]
lower_bound = [1e-4, 0]
In [9]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/expgrowth" # set file name stub for opt. result files
fixed_params = None
In [26]:
# set starting values for perturbation
p0 = [1, 1]
#ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [27]:
ar_ery.get()
Out[27]:
In [28]:
# set starting values for perturbation
p0 = [0.1, 0.1]
#ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [31]:
# set starting values for perturbation
p0 = [10, 0.1]
#ar_ery1 = lbview.map(run_dadi, repeat(p0, 10))
In [32]:
ar_ery = []
for filename in glob("OUT_1D_models/expgrowth*dill"):
ar_ery.append(dill.load(open(filename)))
In [33]:
get_flag_count(ar_ery, NM=True)
In [15]:
import pandas as pd
In [36]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[36]:
In [ ]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 6] # increasing upper bound of time parameter
lower_bound = [1e-4, 0]
In [13]:
# set starting values for perturbation
p0 = [2, 1]
#ar_ery1 = lbview.map(run_dadi, repeat(p0, 10))
In [11]:
ar_ery = []
for filename in glob("OUT_1D_models/expgrowth*dill"):
ar_ery.append(dill.load(open(filename)))
In [16]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[16]:
In [19]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 8] # increasing upper bound of time parameter
lower_bound = [1e-4, 0]
In [20]:
# set starting values for perturbation
p0 = [2, 1]
ar_ery1 = lbview.map(run_dadi, repeat(p0, 10))
In [21]:
ar_ery = []
for filename in glob("OUT_1D_models/expgrowth*dill"):
ar_ery.append(dill.load(open(filename)))
In [23]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(15)
Out[23]:
The time parameter is hitting the upper boundary that I set. The exponential growth model cannot be fit to the ery spectrum with Ludovics correction applied.
In [24]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/PAR_expgrowth" # set file name stub for opt. result files
fixed_params = None
In [25]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 4]
lower_bound = [1e-4, 0]
In [26]:
# set starting values for perturbation
p0 = [1, 1]
ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [27]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_expgrowth*dill"):
ar_par.append(dill.load(open(filename)))
In [28]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(15)
Out[28]:
In [29]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 8]
lower_bound = [1e-4, 0]
In [30]:
# set starting values for perturbation
p0 = [1, 1]
ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [31]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_expgrowth*dill"):
ar_par.append(dill.load(open(filename)))
In [32]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(15)
Out[32]:
The time parameter is hitting the upper boundary that I set. The exponential growth model can therefore not be fit to the PAR spectrum with Ludovic's correction applied.
In [33]:
dadi.Demographics1D.two_epoch?
In [34]:
%%px --local
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics1D.two_epoch)
In [35]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 4]
lower_bound = [1e-4, 0]
In [36]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/ERY_twoEpoch" # set file name stub for opt. result files
fixed_params = None
In [37]:
# set starting values for perturbation
p0 = [1, 1]
ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [38]:
ar_ery = []
for filename in glob("OUT_1D_models/ERY_twoEpoch*dill"):
ar_ery.append(dill.load(open(filename)))
In [39]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[39]:
In [40]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 8]
lower_bound = [1e-4, 0]
In [41]:
# set starting values for perturbation
p0 = [1, 1]
ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [42]:
ar_ery = []
for filename in glob("OUT_1D_models/ERY_twoEpoch*dill"):
ar_ery.append(dill.load(open(filename)))
In [43]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[43]:
Still hitting upper boundary on time. The two epoch model cannot be fit to the ERY spectrum with Ludovic's correction applied.
In [44]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/PAR_twoEpoch" # set file name stub for opt. result files
fixed_params = None
In [45]:
p0 = [1, 1]
ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [50]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_twoEpoch*dill"):
ar_par.append(dill.load(open(filename)))
In [51]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nu1_0', 'T_0', 'nu1_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[51]:
All hitting the upper boundary on the time parameter. The two epoch model cannot be fit to the PAR spectrum with Ludovic's correction applied.
In [52]:
dadi.Demographics1D.bottlegrowth?
In [17]:
%%px --local
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics1D.bottlegrowth)
In [8]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 1e4, 8]
lower_bound = [1e-4, 1e-4, 0]
In [18]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/ERY_bottlegrowth" # set file name stub for opt. result files
fixed_params = None
In [12]:
# set starting values for perturbation
p0 = [1, 1, 1]
#ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [10]:
ar_ery = []
for filename in glob("OUT_1D_models/ERY_bottlegrowth*dill"):
ar_ery.append(dill.load(open(filename)))
In [13]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'T_0', 'nuB_opt', 'nuF_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[13]:
In [10]:
# set starting values for perturbation
p0 = [55, 1.3, 1.5]
#ar_ery = lbview.map(run_dadi, repeat(p0, 10))
In [47]:
ar_ery = []
for filename in glob("OUT_1D_models/ERY_bottlegrowth*dill"):
ar_ery.append(dill.load(open(filename)))
In [48]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'T_0', 'nuB_opt', 'nuF_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[48]:
This looks like convergence.
In [49]:
popt = np.array( df.sort_values(by='-logL', ascending=True).iloc[0, 3:6] )
popt
Out[49]:
In [50]:
# calculate best-fit model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
In [51]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, fs_ery)
In [52]:
mu = 3e-9
L = fs_ery.data.sum()
print "The optimal value of theta per site for the ancestral population is {0:.4f}.".format(theta/L)
In [53]:
Nref = theta/L/mu/4
Nref
Out[53]:
In [54]:
print "At time {0:,} generations ago, the ERY population size instantaneously increased by almost 55-fold (to {1:,}).".format(int(popt[2]*2*Nref), int(popt[0]*Nref))
I think such a high effective population size is not realistic.
In [17]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/PAR_bottlegrowth" # set file name stub for opt. result files
fixed_params = None
In [19]:
%%px --local
# set lower and upper bounds to nu1 and T
upper_bound = [1e4, 1e4, 6]
lower_bound = [1e-4, 1e-4, 0]
In [23]:
# set starting values for perturbation
p0 = [1, 1, 1]
#ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [55]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_bottlegrowth*dill"):
ar_par.append(dill.load(open(filename)))
In [56]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'T_0', 'nuB_opt', 'nuF_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[56]:
In [29]:
cl[:]['maxiter'] = 100
In [30]:
# set starting values for perturbation
p0 = [100, 2, 1.2]
ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [33]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_bottlegrowth*dill"):
ar_par.append(dill.load(open(filename)))
In [34]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'T_0', 'nuB_opt', 'nuF_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[34]:
This looks like convergence.
In [57]:
popt = np.array( df.sort_values(by='-logL', ascending=True).iloc[0, 3:6] )
popt
Out[57]:
In [58]:
# calculate best-fit model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
In [59]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, fs_par)
In [60]:
mu = 3e-9
L = fs_par.data.sum()
print "The optimal value of theta per site for the ancestral population is {0:.4f}.".format(theta/L)
In [61]:
Nref = theta/L/mu/4
Nref
Out[61]:
In [62]:
print "At time {0:,} generations ago, the PAR population size instantaneously increased by almost 124-fold (to {1:,}).".format(int(popt[2]*2*Nref), int(popt[0]*Nref))
An effective population size of 36 million is obviously to high. I therefore cannot regard this model fitting as successful.
In [66]:
dadi.Demographics1D.three_epoch?
In [94]:
%%px --local
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics1D.three_epoch)
In [95]:
%%px --local
# set lower and upper bounds to nuB, nuF, TB and TF
upper_bound = [1e4, 1e4, 6, 6]
lower_bound = [1e-4, 1e-4, 0, 0]
In [38]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/ERY_threeEpoch" # set file name stub for opt. result files
fixed_params = None
In [72]:
# set starting values for perturbation
p0 = [10, 1, 1, 1]
#ar_ery = lbview.map(run_dadi, repeat(p0, 20))
In [73]:
ar_ery = []
for filename in glob("OUT_1D_models/ERY_threeEpoch*dill"):
ar_ery.append(dill.load(open(filename)))
In [74]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_ery]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'TB_0', 'TF_0', 'nuB_opt', 'nuF_opt', 'TB_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[74]:
Reasonable convergence. Divergent parameter value combinations have the same likelihood. The optimal parameter values from the optimisation runs 16 and 10 (adjacent in the table) show that quite different demographic scenarios can have almost identical likelihood. This is not unusual.
In [78]:
popt = np.array( df.sort_values(by='-logL', ascending=True).iloc[0, 4:8] )
popt
Out[78]:
In [80]:
# calculate best-fit model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
In [81]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, fs_ery)
In [82]:
mu = 3e-9
L = fs_ery.data.sum()
print "The optimal value of theta per site for the ancestral population is {0:.4f}.".format(theta/L)
In [92]:
Nref = theta/L/mu/4
Nref
Out[92]:
In [93]:
print "At time {0:,} generations ago, the ERY population size instantaneously increased by almost 10-fold (to {1:,}).".format(int((popt[2]+popt[3])*2*Nref), int(popt[0]*Nref)),
print "It then kept this population constant for {0:,} generations.".format(int(popt[2]*2*Nref)),
print "At time {0:,} generations in the past, the ERY population then decreased to 1.3 fold of the ancient population size or {1:,}.".format(int(popt[3]*2*Nref), int(popt[1]*Nref))
In [100]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = fs_ery.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "MODIFIED_SPECTRA/OUT_1D_models/PAR_threeEpoch" # set file name stub for opt. result files
fixed_params = None
In [10]:
# set starting values for perturbation
p0 = [100, 2, 1, 1]
#ar_par = lbview.map(run_dadi, repeat(p0, 20))
In [11]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_threeEpoch*dill"):
ar_par.append(dill.load(open(filename)))
In [12]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'TB_0', 'TF_0', 'nuB_opt', 'nuF_opt', 'TB_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[12]:
There is no convergence. The three epoch model could not be fit to the PAR spectrum with Ludivic's correction.
In [104]:
%%px --local
fold = 1
maxiter = 300
In [105]:
# set starting values for perturbation
p0 = [20, 1e-2, 0.8, 1e-3]
ar_par = lbview.map(run_dadi, repeat(p0, 10))
In [107]:
ar_par = []
for filename in glob("OUT_1D_models/PAR_threeEpoch*dill"):
ar_par.append(dill.load(open(filename)))
In [109]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_par]
df = pd.DataFrame(data=returned, \
columns=['nuB_0', 'nuF_0', 'TB_0', 'TF_0', 'nuB_opt', 'nuF_opt', 'TB_opt', 'TF_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(20)
Out[109]:
The estimate $\nu_F$ and $TF$ are very close to the lower parameter boundary. Very different demographic scenarios have almost identical likelihood. Reliable parameter estimation does not seem possible, but an ancient population size increase, which is kept constant for a long time is inferred consistently.
In [ ]: