In [1]:
from ipyparallel import Client

cl = Client()

cl.ids


Out[1]:
[0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [37]:
%%px --local

# run whole cell on all engines a well as in the local IPython session

import numpy # dadi calls numpy (not np)

import sys

sys.path.insert(0, '/home/claudius/Downloads/dadi')

import dadi

I am going to use unfolded spectra from ANGSD that are folded in dadi.


In [3]:
%%px --local

# import 1D spectrum of ery on all engines:
fs_ery = dadi.Spectrum.from_file('ERY.unfolded.sfs').fold()

# import 1D spectrum of ery on all engines:
fs_par = dadi.Spectrum.from_file('PAR.unfolded.sfs').fold()

define exp. growth, then bottleneck model function


In [15]:
%psource dadi.Demographics1D.growth

In [23]:
%%px --local

def expGrowth_bottleneck(params, ns, pts):
    """
    exponential growth followed by instantanous size change.

    params = (nuB,TB,nuF,TF)
    ns = (n1,)

    nuB: Ratio of population size after exponential growth to ancient
         population size
    nuF: Ratio of contemporary to ancient population size
    TB: Time during which exponential growth happened
       (in units of 2*Na generations) 
    TF: Time in the past at which instantanous size change happened (after exp. growth)
    n1: Number of samples in resulting Spectrum
    pts: Number of grid points to use in integration.
    """
    nuB,TB,nuF,TF = params

    xx = dadi.Numerics.default_grid(pts)
    phi = dadi.PhiManip.phi_1D(xx)

    nu_func = lambda t: numpy.exp(numpy.log(nuB) * t/TB)
    phi = dadi.Integration.one_pop(phi, xx, TB, nu_func)
    
    phi = dadi.Integration.one_pop(phi, xx, TF, nuF)

    fs = dadi.Spectrum.from_phi(phi, ns, (xx,))
    return fs

This model specifies exponential growth/decline toward $\nu_B$ for some time $TB$, after which the population size undergoes an instantaneous size change to the contemporary size (ratio with $N_{ref}$).


In [24]:
%%px --local

# create link to function that specifies the model
func = expGrowth_bottleneck

# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(func)

In [33]:
%%px

# set up global variables on engines required for run_dadi function call

dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_ery # use ERY spectrum
perturb = True
fold = 2 # perturb randomly up to 6-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_expGrowth_bottleneck/ERY_perturb" # set file name stub for opt. result files

# set lower and upper bounds to nu and T
upper_bound = [1e4, 5, 1e4, 5]
lower_bound = [1e-4, 0, 1e-4, 0]

ns = fs_ery.sample_sizes # both populations have the same sample size

fs_ery.pop_ids = ['ery']
fs_par.pop_ids = ['par']

# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]

In [26]:
lbview = cl.load_balanced_view()

In [27]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
    """
    p_init: initial parameter values to run optimisation from
    """
    if perturb == True:
        p_init = dadi.Misc.perturb_params(p_init, fold=fold, 
                                      upper_bound=upper_bound, lower_bound=lower_bound)
        # note upper_bound and lower_bound variables are expected to be in the namespace of each engine
    # run optimisation of paramters
    popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
                                   lower_bound=lower_bound, upper_bound=upper_bound, \
                                   verbose=verbose, maxiter=maxiter, full_output=full_output)
    # pickle to file
    import dill
    name = outname[:] # make copy of file name stub!
    for p in p_init:
        name += "_%.4f" % (p)
    with open(name + ".dill", "w") as fh:
        dill.dump((p_init, popt), fh)
    
    return p_init, popt

In [28]:
from itertools import repeat

In [38]:
# perturb parameter neutral values
p0 = [1, 1, 1, 1]

ar_ery_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)

In [40]:
def get_flag_count(out, NM=True):
    """
    out: list of tuples, each containing p_init and popt + additional info, including warnflags
    as produced by run_dadi.py
    """
    from collections import defaultdict
    
    if NM: # if ar from Nelder-Mead
        i = 4 # the warnflag is reported at index position 4 in the output array
    else: # ar from BFGS optimisation
        i = 6
    
    warnflag = defaultdict(int)

    for res in out:
        if res[1][i] == 1: # notice the change in indexing
            warnflag[1] +=1
        elif res[1][i] == 2:
            warnflag[2] += 1
        elif res[1][i] == 0:
            warnflag[0] += 1
        else:
            warnflag[999] +=1
    if NM:
        print "success", warnflag[0]
        print "Maximum number of function evaluations made.", warnflag[1]
        print "Maximum number of iterations reached.", warnflag[2]
        print "unknown flag", warnflag[999]
    else:
        print "success", warnflag[0]
        print "Maximum number of iterations exceeded.", warnflag[1]
        print "Gradient and/or function calls not changing.", warnflag[2]
        print "unknown flag", warnflag[999]

In [41]:
get_flag_count(ar_ery_1, NM=True)


success 1
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 9
unknown flag 0

In [42]:
def flatten(array):
    """
        Returns a list of flattened elements of every inner lists (or tuples)
        ****RECURSIVE****
    """
    import numpy
    res = []
    for el in array:
        if isinstance(el, (list, tuple, numpy.ndarray)):
            res.extend(flatten(el))
            continue
        res.append(el)
    return list( res )

In [51]:
success = [flatten(out)[:9] for out in ar_ery_1 if out[1][4] == 0]

In [52]:
import pandas as pd

In [53]:
df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[53]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
0 1.263052 0.865659 0.340853 0.347439 1.384605 0.803309 0.289043 1.267706 2167.753201

In [54]:
print success[0][4:8]


[1.3846053356142007, 0.8033093178294235, 0.28904322070927396, 1.2677064459192211]

In [55]:
# perturb previous optimal parameter combination
p0 = success[0][4:8]

ar_ery_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)

In [56]:
get_flag_count(ar_ery_1, NM=True)


success 4
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 6
unknown flag 0

In [57]:
success = [flatten(out)[:9] for out in ar_ery_1 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[57]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
3 1.382766 0.241601 0.077028 0.342455 1.386752 0.220245 0.063039 0.384148 2167.753189
0 1.059698 0.929504 0.091952 0.788598 1.063068 0.927393 0.149945 0.767699 2167.753192
2 0.990318 0.428491 0.242318 0.779877 0.990106 0.434801 0.161338 0.807679 2167.753193
1 0.563572 2.015404 0.102770 3.277319 0.565273 2.002220 0.089449 3.131931 2172.825525

The optimised parameter values do not deviate very much from the initial parameter values.


In [58]:
# specify the initial parameter values, they will be randomly perturbed by up to a factor of 4
p0 = [100, 0.01, 0.1, 1] # quick exp. growth, then long period of small size

ar_ery_2 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)

In [60]:
success = [flatten(out)[:9] for out in ar_ery_2 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[60]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
2 288.897957 0.006640 0.293071 0.649314 452.463434 0.003161 0.165810 0.825642 2167.753193
1 33.888463 0.009623 0.053050 0.887303 49.583040 0.007919 0.176998 0.868265 2167.753193
3 78.930914 0.004743 0.076432 1.054879 194.383486 0.002071 0.232022 1.057746 2167.753197
0 286.169004 0.009809 0.041980 1.157481 589.244354 0.003207 0.267345 1.168653 2167.753200

In [61]:
# specify the initial parameter values, they will be randomly perturbed by up to a factor of 4
p0 = [0.01, 0.01, 10, 1] # quick exp. decline, then long period of increased size

ar_ery_3 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)

In [62]:
success = [flatten(out)[:9] for out in ar_ery_3 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[62]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
0 0.003391 0.014814 14.217875 1.071744 0.01598 0.000869 0.260179 1.127289 2167.753199

In all successful optimisations above, $\nu_F$, the ratio of contemporary population size to ancient population size, converges to a value below 1/3.


In [63]:
# perturb previous optimal parameter combination
p0 = success[0][4:8]

ar_ery_4 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)

In [64]:
get_flag_count(ar_ery_4, NM=True)


success 2
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 8
unknown flag 0

In [65]:
success = [flatten(out)[:9] for out in ar_ery_4 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[65]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
0 0.007491 0.000671 0.073788 0.316113 0.005604 0.000627 0.059733 0.35875 2167.753189
1 0.012323 0.001271 0.084668 3.136494 0.012072 0.001157 0.085324 3.24476 2172.825525

In [73]:
# save optimisation results to file

import dill

optout = []

for ar_ery in (ar_ery_1, ar_ery_2, ar_ery_3, ar_ery_4):
    optout.extend(list(ar_ery.get()))

dill.dump(optout, open("OUT_expGrowth_bottleneck/ERY_perturb_ar_ery.dill", "w"))

parallelus


In [74]:
%%px

# set up global variables on engines required for run_dadi function call

dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = fs_par # use PAR spectrum
perturb = True
fold = 2 # perturb randomly up to 6-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_expGrowth_bottleneck/PAR_perturb" # set file name stub for opt. result files

In [75]:
p0 = [1, 1, 1, 1]

ar_par_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)

In [76]:
%ll OUT_expGrowth_bottleneck/PAR*


-rw-rw-r-- 1 claudius 314 Apr 24 11:55 OUT_expGrowth_bottleneck/PAR_perturb_0.3903_0.9819_1.3026_0.2697.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:41 OUT_expGrowth_bottleneck/PAR_perturb_0.4756_0.7191_0.4371_0.5403.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:38 OUT_expGrowth_bottleneck/PAR_perturb_0.6027_2.3375_0.5953_1.6260.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:37 OUT_expGrowth_bottleneck/PAR_perturb_0.8309_0.4067_0.3150_2.8543.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:37 OUT_expGrowth_bottleneck/PAR_perturb_1.0791_0.3373_0.5508_0.9630.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:37 OUT_expGrowth_bottleneck/PAR_perturb_1.2331_0.2817_1.9411_1.4572.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:37 OUT_expGrowth_bottleneck/PAR_perturb_1.7138_0.7279_0.7608_0.3195.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:45 OUT_expGrowth_bottleneck/PAR_perturb_2.1220_0.3901_0.5765_1.1659.dill
-rw-rw-r-- 1 claudius 314 Apr 24 11:38 OUT_expGrowth_bottleneck/PAR_perturb_3.7093_0.2654_0.2896_0.8007.dill

In [77]:
ar_par_1 = []

import glob

for filename in glob.glob("OUT_expGrowth_bottleneck/PAR*"):
    ar_par_1.append(dill.load(open(filename)))

In [78]:
get_flag_count(ar_par_1, NM=True)


success 2
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 7
unknown flag 0

In [79]:
success = [flatten(out)[:9] for out in ar_par_1 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[79]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
1 3.709310 0.265380 0.289571 0.800686 0.614397 0.052903 0.018110 1.197595 6929.444306
0 0.830906 0.406732 0.314983 2.854260 0.844383 0.541158 0.151917 4.945685 6929.444306

In [82]:
success[1][4:8]


Out[82]:
[0.61439714659107336,
 0.052903193195409676,
 0.018109892933906826,
 1.197594561232507]

In [83]:
p0 = success[1][4:8]

ar_par_2 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)

In [84]:
get_flag_count(ar_par_2, NM=True)


success 10
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 0
unknown flag 0

In [85]:
success = [flatten(out)[:9] for out in ar_par_2 if out[1][4] == 0]

df = pd.DataFrame(data=success, \
                  columns=['nuB_0','TB_0', 'nuF_0', 'TF_0', 'nuB_opt', 'TB_opt', 'nuF_opt', 'TF_opt', '-logL'])

df.sort_values(by='-logL', ascending=True)


Out[85]:
nuB_0 TB_0 nuF_0 TF_0 nuB_opt TB_opt nuF_opt TF_opt -logL
3 0.472403 0.070881 0.011387 1.288252 0.469248 0.072113 0.008440 1.280387 6929.444306
6 1.768930 0.036260 0.042468 0.370434 1.776229 0.035834 0.007350 0.503654 6929.444306
0 0.218420 0.156013 0.020085 0.364053 0.229155 0.156113 0.006889 0.410683 6929.444306
4 1.358809 0.154517 0.008247 2.280554 1.385853 0.144384 0.011778 2.240702 6929.444306
7 0.732430 0.016699 0.033366 0.677204 0.728324 0.018477 0.016094 0.702511 6929.444306
1 0.218059 0.181408 0.026684 1.834435 0.213655 0.184490 0.022174 1.819735 6929.444306
2 0.459281 0.034909 0.015712 0.441546 0.479841 0.037059 0.010306 0.448867 6929.444306
5 1.197897 0.020329 0.023638 2.330702 1.199745 0.018330 0.018651 2.246834 6929.444306
9 1.903343 0.019700 0.034356 0.722134 1.901370 0.018326 0.017947 0.745342 6929.444306
8 1.316753 0.098304 0.017726 0.607624 1.316912 0.097550 0.014601 0.608083 6929.444306

Again, note that the inferred optimal parameter values only deviate slightly from the initial values and that all optimal parameter combinations have the same likelihood.


In [86]:
p0 = [0.01, 0.01, 10, 1] # quick exp. decline, then long period of increased size

ar_par_3 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)

In [87]:
get_flag_count(ar_par_3, NM=True)


success 0
Maximum number of function evaluations made. 0
Maximum number of iterations reached. 10
unknown flag 0

The initial parameter values specified above (and then perturbed) are too far away from an optimum.


In [94]:
optout = []

for ar_par in (ar_par_1, list(ar_par_2.get())):
    optout.extend(ar_par)
    
dill.dump(optout, open("OUT_expGrowth_bottleneck/PAR_perturb_ar_ery.dill", "w"))

In [ ]: