In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [2]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy as np
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [3]:
%ll dadiExercises/
In [9]:
%less dadiExercises/EryPar.unfolded.2dsfs.dadi_format
In [3]:
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [3]:
%page sfs2d_unfolded
In [4]:
sfs2d_unfolded.sample_sizes
Out[4]:
In [5]:
# add population labels
sfs2d_unfolded.pop_ids = ["ery", "par"]
For the estimation of the 2D SFS, realSFS
has only taken sites that had data from at least 9 individuals in each population (see assembly.sh
, lines 1430 onwards).
In [6]:
# print total number of sites from which the spectrum was estimated
# i. e. also invariant sites
sfs2d_unfolded.data.sum()
Out[6]:
In [7]:
# print number of segregating sites in the SFS
sfs2d_unfolded.S()
Out[7]:
The 2D spectrum contains counts from 60k sites that are variable in par or ery or both.
In [8]:
# percent variable sites
sfs2d_unfolded.S()/sfs2d_unfolded.data.sum()*100
Out[8]:
In [6]:
import pylab
%matplotlib inline
pylab.rcParams['font.size'] = 14.0
pylab.rcParams['figure.figsize'] = [10.0, 8.0]
In [7]:
dadi.Plotting.plot_single_2d_sfs(sfs2d_unfolded, vmin=1, cmap='jet')
Out[7]:
Cells with counts below 1 are masked and appear white in the upper plot.
In [22]:
%psource dadi.Plotting.plot_single_2d_sfs
In [13]:
# How would the SFS look like if the samples were taken from a panmictic population?
dadi.Plotting.plot_single_2d_sfs(sfs2d_unfolded.scramble_pop_ids(), vmin=1, cmap='jet')
Out[13]:
In [59]:
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [8]:
# get genome-wide Weir & Cockerham's Fst from 2D SFS
sfs2d_unfolded.Fst()
Out[8]:
I would like to get Fst estimates for subsets of sites with a certain minor allele frequency in ERY or in PAR (see Bhatia2013).
In [43]:
import numpy as np
# initialise an empty 2D array
np.zeros((37, 37))
Out[43]:
In [64]:
# initialise an empty 2D array
x = np.zeros(sfs2d_unfolded.sample_sizes+1)
x
Out[64]:
In [9]:
# test Fst calculation for MAF class
# extract sites that correspond to MAF in ERY
x = np.zeros(sfs2d_unfolded.sample_sizes+1)
n = len(x[1])-1
i = 1
x[i] = sfs2d_unfolded.data[i].copy()
x[(n-i)] = sfs2d_unfolded.data[(n-i)].copy()
# create new Spectrum object
x_one = dadi.Spectrum(data=x, mask=sfs2d_unfolded.mask, data_folded=False, pop_ids=['ery', 'par'])
# plot the spectrum of subset sites
dadi.Plotting.plot_single_2d_sfs(x_one, vmin=1, cmap='jet')
# calculate Fst for subset sites
x_one.Fst()
Out[9]:
In [10]:
Fst_by_MAF_ery = np.zeros((18))
Fst_by_MAF_ery
Out[10]:
In [11]:
# get Fst by MAF in ERY from 1..18
n = len(x[1])-1
for i in range(len(Fst_by_MAF_ery)):
j=i+1
x = np.zeros(sfs2d_unfolded.sample_sizes+1)
x[j] = sfs2d_unfolded.data[j].copy()
x[(n-j)] = sfs2d_unfolded.data[(n-j)].copy()
# create new Spectrum object
y = dadi.Spectrum(data=x, mask=sfs2d_unfolded.mask, data_folded=False, pop_ids=['ery', 'par'])
Fst_by_MAF_ery[i] = y.Fst()
In [12]:
# Fst by MAF in ERY
pylab.plot(range(1, 19), Fst_by_MAF_ery)
pylab.plot(range(1, 19), [sfs2d_unfolded.Fst()] * 18)
pylab.ylim([0, 0.4])
Out[12]:
In [13]:
# get Fst by MAF in PAR
Fst_by_MAF_par = np.zeros((18))
n = len(x[1])-1
for i in range(len(Fst_by_MAF_par)):
j=i+1
x = np.zeros(sfs2d_unfolded.sample_sizes+1)
x[j] = sfs2d_unfolded.data.transpose()[j].copy() # note the transpose
x[(n-j)] = sfs2d_unfolded.data[(n-j)].copy()
# create new Spectrum object
y = dadi.Spectrum(data=x, mask=sfs2d_unfolded.mask, data_folded=False, pop_ids=['ery', 'par'])
Fst_by_MAF_par[i] = y.Fst()
In [123]:
pylab.rcParams['font.size'] = 16
pylab.plot(range(1, 19), Fst_by_MAF_ery, "o-", label="ERY")
pylab.plot(range(1, 19), Fst_by_MAF_par, "s-", label="PAR")
pylab.plot(range(1, 19), [sfs2d_unfolded.Fst()] * 18, label="all sites")
pylab.ylim([0, 0.5])
pylab.xlim([0,19])
pylab.xlabel('minor allele frequency')
pylab.ylabel(r'$F_{ST}$')
pylab.grid()
pylab.legend()
pylab.savefig("Fst_by_MAF.png")
This does not show a clear trend for a reduction in Fst with lower MAF.
Now I would like to get Fst by MAF for 2D spectra simulated with recent bottleneck or recent expansion in ERY or PAR.
In [ ]:
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
ns = sfs2d_unfolded.sample_sizes
In [21]:
def test_models(params, ns, pts):
nu1_1,nu2_1,nu1_2,nu2_2,T1,T2 = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
# period of constant population size (T1)
phi = dadi.Integration.two_pops(phi, xx, T1, nu1_1, nu2_1, m12=0, m21=0)
# period of exponential population size change (T2)
nu1_func = lambda t: nu1_1 * (nu1_2/nu1_1)**(t/T2)
nu2_func = lambda t: nu2_1 * (nu2_2/nu2_1)**(t/T2)
phi = dadi.Integration.two_pops(phi, xx, T2, nu1_func, nu2_func,
m12=0, m21=0)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [22]:
# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(test_models)
In [84]:
# calculate model spectrum
nu1_1 = 0.6
nu2_1 = 1.2
nu1_2 = nu1_1/10 # ERY final population size ratio
nu2_2 = 1.2
T1 = 0.29 - 0.015
T2 = 0.015 # time of beginning of bottleneck
model_spectrum = func_ex((nu1_1,nu2_1,nu1_2,nu2_2,T1,T2), ns, pts_l)
In [19]:
0.3/20
Out[19]:
In [85]:
# scale the model spectrum to the observed spectrum
ery_bottl_mod = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d_unfolded)
In [86]:
dadi.Plotting.plot_single_2d_sfs(ery_bottl_mod, vmin=1, cmap='jet', pop_ids=['ery', 'par'])
Out[86]:
In [87]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_unfolded)
mu = 3e-9
L = sfs2d_unfolded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
#print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ERY and PAR (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
print "The ancestral population of ERY and PAR split apart {0:,} generations ago.".format(int((T1+T2)*2*N_ref)),
print "Since the split until {0:,} generations ago the ERY population had a constant size of {1:,} and the PAR population of {2:,}.".format(int(T2*2*N_ref), int(nu1_1*N_ref), int(nu2_1*N_ref)),
print "{0:,} generations ago, the ERY population began to decreased exponentially to a current population size of {1:,}.".format(int(T2*2*N_ref), int(nu1_2*N_ref))
In [88]:
# calculate model spectrum
nu1_1 = 0.6
nu2_1 = 1.2
nu1_2 = 0.6
nu2_2 = nu2_1/10 # PAR final population size ratio
T1 = 0.29 - 0.015
T2 = 0.015 # time of beginning of bottleneck
model_spectrum = func_ex((nu1_1,nu2_1,nu1_2,nu2_2,T1,T2), ns, pts_l)
In [89]:
# scale model
par_bottl_mod = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d_unfolded)
# plot model
dadi.Plotting.plot_single_2d_sfs(par_bottl_mod, vmin=1, cmap='jet', pop_ids=['ery', 'par'])
Out[89]:
In [90]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_unfolded)
mu = 3e-9
L = sfs2d_unfolded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
#print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ERY and PAR (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
print "The ancestral population of ERY and PAR split apart {0:,} generations ago.".format(int((T1+T2)*2*N_ref)),
print "Since the split until {0:,} generations ago the ERY population had a constant size of {1:,} and the PAR population of {2:,}.".format(int(T2*2*N_ref), int(nu1_1*N_ref), int(nu2_1*N_ref)),
print "{0:,} generations ago, the PAR population began to decreased exponentially to a current population size of {1:,}.".format(int(T2*2*N_ref), int(nu2_2*N_ref))
In [91]:
# calculate model spectrum
nu1_1 = 0.6
nu2_1 = 1.2
nu1_2 = nu1_1*10.0 # ERY final population size ratio
nu2_2 = 1.2
T1 = 0.29 - 0.015
T2 = 0.015 # time of beginning of bottleneck
model_spectrum = func_ex((nu1_1,nu2_1,nu1_2,nu2_2,T1,T2), ns, pts_l)
In [92]:
# scale model
ery_exp_mod = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d_unfolded)
# plot model
dadi.Plotting.plot_single_2d_sfs(ery_exp_mod, vmin=1, cmap='jet', pop_ids=['ery', 'par'])
Out[92]:
In [93]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_unfolded)
mu = 3e-9
L = sfs2d_unfolded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
# print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
print "The ancestral population of ERY and PAR split apart {0:,} generations ago.".format(int((T1+T2)*2*N_ref)),
print "Since the split until {0:,} generations ago the ERY population had a constant size of {1:,} and the PAR population of {2:,}.".format(int(T2*2*N_ref), int(nu1_1*N_ref), int(nu2_1*N_ref)),
print "{0:,} generations ago, the ERY population began to increase exponentially to a current population size of {1:,}.".format(int(T2*2*N_ref), int(nu1_2*N_ref))
In [94]:
# calculate model spectrum
nu1_1 = 0.6
nu2_1 = 1.2
nu1_2 = 0.6
nu2_2 = nu2_1*10.0 # PAR final population size ratio
T1 = 0.29 - 0.015
T2 = 0.015 # time of beginning of bottleneck
model_spectrum = func_ex((nu1_1,nu2_1,nu1_2,nu2_2,T1,T2), ns, pts_l)
In [95]:
# scale model
par_exp_mod = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d_unfolded)
# plot model
dadi.Plotting.plot_single_2d_sfs(par_exp_mod, vmin=1, cmap='jet', pop_ids=['ery', 'par'])
Out[95]:
In [96]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_unfolded)
mu = 3e-9
L = sfs2d_unfolded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
# print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
print "The ancestral population of ERY and PAR split apart {0:,} generations ago.".format(int((T1+T2)*2*N_ref))
print "Since the split until {0:,} generations ago the ERY population had a size of {1:,} and the PAR population of {2:,}.".format(int(T2*2*N_ref), int(nu1_1*N_ref), int(nu2_1*N_ref))
print "{0:,} generations ago, the PAR population began to increase exponentially to a current population size of {1:,}.".format(int(T2*2*N_ref), int(nu2_2*N_ref))
In [97]:
def get_Fst_by_Maf(model_spectrum):
# get Fst by MAF in PAR
Fst_by_MAF_ery = np.zeros((18))
n = model_spectrum.sample_sizes[0]
for i in range(len(Fst_by_MAF_ery)):
j=i+1
x = np.zeros(model_spectrum.sample_sizes+1)
x[j] = model_spectrum.data[j].copy()
x[(n-j)] = model_spectrum.data[(n-j)].copy()
# create new Spectrum object
y = dadi.Spectrum(data=x, mask=sfs2d_unfolded.mask, data_folded=False, pop_ids=['ery', 'par'])
Fst_by_MAF_ery[i] = y.Fst()
# get Fst by MAF in PAR
Fst_by_MAF_par = np.zeros((18))
n = model_spectrum.sample_sizes[1]
for i in range(len(Fst_by_MAF_par)):
j=i+1
x = np.zeros(model_spectrum.sample_sizes+1)
x[j] = model_spectrum.data.transpose()[j].copy() # note the transpose
x[(n-j)] = model_spectrum.data[(n-j)].copy()
# create new Spectrum object
y = dadi.Spectrum(data=x, mask=sfs2d_unfolded.mask, data_folded=False, pop_ids=['ery', 'par'])
Fst_by_MAF_par[i] = y.Fst()
return (Fst_by_MAF_ery, Fst_by_MAF_par)
In [124]:
models = [ery_bottl_mod, par_bottl_mod, ery_exp_mod, par_exp_mod]
titles = ['Bottleneck in ERY', 'Bottleneck in PAR', 'Expansion in ERY', 'Expansion in PAR']
pylab.rcParams['figure.subplot.hspace'] = 0.3
pylab.rcParams['figure.subplot.wspace'] = 0.2
pylab.rcParams['font.size'] = 16
pylab.figure(figsize=[14, 14])
for i in range(4):
ery, par = get_Fst_by_Maf(models[i])
pylab.subplot(2,2,i+1)
pylab.plot(range(1, 19), ery, "o-", label="ERY")
pylab.plot(range(1, 19), par, "s-", label="PAR")
pylab.plot(range(1, 19), [models[i].Fst()] * 18, label="all sites")
pylab.ylim([0, 0.5])
pylab.xlim([0,19])
pylab.xlabel('minor allele frequency')
pylab.ylabel(r'$F_{ST}$')
pylab.grid()
pylab.legend()
pylab.title(titles[i])
pylab.savefig("Fst_by_MAF_4Models.png")
This clearly shows the effects of strong bottlenecks or population expansions on the distribution of Fst by MAF.
In [ ]:
In [ ]:
In [26]:
# get Fst from resampled spectrum
sfs2d_unfolded.scramble_pop_ids().Fst()
# note the resampling is deterministic, i. e. will lead to 1 result only
Out[26]:
In [50]:
%psource sfs2d_unfolded.Fst
I have permuted population labels of individual BAM files and calculated a 2D SFS for each of 100 permutations (see assembly.sh
, line 1502 onwards).
In [5]:
from glob import glob
# read in 100 2D SFS with permuted population labels
permut_sfs2d = [dadi.Spectrum.from_file(sfs) for sfs in glob("../ANGSD/FST/Bootstrap/*dadi")]
In [6]:
# calculate Weir & Cockerham's Fst for the permuted SFS's
permut_Fst = [sfs.Fst() for sfs in permut_sfs2d]
In [18]:
# plot a histogram of the permuted Fst's
pylab.hist(permut_Fst, bins=30, range=[0.0, 0.1])
pylab.xlabel(r'$F_{ST}$')
pylab.ylabel('permutation resample')
pylab.title('empirical null distribution of ' + r'$F_{ST}$')
pylab.text(0.06, 27, '100 permutations of population label')
pylab.savefig("Fst_dadi_perm_null_dist.png")
In [13]:
pylab.text?
In [49]:
# get the median Fst from permuted SFS's
pylab.median(permut_Fst)
Out[49]:
I would have expected that the median Fst from permuted SFS's is zero. Instead there seems to be a positive bias in the Fst of 0.037. I have also found a bias of very similar magnitude with Bhatia's Fst as estimated by realSFS
.
In [16]:
In [ ]:
In [8]:
sfs2d_folded = sfs2d_unfolded.fold()
In [13]:
# plot the folded GLOBAL minor allele frequency spectrum
dadi.Plotting.plot_single_2d_sfs(sfs2d_folded, vmin=1, cmap='jet')
Out[13]:
In [9]:
# push sfs to remote engines
cl[:].push(dict(sfs2d_folded=sfs2d_folded))
Out[9]:
In [16]:
sfs2d_folded.marginalize?
In [14]:
sfs2d_folded.pop_ids
Out[14]:
In [15]:
# get marginal spectra
par_sfs_marg = sfs2d_unfolded.marginalize([0])
ery_sfs_marg = sfs2d_unfolded.marginalize([1])
In [16]:
pylab.plot(par_sfs_marg.fold(), 'go-', label='PAR')
pylab.plot(ery_sfs_marg.fold(), 'rs-', label='ERY')
pylab.xlabel('minor allele frequency')
pylab.ylabel('SNP count')
pylab.title('marginal spectra')
pylab.grid()
pylab.legend()
Out[16]:
These are the 1D spectra for ERY and PAR from sites where both populations had reads from at least 9 individuals.
In [17]:
print "The number of sites in each marginal spectrum is: {0:,}".format(int(np.sum(sfs2d_unfolded.data)))
In [18]:
# number of segregating sites in the PAR spectrum
par_sfs_marg.S()
Out[18]:
In [19]:
# number of segregating sites in the ERY spectrum
ery_sfs_marg.S()
Out[19]:
The PAR spectrum contains 10k more SNP's than the ERY spectrum.
In [20]:
par_sfs_marg.pi()
Out[20]:
In [21]:
ery_sfs_marg.pi()
Out[21]:
In [22]:
# calculate pi_site for ERY
ery_sfs_marg.pi()/sfs2d_unfolded.data.sum()
Out[22]:
In [23]:
# calculate pi_site for PAR
par_sfs_marg.pi()/sfs2d_unfolded.data.sum()
Out[23]:
These two values of $\pi_{site}$ are practically the same as their equivalents calculated from 1D unfolded spectra estimated from all sites with reads in at least 9 individuals in the focal population (i. e. including non-overlapping sites):
In [1]:
import numpy as np
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [24]:
# import 1D unfolded spectrum for each pop
ery_unfolded = dadi.Spectrum.from_file('ERY.unfolded.sfs')
par_unfolded = dadi.Spectrum.from_file('PAR.unfolded.sfs')
In [25]:
print "Size of ERY 1D spectrum: {0:,}.".format(int(np.sum(ery_unfolded.data)))
print "Size of PAR 1D spectrum: {0:,}.".format(int(np.sum(par_unfolded.data)))
In [27]:
print "{0:.6f}".format(ery_unfolded.pi()/ery_unfolded.data.sum())
In [28]:
print "{0:.6f}".format(par_unfolded.pi()/par_unfolded.data.sum())
The fact that the total sequence length for the ERY 1D spectrum is 1,638,467 and the total sequence length for the PAR spectrum is only 1,214,938 does lead not to significantly different estimates of $\pi_{site}$. The overlapping sites between the two population samples show practically the same $\pi_{site}$ as when calculated from all sites of each population. The higher genetic diversity in PAR as compared to ERY is therefore not due to a relative enrichment in PAR (due to minimum coverage filtering) of sites less affected by allele dropout.
In [10]:
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
ns = sfs2d_folded.sample_sizes
In [ ]:
%pinfo dadi.Demographics2D.split_mig
The 1D model fitting has indicated that both erythropus and parallelus underwent population size reduction, maybe starting a long time ago (>400,000 generations).
In [17]:
%psource dadi.Demographics2D.split_mig
In [18]:
%psource dadi.Integration.two_pops
In [12]:
def split_nomig(params, ns, pts):
"""
params = (nu1,nu2,T)
ns = (n1,n2)
Split into two populations of specifed size, no migration.
nu1: Size ratio of population 1 after split (with respect to ancestral population size Na)
nu2: Size ratio of population 2 after split (with respect to ancestral population size Na)
T: Time in the past of split (in units of 2*Na generations)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,T = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
phi = dadi.Integration.two_pops(phi, xx, T, nu1, nu2, m12=0, m21=0)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [13]:
cl[:].push(dict(split_nomig=split_nomig))
Out[13]:
In [18]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [15]:
%%px --local
func = split_nomig
# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [16]:
# push folded spectrum to all engines
cl[:].push(dict(sfs2d_folded=sfs2d_folded))
Out[16]:
In [17]:
%%px --local
# set up global variables on engines required for run_dadi function call
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d_folded
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 10 # run a maximum of 10 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_nomig" # set file name stub for opt. result files
Ryan Gutenkunst writes on the dadi forum:
The parameter bounds are primarily about avoiding really slow fits. Calculation with high migration rates and times or small population sizes is much slower, and the optimizer can explore extreme values before settling down. So typically we set migrations rates to be bounded [0, 20ish], times to be [0, 5ish], population sizes to be [1e-3, 1e6ish].
In [18]:
%%px --local
# set lower and upper bounds to nu1, nu2 and T
upper_bound = [1e4, 1e4, 5]
lower_bound = [1e-4, 1e-4, 0]
ns = sfs2d_folded.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
Run locally:
In [26]:
# perturb parameter neutral values
p0 = [0.5, 0.5, 0.5] # split into equal proportions at >200,000 generations ago
#run_dadi(p0)
It works.
In [29]:
lbview = cl.load_balanced_view()
In [30]:
from itertools import repeat
In [31]:
# perturb parameter neutral values
p0 = [0.5, 0.5, 0.5] # split into equal proportions at >200,000 generations ago
#ar_split_nomig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [27]:
def get_flag_count(out, NM=True):
"""
out: list of tuples, each containing p_init and popt + additional info, including warnflags
as produced by run_dadi.py
"""
from collections import defaultdict
if NM: # if ar from Nelder-Mead
i = 4 # the warnflag is reported at index position 4 in the output array
else: # ar from BFGS optimisation
i = 6
warnflag = defaultdict(int)
for res in out:
if res[1][i] == 1: # notice the change in indexing
warnflag[1] +=1
elif res[1][i] == 2:
warnflag[2] += 1
elif res[1][i] == 0:
warnflag[0] += 1
else:
warnflag[999] +=1
if NM:
print "success", warnflag[0]
print "Maximum number of function evaluations made.", warnflag[1]
print "Maximum number of iterations reached.", warnflag[2]
print "unknown flag", warnflag[999]
else:
print "success", warnflag[0]
print "Maximum number of iterations exceeded.", warnflag[1]
print "Gradient and/or function calls not changing.", warnflag[2]
print "unknown flag", warnflag[999]
In [34]:
get_flag_count(ar_split_nomig_1, NM=True)
In [35]:
# perturb parameter neutral values
p0 = [0.5, 0.5, 0.1] # split into equal proportions at >200,000 generations ago
#ar_split_nomig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [36]:
get_flag_count(ar_split_nomig_1, NM=True)
In [37]:
%%px
maxiter = 100
In [38]:
# perturb parameter neutral values
p0 = [0.5, 0.5, 0.1] # split into equal proportions at >200,000 generations ago
#ar_split_nomig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [48]:
ar_split_nomig_1.wall_time
Out[48]:
In [39]:
get_flag_count(ar_split_nomig_1, NM=True)
In [19]:
def flatten(array):
"""
Returns a list of flattened elements of every inner lists (or tuples)
****RECURSIVE****
"""
import numpy
res = []
for el in array:
if isinstance(el, (list, tuple, numpy.ndarray)):
res.extend(flatten(el))
continue
res.append(el)
return list( res )
In [20]:
import pandas as pd
In [43]:
success = [flatten(out)[:7] for out in ar_split_nomig_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'nu1_opt', 'nu2_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[43]:
In [49]:
%%px
fold = 3 # increase perturbation factor
In [50]:
# perturb parameter neutral values
p0 = [0.637644, 1.229881, 0.29] # use previous optimal parameter values
#ar_split_nomig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [51]:
get_flag_count(ar_split_nomig_1, NM=True)
In [52]:
success = [flatten(out)[:7] for out in ar_split_nomig_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'nu1_opt', 'nu2_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[52]:
The optimal parameter values look very robust: very different starting values converge to almost identical optimal parameter values.
In section 6, I have noticed that final parameter combinations from "unsuccessfull" optimisations can have higher likelihood than parameter combinations from successfull optimisations. I am therefore going to load the final parameter combinations from all "unsuccessfull" optimisations from file in order to check whether some have higher likelihood than the ones seen so far.
In [12]:
from glob import glob
import dill
In [22]:
ar_split_nomig = []
for filename in glob("OUT_2D_models/split_nomig*dill"):
ar_split_nomig.append(dill.load(open(filename)))
In [23]:
# get final parameter values from "unsuccessfull" optimisations
returned = [flatten(out)[:7] for out in ar_split_nomig if out[1][4] != 0]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T_0', 'nu1_opt', 'nu2_opt', 'T_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[23]:
There are no better parameter combinations among the results from "unsuccessfull" optimisations.
In [55]:
# calculate best-fit model spectrum
model_spectrum = func_ex((0.636945, 1.229606, 0.290465), ns, pts_l)
In [56]:
# log likelihood of the observed 2D spectrum given the model
ll_model = dadi.Inference.ll_multinom(model_spectrum, sfs2d_folded)
In [57]:
ll_model
Out[57]:
The negative log likelihood of the data given the model is also returned by the full output of the optimisation function (as used above).
In [58]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_folded)
In [65]:
print "The optimal value of theta for the ancestral population is {0:4d}.".format(int(theta))
In [66]:
mu = 3e-9
L = sfs2d_folded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
In [72]:
sfs2d_folded.pop_ids
Out[72]:
I am assuming that $\nu_1$ refers to ery and $\nu_2$ refers to par. The split_nomig
model with its optimal parameter values suggests the following:
In [77]:
print "The ancestral population of ery and par split apart {2:,} generations ago. Since then until present the ery population had a size of {0:,} and the par population of {1:,}.".format(int(0.636945*N_ref), int(1.229606*N_ref)
, int(0.290465*2*N_ref))
Note, that the fitting of 1D models to the spectra of each population had indicated a population size reduction for both populations and more so for parallelus, which seems to contradict what this 2D model says about the two populations.
In [79]:
import pylab
pylab.rcParams['figure.figsize'] = [14.0, 12.0]
pylab.rcParams['font.size'] = 14.0
dadi.Plotting.plot_2d_comp_multinom(model_spectrum, sfs2d_folded, vmin=1)
A good computer screen is required to see the heatmap of the residuals in full detail.
The residuals show that the best-fit model has a deficit of fixed divergent SNP's as well as of low frequency shared SNP's close to the diagonal.
Maybe the fit can be improved by introducing gene flow.
In [156]:
?dadi.Demographics2D.split_mig
In [30]:
%%px --local
func = dadi.Demographics2D.split_mig
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [33]:
%%px --local
# set up global variables on engines required for run_dadi function call
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d_folded
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 10 # run a maximum of 10 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_mig" # set file name stub for opt. result files
In [26]:
%%px --local
# set lower and upper bounds to nu1, nu2, T, m
upper_bound = [1e4, 1e4, 2, 10]
lower_bound = [1e-4, 1e-4, 0, 0]
In [34]:
# perturb these parameter values
# nu1, nu2, T, m
p0 = [0.5, 0.5, 0.1, 0.1]
# split into equal proportions at >200,000 generations ago with 1 migrant individual per 10 generations
#ar_split_mig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [85]:
get_flag_count(ar_split_mig_1, NM=True)
In [86]:
%%px
fold = 3 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 10 iterations
In [87]:
# perturb these parameter values
p0 = [0.5, 0.5, 0.1, 0.1]
# split into equal proportions at >200,000 generations ago with migrant one individual per 10 generations
#ar_split_mig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [88]:
get_flag_count(ar_split_mig_1, NM=True)
In [91]:
# perturb these parameter values
p0 = [0.5, 0.5, 0.1, 0.01]
# split into equal proportions at >200,000 generations ago with one migrant individual per 100 generations
#ar_split_mig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [92]:
get_flag_count(ar_split_mig_1, NM=True)
In [93]:
# optimal parameters for nomig model: (0.636945, 1.229606, 0.290465)
# perturb parameter neutral values
p0 = [0.636945, 1.229606, 0.290465, 0.01]
# split into equal proportions at >200,000 generations ago with migrant individual per 100 generations
ar_split_mig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [94]:
get_flag_count(ar_split_mig_1, NM=True)
In [95]:
import numpy as np
In [97]:
m0 = np.logspace(-3, np.log10(5), base=10, num=20)
m0
Out[97]:
In [98]:
from itertools import product
In [109]:
p0 = [0.636945, 1.229606, 0.290465]
# do parameter sweep for migration rate
#ar_split_mig_1 = lbview.map(run_dadi, [p0 + [m] for m in m0], block=False, order=False)
In [112]:
ar_split_mig_1.progress
Out[112]:
In [113]:
get_flag_count(ar_split_mig_1, NM=True)
In [116]:
p0 = [0.636945, 1.229606, 0.290465, 1e-6]
#ar_split_mig_1 = lbview.map(run_dadi, [p0], block=False, order=False)
In [117]:
get_flag_count(ar_split_mig_1, NM=True)
In [118]:
ar_split_mig_1.get()
Out[118]:
In [119]:
p0 = [0.636945, 1.229606, 0.290465, 1e-8]
#ar_split_mig_1 = lbview.map(run_dadi, [p0], block=False, order=False)
In [120]:
ar_split_mig_1.get()
Out[120]:
In [122]:
%%px
maxiter = 500
In [123]:
# nu1, nu2, T, m:
p0 = [0.5, 0.5, 0.1, 1e-4]
ar_split_mig_1 = lbview.map(run_dadi, repeat(p0, 20), block=False, order=False)
In [124]:
get_flag_count(ar_split_mig_1, NM=True)
Apparently, convergence of optimisation requires many more than 100 iterations.
In [125]:
success = [flatten(out)[:9] for out in ar_split_mig_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[125]:
The first three parameter combinations are much more likely than the remaining ones.
In [126]:
# extract optimal parameter combination
df.sort_values(by='-logL', ascending=True).iloc[0, 4:8]
Out[126]:
In [128]:
p0 = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 4:8])
#ar_split_mig_2 = lbview.map(run_dadi, repeat(p0, 20), block=False, order=False)
In [129]:
ar_split_mig_2.progress
Out[129]:
In [132]:
ar_split_mig_2.elapsed/60
Out[132]:
In [16]:
import numpy as np
import pandas as pd
from utility_functions import *
In [3]:
import dill, glob
ar_split_mig_2 = []
for filename in glob.glob("OUT_2D_models/split_mig*dill"):
ar_split_mig_2.append(dill.load(open(filename)))
In [4]:
success = [flatten(out)[:9] for out in ar_split_mig_2 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[4]:
Parameter combinations with neg. log likelihood of 20387 are almost identical to the split-no-migration
from above. Adding some gene flow improves the likelihood of the model.
In section 6, I have noticed that final parameter combinations from "unsuccessfull" optimisations can have higher likelihood than parameter combinations from successfull optimisations. I am therefore going to load the final parameter combinations from all "unsuccessfull" optimisations from file in order to check whether some have higher likelihood than the ones seen so far.
In [36]:
import dill
from glob import glob
In [42]:
ar_split_mig = []
for filename in glob("OUT_2D_models/split_mig_[0-9]*dill"):
ar_split_mig.append(dill.load(open(filename)))
In [43]:
# get "unsuccessfull" optimisations
returned = [flatten(out)[:9] for out in ar_split_mig if out[1][4] != 0]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T_0', 'm_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[43]:
There are no better parameter combinations than the ones already seen.
In [5]:
df.sort_values(by='-logL', ascending=True).iloc[0, 4:8]
Out[5]:
In [44]:
popt = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 4:8])
In [47]:
# calculate best-fit model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
In [48]:
# log likelihood of the observed 2D spectrum given the model
ll_model = dadi.Inference.ll_multinom(model_spectrum, sfs2d_folded)
In [49]:
ll_model
Out[49]:
In [50]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d_folded)
theta
Out[50]:
In [51]:
mu = 3e-9
L = sfs2d_folded.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
In [145]:
dadi.Plotting.plot_2d_comp_multinom(model_spectrum, sfs2d_folded, vmin=1)
This has reduced the residuals for low frequency shared polymorphisms, but the model still cannot fit divergently fixed SNP's well.
The dadi manual says about the migration rate parameter (on page 11):
The migration parameter m12 specifies the rate of migration from pop 2 into pop 1. It is equal to the fraction of individuals each generation in pop 1 that are new migrants from pop 2, times the 2Nref.
So the fraction of new immigrant alleles each generation from $pop_2$ in $pop_1$ is: $$ p_{m12} = \frac{m_{12}}{2N_{ref}} $$
Since dadi's migration rate is scaled by 2Nref, the migration rate parameter as reported by dadi has units of immigrant individuals per generation for a diploid locus. However, whenever pop 1 does not have the same size as the ancestral population ($N_{ref}$}, dadi's $m_{12}$ needs to be rescaled by multiplication with $\nu_1$ to get the number of new immigrant individuals per generation ($M_{12}$):
$$ \begin{align} M_{12} &= p_{m12} \times 2N_{ref}\nu_1 \\[5pt] M_{12} &= \nu_1 m_{12} \end{align} $$See this confirm units of dadi thread on the dadi forum.
The way I calculate Nref from theta here, I get the Nref in units of diploid individuals:
For haploid data, you can replace all factors of 2Nref with Nref. So it would be Nref = (theta)/(2mu*L). The same adjustment would be needed for converting times and rates of gene flow.
Ryan Gutenkunst on dadi forum post Calculating Nref for haploid data.
What does this model fit says in plain English:
In [54]:
print "The ancestral population split apart {0:,} generation ago.".format(int(popt[2]*2*N_ref)),
print "Since then, ery and par had a constant population size of {0:,} and {1:,}, respectively.".format(int(popt[0]*N_ref), int(popt[1]*N_ref)),
print "Since the split, a fraction of {0:.2e} of the population size each generation were made up of new immigrant individuals from the other subspecies.".format(popt[3]/2/N_ref),
print "Put another way:",
print "Since the split ery received a constant number of {0:.2f} new immigrant alleles per generation, while par received a constant number of {1:.2f} per generation.".format(popt[3]*popt[0], popt[3]*popt[1])
Is the model with migration significantly better than the model without migration?
From the dadi manual:
The full likelihood ratio test statistic is equal to $D = 2(ll_c - ll_s)$ where $ll_c$ and $ll_s$ are the likelihoods of the complex and simple model, respectively.
$D$ should be $\chi^2$ distributed
with degrees of freedom equal to the difference in number of parameters between the simple and complex model.
Not all SNP's in the spectrum will be unlinked with each other. Therefore, the likelihoods calculated are composite likelihoods. Dadi can calculate an adjustment factor for $D$ that corrects for linkage. However, this requires bootstrapped data sets, which I haven't created yet. In the following calculation I am assuming that all SNP's can be assumed independent data points.
In [56]:
ll_c = dadi.Inference.ll_multinom(model_spectrum, sfs2d_folded)
ll_c
Out[56]:
In [59]:
func = split_nomig
# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(func)
# calculate best-fit model spectrum with optimal parameter values
model_spectrum = func_ex((0.636945, 1.229606, 0.290465), ns, pts_l)
ll_s = dadi.Inference.ll_multinom(model_spectrum, sfs2d_folded)
ll_s
Out[59]:
In [60]:
D = 2*(ll_c - ll_s)
D
Out[60]:
From the dadi example script YRI_CEY.py
:
Because this is a test of a parameter on the boundary of the parameter space (m cannot be less than zero), our null distribution is an even proportion of chi^2 distributions with 0 and 1 d.o.f. To evaluate the p-value, we use the point percent function for a weighted sum of chi^2 dists.
See Self2007 for more complex cases.
In [61]:
# calculate p-value
p = dadi.Godambe.sum_chi2_ppf(D, weights = (0.5, 0.5))
p
Out[61]:
Gene flow significantly improves the fit to the observed spectrum. I doubt that there could be enough linkage in the data to compromise this result.
Phylogeographic studies as well as paleoclimatic data strongly suggest that the two subspecies, erythropus and parallelus, diverged in isolation during at least the last ice age and only recently (~10,000 YBP) came into secondary contact. Clines of several morphological as well as molecular markers have been shown to be very wide, which indicates substantial gene flow across the hybrid zone into adjacent pure populations. The two population samples come from such "pure" populations in close proximity to the hybrid zone. They may therefore show evidence of recent gene flow between the two subspecies.
In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [2]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [41]:
%%px --local
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
sfs2d = sfs2d_unfolded.fold()
In [42]:
sfs2d.sample_sizes
Out[42]:
In [43]:
%%px --local
sfs2d.pop_ids = ['ery', 'par']
I am now defining a model function that is a mixture of the split_nomig
and split_mig
models. It specifies a split, divergence in isolation for a time $T_i$, then secondary contact of the two diverged populations since $T_c$ times $2N_{ref}$ generations in the past.
In [6]:
def secondary_contact(params, ns, pts):
"""
params = (nu1,nu2,Ti,Tc,m)
ns = (n1,n2)
Split into two populations of specified size,
then diverge in isolation,
finally come into secondary contact.
nu1: Size of population 1 after split.
nu2: Size of population 2 after split.
Ti: Time of divergence in isolation (in units of 2*Na generations)
Tc: Time in the past when secondary contact began (and continues till present)
m: symmetrical migration rate (in units of 2*Na per generation)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,Ti,Tc,m = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi) # split
# divergence in isolation
phi = dadi.Integration.two_pops(phi, xx, Ti, nu1, nu2, m12=0, m21=0)
# secondary contact
phi = dadi.Integration.two_pops(phi, xx, Tc, nu1, nu2, m12=m, m21=m)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [7]:
# push model function into namespace of remote engines
cl[:].push(dict(secondary_contact=secondary_contact))
Out[7]:
In [18]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [8]:
%%px --local
func = secondary_contact
# create extrapolating version of the model function
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [20]:
%%px --local
# set up global variables on engines required for run_dadi function call
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/sec_contact" # set file name stub for opt. result files
In [9]:
%%px --local
# set lower and upper bounds to nu1, nu2, Ti, Tc and m
upper_bound = [1e4, 1e4, 5, 5, 10]
lower_bound = [1e-4, 1e-4, 0, 0, 0]
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
In [13]:
# perturb these parameter values for starting value generation
p0 = [1, 1.5, 0.9, 0.1, 1]
# nu_1, nu_2, Ti, Tc, m
# run locally
run_dadi(p0)
Out[13]:
The code works in principle. However, the upper optimisation did not converge (flag==2).
In [16]:
lbview = cl.load_balanced_view()
In [17]:
from itertools import repeat
In [18]:
# perturb parameter neutral values
p0 = [ 0.6746985 , 1.38442775, 0.36257617, 0.02475325, 0.65765189]
# although the single optimisation above was not successfull (flag==2),
# I am using its returned values as starting values for perturbation
#ar_sec_contact_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [10]:
from utility_functions import *
In [20]:
get_flag_count(ar_sec_contact_1, NM=True)
In [22]:
%%px
# increase the maximum number of iterations
maxiter = 300
In [23]:
# perturb parameter neutral values
p0 = [ 0.6746985 , 1.38442775, 0.36257617, 0.02475325, 0.65765189]
# although the single optimisation above was not successfull (flag==2),
# I am using its returned values as starting values for perturbation
#ar_sec_contact_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [33]:
# get total running time in minutes
ar_sec_contact_1.wall_time/60
Out[33]:
In [24]:
get_flag_count(ar_sec_contact_1, NM=True)
5 runs succeeded, 5 hit the maxiter limit.
In [11]:
import pandas as pd
In [36]:
success = [flatten(out)[:11] for out in ar_sec_contact_1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'Ti_0', 'Tc_0', 'm_0', 'nu1_opt', 'nu2_opt', 'Ti_opt', 'Tc_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[36]:
The second parameter combination from the bottom is the closest to a recent-secondary-contact
model, but it has slightly lower likelihood than models that specify a much longer time of secondary contact ($T_c$).
In [14]:
import numpy as np
In [ ]:
# extract best parameter combination
pset = np.array(df.sort_values(by='-logL', ascending=True).iloc[0,5:10])
In [32]:
# perturb from best parameter combination
p0 = pset
#ar_sec_contact_2 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [34]:
get_flag_count(ar_sec_contact_2, NM=True)
Only 3 runs succeeded.
In [37]:
# add output from successfull runs to table
success.extend([flatten(out)[:11] for out in ar_sec_contact_2 if out[1][4] == 0])
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'Ti_0', 'Tc_0', 'm_0', 'nu1_opt', 'nu2_opt', 'Ti_opt', 'Tc_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[37]:
The most likely parameter combination has the same likelihood as the best-fit for the split_mig
model (i. e. the model with $T_i = 0$). The best-fit parameters for this secondary contact model suggest only a short time of complete isolation followed by a long time of secondary contact, which is qualitatively equivalent to a model without complete isolation. A $T_i$ greater 0 does not improve the likelihood, so I think it is not significant.
In section 6, I have noticed that final parameter combinations from "unsuccessfull" optimisations can have higher likelihood than parameter combinations from successfull optimisations. I am therefore going to load the final parameter combinations from all "unsuccessfull" optimisations from file in order to check whether some have higher likelihood than the ones seen so far.
In [ ]:
from glob import glob
import dill
In [32]:
ar_sec_contact = []
for filename in glob("OUT_2D_models/sec_contact*dill"):
ar_sec_contact.append(dill.load(open(filename)))
In [33]:
# get parameters from "successfull" runs into a table
returned = ([flatten(out)[:11] for out in ar_sec_contact if out[1][4] != 0])
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Ti_0', 'Tc_0', 'm_0', 'nu1_opt', 'nu2_opt', 'Ti_opt', 'Tc_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[33]:
There are no better no better parameter combinations than seen so far. But notice that this table shows that a $T_i$ that is practically 0 (top row) does not lower likelihood.
Let's search in the parameter space that is closer to a isolation with recent secondary contact model.
In [40]:
p0 = [0.624726, 1.206711, 0.352286, 0.071660, 0.586254]
#ar_sec_contact_3 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [41]:
get_flag_count(ar_sec_contact_3, NM=True)
In [42]:
# add output from successfull runs to table
success.extend([flatten(out)[:11] for out in ar_sec_contact_3 if out[1][4] == 0])
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'Ti_0', 'Tc_0', 'm_0', 'nu1_opt', 'nu2_opt', 'Ti_opt', 'Tc_opt', 'm_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[42]:
The best-fit parameters clearly do not describe a long time of divergence in isolation followed by recent high gene flow. In contrast, they generally indicate a long period of secondary contact with a migration rate that is essentially the same as the one estimated for the simple divergence with (uninterrupted) migration model above.
The secondary contact model with best-fit parameters has the same likelihood as the simple isolation with constant migration model above. This indicates that the time of isolation $T_i$ is not significantly different from 0.
In [12]:
# reload data from files into data frame
from glob import glob
import dill
from utility_functions import *
import pandas as pd
import numpy as np
ar_sec_contact = []
for file in glob('OUT_2D_models/sec_contact*dill'):
ar_sec_contact.append(dill.load(open(file)))
success = [flatten(out)[:11] for out in ar_sec_contact if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'Ti_0', 'Tc_0', 'm_0', 'nu1_opt', 'nu2_opt', 'Ti_opt', 'Tc_opt', 'm_opt', '-logL'])
In [13]:
popt = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 5:10])
popt
Out[13]:
In [14]:
# determine theta
# get the unscaled optimal secondary contact model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
# get theta
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum.fold(), sfs2d)
theta
Out[14]:
Note, this is a much lower theta than the one implied by the simple isolation model above (section 1). This is to be expected (see section 1.1 of the supplementary material of Gutenkunst2009).
In [58]:
import pylab
%matplotlib inline
pylab.rcParams['figure.figsize'] = [12, 10]
pylab.rcParams['font.size'] = 14
dadi.Plotting.plot_2d_comp_multinom(model_spectrum, sfs2d, vmin=1)
The best-fit SC model spectrum looks very similar to the best-fit IM model (see previous section).
In [31]:
mu = 3e-9
L = sfs2d.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population Nref of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
The best-fit parameters of this model say the following:
In [32]:
print "After the split of the ancestral population {0:,} generations ago, the daughter populations increased in size (instantaneously).".format(int((popt[2]+popt[3])*2*N_ref)),
print "ERY increased to about the same size as the ancestral population (Nref), while PAR increased to {0:.2f} times of Nref.".format(popt[1]),
print "After the split ERY and PAR diverged in isolation for {0:,} generations.".format(int(popt[2]*2*N_ref)),
print "The two populations came into secondary contact {0:,} generations ago.".format(int(popt[3]*2*N_ref)),
print "ERY and PAR stayed in continuous secondary contact until the present and",
print "during that time exchanged one individual from par to ery every {0:.1f} generations".format(1.0/(popt[4]/2*popt[0])),
print "and one individual from ery into par every {0:.1f} generations.".format(1.0/(popt[4]/2*popt[1]))
Parameter combinations that specify a recent secondary contact model, see rows with index 0, 2 and 9 in upper table, have lower likelihood than parameter combinations that are almost identical to an IM model with constant migration.
How does a recent secondary contact (RSC) model spectrum compare with the data?
In [15]:
df.sort_values(by='Tc_opt', ascending=True)
Out[15]:
In [16]:
# extract parameter combination that specifies a secondary contact model
p_sec_contact = np.array(df.sort_values(by='Tc_opt', ascending=True).iloc[0,5:10])
p_sec_contact
Out[16]:
This RSC parameter combination says the following:
In [88]:
print "After the split of the ancestral population {0:,} generations ago, the daughter populations increased in size (instantaneously).".format(int((p_sec_contact[2]+p_sec_contact[3])*2*N_ref)),
print "ERY decreased to about {0:.2f} the size of the ancestral population (Nref), while PAR increased to {1:.2f} times of Nref.".format(p_sec_contact[0],p_sec_contact[1]),
print "After the split ERY and PAR diverged in isolation for {0:,} generations.".format(int(p_sec_contact[2]*2*N_ref)),
print "The two populations came into secondary contact {0:,} generations ago.".format(int(p_sec_contact[3]*2*N_ref)),
print "ERY and PAR stayed in continuous secondary contact until present and",
print "during that time contained a proportion {0:.2e} new immigrant alleles each generations.".format(p_sec_contact[4]/2/N_ref)
In [17]:
# get the optimally scaled secondary contact model spectrum
SC_model_spectrum = dadi.Inference.optimally_scaled_sfs(func_ex(p_sec_contact, ns, pts_l), sfs2d)
In [18]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = [12, 10]
pylab.rcParams['font.size'] = 14
In [98]:
%psource dadi.Plotting.plot_2d_comp_Poisson
In [23]:
# compare SC model with data
dadi.Plotting.plot_2d_comp_multinom(model=SC_model_spectrum, data=sfs2d, vmin=1)
The recent secondary contact model (RSC) predicts mostly very low frequency introgressed polymorphisms (MAF class 1), but shows a deficit (with respect to the data) of low frequency variants further along the diagonal.
First, I need to get the best fit IM model spectrum again:
In [19]:
import dill, glob
ar_split_mig_2 = []
for filename in glob.glob("OUT_2D_models/split_mig*dill"):
ar_split_mig_2.append(dill.load(open(filename)))
In [20]:
success = [flatten(out)[:9] for out in ar_split_mig_2 if out[1][4] == 0]
df_IM = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm_opt', '-logL'])
In [21]:
# get optimal parameter combination for IM model
IM_popt = np.array(df_IM.sort_values(by='-logL', ascending=True).iloc[0, 4:8])
IM_popt
Out[21]:
In [22]:
# get IM model function (built-in)
IM = dadi.Demographics2D.split_mig
IM_func_ex = dadi.Numerics.make_extrap_log_func(IM)
The best fit IM model spectrum can be extracted with the following code:
In [23]:
IM_model_spectrum = dadi.Inference.optimally_scaled_sfs(IM_func_ex(IM_popt, ns, pts_l).fold(), sfs2d)
In [4]:
%psource dadi.Plotting.plot_2d_comp_multinom
In [24]:
# I have edited the functions `dadi.Plotting.plot_2d_comp_multinom` and `dadi.Plotting.plot_2d_comp_Poisson`
# in `Plotting.py` in the dadi repo to add the argument `title`
dadi.Plotting.plot_2d_comp_multinom(model=IM_model_spectrum, data=SC_model_spectrum.fold(), vmin=1, title=['RSC', 'IM'])
Compared to the IM model, the RSC model predicts many fewer low frequency shared polymorphisms as well as highly diverged polymorphisms (red in the residual plot). On the other hand, recent secondary contact predicts more high frequency shared polymorphisms than isolation with continuous migration (blue in the residual plot).
In [25]:
ll_s = dadi.Inference.ll_multinom(IM_model_spectrum, sfs2d)
ll_s
Out[25]:
In [26]:
ll_c = dadi.Inference.ll_multinom(SC_model_spectrum.fold(), sfs2d)
ll_c
Out[26]:
The simpler IM model has higher likelihood.
In [27]:
D = 2*(ll_c - ll_s)
D
Out[27]:
In [28]:
# calculate p-value for Chi-square dist. with 1 degree of freedom
p = dadi.Godambe.sum_chi2_ppf(D, weights=(0.5, 0.5))
p
Out[28]:
This is like testing whether $T_i$ is significantly different from 0 (it is 0 in the IM model).
It looks like the simpler IM model is significantly better than the more complex RSC model, but I don't know how to test that. That is I would like to know how confident I can be that the true $T_i$ == 0. In a Bayesian framework I could check whether the 95% CI of the posterior distribution for $T_i$ falls into a (arbitrarily sized) region of practical equivalence (ROPE) with 0. An ABC analysis might be necessary to answer this question statistically. I think what can be said with very high confidence is that $T_i$ cannot be as large as $0.9 \times 2N_{ref}$ as in this parameterisation as this lowers the probability of the data by 1,136 log-likelihood units compared to a $T_i$ of 0.
In [33]:
0.92263183 * 2*N_ref
Out[33]:
In [34]:
ll_s - ll_c
Out[34]:
I doubt that it could it be as simple as:
In [73]:
dadi.Godambe.sum_chi2_ppf(2*(ll_s - ll_c))
Out[73]:
In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [2]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy as np
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [3]:
%%px --local
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [4]:
%%px --local
# add population labels
sfs2d_unfolded.pop_ids = ["ery", "par"]
sfs2d = sfs2d_unfolded.fold()
In [41]:
%psource dadi.Demographics2D.split_mig
Define an IM model that allows for unequal migration rates between the two populations.
In [56]:
def split_asym_mig(params, ns, pts):
"""
params = (nu1,nu2,T,m1,m2)
ns = (n1,n2)
Split into two populations of specifed size, with potentially asymmetric migration.
nu1: Size of population 1 after split.
nu2: Size of population 2 after split.
T: Time in the past of split (in units of 2*Na generations)
m1: Migration rate from ery into par (in units of 2*Na ind per generation)
m2: Migration rate from par into ery (in units of 2*Na ind per generation)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,T,m1,m2 = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
# split
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
# divergence with potentially asymmetric migration
phi = dadi.Integration.two_pops(phi, xx, T, nu1, nu2, m12=m2, m21=m1)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [43]:
cl[:].push(dict(split_asym_mig=split_asym_mig))
Out[43]:
In [44]:
%%px --local
func = split_asym_mig
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [16]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 300 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_asym_mig" # set file name stub for opt. result files
In [61]:
dadi.Inference.optimize_log_fmin?
In [17]:
from utility_functions import *
BEGIN ignore
Ignore this section. Unsuccessfully trying to use run_dadi
defined in utility_functions
module.
From the Python Tutorial:
The global namespace for a module is created when the module definition is read in
and
The statements executed by the top-level invocation of the interpreter, either read from a script file or interactively, are considered part of a module called __main__, so they have their own global namespace.
So, my module utility_functions
has its own global namespace, different from the global namespace of this IPython session. I have there added the line from __main__ import *
before the definition of the function run_dadi
in the module utility_functions
, so that it can see the global namespace if this IPython session.
In [11]:
# perturb these parameter values
# nu1, nu2, T, m1, m2
p0 = [1.0, 1.766, 0.922, 0.25, 0.25]
# run one optimisation locally
run_dadi(p0)
Out[11]:
It works locally.
In [48]:
from itertools import repeat
In [49]:
lbview = cl.load_balanced_view()
In [50]:
%%px --local
# set lower and upper bounds to nu1, nu2, T, m1, m2
upper_bound = [1e4, 1e4, 4, 10, 10] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0]
In [21]:
# using optimal parameters from simple divergence with migration model (split_mig)
p0 = [1.0, 1.766, 0.922, 0.25, 0.25]
ar_split_asym_mig_1 = lbview.map(run_dadi, repeat(p0, 1), block=False, order=True)
In [22]:
ar_split_asym_mig_1.get()
END ignore.
In [60]:
dadi.Misc.perturb_params?
In [53]:
%%px --local
# set lower and upper bounds to nu1, nu2, T, m1, m2
upper_bound = [1e4, 1e4, 4, 10, 10] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0]
In [54]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [58]:
from itertools import repeat
In [56]:
lbview = cl.load_balanced_view()
In [57]:
# using optimal parameters from simple divergence with migration model (split_mig)
p0 = [1.0, 1.766, 0.922, 0.25, 0.25]
In [15]:
#ar_split_asym_mig_1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [28]:
get_flag_count(ar_split_asym_mig_1, NM=True)
In [10]:
import dill
from glob import glob
In [11]:
ar_split_asym_mig = []
for filename in glob("OUT_2D_models/split_asym_mig*dill"):
ar_split_asym_mig.append(dill.load(open(filename)))
In [12]:
import pandas as pd
In [15]:
l = 2*len(p0)+1
success = [flatten(out)[:l] for out in ar_split_asym_mig if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[15]:
These 4 optimisations consistently indicate a much smaller gene flow from erythropus into parallelus than in the other direction. This seems plausible when considering the observed expansion of the range of parallelus towards the south of the Pyrenees. Shapes of neutral clines across the hybrid zone should also be less steep on the erythropus side than on the parallelus side.
In [35]:
%%px --local
maxiter = 500
In [36]:
cl[1]['fold']
Out[36]:
In [37]:
upper_bound
Out[37]:
I will need to increase the upper bound on the divergence time $T$. Otherwise, perturbation can result in starting values above the upper bound.
In [38]:
%%px --local
upper_bound = [10000.0, 10000.0, 6, 10, 10]
In [39]:
# using previous optimal parameters as starting values to perturb
p0 = [0.915567, 2.223840, 1.228988, 0.084270, 0.497170]
#ar_split_asym_mig_2 = lbview.map(run_dadi, repeat(p0, 20), block=False, order=True)
In [15]:
ar_split_asym_mig = []
for filename in glob("OUT_2D_models/split_asym_mig*dill"):
ar_split_asym_mig.append(dill.load(open(filename)))
l = 2*len(p0)+1
success = [flatten(out)[:l] for out in ar_split_asym_mig if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[15]:
There is good convergence of parameter estimates.
In section 6, I have noticed that final parameter combinations from "unsuccessfull" optimisations can have higher likelihood than parameter combinations from successfull optimisations. I am therefore going to load the final parameter combinations from all "unsuccessfull" optimisations from file in order to check whether some have higher likelihood than the ones seen so far.
In [35]:
from glob import glob
import dill
In [37]:
ar_split_asym_mig = []
for filename in glob("OUT_2D_models/split_asym_mig_[1234567890]*dill"):
ar_split_asym_mig.append(dill.load(open(filename)))
l = 2*5+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig if out[1][4] != 0]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[37]:
No better parameter combinations than seen before can be found here.
In [16]:
import numpy as np
In [17]:
popt = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 5:10])
popt
Out[17]:
In [18]:
func = split_asym_mig
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [22]:
# calculate best-fit model spectrum
model_spectrum = func_ex(popt, ns, pts_l)
model_spectrum_asym_mig_scaled = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d)
In [23]:
theta = dadi.Inference.optimal_sfs_scaling(model_spectrum, sfs2d)
In [24]:
print "The optimal value of theta for the ancestral population is {0:4d}.".format(int(theta))
In [25]:
mu = 3e-9
L = sfs2d.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
In [29]:
print "The ancestral population split apart {0:,} generations ago.".format(int(popt[2]*2*N_ref)),
print "Immediately after the split the ERY population changed to a size of {0:.2f} times Nref and the PAR population to {1:.2f} times Nref.".format(popt[0], popt[1]),
print "Since the split of the ancestral population, PAR received 1 individual from ERY every {0:.2f} generations,".format(1.0/(popt[3]/2*popt[1])),
print "while ERY received 1 PAR individual every {0:.2f} generations.".format(1.0/(popt[4]/2*popt[0])),
print "Put another way: The par population contained a constant proportion of {0:.2e} of new immigrant alleles each generation".format(popt[3]/2/N_ref),
print "and the ery population contained a constant proportion of {0:.2e} of new immigrant alleles each generation.".format(popt[4]/2/N_ref)
This model therefore indicates a much stronger introgression from parallelus into erythropus than in the opposite direction. Is the asymmetrical gene flow significant?
In [30]:
ll_c = dadi.Inference.ll_multinom(model_spectrum, sfs2d)
ll_c
Out[30]:
In [31]:
# get maximum log likelihood of best fit parameter combination for the model
# with symmetrical migration rates
func = dadi.Demographics2D.split_mig
func_ex = dadi.Numerics.make_extrap_log_func(func)
# best-fit paramter combination for the split_mig model (see section 2 above)
popt_split_mig = [0.994185, 1.766127, 0.922632, 0.250688]
model_spectrum_sym_mig = func_ex(popt_split_mig, ns, pts_l)
model_spectrum_sym_mig_scaled = dadi.Inference.optimally_scaled_sfs(model_spectrum_sym_mig, sfs2d)
ll_s = dadi.Inference.ll_multinom(model_spectrum_sym_mig, sfs2d)
ll_s
Out[31]:
In [32]:
D = 2*(ll_c - ll_s)
D
Out[32]:
In [77]:
# calculate p-value for Chi-square dist. with 1 degree of freedom
p = dadi.Godambe.sum_chi2_ppf(D)
p
Out[77]:
It looks as though allowing for asymmetrical gene flow significantly improves the fit to the data. Linkage is unlikely to push the p-value above a significance value of 1e-2, for instance.
In [63]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 300 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_asym_mig_nom1" # set file name stub for opt. result files
# fix m1 to zero, allow all other parameters to be optimised:
fixed_params = [None, None, None, 0.0, None] # specify None to optimise all params
In [70]:
dadi.Inference.optimize_log_fmin?
In [62]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output, \
fixed_params=fixed_params)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [69]:
# using optimal parameters as starting values to perturb
# m1 (0.084) will be ignored
p0 = popt
p0
Out[69]:
In [66]:
#ar_split_asym_mig_nom1 = lbview.map(run_dadi, repeat(p0, 10), block=False, order=True)
In [67]:
ar_split_asym_mig_nom1 = []
for filename in glob("OUT_2D_models/split_asym_mig_nom1*dill"):
ar_split_asym_mig_nom1.append(dill.load(open(filename)))
In [71]:
l = 2*len(p0)+1
success = [flatten(out)[:l] for out in ar_split_asym_mig_nom1 if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[71]:
This looks like convergence.
In [72]:
# extract optimal parameter values from data frame
popt_nom1 = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 5:10])
popt_nom1
Out[72]:
In [76]:
# calculate best-fit model spectrum
model_spectrum_nom1 = func_ex(popt_nom1, ns, pts_l)
model_spectrum_nom1_scaled = dadi.Inference.optimally_scaled_sfs(model_spectrum_nom1, sfs2d)
In [78]:
# get log likelihood of best-fit spectrum of IM model without ery->par migration
ll_s = dadi.Inference.ll_multinom(model_spectrum_nom1, sfs2d)
ll_s
Out[78]:
In [77]:
# get log likelihood for model that also optimised m1 (from above)
ll_c = dadi.Inference.ll_multinom(model_spectrum, sfs2d)
ll_c
Out[77]:
In [79]:
D = 2*(ll_c - ll_s)
D
Out[79]:
In [81]:
# calculate p-value for Chi-square dist.
# the weights specify a weighted sum of a chi^2 dist.'s with 1 d.o.f and a chi^2 with 2 d.o.f
#
p = dadi.Godambe.sum_chi2_ppf(D, weights=(0.5, 0.5))
p
Out[81]:
The gene flow from ery into par is highly significant. It is unlikely that this result can be compromised by linkage between the SNP's.
In [33]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = [14, 12]
pylab.rcParams['font.size'] = 14
dadi.Plotting.plot_2d_comp_multinom(model_spectrum, sfs2d, vmin=1)
Plot residuals between IM models with symmetrical and asymmetrical migration.
In [36]:
dadi.Plotting.plot_2d_comp_multinom(model=model_spectrum_asym_mig_scaled.fold(), \
data=model_spectrum_sym_mig_scaled.fold(), vmin=1,\
title=['sym mig', 'asym mig'])
The model spectrum for asymmetric gene flow has an excess (brown) of low to medium MAF SNP's in PAR that occur at low frequency in ERY.
In [40]:
popt_split_mig = popt_split_mig + [popt_split_mig[-1]]
popt_split_mig
Out[40]:
In [44]:
pd.DataFrame(data=np.array([popt_split_mig, popt]), index=['sym_mig', 'asym_mig'], columns=['nu1', 'nu2', 'T', 'm1', 'm2'])
Out[44]:
The excess of SNP's in MAF categroy 1 that did not introgress from PAR into ERY could be due to the greater population size increase inferred for PAR in the asymmetric gene flow model, although this model also inferred a greater time since the population size increase which should lead to greater equilibration of the initial rise in low frequency variants.
At the end of the last Ice Age, about 10 Ky ago, both ERY and PAR expanded from refugial areas across Iberia and across northern, central and western Europe, respectively. This expansion likely happened by long-distance migration, where only very few individuals colonised a new habitat. This should lead to a series of founder events that successively reduced genetic diversity the further away from the refugium.
In [35]:
from glob import glob
import dill
from utility_functions import *
import pandas as pd
# turn on floating point division by default, old behaviour via '//'
from __future__ import division
In [6]:
lbview = cl.load_balanced_view()
from itertools import repeat
In [7]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = [12, 10]
pylab.rcParams['font.size'] = 14
In [36]:
def split_asym_mig_2epoch(params, ns, pts):
"""
params = (nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2)
ns = (n1,n2)
Split into two populations of specified size, with potentially asymmetric migration.
The split coincides with a stepwise size change in the daughter populations. Then,
have a second stepwise size change at some point in time after the split. This is
enforced to happen at the same time for both populations. Migration is assumed to
be the same during both epochs.
nu1_1: pop size ratio of pop 1 after split (with respect to Na)
nu2_1: pop size ratio of pop 2 after split (with respect to Na)
T1: Time from split to second size change (in units of 2*Na generations)
nu1_2: pop size ratio of pop 1 after second size change (with respect to Na)
nu2_2: pop size ratio of pop 2 after second size change (with respect to Na)
T2: time in past of second size change (in units of 2*Na generations)
m1: Migration rate from ery into par (in units of 2*Na ind per generation)
m2: Migration rate from par into ery (in units of 2*Na ind per generation)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2 = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
# split
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
# divergence with potentially asymmetric migration for time T1
phi = dadi.Integration.two_pops(phi, xx, T1, nu1_1, nu2_1, m12=m2, m21=m1)
# divergence with potentially asymmetric migration and different pop size for time T2
phi = dadi.Integration.two_pops(phi, xx, T2, nu1_2, nu2_2, m12=m2, m21=m1)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [37]:
cl[:].push(dict(split_asym_mig_2epoch=split_asym_mig_2epoch))
Out[37]:
In [38]:
%%px --local
func = split_asym_mig_2epoch
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [39]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_asym_mig_2epoch" # set file name stub for opt. result files
fixed_params = None
In [12]:
%%px --local
# set lower and upper bounds to nu1, nu2, T, m1, m2
upper_bound = [1e4, 1e4, 6, 1e4, 1e4, 6, 10, 10]
lower_bound = [1e-4, 1e-4, 0, 1e-4, 1e-4, 0, 0, 0]
In [40]:
p0 = [1.0, 3.0, 2.0, 1.5, 3.8, 0.343805, 0.076152, 0.347306]
In [27]:
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [29]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[29]:
In [30]:
popt = df.sort_values(by='-logL', ascending=True).iloc[0,8:16]
popt
Out[30]:
In [36]:
p0 = np.array(popt)
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [15]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[15]:
In [25]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,8:16])
popt
Out[25]:
In [24]:
p0 = [0.94708150816833536, 3.8315744841549426, 1.2017020166706207,0.71982281273823534, 0.25322772844223501, 0.011374413880093007,0.0080052470233440004, 0.5823162256176253]
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [26]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[26]:
In [27]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,8:16])
popt
Out[27]:
In [28]:
p0 = [1.7148538096681663, 4.9162601290093262, 2.4096584681561115,0.13455833748374804, 0.16065741923794288, 0.0049387150509544253,0.027701205471474446, 0.36461853899050334]
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [29]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[29]:
In [30]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,8:16])
popt
Out[30]:
In [31]:
p0 = [1.0456418566167558, 3.6959057980884156, 1.2213291569132876,0.12722919734283214, 0.082311967206571773, 0.0033710890551920097,0.063492045724766522, 0.54652237829166095]
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [33]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[33]:
In [34]:
%%px --local
pts_l = [50, 60, 70]
dadi_opt_func = dadi.Inference.optimize_log # uses BFGS algorithm
fold = 1 # perturb randomly up to `fold` times 2-fold
maxiter = 100 # run a maximum of 100 iterations
In [35]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,8:16])
popt
Out[35]:
In [36]:
p0 = [0.90877952357019698, 3.5342771794685697, 1.1440736311780284,0.082373742973961023, 0.033507304302622123, 0.0014247576674479255,0.078269443701444424, 0.5629736811087368]
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [37]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[37]:
In [38]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,8:16])
popt
Out[38]:
In [14]:
p0 = [1.0179130901393454, 3.6975318699708484, 1.2695784485894963,0.00040204669127636192, 0.00020950770841736489,8.6260651610300379e-06, 0.06719544663984639, 0.54962618122149143]
#ar_split_asym_mig_2epoch = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [44]:
ar_split_asym_mig_2epoch = []
for filename in glob("OUT_2D_models/split_asym_mig_2epoch_[1234567890]*dill"):
ar_split_asym_mig_2epoch.append(dill.load(open(filename)))
l = 2*len(p0)+1
# get parameters from "unsuccessfull" optimisations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_2epoch]
#nu1_1,nu2_1,T1,nu1_2,nu2_2,T2,m1,m2
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'T1_0', 'nu1_2_0', 'nu2_2_0', 'T2_0', 'm1_0', 'm2_0', 'nu1_opt', 'nu2_opt', 'T1_opt', 'nu1_2_opt', 'nu2_2_opt', 'T2_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[44]:
It is apparently difficult to achieve convergence. However, all best-fit models indicate a very recent and very drastic population size reduction for ERY and PAR. Allowing for a second stepwise size change by adding three more parameters ($\nu_{ery_2}$, $\nu_{par_2}$ and $T2$) improves the likelihood by 1479 logL units as compared to the asymmetric migration model above (best logL -18,105) and is therefore clearly significant (although it was optimised with a coarser grid).
In [45]:
popt = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 8:16])
popt
Out[45]:
In [46]:
# calculate best-fit model spectrum
model = func_ex(popt, ns, pts_l)
ll_model = dadi.Inference.ll_multinom(model, sfs2d)
ll_model
Out[46]:
In [47]:
theta = dadi.Inference.optimal_sfs_scaling(model, sfs2d)
L = sfs2d.data.sum()
print "The optimal value of theta per site for the ancestral population is {0:.5f}.".format(theta/L)
In [48]:
mu = 3e-9
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ERY and PAR (in number of diploid individuals) implied by this theta is:\n {0:,}.".format(int(N_ref))
In [49]:
ery_1, par_1, T1, ery_2, par_2, T2, m1, m2 = popt
In [57]:
print "The ancestral population split apart {0:,} generations ago.".format(int((T1+T2)*2*N_ref)),
print "Immediately after the split the ERY population changed to a size of {0:,} and the PAR population to {1:,}.".format(int(ery_1*N_ref), int(par_1*N_ref)),
print "Since the split of the ancestral population, PAR received 1 individual from ERY every {0:.2f} generations,".format(1.0/(m1*par_1/2)),
print "while ERY received 1 PAR individual every {0:.2f} generations.".format(1.0/(m2*ery_1/2)),
print "Put another way: The PAR population contained a constant proportion of {0:.2e} of new immigrant alleles each generation".format(m1/2/N_ref),
print "and the ERY population contained a constant proportion of {0:.2e} of new immigrant alleles each generation.".format(m2/2/N_ref)
print "ERY and PAR underwent a second stepwise simultaneous population size change {0:,} generations in the past.".format(int(T2*2*N_ref)),
print "ERY changed to a size of {0:,} individuals and PAR to a size of {1:,} individuals.".format(int(ery_2*N_ref), int(par_2*N_ref))
The 2nd population size change seems unreasonably recent and drastic. The fitting of this model to the corrected spectrum came up with very similar results. So if this is due to an artifact in the data, it could not be removed by Ludovic's correction. This is worrying since the improvement in fit is too large to be ignored (still 325 logL units with the corrected spectrum) but the inferred time and amounts of the 2nd population size changes can hardly be realistic.
In [54]:
dadi.Plotting.plot_2d_comp_multinom(data=sfs2d, model=model, vmin=1)
I would like to compare the best-fit spectra of the ancient migration with this recent bottleneck model.
In [55]:
recent_bottleneck = dadi.Inference.optimally_scaled_sfs(model, sfs2d)
In [57]:
func_ex = dadi.Numerics.make_extrap_log_func(split_asym_mig)
popt_asym_mig = [ 0.92091098, 2.2386172 , 1.24468148, 0.08414741, 0.49491818]
model_asym_mig = func_ex(popt_asym_mig, ns, pts_l)
model_asym_mig = dadi.Inference.optimally_scaled_sfs(model_asym_mig, sfs2d)
In [58]:
dadi.Plotting.plot_2d_comp_multinom(data=recent_bottleneck.fold() , model=model_asym_mig.fold(), \
vmin=1, title=['recent bottleneck', 'asym mig'], pop_ids=['ery', 'par'])
The two predicted spectra under the two models mainly differ in their predictions for the SNP count classes [0, 1] and [1, 0]. This is also true when those two models are fit to the corrected spectrum.
I would now like to add a gradual population size change after the split of the ancestral population. So far, the split coincided with a stepwise population size change.
In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [3]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [4]:
%%px --local
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [5]:
%%px --local
# add population labels
sfs2d_unfolded.pop_ids = ["ery", "par"]
sfs2d = sfs2d_unfolded.fold()
In [7]:
%psource dadi.Demographics2D.IM
Dadi's built-in IM
model function contains what I would like to model now. To avoid confusion of migration rate parameters, I am anyway going to specify a slightly modified version of this model that uses the same naming convention and order of m as used above.
In [6]:
def IM(params, ns, pts):
"""
ns = (n1,n2)
params = (s,nu1,nu2,T,m1,m2)
Isolation-with-migration model with exponential pop growth or decline.
s: Size of pop 1 after split. (Pop 2 has size 1-s.)
nu1: Final size of pop 1.
nu2: Final size of pop 2.
T: Time in the past of split (in units of 2*Na generations)
m1: Migration from pop 1 to pop 2
m2: Migration from pop 2 to pop 1 (2*Na*m12)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
s,nu1,nu2,T,m1,m2 = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
nu1_func = lambda t: s * (nu1/s)**(t/T)
nu2_func = lambda t: (1-s) * (nu2/(1-s))**(t/T)
phi = dadi.Integration.two_pops(phi, xx, T, nu1_func, nu2_func,
m12=m2, m21=m1)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
In [7]:
%who
In [8]:
cl[:].push(dict(IM=IM))
Out[8]:
In [9]:
%%px --local
func = IM
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [10]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 300 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/IM" # set file name stub for opt. result files
fixed_params = None # do not fix any parameters, optimise all
In [11]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output, \
fixed_params=fixed_params)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [12]:
lbview = cl.load_balanced_view()
In [13]:
from itertools import repeat
In [14]:
%%px --local
# set lower and upper bounds to s, nu1, nu2, T, m1, m2
upper_bound = [1, 1e4, 1e4, 4, 10, 10] # note, I have increased the upper bound for T
lower_bound = [0, 1e-4, 1e-4, 0, 0, 0]
In [15]:
# using optimal parameter values from `split_asym_mig` model as starting values to perturb
p0 = [0.5, 0.92091098, 2.2386172 , 1.24468148, 0.08414741, 0.49491818]
In [20]:
#ar_IM = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)
In [16]:
from utility_functions import *
In [24]:
ar_IM.done()
Out[24]:
In [25]:
ar_IM.elapsed/60
Out[25]:
It took almost 55 minutes to run these 10 optimisations!
In [27]:
get_flag_count(ar_IM, NM=True)
None of these iterations were successfull!
Let's have a look at the final parameters anyway.
In [17]:
from glob import glob
import dill
In [29]:
ar_IM_1 = []
for filename in glob("OUT_2D_models/IM*dill"):
ar_IM_1.append(dill.load((open(filename))))
In [18]:
import pandas as pd
In [31]:
l = 2*len(p0)+1
returned = [flatten(out)[:l] for out in ar_IM_1]
df = pd.DataFrame(data=returned, columns=['s_0' ,'nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 's_opt' ,'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[31]:
Note, that the opt parameter values are just the final parameter values when the optimisation aborted. The parameter combinations with the highest likelihood have extremely long divergence time T. Also, the best-fit parameter combination of the split_asym_mig
model above had log-likelihood of -18104, i. e. MUCH higher than the best parameter combination returned here.
In [32]:
cl[0]['fold']
Out[32]:
In [33]:
cl[0]['maxiter']
Out[33]:
In [34]:
%%px
fold = 1
maxiter = 500
In [35]:
#ar_IM = lbview.map(run_dadi, repeat(p0, 10), block=False, order=False)
In [36]:
ar_IM.done()
Out[36]:
In [37]:
ar_IM.elapsed/60
Out[37]:
In [38]:
get_flag_count(ar_IM, NM=True)
6 optimisations were successfull.
In [39]:
ar_IM_2 = []
for filename in glob("OUT_2D_models/IM*dill"):
ar_IM_2.append(dill.load((open(filename))))
In [41]:
l = 2*len(p0)+1
success = [flatten(out)[:l] for out in ar_IM_2 if out[1][4] == 0]
df = pd.DataFrame(data=success, columns=['s_0' ,'nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 's_opt' ,'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[41]:
The optimsations have not converged on a set of optimal parameter values.
In [42]:
cl[:]['fold']
Out[42]:
In [43]:
p0
Out[43]:
In [44]:
%%px
fold = 1.5
In [45]:
ar_IM = lbview.map(run_dadi, repeat(p0, 20))
In [20]:
ar_IM_3 = []
for filename in glob("OUT_2D_models/IM*dill"):
ar_IM_3.append(dill.load((open(filename))))
In [21]:
l = 2*len(p0)+1
success = [flatten(out)[:l] for out in ar_IM_3 if out[1][4] == 0]
df = pd.DataFrame(data=success, columns=['s_0' ,'nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 's_opt' ,'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[21]:
This model is hard to fit to the data. The best-fit parameter combinations contain an extremely long divergence time T of around 3, i. e. $6 \times N_{ref}$ generations. No parameter combination with higher likelihood could be found than in the previous runs and they are all much lower than from the split_asym_mig
model (-18104), which imposed a stepwise size change coincident with the population split and otherwise constant population sizes. I do not understand why this more flexible model is so much harder to fit to the data.
In section 6, I have noticed that final parameter combinations from "unsuccessfull" optimisations can have higher likelihood than parameter combinations from successfull optimisations. I am therefore going to load the final parameter combinations from all "unsuccessfull" optimisations from file in order to check whether some have higher likelihood than the ones seen so far.
In [39]:
from glob import glob
import dill
In [41]:
ar_IM = []
for filename in glob("OUT_2D_models/IM*dill"):
ar_IM.append(dill.load((open(filename))))
In [42]:
import pandas as pd
In [45]:
l = 2*6+1
# get all paramters returned from " unsuccessfull" optimisations into a table
returned = [flatten(out)[:l] for out in ar_IM if out[1][4] != 0]
df = pd.DataFrame(data=returned, columns=['s_0' ,'nu1_0','nu2_0', 'T_0', 'm1_0', 'm2_0', 's_opt' ,'nu1_opt', 'nu2_opt', 'T_opt', 'm1_opt', 'm2_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head()
Out[45]:
No better parameter combinations can be found here.
In [23]:
import numpy as np
In [34]:
popt = np.array(df.sort_values(by='-logL', ascending=True).iloc[0, 6:12])
popt
Out[34]:
In [35]:
%psource IM
In [36]:
func_ex = dadi.Numerics.make_extrap_log_func(IM)
model_spectrum = func_ex(popt, ns, pts_l)
In [37]:
dadi.Plotting.plot_2d_comp_multinom(model_spectrum, sfs2d, vmin=1)
I would like to see the difference in the expected spectra from the split_asym_mig
and this IM
model.
In [42]:
# optimally scale the best-fit IM model spectrum,
# by multiplication with optimal theta
model_spectrum = dadi.Inference.optimally_scaled_sfs(model_spectrum, sfs2d)
In [43]:
# make extraplating function from `split_asym_mig` model function
func_ex = dadi.Numerics.make_extrap_log_func(split_asym_mig)
# these were the optimal parameters for the `split_asym_mig` model
popt_asym_mig = [0.92091098, 2.2386172 , 1.24468148, 0.08414741, 0.49491818]
# get (unsclaed) expected SFS for optimal `split_asym_mig` model
model_spectrum_asym_mig = func_ex(popt_asym_mig, ns, pts_l)
# optimally scale spectrum
model_spectrum_asym_mig = dadi.Inference.optimally_scaled_sfs(model_spectrum_asym_mig, sfs2d)
In [48]:
dadi.Plotting.plot_2d_comp_multinom(data=model_spectrum_asym_mig.fold(), model=model_spectrum.fold(), \
vmin=1, title=['asym_mig', 'IM'], pop_ids=['ery', 'par'])
Looking at the residual plot, the IM
model has a deficit (blue) of low frequency private alleles in par and an excess (red) of high frequency private alleles in ery.
So far, a model with an ancient split and low but constant gene flow (more from par into ery than in the other direction) has fit the data best. I would like to know whether I can achieve an improvement if I let gene flow cease completely at some point in time.
In [37]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[37]:
In [38]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [39]:
%%px --local
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [40]:
%%px --local
# add population labels
sfs2d_unfolded.pop_ids = ["ery", "par"]
sfs2d = sfs2d_unfolded.fold()
In [41]:
import numpy as np
In [42]:
def split_asym_mig_iso(params, ns, pts):
"""
params = (nu1,nu2,Tc,m1,m2,Ti)
ns = (n1,n2)
Split into two populations of specifed size, with potentially asymmetric migration
for a time Tc followed by complete isolation until present.
nu1: Size of population 1 after split.
nu2: Size of population 2 after split.
Tc: Time of gene flow after split (in units of 2*Na generations)
m1: Migration rate from ery into par (in units of 2*Na ind per generation)
m2: Migration rate from par into ery (in units of 2*Na ind per generation)
Ti: Time of isolation after cessation of gene flow
The split lies Tc+Ti * 2Na generations in the past.
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,Tc,m1,m2,Ti = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
# split
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
# divergence with potentially asymmetric migration
phi = dadi.Integration.two_pops(phi, xx, Tc, nu1, nu2, m12=m2, m21=m1)
# divergence without gene flow
phi = dadi.Integration.two_pops(phi, xx, Ti, nu1, nu2, m12=0, m21=0)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
This is a piecewise model. I could make gene flow fade out gradually, but that would require four migration parameters that specify the initial and final levels of gene flow for each direction. Also, it would make the model less comparable to the split_asym_mig
model, which is also piecewise.
In [43]:
cl[:].push(dict(split_asym_mig_iso=split_asym_mig_iso))
Out[43]:
In [44]:
%%px --local
func = split_asym_mig_iso
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [45]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 300 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_asym_mig_iso" # set file name stub for opt. result files
fixed_params = None
In [46]:
%%px --local
# set lower and upper bounds to nu1, nu2, Tc, m1, m2 and Ti
upper_bound = [1e4, 1e4, 4, 10, 10, 4] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0, 0]
In [47]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output, \
fixed_params=fixed_params)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [48]:
lbview = cl.load_balanced_view()
In [49]:
from itertools import repeat
In [21]:
# these were the optimal parameters for the `split_asym_mig` model
popt_asym_mig = [0.92091098, 2.2386172 , 1.24468148, 0.08414741, 0.49491818]
p0 = [0.92091098, 2.2386172 , 1.24468148/2, 0.08414741, 0.49491818, 1.24468148/2]
In [17]:
#ar_split_asym_mig_iso = lbview.map(run_dadi, repeat(p0, 10))
In [18]:
from utility_functions import *
In [19]:
get_flag_count(ar_split_asym_mig_iso, NM=True)
In [19]:
from glob import glob
import dill
ar_split_asym_mig_iso = []
for filename in glob("OUT_2D_models/split_asym_mig_iso*dill"):
ar_split_asym_mig_iso.append(dill.load(open(filename)))
In [20]:
import pandas as pd
In [22]:
l = 2*len(p0)+1
# show only successfull optimisations
success = [flatten(out)[:l] for out in ar_split_asym_mig_iso if out[1][4] == 0]
df = pd.DataFrame(data=success, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[22]:
In [23]:
l = 2*len(p0)+1
# show parameter combinations from unsuccessfull optimisations:
returned = [flatten(out)[:l] for out in ar_split_asym_mig_iso if out[1][4] != 0]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True)
Out[23]:
This is strange. "Unsuccessfull" optimisations (those which were aborted due to Maximum number of iterations reached
) return parameter combinations that are much better than the two successfull optimisations!
I wonder how many good parameter combinations I have missed due to my discarding of "unsuccessfull" optimisations!
Fortunately, I have saved the results of all optimisations to disk. So I can check each of my model optimisations for previously missed parameter combinbations that are better than the ones I have found so far.
The log likelihood of the split_asym_mig
model with best-fit parameters was: -18104.966266885222
In [29]:
D = 2*(-17596.977545 - -18104.966266885222)
D
Out[29]:
This ancient migration model fits the data significantly better than the split_asym_mig
model. That also means that the recent period of complete isolation is significant. Assuming that I had found the best-fit parameters for that model, this means that a model that allows for a very ancient split (>1 million generations ago) and that specifies a recent cessation of gene flow fits the data better than a model with a less ancient split and constant migration until present.
Let's get a few more optimisation results.
In [24]:
p0 = np.array(df.sort_values(by='-logL', ascending=True).iloc[0,6:12])
p0
Out[24]:
In [25]:
ar_split_asym_mig_iso = lbview.map(run_dadi, repeat(p0, 10))
In [26]:
ar_split_asym_mig_iso.elapsed/60
Out[26]:
In [27]:
ar_split_asym_mig_iso = []
for filename in glob("OUT_2D_models/split_asym_mig_iso*dill"):
ar_split_asym_mig_iso.append(dill.load(open(filename)))
In [28]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_iso]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[28]:
There seems to be decent convergence. The long time inferred for the split ($T_c+T_i$) is worrying. I think it would be good to start a few more optimisations around the best parameter combination.
In [29]:
%%px
fold = 1
In [30]:
p0 = np.array(df.sort_values(by='-logL', ascending=True).iloc[0,6:12])
p0
Out[30]:
In [31]:
#ar_split_asym_mig_iso = lbview.map(run_dadi, repeat(p0, 10))
In [55]:
% ll OUT_2D_models/split_asym_mig_iso*dill
In [56]:
ar_split_asym_mig_iso = []
for filename in glob("OUT_2D_models/split_asym_mig_iso*dill"):
ar_split_asym_mig_iso.append(dill.load(open(filename)))
In [57]:
l = 2*6+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_iso]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[57]:
This looks like decent convergence. I would like to refine the search by using a finer grid.
In [50]:
%%px --local
# set lower and upper bounds to nu1, nu2, Tc, m1, m2 and Ti
upper_bound = [1e4, 1e4, 6, 10, 10, 4] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0, 0]
In [51]:
%%px
fold = 1
pts_l = [50, 60, 70]
maxiter = 300
In [58]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,6:12])
popt
Out[58]:
In [59]:
p0 = popt
#ar_split_asym_mig_iso = lbview.map(run_dadi, repeat(p0, 10))
In [60]:
ar_split_asym_mig_iso = []
for filename in glob("OUT_2D_models/split_asym_mig_iso*dill"):
ar_split_asym_mig_iso.append(dill.load(open(filename)))
In [61]:
l = 2*6+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_iso]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[61]:
$T_C$ is close to the upper bound of 6 that I have set. Although it is hard to believe in such high values of a time parameter, I am going to increase the bound and run a few more optimisations. If the next optimisations are again close the now increased boundary, I am declaring the optimisation failed.
In [67]:
%%px --local
# set lower and upper bounds to nu1, nu2, Tc, m1, m2 and Ti
upper_bound = [1e4, 1e4, 8, 10, 10, 4] # note, I have increased the upper bound for T
In [65]:
%%px
fold = 1
pts_l = [50, 60, 70]
maxiter = 100
In [66]:
popt = np.array(df.sort_values(by='-logL', ascending=True).head(10).iloc[0,6:12])
popt
Out[66]:
In [68]:
p0 = popt
#ar_split_asym_mig_iso = lbview.map(run_dadi, repeat(p0, 10), block=False)
In [71]:
ar_split_asym_mig_iso.done()
Out[71]:
In [72]:
ar_split_asym_mig_iso = []
for filename in glob("OUT_2D_models/split_asym_mig_iso*dill"):
ar_split_asym_mig_iso.append(dill.load(open(filename)))
In [73]:
l = 2*6+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_split_asym_mig_iso]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[73]:
It is difficult to achieve convergence. Parameter combinations with quite different time of primary contact ($T_C$) have very similar likelihood.
In [76]:
popt_mig_iso = np.array(df.sort_values(by='-logL', ascending=True).iloc[0,6:12])
popt_mig_iso
Out[76]:
In [77]:
model_mig_iso = func_ex(popt_mig_iso, ns, pts_l)
In [78]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = [14, 12]
pylab.rcParams['font.size'] = 14
In [79]:
dadi.Plotting.plot_2d_comp_multinom(model=model_mig_iso, data=sfs2d, vmin=1)
In [80]:
theta = dadi.Inference.optimal_sfs_scaling(model_mig_iso, sfs2d)
theta
Out[80]:
In [81]:
mu = 3e-9
L = sfs2d.data.sum() # this sums over all entries in the spectrum, including masked ones, i. e. also contains invariable sites
print "The total sequence length for the 2D spectrum is {0:,}.".format(int(L))
N_ref = theta/L/mu/4
print "The effective size of the ancestral population of ery and par (in number of diploid individuals) implied by this theta is: {0:,}.".format(int(N_ref))
I think those long inferred divergence times lead to a reduction in the estimated theta.
In [82]:
popt = popt_mig_iso
print "The ancestral population split apart {0:,} generations ago.".format(int((popt[2]+popt[5])*2*N_ref)),
print "Immediately after the split the ERY population changed to a size of {0:,} and the PAR population to {1:,}.".format(int(popt[0]*N_ref), int(popt[1]*N_ref)),
print "Since the split of the ancestral population, PAR received 1 individual from ERY every {0:.2f} generations,".format(1.0/(popt[3]/2*popt[1])),
print "while ERY received 1 PAR individual every {0:.2f} generations.".format(1.0/(popt[4]/2*popt[0])),
print "Put another way: The par population contained a constant proportion of {0:.2e} of new immigrant alleles each generation".format(popt[3]/2/N_ref),
print "and the ery population contained a constant proportion of {0:.2e} of new immigrant alleles each generation.".format(popt[4]/2/N_ref),
print "ERY and PAR remained in contact for {0:,} generations.".format(int(popt[2]*2*N_ref)),
print "{0:,} generations ago gene flow between ERY and PAR had ceased.".format(int(popt[5]*2*N_ref))
See 2D_model_synthesis for correct parameter translations.
This is another indication that a recent secondary contact model is not compatible with the data. If the better fit of this ancient migration model (compared to constant migration) is not just due to fitting noise or biases in the data, then it would also mean that there is no detectable gene flow since the last ice age, despite the fact that the two populations are just a few kilometers apart and the clines of several markers are very wide across this transect of the hybrid zone.
In [57]:
model_mig_iso_scaled = dadi.Inference.optimally_scaled_sfs(model_mig_iso, sfs2d)
In [59]:
func_ex = dadi.Numerics.make_extrap_log_func(split_asym_mig)
In [60]:
popt_asym_mig = [0.92091098, 2.2386172 , 1.24468148, 0.08414741, 0.49491818]
In [61]:
model_asym_mig = func_ex(popt_asym_mig, ns, pts_l)
In [62]:
model_asym_mig_scaled = dadi.Inference.optimally_scaled_sfs(model_asym_mig, sfs2d)
In [64]:
dadi.Plotting.plot_2d_comp_multinom(data=model_mig_iso_scaled.fold() , model=model_asym_mig_scaled.fold(), vmin=1, title=['ancient mig', 'asym mig'], pop_ids=['ery', 'par'])
Apparently the constant asymmetric migration model predicts many more variants at frequency 1 in erythropus than the ancient asymmetric migration model.
I would like to see whether the fit to the data can be further improved if I add to the previous model a third epoch with a recent restart of gene flow.
In [1]:
from ipyparallel import Client
cl = Client()
cl.ids
Out[1]:
In [2]:
%%px --local
# run whole cell on all engines a well as in the local IPython session
import numpy
import sys
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [3]:
%%px --local
# import 2D unfolded spectrum
sfs2d_unfolded = dadi.Spectrum.from_file('dadiExercises/EryPar.unfolded.2dsfs.dadi_format')
In [4]:
%%px --local
# add population labels
sfs2d_unfolded.pop_ids = ["ery", "par"]
sfs2d = sfs2d_unfolded.fold()
In [5]:
def split_mig_iso_mig(params, ns, pts):
"""
params = (nu1,nu2,Tc,m1,m2,Ti,Tsc)
ns = (n1,n2)
Split into two populations of specifed size, with potentially asymmetric migration
for a time Tc followed by a period of complete isolation Ti which is followed by a restart of
migration until present. Migration rates in the two epochs are assumed be to equal (and constant).
nu1: Size of population 1 after split.
nu2: Size of population 2 after split.
Tc: Time of gene flow after split (in units of 2*Na generations)
m1: Migration rate from ery into par (in units of 2*Na ind per generation)
m2: Migration rate from par into ery (in units of 2*Na ind per generation)
Ti: Time of isolation after cessation of gene flow
Tsc: Time of gene flow after period of isolation until present
The split lies Tc+Ti+Tsc * 2Na generations in the past.
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,Tc,m1,m2,Ti,Tsc = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
# split
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
# divergence with potentially asymmetric migration
phi = dadi.Integration.two_pops(phi, xx, Tc, nu1, nu2, m12=m2, m21=m1)
# divergence without gene flow
phi = dadi.Integration.two_pops(phi, xx, Ti, nu1, nu2, m12=0, m21=0)
# divergence with potentially asymmetric migration
phi = dadi.Integration.two_pops(phi, xx, Tsc, nu1, nu2, m12=m2, m21=m1)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
This model just adds another epoch to the ancient migration model in the previous section. The period without gene flow is now followed by another period where gene flow recommences and lasts until the present. Note that migration rates are assumed to be the same in both epochs of gene flow.
In [6]:
cl[:].push(dict(split_mig_iso_mig=split_mig_iso_mig))
Out[6]:
In [7]:
%%px --local
func = split_mig_iso_mig
func_ex = dadi.Numerics.make_extrap_log_func(func)
In [8]:
%%px --local
# set up global variables on engines required for run_dadi function call
ns = sfs2d.sample_sizes # both populations have the same sample size
# setting the smallest grid size slightly larger than the largest population sample size (36)
pts_l = [40, 50, 60]
dadi_opt_func = dadi.Inference.optimize_log_fmin # uses Nelder-Mead algorithm
sfs = sfs2d
perturb = True
fold = 2 # perturb randomly up to `fold` times 2-fold
maxiter = 300 # run a maximum of 300 iterations
verbose = 0
full_output = True # need to have full output to get the warnflags (see below)
outname = "OUT_2D_models/split_mig_iso_mig" # set file name stub for opt. result files
fixed_params = None
In [10]:
%%px --local
# set lower and upper bounds to nu1, nu2, Tc, m1, m2, Ti, Tsc
upper_bound = [1e4, 1e4, 4, 10, 10, 4, 4] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0, 0, 0]
In [11]:
def run_dadi(p_init): # for the function to be called with map, it needs to have one input variable
"""
p_init: initial parameter values to run optimisation from
"""
if perturb == True:
p_init = dadi.Misc.perturb_params(p_init, fold=fold,
upper_bound=upper_bound, lower_bound=lower_bound)
# note upper_bound and lower_bound variables are expected to be in the namespace of each engine
# run optimisation of paramters
popt = dadi_opt_func(p0=p_init, data=sfs, model_func=func_ex, pts=pts_l, \
lower_bound=lower_bound, upper_bound=upper_bound, \
verbose=verbose, maxiter=maxiter, full_output=full_output, \
fixed_params=fixed_params)
# pickle to file
import dill
name = outname[:] # make copy of file name stub!
for p in p_init:
name += "_%.4f" % (p)
with open(name + ".dill", "w") as fh:
dill.dump((p_init, popt), fh)
return p_init, popt
In [12]:
lbview = cl.load_balanced_view()
In [13]:
from itertools import repeat
In [14]:
# using the best fit parameters from the ancient migration model and adding an initial Tsc of 0.1
p0 = [1.798721, 4.397479, 3.879859, 0.061955, 0.379276, 0.107981, 0.1]
In [15]:
#ar_mig_iso_mig = lbview.map(run_dadi, repeat(p0, 10))
In [17]:
ar_mig_iso_mig.elapsed/60
Out[17]:
In [15]:
from glob import glob
import dill
from utility_functions import *
In [16]:
% ll OUT_2D_models/split_mig_iso_mig*dill
In [17]:
ar_mig_iso_mig = []
for filename in glob("OUT_2D_models/split_mig_iso_mig*dill"):
ar_mig_iso_mig.append(dill.load(open(filename)))
In [18]:
import pandas as pd
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_mig_iso_mig]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'Tsc_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', 'Tsc_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[18]:
I think the perturbation of Tc did not work very well. Let's generate a few more optimisations.
In [19]:
%%px --local
# set lower and upper bounds to nu1, nu2, Tc, m1, m2, Ti, Tsc
upper_bound = [1e4, 1e4, 8, 10, 10, 4, 4] # note, I have increased the upper bound for T
lower_bound = [1e-4, 1e-4, 0, 0, 0, 0, 0]
In [20]:
# using the best fit parameters from the ancient migration model and adding an initial Tsc of 0.1
p0 = [1.798721, 4.397479, 3.879859/2, 0.061955, 0.379276, 0.107981, 0.1]
In [21]:
#ar_mig_iso_mig = lbview.map(run_dadi, repeat(p0, 10))
In [22]:
ar_mig_iso_mig.done()
Out[22]:
In [23]:
ar_mig_iso_mig = []
for filename in glob("OUT_2D_models/split_mig_iso_mig*dill"):
ar_mig_iso_mig.append(dill.load(open(filename)))
In [24]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_mig_iso_mig]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'Tsc_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', 'Tsc_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[24]:
No good convergence has been achieved yet. I need to extend the search close to the best parameter combination so far.
In [27]:
import numpy as np
In [28]:
popt = np.array( df.sort_values(by='-logL', ascending=True).head(10).iloc[0, 7:14] )
popt
Out[28]:
In [30]:
%%px
maxiter = 300
pts_l = [50, 60, 70]
fold = 1
In [31]:
p0 = popt
#ar_mig_iso_mig = lbview.map(run_dadi, repeat(p0, 10))
In [32]:
ar_mig_iso_mig = []
for filename in glob("OUT_2D_models/split_mig_iso_mig*dill"):
ar_mig_iso_mig.append(dill.load(open(filename)))
In [33]:
l = 2*len(p0)+1
# show all parameter combinations
returned = [flatten(out)[:l] for out in ar_mig_iso_mig]
df = pd.DataFrame(data=returned, \
columns=['nu1_0','nu2_0', 'Tc_0', 'm1_0', 'm2_0', 'Ti_0', 'Tsc_0', 'nu1_opt', 'nu2_opt', 'Tc_opt', 'm1_opt', 'm2_opt', 'Ti_opt', 'Tsc_opt', '-logL'])
df.sort_values(by='-logL', ascending=True).head(10)
Out[33]:
The best parameter combination from the ancient migration model had a likelihood of 17582.88. So the addition of $T_{SC}$ to the model had not improved the fit significantly.
A time of secondary contact cannot be inferred from this spectrum.