In [19]:
# load dadi module
import sys
import numpy as np
sys.path.insert(0, '/home/claudius/Downloads/dadi')
import dadi
In [2]:
%less PAR.unfolded.sfs
In [2]:
# load 1D spectra from file
fs_ery = dadi.Spectrum.from_file("ERY.unfolded.sfs")
fs_par = dadi.Spectrum.from_file("PAR.unfolded.sfs")
See lines 1429-1435 and 1661-1664 in assembly.sh for the command lines that created these 1D unfolded spectra. Only sites with data from at least 9 individuals were included in the analysis.
In [23]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [14, 12]
plt.rcParams['font.size'] = 14
In [4]:
plt.grid()
plt.plot(fs_ery.fold(), 'ro-', label='ery', linewidth=2.5, markersize=12)
plt.plot(fs_par.fold(), 'g^-', label='par', linewidth=2.5, markersize=12)
plt.xlabel("minor allele frequency")
plt.ylabel("SNP count")
plt.legend()
Out[4]:
The site frequency spectrum should be roughly proportional to $1/p$, $p$ being the minor allele frequency, for any demographic history or selection regime. In particular the first and second frequency classes ($p=1$ and $p=2$) are deviating from this.
In [3]:
# fold spectra
fs_ery = fs_ery.fold()
fs_par = fs_par.fold()
In [6]:
# make the extrapolating version of the standard neutral model function
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics1D.snm)
# setting the smallest grid size slightly larger than the largest population sample size
pts_l = [40, 50, 60]
ns = fs_ery.sample_sizes
# calculate unfolded AFS under standard neutral model (up to a scaling factor theta)
snm = func_ex(0, ns, pts_l)
snm = snm.fold()
# scaled snm spectrum for ERY
snm_ery = dadi.Inference.optimally_scaled_sfs(snm, fs_ery)
# scaled snm spectrum for PAR
snm_par = dadi.Inference.optimally_scaled_sfs(snm, fs_par)
In [7]:
dadi.Plotting.plot_1d_comp_multinom(snm_ery[:19], fs_ery[:19], residual='linear')
In [8]:
dadi.Plotting.plot_1d_comp_multinom(snm_par[:19], fs_par[:19], residual='linear')
The lower plot (green line) is for the scaled Poisson residuals.
$$ residuals = (model - data)/\sqrt{model} $$The model is the expected counts in each frequency class. If these counts are Poisson distributed, then their variance is equal to their expectation. The differences between model and data are therefore scaled by the expected standard deviation of the model counts.
Ludovic has suggested optimising for a fraction $p$ of all SNP's that lie on the X chromosome. SNP's on the X chromosome are counted twice in the SFS since all sequenced individuals were male and C. parallelus males have no Y chromosome homologous to the X. This fraction $p$ should therefore be subtracted from even frequency classes and added to the respective frequency class that contains SNP's that are 1/2 as frequent, e. g. from class 2 --> 1 or from 8 --> 4. One could optimise for the minimum deviation from a neutral spectrum.
In [4]:
# number of segregating sites in ERY spectrum
fs_ery.S()
Out[4]:
In [5]:
# total number of sites in ERY spectrum
fs_ery.data.sum()
Out[5]:
In [6]:
# number of segregating sites in PAR spectrum
fs_par.S()
Out[6]:
In [7]:
# total number of sites in PAR spectrum
fs_par.data.sum()
Out[7]:
PAR has about the same number of SNP's as ERY, but these are distributed over a sequence length about 1/4 shorter than in ERY.
Since the ERY and PAR spectra have almost the same number of SNP's, the scaled standard neutral spectra are also almost the same.
Deviation from these standard neutral model spectra shall be minimised.
Now I need to write a cost function: it should take p (the fraction of SNP's on the X chromosome), modify the observed SFS and return the sum of squared deviations between the modified SFS and the standard neutral model SFS.
In [8]:
# turn on floating point division by default, old behaviour via '//'
from __future__ import division
In [9]:
p = 1/2
p
Out[9]:
In [10]:
# the spectrum object is a masked numpy array
fs_par.mask
Out[10]:
In [11]:
# make copy of SFS, we don't want to modify the original in place
sfs = fs_par.copy()
In [12]:
sfs[1] = 0
sfs
Out[12]:
In [13]:
fs_par
Out[13]:
In [14]:
sfs = fs_par.copy()
In [15]:
# test
for i in range(len(sfs)):
if not i%2:
print i, i//2
In [16]:
# modifies sfs
for i in range(len(sfs)): # from 0..36 in this case
if not sfs.mask[i]: # if count category is not masked (i. e index 1..18)
if not i%2: # if count class is a multiple of 2
#print i//2
deduct = sfs[i] * p # get the proportion p of MAF counts in this class
sfs[i] -= deduct # remove this count from the frequency class
j = i//2 # get the frequency class that corresponds to the haploid genotype
sfs[j] += deduct # add the deducted count to this frequency class
In [17]:
sfs
Out[17]:
In [20]:
sfs.data.sum()
Out[20]:
In [21]:
fs_par.data.sum()
Out[21]:
In [24]:
plt.plot(sfs, 'bo-', markersize=12, label="modified")
plt.plot(fs_par, 'g^-', markersize=12, label='raw')
plt.legend()
Out[24]:
The modification with an arbitrary p of 0.5 seems to work.
In [22]:
# define function that modifies a SFS with a given p
def modify(p, sfs):
"""
modifies SFS with given p
subtracts proportion p from even count classes and adds to corresponding
haploid count class, e. g. from 2 onto 1 or from 16 onto 8
returns modified sfs
"""
fs = sfs.copy() # it is necessary to make sure that a copy of the observed SFS is
# made here for each function evaluation
# modify observed SFS with given p:
for i in range(len(fs)):
if not fs.mask[i]:
if not i%2:
deduct = fs[i] * p
fs[i] -= deduct
j = i//2
fs[j] += deduct
return fs
In [4]:
import numpy as np
In [24]:
# define cost function used for optimisation
def f(p, snm, sfs):
"""
p: proportion of all SNP's on the X chromosome [float, 0<p<1]
snm: standard neutral model spectrum (optimally scaled)
sfs: observed SFS
"""
# modify sfs
fs = modify(p, sfs)
# return sum of squared deviations of modified SFS with snm spectrum:
return np.sum( (fs - snm)**2 ) # note, due to the mask it is necessary to use the numpy sum function
In [25]:
fs_par
Out[25]:
In [26]:
# test cost function
f(0.5, snm_par, fs_par)
Out[26]:
In [27]:
# get cost when modifiying snm spectrum with p of 0
# the cost should be 0
f(0.0, snm_par, snm_par)
Out[27]:
In [28]:
# get cost when modifying snm with p of 0.001
# the cost should be slightly greater than 0
f(0.001, snm_par, snm_par.copy())
Out[28]:
The cost function seems to be correct.
In [29]:
x = np.linspace(0, 1, 100)
y = map(lambda x: f(x, snm_par, fs_par), x)
In [30]:
plt.plot(x, y)
plt.grid()
plt.xlabel("p")
plt.ylabel("cost")
plt.title("cost function for PAR spectrum")
Out[30]:
In [31]:
x = np.linspace(0, 1, 100)
y = map(lambda x: f(x, snm_par, snm_par), x)
plt.plot(x, y)
plt.grid()
plt.xlabel("p")
plt.ylabel("cost")
plt.title("cost function for SNM spectrum")
Out[31]:
As expected, the optimal value of p when trying to modify the snm
spectrum to better fit the snm
spectrum is 0.
In [32]:
from scipy.optimize import minimize_scalar
In [38]:
minimize_scalar?
In [33]:
# test optimisation
# the function to optimise (cost function) 'f' gets the optimally scaled standard neutral SFS for PAR
# and the standard neutral SFS for PAR to modify
# I want to see an optimal value for p of zero as shown in the plot of the cost function above.
res_test = minimize_scalar(f, method = 'bounded', bounds = (-1.0, 1.0), args = (snm_par, snm_par))
In [34]:
res_test
Out[34]:
This makes sense: the optimal value is practically zero.
In [35]:
# check that original SFS has not been modified
fs_par
Out[35]:
In [36]:
# run optimisation
# the function to optimise (cost function) 'f' gets the optimally scaled standard neutral SFS for PAR
# and the observed SFS of PAR. The optimal value should be somewhat below 0.4 as shown in the
# plot of the cost function above.
res_par = minimize_scalar(f, method = 'bounded', bounds = (-1.0, 1.0), args = (snm_par, fs_par))
In [37]:
res_par
Out[37]:
This makes sense. The optimal value of $p$ is 0.36, which would mean that 36% of all SNP's lie on the X chromosome or are affected by "calling" of heterozgotes as homozygotes due to PCR drift or binomial allele sampling error.
In [38]:
# get modified SFS with optimal p
fs_par_m = modify(res_par.x, fs_par)
fs_par_m
Out[38]:
In [40]:
plt.plot(fs_par_m, 'bo-', markersize=12, label="modified")
plt.plot(fs_par, 'g^-', markersize=12, label='raw')
plt.title("PAR - comparison of optimally modified with observed SFS")
plt.legend()
Out[40]:
This modification looks good to me. It doesn't seem to change the general right-skew of the observed SFS, i. e. excess of low frequency variants and stays fairly close to the observed SFS for the frequency classes greater 2.
In [43]:
plt.plot(snm_par, 'bo-', markersize=12, label="SNM")
plt.plot(fs_par_m, 'g^-', markersize=12, label='modified')
plt.plot(fs_par, 'r>--', markersize=12, label='observed spectrum')
plt.title("PAR - comparison of observed, SNM and modified spectra")
plt.legend()
Out[43]:
The modification does not seem to introduce an obvious bias towards the standard neutral model spectrum (SNM).
In [81]:
dadi.Spectrum.tofile?
In [45]:
# run optimisation
# the function to optimise (objective function) 'f' gets the optimally scaled standard neutral SFS for ERY
# and a copy of the observed SFS of ERY
res_ery = minimize_scalar(f, method = 'bounded', bounds = (-1.0, 1.0), args = (snm_ery, fs_ery))
In [46]:
res_ery
Out[46]:
The optimal value of $p$ is close to the value estimated for PAR, which is what I would expect.
In [47]:
# get modified SFS with optimal p
fs_ery_m = modify(res_ery.x, fs_ery)
In [48]:
plt.plot(snm_ery, 'bo-', markersize=12, label="SNM")
plt.plot(fs_ery_m, 'g^-', markersize=12, label="modified")
plt.plot(fs_ery, 'r>--', markersize=12, label='raw')
plt.title("ERY - comparison of optimally modified with observed SFS")
plt.legend()
Out[48]:
The modification does not seem to introduce a bias toward an SNM spectrum.
This seems to be a reasonable modification.
In [3]:
sfs2d = dadi.Spectrum.from_file("dadiExercises/EryPar.unfolded.2dsfs.dadi_format")
In [4]:
sfs2d.pop_ids = ['ery', 'par']
In [39]:
%matplotlib inline
import pylab
import numpy as np
plt.rcParams['figure.figsize'] = [12, 10]
plt.rcParams['font.size'] = 14
In [11]:
# plot unfoled spectrum
dadi.Plotting.plot_single_2d_sfs(sfs2d, vmin=1, cmap=pylab.cm.jet)
Out[11]:
In [12]:
sfs2d = sfs2d.fold()
In [13]:
# plot folded spectrum
dadi.Plotting.plot_single_2d_sfs(sfs2d, vmin=1, cmap=pylab.cm.jet)
Out[13]:
In [14]:
sfs2d
Out[14]:
In [15]:
sfs2d.mask
Out[15]:
I now would like to apply the same modification on the 2D spectrum.
In [16]:
if not 3%2:
print 3//2
In [17]:
for i in range(len(sfs2d)):
for j in range(len(sfs2d[i])):
if not sfs2d.mask[i][j]:
if not i%2 or not j%2: # if PAR or ERY index are even
print i, j,
print "-->",
if not i%2: # if PAR index is even
k = i//2 # get half of index
else:
k = i
if not j%2: # if ERY index is even
l = j//2 # get half of index
else:
l = j
print k, l
This shows from which cell to which cell the transferral of SNP count takes place. A proportion $p$ of the SNP count in a cell would be deducted and added to the cell that corresponds to half the allele count in PAR, ERY or both. That means if the allele count (equal to index) in the source cell for one population (ERY or PAR) is an odd number, this index will not be changed to find the target cell, only the index of the other population, which must be even to do a transferral at all.
In [18]:
x = sfs2d.data.copy()
x
Out[18]:
In [19]:
x[0][0] = 1
In [20]:
x
Out[20]:
In [21]:
# check that original sfs has not changed
sfs2d.data
Out[21]:
This seems to work without Warning.
In [22]:
sfs = sfs2d.data.copy()
In [23]:
p = 0.5
In [24]:
for i in range(len(sfs)):
for j in range(len(sfs[i])):
if not sfs2d.mask[i][j]:
if not i%2 or not j%2:
deduct = sfs[i][j] * p
sfs[i][j] -= deduct
if not i%2:
k = i//2
else:
k = i
if not j%2:
l = j//2
else:
l = j
sfs[k][l] += deduct
In [25]:
# recreate dadi Spectrum object
sfs2d_m = dadi.Spectrum(data=sfs, mask=sfs2d.mask, data_folded=True, pop_ids=['ery', 'par'])
In [26]:
dadi.Plotting.plot_single_2d_sfs(sfs2d_m, vmin=1, cmap=pylab.cm.jet)
Out[26]:
This does look pretty good to me.
In [27]:
dadi.Plotting.plot_2d_comp_multinom(data=sfs2d, model=sfs2d_m, vmin=1, title=['original', 'modified'])
I think the modification does a good job.
In [28]:
def modify2D(p, sfs):
"""
modifies an sfs with given p
returns modified sfs
leaves original sfs unchanged
"""
# make copy of 2D array for modification
fs = sfs.data.copy()
for i in range(len(fs)):
for j in range(len(fs[i])):
if not sfs.mask[i][j]:
if not i%2 or not j%2:
deduct = fs[i][j] * p
fs[i][j] -= deduct
if not i%2:
k = i//2
else:
k = i
if not j%2:
l = j//2
else:
l = j
fs[k][l] += deduct
# recreate a dadi Spectrum object
fs_m = dadi.Spectrum(data=fs, mask=sfs.mask, data_folded=True, pop_ids=['ery', 'par'])
return fs_m
In [29]:
# define cost function used for optimisation
def f_2D(p, snm, sfs):
"""
p: proportion of all SNP's on the X chromosome [float, 0<p<1]
snm: standard neutral model spectrum (optimally scaled)
sfs: observed SFS
"""
# modify sfs
fs = modify2D(p, sfs)
# return sum of squared deviations of modified SFS with snm spectrum:
return np.sum( (fs - snm)**2 ) # note, due to the mask it is necessary to use the numpy sum function
In [30]:
dadi.Demographics2D.snm?
In [31]:
# make the extrapolating version of the demographic model function
func_ex = dadi.Numerics.make_extrap_log_func(dadi.Demographics2D.snm)
# setting the smallest grid size slightly larger than the largest population sample size
pts_l = [40, 50, 60]
ns = sfs2d.sample_sizes
# calculate unfolded AFS under standard neutral model (up to a scaling factor theta)
snm = func_ex(0, ns, pts_l)
snm = snm.fold()
# scaled snm spectrum
snm = dadi.Inference.optimally_scaled_sfs(snm, sfs2d)
In [32]:
dadi.Plotting.plot_single_2d_sfs(snm, vmin=1, cmap=pylab.cm.jet)
Out[32]:
I don't think that this would be a good model spectrum for fitting. We know that ERY and PAR are highly diverged and certainly not a single panmictic population.
I think the most parsimonious model would be a simple divergence in isolation model.
In [33]:
def split_nomig(params, ns, pts):
"""
params = (nu1,nu2,T)
ns = (n1,n2)
Split into two populations of specifed size, no migration.
nu1: Size of population 1 after split.
nu2: Size of population 2 after split.
T: Time in the past of split (in units of 2*Na generations)
n1,n2: Sample sizes of resulting Spectrum
pts: Number of grid points to use in integration.
"""
nu1,nu2,T = params
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
phi = dadi.Integration.two_pops(phi, xx, T, nu1, nu2, m12=0, m21=0)
fs = dadi.Spectrum.from_phi(phi, ns, (xx,xx))
return fs
This model takes three parameters. I am going to assume that $\nu_1=\nu_2=1$ and that $T=1$.
In [34]:
# make the extrapolating version of the demographic model function
func_ex = dadi.Numerics.make_extrap_log_func(split_nomig)
In [35]:
# calculate unfolded AFS under divergence in isolation model (dim) (up to a scaling factor theta)
params = (1, 1, 1)
ns = sfs2d.sample_sizes
pts_l = [40, 50, 60]
dim = func_ex(params, ns, pts_l)
dim = dim.fold()
# optimally scale divergence in isolation model spectrum
dim = dadi.Inference.optimally_scaled_sfs(dim, sfs2d)
In [36]:
dadi.Plotting.plot_single_2d_sfs(dim, vmin=1, cmap=pylab.cm.jet)
Out[36]:
This shall be the model to optimise the fit to.
In [37]:
from scipy.optimize import minimize_scalar
In [40]:
# test optimisation
# the function to optimise (cost function) 'f_2D' gets the optimally scaled divergence in isolation model
# to fit to itself.
# I want to see an optimal value for p of zero.
res_test = minimize_scalar(f_2D, method = 'bounded', bounds = (-1.0, 1.0), args = (dim, dim))
In [41]:
res_test
Out[41]:
That is as expected.
In [42]:
res = minimize_scalar(f_2D, method='bounded', bounds=(-1.0, 1.0), args=(dim, sfs2d))
In [43]:
res
Out[43]:
This is remarkably close to the value of p inferred for the 1D spectra and therefore a good sign.
In [44]:
sfs2d_m = modify2D(res.x, sfs2d)
In [45]:
dadi.Plotting.plot_single_2d_sfs(sfs2d_m, vmin=1, cmap=pylab.cm.jet)
Out[45]:
This looks reasonable.
In [46]:
dadi.Plotting.plot_2d_comp_multinom(data=sfs2d, model=sfs2d_m, vmin=1, title=['observed', 'modified'])
This looks good to me.
I need to know whether I introduce a bias by choosing a certain null model for fitting. I therefore want to do two more fittings. One using a null model with short and one with long divergence time.
In [47]:
# calculate unfolded AFS under divergence in isolation model (dim) (up to a scaling factor theta)
params = (1, 1, 2) # note the doubling of divergence time
ns = sfs2d.sample_sizes
pts_l = [40, 50, 60]
dim = func_ex(params, ns, pts_l)
dim = dim.fold()
# optimally scale divergence in isolation model spectrum
dim = dadi.Inference.optimally_scaled_sfs(dim, sfs2d)
In [48]:
# plot null model
dadi.Plotting.plot_single_2d_sfs(dim, vmin=1, cmap=pylab.cm.jet)
Out[48]:
In [49]:
res_a = minimize_scalar(f_2D, method='bounded', bounds=(-1.0, 1.0), args=(dim, sfs2d))
In [50]:
res_a
Out[50]:
p has decreased by 5 percentage points.
In [51]:
# calculate unfolded AFS under divergence in isolation model (dim) (up to a scaling factor theta)
params = (1, 1, 0.5) # note the halfing of divergence time
ns = sfs2d.sample_sizes
pts_l = [40, 50, 60]
dim = func_ex(params, ns, pts_l)
dim = dim.fold()
# optimally scale divergence in isolation model spectrum
dim = dadi.Inference.optimally_scaled_sfs(dim, sfs2d)
In [52]:
# plot null model
dadi.Plotting.plot_single_2d_sfs(dim, vmin=1, cmap=pylab.cm.jet)
Out[52]:
In [53]:
res_b = minimize_scalar(f_2D, method='bounded', bounds=(-1.0, 1.0), args=(dim, sfs2d))
In [54]:
res_b
Out[54]:
p has increased by 4 percent.
How different are the corresponding corrected spectra?
In [55]:
# get corrected spectra
sfs2d_m_a = modify2D(res_a.x, sfs2d) # p=0.30
sfs2d_m_b = modify2D(res_b.x, sfs2d) # p=0.39
In [56]:
# plot corrected spectra and residuals
dadi.Plotting.plot_2d_comp_multinom(data=sfs2d_m_a, model=sfs2d_m_b, vmin=1, title=['T=2', 'T=0.5'])
The difference looks small, but I need to use the differently corrected spectra for model fitting to see whether the degree of correction has a strong influence on parameter inference.
In [57]:
ery_marg = sfs2d.marginalize([1])
par_marg = sfs2d.marginalize([0])
ery_p030_marg = sfs2d_m_a.marginalize([1])
par_p030_marg = sfs2d_m_a.marginalize([0])
ery_p039_marg = sfs2d_m_b.marginalize([1])
par_p039_marg = sfs2d_m_b.marginalize([0])
In [64]:
pylab.rcParams['figure.figsize'] = [18, 10]
In [67]:
pylab.subplot(121)
pylab.grid()
pylab.plot(ery_marg, 'ro--', label='ery uncorrected', linewidth=1.0, markersize=4)
pylab.plot(par_marg, 'g^--', label='par uncorrected', linewidth=1.0, markersize=4)
pylab.plot(ery_p030_marg, 'ro-', label='ery, p=0.30', linewidth=2.5, markersize=12)
pylab.plot(par_p030_marg, 'g^-', label='par, p=0.30', linewidth=2.5, markersize=12)
pylab.xlabel("minor allele frequency")
pylab.ylabel("SNP count")
pylab.ylim(0, 14000)
pylab.legend()
pylab.subplot(122)
pylab.grid()
pylab.plot(ery_marg, 'ro--', label='ery uncorrected', linewidth=1.0, markersize=4)
pylab.plot(par_marg, 'g^--', label='par uncorrected', linewidth=1.0, markersize=4)
pylab.plot(ery_p039_marg, 'ro-', label='ery, p=0.39', linewidth=2.5, markersize=12)
pylab.plot(par_p039_marg, 'g^-', label='par, p=0.39', linewidth=2.5, markersize=12)
pylab.xlabel("minor allele frequency")
pylab.ylabel("SNP count")
pylab.ylim(0, 14000)
pylab.legend()
Out[67]:
In [ ]:
In [84]:
# save PAR modified spectrum to file
dadi.Spectrum.tofile(fs_par_m, fid="PAR_modified.sfs", comment_lines=['Ludovics correction applied'])
In [25]:
# load saved spectrum from file
fs = dadi.Spectrum.from_file("PAR_modified.sfs")
In [26]:
fs
Out[26]:
In [27]:
plt.plot(fs, 'g^-', label='par', linewidth=2.5, markersize=12)
Out[27]:
In [28]:
fs.data.sum()
Out[28]:
In [93]:
# save modified ERY spectrum to file
dadi.Spectrum.tofile(fs_ery_m, fid="ERY_modified.sfs", comment_lines=['Ludovics correction applied'])
In [29]:
# load saved spectrum from file
fs = dadi.Spectrum.from_file("ERY_modified.sfs")
fs.data.sum()
Out[29]:
In [94]:
# save modified 2D spectrum to file
dadi.Spectrum.tofile(sfs2d_m, fid="EryPar_modified.2dsfs", comment_lines=['Ludovics correction applied'])
In [98]:
# load saved 2D spectrum from file
fs2d = dadi.Spectrum.from_file("EryPar_modified.2dsfs")
In [99]:
dadi.Plotting.plot_single_2d_sfs(fs2d, vmin=1, cmap=pylab.cm.jet)
Out[99]:
In [49]:
# save modified 2D spectrum to file
dadi.Spectrum.tofile(sfs2d_m, fid="EryPar_modified.2dsfs", comment_lines=['Ludovics correction applied, p=0.35'])
In [50]:
# save modified 2D spectrum to file
dadi.Spectrum.tofile(sfs2d_m_a, fid="EryPar_modified_a.2dsfs", comment_lines=['Ludovics correction applied, p=0.30'])
In [51]:
# save modified 2D spectrum to file
dadi.Spectrum.tofile(sfs2d_m_b, fid="EryPar_modified_b.2dsfs", comment_lines=['Ludovics correction applied, p=0.39'])
In [ ]: