anhima.af
- Allele frequencies
In [1]:
from __future__ import division, print_function
import numpy as np
np.random.seed(42)
import scipy.stats
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# # dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.af
In [2]:
# simulate genotypes
n_variants = 1000
n_samples = 1000
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
In [3]:
anhima.af.count_variant(genotypes)
Out[3]:
In [4]:
anhima.af.count_non_variant(genotypes)
Out[4]:
In [5]:
n_seg = anhima.af.count_segregating(genotypes)
n_seg
Out[5]:
In [6]:
n_non_seg = anhima.af.count_non_segregating(genotypes)
n_non_seg
Out[6]:
In [7]:
assert n_seg + n_non_seg == n_variants
In [8]:
n_non_seg_ref = anhima.af.count_non_segregating(genotypes, allele=0)
n_non_seg_ref
Out[8]:
In [9]:
n_non_seg_alt = anhima.af.count_non_segregating(genotypes, allele=1)
n_non_seg_alt
Out[9]:
In [10]:
assert n_non_seg == n_non_seg_ref + n_non_seg_alt
In [11]:
anhima.af.count_singletons(genotypes)
Out[11]:
In [12]:
anhima.af.count_doubletons(genotypes)
Out[12]:
In [13]:
an, ac, af = anhima.af.allele_frequencies(genotypes)
In [14]:
# plot actual distribution of alt allele frequencies
plt.hist(af[:, 1], bins=20);
In [15]:
# simulate genotypes
n_variants = 2000
n_samples = 200
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0.1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
# simulate two sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=7000, copy=False)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=7000, copy=False)
# check relatedness
gn = anhima.gt.as_n_alt(genotypes)
d, ds = anhima.dist.pairwise_distance(gn)
anhima.dist.plot_pairwise_distance(ds);
In [16]:
# calculate allele frequencies, leaving out 5 samples from each subpopulation
_, _, af_subpop1 = anhima.af.allele_frequency(genotypes_subpop1[:, :(n_samples//2)-5])
_, _, af_subpop2 = anhima.af.allele_frequency(genotypes_subpop2[:, :(n_samples//2)-5])
# filter variants to use only those with power to differentiate between populations
flt = np.abs(af_subpop1 - af_subpop2) > .5
genotypes_subpop1_flt = genotypes_subpop1[flt]
genotypes_subpop2_flt = genotypes_subpop2[flt]
# plot allele frequencies to visualise choice
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(af_subpop1, af_subpop2, color='#cccccc', marker='o', linestyle=' ', mew=1, mfc='none');
af_subpop1_flt = af_subpop1[flt]
af_subpop2_flt = af_subpop2[flt]
ax.plot(af_subpop1_flt, af_subpop2_flt, 'bo', mew=0)
ax.set_xlabel('allele frequency (subpop1)')
ax.set_ylabel('allele frequency (subpop2)');
# plot allele frequencies in each subpopulation
fig, ax = plt.subplots(figsize=(14, 3))
x = np.arange(af_subpop1_flt.size) + .5
ax.plot(x, af_subpop1_flt, 'bo-', label='subpop1')
ax.plot(x, af_subpop2_flt, 'ro-', label='subpop2')
ax.set_yticks([0, 1])
ax.set_xticks([])
ax.set_xlim(0, af_subpop1_flt.size)
ax.set_ylabel('allele frequency')
ax.legend(loc='upper left', bbox_to_anchor=(1.01, 1));
In [17]:
def investigate_ancestry(genotypes, af_subpop1, af_subpop2, filter_size=0, min_confidence=1):
# predict ancestry
ancestry, confidence = anhima.af.maximum_likelihood_ancestry(genotypes,
af_subpop1,
af_subpop2,
filter_size=filter_size)
# plot the ancestry predictions
fig, ax = plt.subplots(figsize=(14, 2))
anhima.gt.plot_discrete_calldata(ancestry, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
ax.set_title('ancestry')
# plot the confidence
fig, ax = plt.subplots(figsize=(14, 2))
anhima.gt.plot_continuous_calldata(confidence, ax=ax, pcolormesh_kwargs=dict(cmap='Greys', vmin=0, vmax=20))
ax.set_title('ancestry confidence')
# plot distribution of confidence scores
fig, ax = plt.subplots()
ax.hist(confidence.flatten(), bins=20)
ax.set_title('confidence')
# plot ancestry filtered by confidence
fig, ax = plt.subplots(figsize=(14, 2))
x = ancestry.copy()
x[confidence < min_confidence] = -1
anhima.gt.plot_discrete_calldata(x, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
ax.set_title('ancestry, filtered by confidence')
In [18]:
# investigate ancestry in 5 samples from subpop1 we set aside earlier
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)
In [19]:
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)
In [20]:
# investigate ancestry in 5 samples from subpop2 we set aside earlier
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)
In [21]:
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)
In [22]: