anhima.af - Allele frequencies


In [1]:
from __future__ import division, print_function
import numpy as np
np.random.seed(42)
import scipy.stats
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# # dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.af

In [2]:
# simulate genotypes 
n_variants = 1000
n_samples = 1000
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)

In [3]:
anhima.af.count_variant(genotypes)


Out[3]:
968

In [4]:
anhima.af.count_non_variant(genotypes)


Out[4]:
32

In [5]:
n_seg = anhima.af.count_segregating(genotypes)
n_seg


Out[5]:
964

In [6]:
n_non_seg = anhima.af.count_non_segregating(genotypes)
n_non_seg


Out[6]:
36

In [7]:
assert n_seg + n_non_seg == n_variants

In [8]:
n_non_seg_ref = anhima.af.count_non_segregating(genotypes, allele=0)
n_non_seg_ref


Out[8]:
32

In [9]:
n_non_seg_alt = anhima.af.count_non_segregating(genotypes, allele=1)
n_non_seg_alt


Out[9]:
4

In [10]:
assert n_non_seg == n_non_seg_ref + n_non_seg_alt

In [11]:
anhima.af.count_singletons(genotypes)


Out[11]:
16

In [12]:
anhima.af.count_doubletons(genotypes)


Out[12]:
21

In [13]:
an, ac, af = anhima.af.allele_frequencies(genotypes)

In [14]:
# plot actual distribution of alt allele frequencies
plt.hist(af[:, 1], bins=20);


Maximum likelihood ancestry


In [15]:
# simulate genotypes 
n_variants = 2000
n_samples = 200
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0.1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
# simulate two sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=7000, copy=False)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=7000, copy=False)

# check relatedness
gn = anhima.gt.as_n_alt(genotypes)
d, ds = anhima.dist.pairwise_distance(gn)
anhima.dist.plot_pairwise_distance(ds);



In [16]:
# calculate allele frequencies, leaving out 5 samples from each subpopulation
_, _, af_subpop1 = anhima.af.allele_frequency(genotypes_subpop1[:, :(n_samples//2)-5])
_, _, af_subpop2 = anhima.af.allele_frequency(genotypes_subpop2[:, :(n_samples//2)-5])

# filter variants to use only those with power to differentiate between populations
flt = np.abs(af_subpop1 - af_subpop2) > .5
genotypes_subpop1_flt = genotypes_subpop1[flt]
genotypes_subpop2_flt = genotypes_subpop2[flt]

# plot allele frequencies to visualise choice
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(af_subpop1, af_subpop2, color='#cccccc', marker='o', linestyle=' ', mew=1, mfc='none');
af_subpop1_flt = af_subpop1[flt]
af_subpop2_flt = af_subpop2[flt]
ax.plot(af_subpop1_flt, af_subpop2_flt, 'bo', mew=0)
ax.set_xlabel('allele frequency (subpop1)')
ax.set_ylabel('allele frequency (subpop2)');

# plot allele frequencies in each subpopulation
fig, ax = plt.subplots(figsize=(14, 3))
x = np.arange(af_subpop1_flt.size) + .5
ax.plot(x, af_subpop1_flt, 'bo-', label='subpop1')
ax.plot(x, af_subpop2_flt, 'ro-', label='subpop2')
ax.set_yticks([0, 1])
ax.set_xticks([])
ax.set_xlim(0, af_subpop1_flt.size)
ax.set_ylabel('allele frequency')
ax.legend(loc='upper left', bbox_to_anchor=(1.01, 1));



In [17]:
def investigate_ancestry(genotypes, af_subpop1, af_subpop2, filter_size=0, min_confidence=1):
    
    # predict ancestry
    ancestry, confidence = anhima.af.maximum_likelihood_ancestry(genotypes, 
                                                                 af_subpop1, 
                                                                 af_subpop2, 
                                                                 filter_size=filter_size)
    
    # plot the ancestry predictions
    fig, ax = plt.subplots(figsize=(14, 2))
    anhima.gt.plot_discrete_calldata(ancestry, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
    ax.set_title('ancestry')
    
    # plot the confidence
    fig, ax = plt.subplots(figsize=(14, 2))
    anhima.gt.plot_continuous_calldata(confidence, ax=ax, pcolormesh_kwargs=dict(cmap='Greys', vmin=0, vmax=20))
    ax.set_title('ancestry confidence')

    # plot distribution of confidence scores
    fig, ax = plt.subplots()
    ax.hist(confidence.flatten(), bins=20)
    ax.set_title('confidence')

    # plot ancestry filtered by confidence
    fig, ax = plt.subplots(figsize=(14, 2))
    x = ancestry.copy()
    x[confidence < min_confidence] = -1
    anhima.gt.plot_discrete_calldata(x, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
    ax.set_title('ancestry, filtered by confidence')

In [18]:
# investigate ancestry in 5 samples from subpop1 we set aside earlier
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)



In [19]:
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)



In [20]:
# investigate ancestry in 5 samples from subpop2 we set aside earlier
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)



In [21]:
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)



In [22]: