`anhima.af` - Allele frequencies



In [1]:

    
from __future__ import division, print_function
import numpy as np
np.random.seed(42)
import scipy.stats
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# # dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.af



In [2]:

    
# simulate genotypes 
n_variants = 1000
n_samples = 1000
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)



In [3]:

    
anhima.af.count_variant(genotypes)









    Out[3]:





968



In [4]:

    
anhima.af.count_non_variant(genotypes)









    Out[4]:





32



In [5]:

    
n_seg = anhima.af.count_segregating(genotypes)
n_seg









    Out[5]:





964



In [6]:

    
n_non_seg = anhima.af.count_non_segregating(genotypes)
n_non_seg









    Out[6]:





36



In [7]:

    
assert n_seg + n_non_seg == n_variants



In [8]:

    
n_non_seg_ref = anhima.af.count_non_segregating(genotypes, allele=0)
n_non_seg_ref









    Out[8]:





32



In [9]:

    
n_non_seg_alt = anhima.af.count_non_segregating(genotypes, allele=1)
n_non_seg_alt









    Out[9]:





4



In [10]:

    
assert n_non_seg == n_non_seg_ref + n_non_seg_alt



In [11]:

    
anhima.af.count_singletons(genotypes)









    Out[11]:





16



In [12]:

    
anhima.af.count_doubletons(genotypes)









    Out[12]:





21



In [13]:

    
an, ac, af = anhima.af.allele_frequencies(genotypes)



In [14]:

    
# plot actual distribution of alt allele frequencies
plt.hist(af[:, 1], bins=20);

Maximum likelihood ancestry



In [15]:

    
# simulate genotypes 
n_variants = 2000
n_samples = 200
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0.1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
# simulate two sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=7000, copy=False)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=7000, copy=False)

# check relatedness
gn = anhima.gt.as_n_alt(genotypes)
d, ds = anhima.dist.pairwise_distance(gn)
anhima.dist.plot_pairwise_distance(ds);



In [16]:

    
# calculate allele frequencies, leaving out 5 samples from each subpopulation
_, _, af_subpop1 = anhima.af.allele_frequency(genotypes_subpop1[:, :(n_samples//2)-5])
_, _, af_subpop2 = anhima.af.allele_frequency(genotypes_subpop2[:, :(n_samples//2)-5])

# filter variants to use only those with power to differentiate between populations
flt = np.abs(af_subpop1 - af_subpop2) > .5
genotypes_subpop1_flt = genotypes_subpop1[flt]
genotypes_subpop2_flt = genotypes_subpop2[flt]

# plot allele frequencies to visualise choice
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(af_subpop1, af_subpop2, color='#cccccc', marker='o', linestyle=' ', mew=1, mfc='none');
af_subpop1_flt = af_subpop1[flt]
af_subpop2_flt = af_subpop2[flt]
ax.plot(af_subpop1_flt, af_subpop2_flt, 'bo', mew=0)
ax.set_xlabel('allele frequency (subpop1)')
ax.set_ylabel('allele frequency (subpop2)');

# plot allele frequencies in each subpopulation
fig, ax = plt.subplots(figsize=(14, 3))
x = np.arange(af_subpop1_flt.size) + .5
ax.plot(x, af_subpop1_flt, 'bo-', label='subpop1')
ax.plot(x, af_subpop2_flt, 'ro-', label='subpop2')
ax.set_yticks([0, 1])
ax.set_xticks([])
ax.set_xlim(0, af_subpop1_flt.size)
ax.set_ylabel('allele frequency')
ax.legend(loc='upper left', bbox_to_anchor=(1.01, 1));



In [17]:

    
def investigate_ancestry(genotypes, af_subpop1, af_subpop2, filter_size=0, min_confidence=1):
    
    # predict ancestry
    ancestry, confidence = anhima.af.maximum_likelihood_ancestry(genotypes, 
                                                                 af_subpop1, 
                                                                 af_subpop2, 
                                                                 filter_size=filter_size)
    
    # plot the ancestry predictions
    fig, ax = plt.subplots(figsize=(14, 2))
    anhima.gt.plot_discrete_calldata(ancestry, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
    ax.set_title('ancestry')
    
    # plot the confidence
    fig, ax = plt.subplots(figsize=(14, 2))
    anhima.gt.plot_continuous_calldata(confidence, ax=ax, pcolormesh_kwargs=dict(cmap='Greys', vmin=0, vmax=20))
    ax.set_title('ancestry confidence')

    # plot distribution of confidence scores
    fig, ax = plt.subplots()
    ax.hist(confidence.flatten(), bins=20)
    ax.set_title('confidence')

    # plot ancestry filtered by confidence
    fig, ax = plt.subplots(figsize=(14, 2))
    x = ancestry.copy()
    x[confidence < min_confidence] = -1
    anhima.gt.plot_discrete_calldata(x, colors='wbyr', states=(-1, 0, 1, 2), ax=ax)
    ax.set_title('ancestry, filtered by confidence')



In [18]:

    
# investigate ancestry in 5 samples from subpop1 we set aside earlier
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)



In [19]:

    
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)



In [20]:

    
# investigate ancestry in 5 samples from subpop2 we set aside earlier
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1)



In [21]:

    
# combine information from neighbouring variants
investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)



In [22]:

anhima.af - Allele frequencies

Maximum likelihood ancestry

`anhima.af` - Allele frequencies