anhima.dist - Genetic distance calculations


In [1]:
import numpy as np
np.random.seed(1)
import scipy.stats
import random
random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.dist

In [2]:
# simulate genotypes with no relatedness
n_variants = 1000
n_samples = 100
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
gn = anhima.gt.as_n_alt(genotypes)

In [3]:
# simulate three sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=1000)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=1000)
genotypes_subpop2a = anhima.sim.simulate_relatedness(genotypes_subpop2[:, n_samples//4:], relatedness=.5, n_iter=500)
genotypes_subpop2b = anhima.sim.simulate_relatedness(genotypes_subpop2[:, :n_samples//4], relatedness=.5, n_iter=500)
genotypes_related = np.concatenate([genotypes_subpop1, genotypes_subpop2a, genotypes_subpop2b], axis=1)
gnr = anhima.gt.as_n_alt(genotypes_related)

Euclidean distance


In [4]:
# calculate pairwise distance (Euclidean is default metric)
d, ds = anhima.dist.pairwise_distance(gn)
dr, dsr = anhima.dist.pairwise_distance(gnr)

In [5]:
plt.hist(d, bins=30);



In [6]:
plt.hist(dr, bins=30);



In [7]:
anhima.dist.plot_pairwise_distance(ds);



In [8]:
anhima.dist.plot_pairwise_distance(dsr);


Other distance metrics


In [9]:
d, s = anhima.dist.pairwise_distance(gnr, metric='euclidean')
plt.hist(d, bins=30)
plt.xlim(min(d), max(d));



In [10]:
d, s = anhima.dist.pairwise_distance(gnr, metric='sqeuclidean')
plt.hist(d, bins=30)
plt.xlim(min(d), max(d));



In [11]:
d, s = anhima.dist.pairwise_distance(gnr, metric='cityblock')
plt.hist(d, bins=30)
plt.xlim(min(d), max(d));