anhima.ld
- Linkage disequilibrium
In [1]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.ld
%aimport anhima.loc
Simulate genotypes with some degree of linkage disequilibrium.
In [2]:
n_variants = 1000
n_samples = 100
correlation = .97
g = anhima.sim.simulate_genotypes_with_ld(n_variants, n_samples, correlation)
g
Out[2]:
Simulate the genomic physical positions of the variants.
In [3]:
pos = np.random.randint(low=0, high=n_variants*100, size=n_variants)
pos.sort()
Calculate an approximate value of LD between all pairs of variants.
In [4]:
r_squared = anhima.ld.pairwise_genotype_ld(g)
r_squared
Out[4]:
In [5]:
anhima.ld.plot_pairwise_ld(r_squared);
In [6]:
# zoom in to first 100 variants
anhima.ld.plot_pairwise_ld(r_squared[:100, :100]);
In [7]:
cor, sep, dist = anhima.ld.pairwise_ld_decay(r_squared, pos)
cor, sep, dist
Out[7]:
In [8]:
plt.hist(cor[sep == 1]);
In [9]:
plt.hist(cor[(dist > 0) & (dist < 200)]);
In [10]:
np.sqrt(np.mean(cor[sep == 1]))
Out[10]:
In [11]:
anhima.ld.plot_ld_decay_by_separation(cor, sep);
In [12]:
anhima.ld.plot_ld_decay_by_distance(cor, dist, bins=np.arange(0, 10000, 200));
In [13]:
anhima.ld.plot_windowed_ld(g, pos, window_size=10000);
In [14]:
anhima.ld.plot_windowed_ld(g, pos, window_size=1000);
In [15]:
included = anhima.ld.ld_prune_pairwise(g)
Indices of the variants that have been retained.
In [16]:
np.nonzero(included)[0]
Out[16]:
In [17]:
g_pruned = np.compress(included, g, axis=0)
g_pruned
Out[17]:
In [18]:
r_squared_pruned = anhima.ld.pairwise_genotype_ld(g_pruned)
anhima.ld.plot_pairwise_ld(r_squared_pruned);
In [19]:
pos_pruned = pos[included]
cor_pruned, sep_pruned, dist_pruned = anhima.ld.pairwise_ld_decay(r_squared_pruned, pos_pruned)
In [20]:
anhima.ld.plot_ld_decay_by_separation(cor_pruned, sep_pruned);
In [21]:
anhima.ld.plot_ld_decay_by_distance(cor_pruned, dist_pruned, bins=np.arange(0, 10000, 400));
In [22]:
n_variants_big = 100000
g_big = anhima.sim.simulate_genotypes_with_ld(n_variants_big, n_samples, correlation=.97)
In [23]:
pos_big = np.random.randint(low=0, high=n_variants_big*100, size=n_variants_big)
pos_big.sort()
In [24]:
included_big = anhima.ld.ld_prune_pairwise(g_big)
In [25]:
np.count_nonzero(included_big)
Out[25]:
In [26]:
g_big_pruned = np.compress(included_big, g_big, axis=0)
In [27]:
r_squared_big_pruned = anhima.ld.pairwise_genotype_ld(g_big_pruned)
In [28]:
anhima.ld.plot_pairwise_ld(r_squared_big_pruned[:100, :100]);
In [29]:
cor_big, sep_big, dist_big = anhima.ld.windowed_ld_decay(g_big, pos_big, window_size=1000, step=10)
In [30]:
anhima.ld.plot_ld_decay_by_separation(cor_big, sep_big, max_separation=100);
In [31]:
anhima.ld.plot_ld_decay_by_distance(cor_big, dist_big, bins=np.arange(0, 10000, 200));
In [32]:
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=100000);
In [33]:
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=10000);
In [34]: