anhima.ld - Linkage disequilibrium


In [1]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.ld
%aimport anhima.loc

Simulate genotype data

Simulate genotypes with some degree of linkage disequilibrium.


In [2]:
n_variants = 1000
n_samples = 100
correlation = .97
g = anhima.sim.simulate_genotypes_with_ld(n_variants, n_samples, correlation)
g


Out[2]:
array([[2, 0, 2, ..., 2, 0, 0],
       [0, 2, 0, ..., 0, 2, 2],
       [0, 2, 0, ..., 0, 2, 2],
       ..., 
       [0, 2, 1, ..., 0, 1, 2],
       [0, 2, 1, ..., 0, 1, 2],
       [0, 2, 1, ..., 0, 1, 2]], dtype=int8)

Simulate the genomic physical positions of the variants.


In [3]:
pos = np.random.randint(low=0, high=n_variants*100, size=n_variants)
pos.sort()

Pairwise LD

Calculate an approximate value of LD between all pairs of variants.


In [4]:
r_squared = anhima.ld.pairwise_genotype_ld(g)
r_squared


Out[4]:
array([[  1.00000000e+00,   9.07748779e-01,   8.18691153e-01, ...,
          1.49118959e-04,   3.89208802e-04,   3.57366211e-04],
       [  9.07748779e-01,   1.00000000e+00,   9.06034190e-01, ...,
          1.64228306e-03,   5.33172702e-03,   4.96930122e-03],
       [  8.18691153e-01,   9.06034190e-01,   1.00000000e+00, ...,
          4.79109120e-03,   1.04433734e-02,   6.92715218e-03],
       ..., 
       [  1.49118959e-04,   1.64228306e-03,   4.79109120e-03, ...,
          1.00000000e+00,   9.18831940e-01,   8.75647397e-01],
       [  3.89208802e-04,   5.33172702e-03,   1.04433734e-02, ...,
          9.18831940e-01,   1.00000000e+00,   9.52542895e-01],
       [  3.57366211e-04,   4.96930122e-03,   6.92715218e-03, ...,
          8.75647397e-01,   9.52542895e-01,   1.00000000e+00]])

In [5]:
anhima.ld.plot_pairwise_ld(r_squared);



In [6]:
# zoom in to first 100 variants
anhima.ld.plot_pairwise_ld(r_squared[:100, :100]);


LD decay


In [7]:
cor, sep, dist = anhima.ld.pairwise_ld_decay(r_squared, pos)
cor, sep, dist


Out[7]:
(array([ 0.90774878,  0.81869115,  0.79309105, ...,  0.91883194,
         0.8756474 ,  0.95254289]),
 array([1, 2, 3, ..., 1, 2, 1]),
 array([ 59, 245, 383, ..., 295, 437, 142]))

In [8]:
plt.hist(cor[sep == 1]);



In [9]:
plt.hist(cor[(dist > 0) & (dist < 200)]);



In [10]:
np.sqrt(np.mean(cor[sep == 1]))


Out[10]:
0.97025821817633195

In [11]:
anhima.ld.plot_ld_decay_by_separation(cor, sep);



In [12]:
anhima.ld.plot_ld_decay_by_distance(cor, dist, bins=np.arange(0, 10000, 200));


LD over the genome


In [13]:
anhima.ld.plot_windowed_ld(g, pos, window_size=10000);



In [14]:
anhima.ld.plot_windowed_ld(g, pos, window_size=1000);


LD-based variant pruning


In [15]:
included = anhima.ld.ld_prune_pairwise(g)

Indices of the variants that have been retained.


In [16]:
np.nonzero(included)[0]


Out[16]:
array([  0,  37,  80, 102, 125, 146, 172, 194, 219, 249, 283, 306, 333,
       353, 372, 397, 426, 446, 475, 500, 525, 555, 580, 609, 634, 661,
       697, 723, 754, 796, 815, 847, 872, 896, 913, 943, 987])

In [17]:
g_pruned = np.compress(included, g, axis=0)
g_pruned


Out[17]:
array([[2, 0, 2, ..., 2, 0, 0],
       [0, 0, 0, ..., 1, 2, 0],
       [0, 1, 2, ..., 1, 1, 1],
       ..., 
       [2, 1, 0, ..., 0, 2, 2],
       [2, 0, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 2, 1, 0]], dtype=int8)

In [18]:
r_squared_pruned = anhima.ld.pairwise_genotype_ld(g_pruned)
anhima.ld.plot_pairwise_ld(r_squared_pruned);



In [19]:
pos_pruned = pos[included]
cor_pruned, sep_pruned, dist_pruned = anhima.ld.pairwise_ld_decay(r_squared_pruned, pos_pruned)

In [20]:
anhima.ld.plot_ld_decay_by_separation(cor_pruned, sep_pruned);



In [21]:
anhima.ld.plot_ld_decay_by_distance(cor_pruned, dist_pruned, bins=np.arange(0, 10000, 400));


Bigger datasets


In [22]:
n_variants_big = 100000
g_big = anhima.sim.simulate_genotypes_with_ld(n_variants_big, n_samples, correlation=.97)

In [23]:
pos_big = np.random.randint(low=0, high=n_variants_big*100, size=n_variants_big)
pos_big.sort()

In [24]:
included_big = anhima.ld.ld_prune_pairwise(g_big)

In [25]:
np.count_nonzero(included_big)


Out[25]:
3706

In [26]:
g_big_pruned = np.compress(included_big, g_big, axis=0)

In [27]:
r_squared_big_pruned = anhima.ld.pairwise_genotype_ld(g_big_pruned)

In [28]:
anhima.ld.plot_pairwise_ld(r_squared_big_pruned[:100, :100]);



In [29]:
cor_big, sep_big, dist_big = anhima.ld.windowed_ld_decay(g_big, pos_big, window_size=1000, step=10)

In [30]:
anhima.ld.plot_ld_decay_by_separation(cor_big, sep_big, max_separation=100);



In [31]:
anhima.ld.plot_ld_decay_by_distance(cor_big, dist_big, bins=np.arange(0, 10000, 200));



In [32]:
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=100000);



In [33]:
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=10000);



In [34]: