anhima.ld - Linkage disequilibrium
In [1]:
    
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.ld
%aimport anhima.loc
    
Simulate genotypes with some degree of linkage disequilibrium.
In [2]:
    
n_variants = 1000
n_samples = 100
correlation = .97
g = anhima.sim.simulate_genotypes_with_ld(n_variants, n_samples, correlation)
g
    
    Out[2]:
Simulate the genomic physical positions of the variants.
In [3]:
    
pos = np.random.randint(low=0, high=n_variants*100, size=n_variants)
pos.sort()
    
Calculate an approximate value of LD between all pairs of variants.
In [4]:
    
r_squared = anhima.ld.pairwise_genotype_ld(g)
r_squared
    
    Out[4]:
In [5]:
    
anhima.ld.plot_pairwise_ld(r_squared);
    
    
In [6]:
    
# zoom in to first 100 variants
anhima.ld.plot_pairwise_ld(r_squared[:100, :100]);
    
    
In [7]:
    
cor, sep, dist = anhima.ld.pairwise_ld_decay(r_squared, pos)
cor, sep, dist
    
    Out[7]:
In [8]:
    
plt.hist(cor[sep == 1]);
    
    
In [9]:
    
plt.hist(cor[(dist > 0) & (dist < 200)]);
    
    
In [10]:
    
np.sqrt(np.mean(cor[sep == 1]))
    
    Out[10]:
In [11]:
    
anhima.ld.plot_ld_decay_by_separation(cor, sep);
    
    
In [12]:
    
anhima.ld.plot_ld_decay_by_distance(cor, dist, bins=np.arange(0, 10000, 200));
    
    
In [13]:
    
anhima.ld.plot_windowed_ld(g, pos, window_size=10000);
    
    
In [14]:
    
anhima.ld.plot_windowed_ld(g, pos, window_size=1000);
    
    
In [15]:
    
included = anhima.ld.ld_prune_pairwise(g)
    
Indices of the variants that have been retained.
In [16]:
    
np.nonzero(included)[0]
    
    Out[16]:
In [17]:
    
g_pruned = np.compress(included, g, axis=0)
g_pruned
    
    Out[17]:
In [18]:
    
r_squared_pruned = anhima.ld.pairwise_genotype_ld(g_pruned)
anhima.ld.plot_pairwise_ld(r_squared_pruned);
    
    
In [19]:
    
pos_pruned = pos[included]
cor_pruned, sep_pruned, dist_pruned = anhima.ld.pairwise_ld_decay(r_squared_pruned, pos_pruned)
    
In [20]:
    
anhima.ld.plot_ld_decay_by_separation(cor_pruned, sep_pruned);
    
    
In [21]:
    
anhima.ld.plot_ld_decay_by_distance(cor_pruned, dist_pruned, bins=np.arange(0, 10000, 400));
    
    
In [22]:
    
n_variants_big = 100000
g_big = anhima.sim.simulate_genotypes_with_ld(n_variants_big, n_samples, correlation=.97)
    
In [23]:
    
pos_big = np.random.randint(low=0, high=n_variants_big*100, size=n_variants_big)
pos_big.sort()
    
In [24]:
    
included_big = anhima.ld.ld_prune_pairwise(g_big)
    
In [25]:
    
np.count_nonzero(included_big)
    
    Out[25]:
In [26]:
    
g_big_pruned = np.compress(included_big, g_big, axis=0)
    
In [27]:
    
r_squared_big_pruned = anhima.ld.pairwise_genotype_ld(g_big_pruned)
    
In [28]:
    
anhima.ld.plot_pairwise_ld(r_squared_big_pruned[:100, :100]);
    
    
In [29]:
    
cor_big, sep_big, dist_big = anhima.ld.windowed_ld_decay(g_big, pos_big, window_size=1000, step=10)
    
In [30]:
    
anhima.ld.plot_ld_decay_by_separation(cor_big, sep_big, max_separation=100);
    
    
In [31]:
    
anhima.ld.plot_ld_decay_by_distance(cor_big, dist_big, bins=np.arange(0, 10000, 200));
    
    
In [32]:
    
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=100000);
    
    
In [33]:
    
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=10000);
    
    
In [34]: