In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.stats
%aimport allel.plot
%aimport allel.opt.stats
In [2]:
gn = np.array([[0, 0, 1, 0, 2, 0, 1, 1, 0],
[1, 0, 0, 1, 1, 2, 0, 0, 2]], dtype='i1')
In [3]:
np.corrcoef(gn)
Out[3]:
In [4]:
gn_sq = gn ** 2
In [5]:
allel.opt.stats.gn_corrcoef_int8(gn[0], gn[1], gn_sq[0], gn_sq[1])
Out[5]:
In [6]:
gnb = np.random.randint(0, 3, size=(2, 10000)).astype('i1')
In [7]:
np.corrcoef(gnb)
Out[7]:
In [8]:
gnb_sq = gnb ** 2
In [9]:
allel.opt.stats.gn_corrcoef_int8(gnb[0], gnb[1], gnb_sq[0], gnb_sq[1])
Out[9]:
In [10]:
%timeit np.corrcoef(gnb)
In [11]:
%timeit allel.opt.stats.gn_corrcoef_int8(gnb[0], gnb[1], gnb_sq[0], gnb_sq[1])
In [12]:
gnb2 = np.random.randint(0, 3, size=(1000, 1000)).astype('i1')
In [13]:
r2a = np.corrcoef(gnb2)
r2a
Out[13]:
In [14]:
r2b = allel.stats.rogers_huff_r(gnb2)
from scipy.spatial.distance import squareform
r2b_sq = squareform(r2b)
r2b_sq
Out[14]:
In [15]:
%timeit np.corrcoef(gnb2)
%memit np.corrcoef(gnb2)
In [16]:
%timeit allel.stats.rogers_huff_r(gnb2)
%memit allel.stats.rogers_huff_r(gnb2)
In [17]:
import cProfile
In [18]:
cProfile.run('allel.stats.rogers_huff_r(gnb2)', sort='time')
In [19]:
import random
import numpy as np
def simulate_genotypes_with_ld(n_variants, n_samples, correlation=0.2):
"""A very simple function to simulate a set of genotypes, where
variants are in some degree of linkage disequilibrium with their
neighbours.
Parameters
----------
n_variants : int
The number of variants to simulate data for.
n_samples : int
The number of individuals to simulate data for.
correlation : float, optional
The fraction of samples to copy genotypes between neighbouring
variants.
Returns
-------
gn : ndarray, int8
A 2-dimensional array of shape (n_variants, n_samples) where each
element is a genotype call coded as a single integer counting the
number of non-reference alleles.
"""
# initialise an array of random genotypes
gn = np.random.randint(size=(n_variants, n_samples), low=0, high=3)
gn = gn.astype('i1')
# determine the number of samples to copy genotypes for
n_copy = int(correlation * n_samples)
# introduce linkage disequilibrium by copying genotypes from one sample to
# the next
for i in range(1, n_variants):
# randomly pick the samples to copy from
sample_indices = random.sample(range(n_samples), n_copy)
# view genotypes from the previous variant for the selected samples
c = gn[i-1, sample_indices]
# randomly choose whether to invert the correlation
inv = random.randint(0, 1)
if inv:
c = 2-c
# copy across genotypes
gn[i, sample_indices] = c
return gn
In [20]:
gnl = simulate_genotypes_with_ld(1000, 1000, .97)
gnl.shape
Out[20]:
In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
In [22]:
x = allel.stats.rogers_huff_r(gnl) ** 2
In [23]:
allel.plot.pairwise_ld(x);
In [24]:
plt.hist(x);
In [25]:
loc = allel.stats.locate_unlinked(gnl, size=100, step=10, threshold=.4)
In [26]:
np.count_nonzero(loc)
Out[26]:
In [27]:
np.nonzero(loc)[0]
Out[27]:
In [28]:
gnu = gnl[loc]
y = allel.stats.rogers_huff_r(gnu) ** 2
plt.hist(y);
In [29]:
x = allel.stats.rogers_huff_r(gnu) ** 2
allel.plot.pairwise_ld(x);
In [30]:
%timeit allel.stats.locate_unlinked(gnl, size=100, step=10, threshold=.3)
%memit allel.stats.locate_unlinked(gnl, size=100, step=10, threshold=.3)
In [31]:
import cProfile
cProfile.run('allel.stats.locate_unlinked(gnl, size=100, step=10, threshold=.3)', sort='time')
In [ ]: