In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
def binarysize(n):
return humanize.naturalsize(n, binary=True)
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel
%aimport allel.model
%aimport allel.bcolz
%aimport allel.stats
%aimport allel.plot
allel.__version__
Out[1]:
In [2]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 5000000, 100, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 2
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 2
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 3
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 3
In [3]:
g = allel.bcolz.GenotypeCArray(data, copy=False)
print(binarysize(g.nbytes))
In [4]:
ac = g.count_alleles()[:]
ac.shape
Out[4]:
In [5]:
ac
Out[5]:
In [6]:
%timeit mpd = allel.stats.mean_pairwise_diversity(ac)
In [7]:
import cProfile
cProfile.run('allel.stats.mean_pairwise_diversity(ac)', sort='time')
In [8]:
pos = np.unique(np.random.randint(1, 100000000, size=n_variants*2))[:n_variants]
pos.shape, pos.min(), pos.max(), pos.dtype
Out[8]:
In [11]:
pi, windows, n_bases, counts = allel.stats.windowed_diversity(pos, ac, size=20000, start=1)
In [12]:
plt.hist(pi, bins=20);
In [9]:
%timeit pi, windows, n_bases, counts = allel.stats.windowed_diversity(pos, ac, size=20000, start=1)
In [16]:
cProfile.run('allel.stats.windowed_diversity(pos, ac, size=20000, start=1)', sort='time')
In [ ]: