In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
import matplotlib.pyplot as plt
%matplotlib inline
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel
%aimport allel.model
%aimport allel.bcolz
%aimport allel.stats
%aimport allel.plot
allel.__version__
Out[1]:
In [2]:
def binarysize(n):
return humanize.naturalsize(n, binary=True)
In [3]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 500000, 100, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1
In [4]:
g = allel.model.GenotypeArray(data, copy=False)
print(binarysize(g.nbytes))
In [5]:
gc = allel.bcolz.GenotypeCArray(data)
print(binarysize(gc.cbytes))
In [6]:
gn = g.to_n_alt()
print(binarysize(gn.nbytes))
In [7]:
gnc = gc.to_n_alt()
print(binarysize(gnc.cbytes))
In [8]:
d1 = allel.stats.pairwise_distance(gn, metric='cityblock')
In [9]:
plt.hist(d1);
In [10]:
allel.plot.pairwise_distance(d1);
In [11]:
d2 = allel.stats.pairwise_distance(gnc, metric='cityblock')
In [12]:
np.array_equal(d1, d2)
Out[12]:
In [13]:
%timeit allel.stats.pairwise_distance(gn, metric='cityblock')
In [14]:
%timeit allel.stats.pairwise_distance(gnc, metric='cityblock')
In [15]:
%memit allel.stats.pairwise_distance(gn, metric='cityblock')
In [16]:
%memit allel.stats.pairwise_distance(gnc, metric='cityblock')
In [13]:
import scipy.spatial
In [14]:
d1 = scipy.spatial.distance.pdist(gn[:100].T, 'cityblock')
In [15]:
d2 = allel.stats.pdist(gn[:100], 'cityblock')
In [16]:
d1
Out[16]:
In [17]:
d2
Out[17]:
In [18]:
np.array_equal(d1, d2)
Out[18]:
In [19]:
%timeit scipy.spatial.distance.pdist(gn[:1000].T, 'cityblock')
In [20]:
%timeit scipy.spatial.distance.pdist(gn[:1000].T, scipy.spatial.distance.cityblock)
In [21]:
%timeit allel.stats.pdist(gn[:1000], scipy.spatial.distance.cityblock)
In [22]:
gac = g[:1000].to_allele_counts()
gac.shape
Out[22]:
In [23]:
pos = np.unique(np.random.randint(0, 1000000, size=10000))[:1000]
pos.shape, pos.min(), pos.max()
Out[23]:
In [24]:
is_accessible = np.random.randint(0, 2, size=pos.max()).astype('b1')
is_accessible.shape, np.count_nonzero(is_accessible)
Out[24]:
In [54]:
dxy = lambda ac1, ac2: allel.stats.sequence_divergence(pos, ac1, ac2, is_accessible=is_accessible)
In [55]:
dxy(gac[:, 0], gac[:, 1])
Out[55]:
In [56]:
d3 = allel.stats.pairwise_distance(gac, dxy)
In [57]:
d3
Out[57]:
In [29]:
allel.plot.pairwise_distance(d3);
In [58]:
%timeit allel.stats.pairwise_distance(gac, dxy)
In [59]:
d4 = allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)
d4
Out[59]:
In [62]:
np.array_equal(d3, d4)
Out[62]:
In [60]:
%timeit allel.stats.pairwise_dxy(pos, gac)
In [61]:
%timeit allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)
In [64]:
%memit allel.stats.pairwise_dxy(pos, gac)
In [63]:
%memit allel.stats.pairwise_dxy(pos, gac, is_accessible=is_accessible)
In [ ]: