In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
import sh
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.bcolz
In [2]:
def binarysize(n):
return humanize.naturalsize(n, binary=True)
In [3]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 50000, 1000, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1
In [4]:
g = allel.model.GenotypeArray(data, copy=False)
print(binarysize(g.nbytes))
In [5]:
gc = allel.bcolz.GenotypeCArray(data)
print(binarysize(gc.cbytes), gc.chunklen)
In [6]:
import tempfile
rootdir = tempfile.mkdtemp()
rootdir
Out[6]:
In [7]:
gcp = allel.bcolz.GenotypeCArray(data, rootdir=rootdir, mode='w')
gcp.carr.flush()
print(binarysize(gc.cbytes), gc.chunklen)
In [8]:
mask = np.random.randint(0, 2, size=(n_variants, n_samples)).astype('b1')
gm = g.copy()
gm.mask = mask
In [9]:
gcm = allel.bcolz.GenotypeCArray(data)
gcm.mask = mask
In [10]:
m = g.to_sparse(format='csr')
m
Out[10]:
In [11]:
print(m.data.dtype, binarysize(m.data.nbytes), binarysize(m.indices.nbytes), binarysize(m.indptr.nbytes))
In [12]:
g.count_called(), gc.count_called(), gcm.count_called(), gm.count_called()
Out[12]:
In [13]:
g.count_hom_ref(), gc.count_hom_ref(), gm.count_hom_ref(), gcm.count_hom_ref()
Out[13]:
In [14]:
g.count_het(), gc.count_het(), gm.count_het(), gcm.count_het()
Out[14]:
In [15]:
g.count_hom_alt(), gc.count_hom_alt(), gm.count_hom_alt(), gcm.count_hom_alt()
Out[15]:
In [16]:
sh.sudo.drop_cache()
%timeit n = g.count_called()
%timeit n = gc.count_called()
%timeit n = gcp.count_called()
%timeit n = gm.count_called()
%timeit n = gcm.count_called()
sh.sudo.drop_cache()
%memit n = g.count_called()
%memit n = gc.count_called()
%memit n = gcp.count_called()
%memit n = gm.count_called()
%memit n = gcm.count_called()
In [17]:
%timeit n = g.count_het()
%timeit n = gc.count_het()
%timeit n = gcp.count_het()
%memit n = g.count_het()
%memit n = gc.count_het()
%memit n = gcp.count_het()
In [18]:
%timeit n = g.count_call((0, 1))
%timeit n = gc.count_call((0, 1))
%timeit n = gcp.count_call((0, 1))
%memit n = g.count_call((0, 1))
%memit n = gc.count_call((0, 1))
%memit n = gcp.count_call((0, 1))
In [19]:
%timeit gn = g.to_n_alt()
%timeit gn = gc.to_n_alt()
%timeit gn = gcp.to_n_alt()
%memit gn = g.to_n_alt()
%memit gn = gc.to_n_alt()
%memit gn = gcp.to_n_alt()
In [20]:
gc.to_n_alt()
Out[20]:
In [21]:
%timeit x = g.to_packed()
%timeit x = gc.to_packed()
%timeit x = gcp.to_packed()
%memit x = g.to_packed()
%memit x = gc.to_packed()
%memit x = gcp.to_packed()
In [22]:
gc.to_packed()
Out[22]:
In [23]:
%timeit x = g.to_allele_counts()
%timeit x = gc.to_allele_counts()
%timeit x = gcp.to_allele_counts()
%memit x = g.to_allele_counts()
%memit x = gc.to_allele_counts()
%memit x = gcp.to_allele_counts()
In [24]:
mapping = np.array([[1, 0]] * g.shape[0])
mapping
Out[24]:
In [25]:
g.map_alleles(mapping)
Out[25]:
In [26]:
g2 = g.copy()
g3 = g.astype('i2')
%timeit x = g2.map_alleles(mapping, copy=False)
%timeit x = g3.map_alleles(mapping, copy=False)
%timeit x = gc.map_alleles(mapping)
%timeit x = gcp.map_alleles(mapping)
%memit x = g2.map_alleles(mapping, copy=False)
%memit x = g3.map_alleles(mapping, copy=False)
%memit x = gc.map_alleles(mapping)
%memit x = gcp.map_alleles(mapping)
In [27]:
np.array_equal(g.map_alleles(mapping), g.astype('i2').map_alleles(mapping))
Out[27]:
In [28]:
h = g.to_haplotypes()
h
Out[28]:
In [29]:
hc = gc.to_haplotypes()
hc
Out[29]:
In [30]:
%timeit x = g.max()
%timeit x = gc.max()
%memit x = g.max()
%memit x = gc.max()
In [31]:
%timeit x = h.max()
%timeit x = hc.max()
%memit x = h.max()
%memit x = hc.max()
In [32]:
x = g.count_alleles(max_allele=2)[:, 0]
y = gm.count_alleles(max_allele=2)[:, 0]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(x, y, 'ko')
Out[32]:
In [33]:
sh.sudo.drop_cache()
%timeit x = g.count_alleles(max_allele=2)
%timeit x = gc.count_alleles(max_allele=2)
%timeit x = gcp.count_alleles(max_allele=2)
%timeit x = gm.count_alleles(max_allele=2)
%timeit x = gcm.count_alleles(max_allele=2)
sh.sudo.drop_cache()
%memit x = g.count_alleles(max_allele=2)
%memit x = gc.count_alleles(max_allele=2)
%memit x = gcp.count_alleles(max_allele=2)
%memit x = gm.count_alleles(max_allele=2)
%memit x = gcm.count_alleles(max_allele=2)
In [34]:
%timeit x = h.count_alleles(max_allele=2)
%timeit x = hc.count_alleles(max_allele=2)
%memit x = h.count_alleles(max_allele=2)
%memit x = hc.count_alleles(max_allele=2)
In [35]:
gsub = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//2), replace=False)
len(gsub)
Out[35]:
In [36]:
np.array_equal(g.count_alleles(max_allele=2, subpop=gsub),
g.take(gsub, axis=1).count_alleles(max_allele=2))
Out[36]:
In [37]:
%timeit x = g.count_alleles(max_allele=2, subpop=gsub)
%timeit x = g.take(gsub, axis=1).count_alleles(max_allele=2)
%timeit x = gc.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gc.take(gsub, axis=1).count_alleles(max_allele=2)
%memit x = g.count_alleles(max_allele=2, subpop=gsub)
%memit x = g.take(gsub, axis=1).count_alleles(max_allele=2)
%memit x = gc.count_alleles(max_allele=2, subpop=gsub)
%memit x = gc.take(gsub, axis=1).count_alleles(max_allele=2)
In [38]:
sh.sudo.drop_cache()
%timeit x = g.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gc.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gcp.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gm.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gcm.count_alleles(max_allele=2, subpop=gsub)
sh.sudo.drop_cache()
%memit x = g.count_alleles(max_allele=2, subpop=gsub)
%memit x = gc.count_alleles(max_allele=2, subpop=gsub)
%memit x = gcp.count_alleles(max_allele=2, subpop=gsub)
%memit x = gm.count_alleles(max_allele=2, subpop=gsub)
%memit x = gcm.count_alleles(max_allele=2, subpop=gsub)
In [39]:
hsub = [ploidy*i + n for i in gsub for n in range(ploidy)]
len(hsub)
Out[39]:
In [40]:
np.array_equal(h.count_alleles(max_allele=2, subpop=hsub),
h.take(hsub, axis=1).count_alleles(max_allele=2))
Out[40]:
In [41]:
%timeit x = h.count_alleles(max_allele=2, subpop=hsub)
%timeit x = h.take(hsub, axis=1).count_alleles(max_allele=2)
%timeit x = hc.count_alleles(max_allele=2, subpop=hsub)
%timeit x = hc.take(hsub, axis=1).count_alleles(max_allele=2)
%memit x = h.count_alleles(max_allele=2, subpop=hsub)
%memit x = h.take(hsub, axis=1).count_alleles(max_allele=2)
%memit x = hc.count_alleles(max_allele=2, subpop=hsub)
%memit x = hc.take(hsub, axis=1).count_alleles(max_allele=2)
In [42]:
gsub1 = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//4), replace=False)
gsub2 = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//4), replace=False)
subpops = {'sub1': gsub1, 'sub2': gsub2}
In [43]:
%timeit gc.count_alleles_subpops(max_allele=2, subpops=subpops)
%timeit {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}
%memit gc.count_alleles_subpops(max_allele=2, subpops=subpops)
%memit {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}
In [44]:
x1 = gc.count_alleles_subpops(max_allele=2, subpops=subpops)
x1
Out[44]:
In [45]:
x1['sub1']
Out[45]:
In [46]:
x1['sub2']
Out[46]:
In [47]:
x2 = {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}
x2
Out[47]:
In [48]:
np.array_equal(x1['sub2'], x2['sub2'])
Out[48]:
In [49]:
variants = np.random.randint(0, 2, size=n_variants).astype(bool)
np.count_nonzero(variants)
Out[49]:
In [50]:
samples = np.random.randint(0, 2, size=n_samples).astype(bool)
np.count_nonzero(samples)
Out[50]:
In [51]:
gc.compress(variants, axis=0)
Out[51]:
In [52]:
%timeit x = g.compress(variants, axis=0)
%timeit x = gc.compress(variants, axis=0)
%memit x = g.compress(variants, axis=0)
%memit x = gc.compress(variants, axis=0)
In [53]:
indices = np.nonzero(variants)[0]
%timeit x = g.take(indices, axis=0)
%timeit x = gc.take(indices, axis=0)
%memit x = g.take(indices, axis=0)
%memit x = gc.take(indices, axis=0)
In [54]:
%timeit x = g.compress(samples, axis=1)
%timeit x = gc.compress(samples, axis=1)
%memit x = g.compress(samples, axis=1)
%memit x = gc.compress(samples, axis=1)
In [55]:
indices = np.nonzero(samples)[0]
%timeit x = g.take(indices, axis=1)
%timeit x = gc.take(indices, axis=1)
%memit x = g.take(indices, axis=1)
%memit x = gc.take(indices, axis=1)
In [56]:
%timeit x = g.subset(variants, samples)
%timeit x = gc.subset(variants, samples)
%memit x = g.subset(variants, samples)
%memit x = gc.subset(variants, samples)
In [57]:
gc.subset(variants, samples)
Out[57]:
In [59]:
gms = gm.subset(variants, samples)
gms.shape, gms.mask.shape
Out[59]:
In [60]:
gcms = gcm.subset(variants, samples)
gcms.shape, gcms.mask.shape
Out[60]:
In [ ]: