In [1]:
import sys
import numpy as np
import bcolz
import numexpr
import humanize
import h5py
import tempfile
def binarysize(n):
return humanize.naturalsize(n, binary=True)
%reload_ext memory_profiler
sys.path.insert(0, '../..')
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.bcolz
%aimport allel.io
In [2]:
pos = np.unique(np.random.randint(0, 100000000, size=10000000))
pos.size, binarysize(pos.nbytes)
Out[2]:
In [3]:
x = np.random.randint(0, 100, size=pos.size)
binarysize(x.nbytes)
Out[3]:
In [4]:
y = np.random.random(size=pos.size)
binarysize(y.nbytes)
Out[4]:
In [5]:
z = np.random.choice([b'a', b'b', b'c'], size=pos.size)
binarysize(z.nbytes)
Out[5]:
In [6]:
query = "(x > 50) & (y < .5) & (z == b'a')"
In [7]:
a = np.rec.fromarrays([pos, x, y, z], names=['pos', 'x', 'y', 'z'])
vt = allel.model.VariantTable(a, index='pos')
vt
Out[7]:
In [8]:
vt.x
Out[8]:
In [9]:
vt[['pos', 'x']]
Out[9]:
In [10]:
vt.display(10)
In [11]:
vtc = allel.bcolz.VariantCTable(a, index='pos')
vtc
Out[11]:
In [16]:
vtc.ctbl
Out[16]:
In [12]:
vtc.x
Out[12]:
In [13]:
vtc.display(10)
In [14]:
rootdir = tempfile.mkdtemp()
vtcp = allel.bcolz.VariantCTable(a, index='pos', rootdir=rootdir, mode='w')
vtcp
Out[14]:
In [15]:
vtcp2 = allel.bcolz.VariantCTable.open(rootdir)
vtcp2
Out[15]:
In [17]:
vt.query(query).display(10)
In [18]:
vtc.query(query).display(10)
In [18]:
# baselines with plain numpy arrays
%timeit eval(query)
%memit eval(query)
%timeit numexpr.evaluate(query)
%memit numexpr.evaluate(query)
In [19]:
vm = 'python'
%timeit vt.eval(query, vm=vm)
%memit vt.eval(query, vm=vm)
%timeit vtc.eval(query, vm=vm)
%memit vtc.eval(query, vm=vm)
%timeit vtcp.eval(query, vm=vm)
%memit vtcp.eval(query, vm=vm)
In [20]:
vm = 'numexpr'
%timeit vt.eval(query, vm=vm)
%memit vt.eval(query, vm=vm)
%timeit vtc.eval(query, vm=vm)
%memit vtc.eval(query, vm=vm)
%timeit vtcp.eval(query, vm=vm)
%memit vtcp.eval(query, vm=vm)
In [21]:
fn1 = tempfile.mktemp()
fn1
Out[21]:
In [22]:
with h5py.File(fn1, mode='w') as h5f:
h5g = h5f.create_group('table')
h5g.create_dataset('pos', data=pos, chunks=(1000,))
h5g.create_dataset('x', data=x, chunks=(1000,))
h5g.create_dataset('y', data=y, chunks=(1000,))
h5g.create_dataset('z', data=z, chunks=(1000,))
In [23]:
vtc2 = allel.bcolz.VariantCTable.from_hdf5_group(fn1, 'table', names=['pos', 'x', 'y', 'z'], cparams=bcolz.cparams(cname='lz4', clevel=4))
vtc2
Out[23]:
In [24]:
vtc2.pos
Out[24]:
In [25]:
%timeit vtc2.eval(query)
In [26]:
%memit vtc2.eval(query)
In [27]:
h5g = h5py.File(fn1, mode='r')['table']
%timeit numexpr.evaluate(query, local_dict=h5g)
%memit numexpr.evaluate(query, local_dict=h5g)
In [21]:
size = 100000
pos = np.unique(np.random.randint(0, 100000000, size=size))
chrom = np.array([b'chr1'] * pos.size)
ref = np.random.choice([b'A', b'C', b'T', b'G'], size=pos.size)
x = np.random.randint(0, 100, size=pos.size)
y = np.random.random(size=pos.size)
z = np.random.choice([b'a', b'b', b'c'], size=pos.size)
flag = np.random.randint(0, 2, size=pos.size).astype(bool)
columns = [chrom, pos, ref, x, y, z, flag]
names = ['CHROM', 'POS', 'REF', 'DP', 'MQ', 'ZZ', 'FLG']
a = np.rec.fromarrays(columns, names=names)
vt = allel.model.VariantTable(a)
vt
Out[21]:
In [22]:
fn = tempfile.mktemp()
%time vt.to_vcf(fn)
In [23]:
vct = allel.bcolz.VariantCTable(columns, names=names)
vct
Out[23]:
In [28]:
%time vct.to_vcf(fn)
In [25]:
!head -n20 {fn}
In [26]:
!tail -n20 {fn}
In [ ]: