Setup


In [1]:
import sys
import numpy as np
import bcolz
import numexpr
import humanize
import h5py
import tempfile
def binarysize(n):
    return humanize.naturalsize(n, binary=True)

%reload_ext memory_profiler
sys.path.insert(0, '../..')
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.bcolz
%aimport allel.io

In [2]:
pos = np.unique(np.random.randint(0, 100000000, size=10000000))
pos.size, binarysize(pos.nbytes)


Out[2]:
(9516386, '72.6 MiB')

In [3]:
x = np.random.randint(0, 100, size=pos.size)
binarysize(x.nbytes)


Out[3]:
'72.6 MiB'

In [4]:
y = np.random.random(size=pos.size)
binarysize(y.nbytes)


Out[4]:
'72.6 MiB'

In [5]:
z = np.random.choice([b'a', b'b', b'c'], size=pos.size)
binarysize(z.nbytes)


Out[5]:
'9.1 MiB'

In [6]:
query = "(x > 50) & (y < .5) & (z == b'a')"

In [7]:
a = np.rec.fromarrays([pos, x, y, z], names=['pos', 'x', 'y', 'z'])
vt = allel.model.VariantTable(a, index='pos')
vt


Out[7]:
VariantTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'

...


In [8]:
vt.x


Out[8]:
array([17, 78, 93, ..., 11, 13,  5])

In [9]:
vt[['pos', 'x']]


Out[9]:
VariantTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8')])
pos x
5 17
39 78
58 93
64 63
84 41

...


In [10]:
vt.display(10)


VariantTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'
101 56 0.702506983122 b'c'
121 3 0.512401263151 b'c'
134 20 0.549689464029 b'b'
138 83 0.978512936125 b'c'
188 79 0.176854785614 b'a'

...


In [11]:
vtc = allel.bcolz.VariantCTable(a, index='pos')
vtc


Out[11]:
VariantCTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'

...


In [16]:
vtc.ctbl


Out[16]:
ctable((9516386,), [('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
  nbytes: 226.89 MB; cbytes: 91.50 MB; ratio: 2.48
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[(5, 17, 0.9411783286306781, b'c') (39, 78, 0.9540060261020432, b'c')
 (58, 93, 0.3698813547240103, b'b') ...,
 (99999971, 11, 0.9627122072169108, b'b')
 (99999973, 13, 0.9434966815056626, b'a')
 (99999978, 5, 0.17309121393683735, b'c')]

In [12]:
vtc.x


Out[12]:
carray((9516386,), int64)
  nbytes: 72.60 MB; cbytes: 10.08 MB; ratio: 7.20
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[17 78 93 ..., 11 13  5]

In [13]:
vtc.display(10)


VariantCTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'
101 56 0.702506983122 b'c'
121 3 0.512401263151 b'c'
134 20 0.549689464029 b'b'
138 83 0.978512936125 b'c'
188 79 0.176854785614 b'a'

...


In [14]:
rootdir = tempfile.mkdtemp()
vtcp = allel.bcolz.VariantCTable(a, index='pos', rootdir=rootdir, mode='w')
vtcp


Out[14]:
VariantCTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'

...


In [15]:
vtcp2 = allel.bcolz.VariantCTable.open(rootdir)
vtcp2


Out[15]:
VariantCTable(9516386, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
5 17 0.941178328631 b'c'
39 78 0.954006026102 b'c'
58 93 0.369881354724 b'b'
64 63 0.636430366687 b'a'
84 41 0.311720778772 b'b'

...


In [17]:
vt.query(query).display(10)


VariantTable(776406, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
188 79 0.176854785614 b'a'
213 90 0.367455877413 b'a'
289 82 0.171532726593 b'a'
400 83 0.113906927424 b'a'
700 51 0.473933866061 b'a'
796 66 0.406920706967 b'a'
810 74 0.303002118461 b'a'
825 95 0.235118323645 b'a'
840 88 0.347419256774 b'a'
873 63 0.153837759783 b'a'

...


In [18]:
vtc.query(query).display(10)


VariantCTable(776406, dtype=[('pos', '<i8'), ('x', '<i8'), ('y', '<f8'), ('z', 'S1')])
pos x y z
188 79 0.176854785614 b'a'
213 90 0.367455877413 b'a'
289 82 0.171532726593 b'a'
400 83 0.113906927424 b'a'
700 51 0.473933866061 b'a'
796 66 0.406920706967 b'a'
810 74 0.303002118461 b'a'
825 95 0.235118323645 b'a'
840 88 0.347419256774 b'a'
873 63 0.153837759783 b'a'

...

Query profiling


In [18]:
# baselines with plain numpy arrays
%timeit eval(query)
%memit eval(query)
%timeit numexpr.evaluate(query)
%memit numexpr.evaluate(query)


10 loops, best of 3: 154 ms per loop
peak memory: 1170.03 MiB, increment: 0.11 MiB
10 loops, best of 3: 48.5 ms per loop
peak memory: 1170.08 MiB, increment: 0.05 MiB

In [19]:
vm = 'python'
%timeit vt.eval(query, vm=vm)
%memit vt.eval(query, vm=vm)
%timeit vtc.eval(query, vm=vm)
%memit vtc.eval(query, vm=vm)
%timeit vtcp.eval(query, vm=vm)
%memit vtcp.eval(query, vm=vm)


1 loops, best of 3: 252 ms per loop
peak memory: 1170.20 MiB, increment: 0.00 MiB
1 loops, best of 3: 304 ms per loop
peak memory: 1170.20 MiB, increment: 0.00 MiB
1 loops, best of 3: 343 ms per loop
peak memory: 1170.48 MiB, increment: 0.28 MiB

In [20]:
vm = 'numexpr'
%timeit vt.eval(query, vm=vm)
%memit vt.eval(query, vm=vm)
%timeit vtc.eval(query, vm=vm)
%memit vtc.eval(query, vm=vm)
%timeit vtcp.eval(query, vm=vm)
%memit vtcp.eval(query, vm=vm)


10 loops, best of 3: 61 ms per loop
peak memory: 1170.48 MiB, increment: 0.00 MiB
1 loops, best of 3: 168 ms per loop
peak memory: 1170.63 MiB, increment: 0.00 MiB
1 loops, best of 3: 227 ms per loop
peak memory: 1170.63 MiB, increment: 0.00 MiB

In [21]:
fn1 = tempfile.mktemp()
fn1


Out[21]:
'/tmp/tmpsoyqr5mx'

In [22]:
with h5py.File(fn1, mode='w') as h5f:
    h5g = h5f.create_group('table')
    h5g.create_dataset('pos', data=pos, chunks=(1000,))
    h5g.create_dataset('x', data=x, chunks=(1000,))
    h5g.create_dataset('y', data=y, chunks=(1000,))
    h5g.create_dataset('z', data=z, chunks=(1000,))

In [23]:
vtc2 = allel.bcolz.VariantCTable.from_hdf5_group(fn1, 'table', names=['pos', 'x', 'y', 'z'], cparams=bcolz.cparams(cname='lz4', clevel=4))
vtc2


Out[23]:
pos x y z
0 9 94 0.048534 b'c'
1 10 87 0.622883 b'b'
2 25 29 0.625948 b'b'
3 42 34 0.838398 b'a'
4 67 64 0.094999 b'c'

In [24]:
vtc2.pos


Out[24]:
carray((9515340,), int64)
  nbytes: 72.60 MB; cbytes: 11.84 MB; ratio: 6.13
  cparams := cparams(clevel=4, shuffle=True, cname='lz4')
[       9       10       25 ..., 99999966 99999973 99999995]

In [25]:
%timeit vtc2.eval(query)


1 loops, best of 3: 192 ms per loop

In [26]:
%memit vtc2.eval(query)


peak memory: 1306.82 MiB, increment: 0.76 MiB

In [27]:
h5g = h5py.File(fn1, mode='r')['table']
%timeit numexpr.evaluate(query, local_dict=h5g)
%memit numexpr.evaluate(query, local_dict=h5g)


1 loops, best of 3: 400 ms per loop
peak memory: 1463.86 MiB, increment: 145.38 MiB

VCF


In [21]:
size = 100000
pos = np.unique(np.random.randint(0, 100000000, size=size))
chrom = np.array([b'chr1'] * pos.size)
ref = np.random.choice([b'A', b'C', b'T', b'G'], size=pos.size)
x = np.random.randint(0, 100, size=pos.size)
y = np.random.random(size=pos.size)
z = np.random.choice([b'a', b'b', b'c'], size=pos.size)
flag = np.random.randint(0, 2, size=pos.size).astype(bool)
columns = [chrom, pos, ref, x, y, z, flag]
names = ['CHROM', 'POS', 'REF', 'DP', 'MQ', 'ZZ', 'FLG']
a = np.rec.fromarrays(columns, names=names)
vt = allel.model.VariantTable(a)
vt


Out[21]:
CHROM POS REF DP MQ ZZ FLG
0 b'chr1' 961 b'A' 4 0.922264 b'a' False
1 b'chr1' 1812 b'A' 70 0.970685 b'c' False
2 b'chr1' 1937 b'G' 71 0.925726 b'b' False
3 b'chr1' 2324 b'A' 71 0.190789 b'b' True
4 b'chr1' 2383 b'T' 50 0.214158 b'b' True

In [22]:
fn = tempfile.mktemp()
%time vt.to_vcf(fn)


CPU times: user 3.08 s, sys: 16 ms, total: 3.09 s
Wall time: 3.09 s

In [23]:
vct = allel.bcolz.VariantCTable(columns, names=names)
vct


Out[23]:
CHROM POS REF DP MQ ZZ FLG
0 b'chr1' 961 b'A' 4 0.922264 b'a' False
1 b'chr1' 1812 b'A' 70 0.970685 b'c' False
2 b'chr1' 1937 b'G' 71 0.925726 b'b' False
3 b'chr1' 2324 b'A' 71 0.190789 b'b' True
4 b'chr1' 2383 b'T' 50 0.214158 b'b' True

In [28]:
%time vct.to_vcf(fn)


CPU times: user 3.09 s, sys: 12 ms, total: 3.1 s
Wall time: 3.09 s

In [25]:
!head -n20 {fn}


##fileformat=VCFv4.1
##fileDate=20150225
##source=scikit-allel-0.6.0.dev2
##INFO=<ID=DP,Number=1,Type=Integer,Description="">
##INFO=<ID=FLG,Number=0,Type=Flag,Description="">
##INFO=<ID=MQ,Number=1,Type=Float,Description="">
##INFO=<ID=ZZ,Number=1,Type=String,Description="">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO













In [26]:
!tail -n20 {fn}






















In [ ]: