In [1]:
import h5py
import numpy as np
import bcolz
import zarr
import sys
sys.path.insert(0, '../..')
import allel
allel.__version__


Out[1]:
'0.20.3'

In [2]:
# data from http://www.malariagen.net/data/ag1000g-phase1-ar3
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.h5',
                    mode='r')

Chunked arrays


In [3]:
genotype = allel.GenotypeChunkedArray(callset['3L/calldata/genotype'])
genotype


Out[3]:
GenotypeChunkedArray((9643193, 765, 2), int8, chunks=(6553, 10, 2))
nbytes: 13.7G; cbytes: 548.0M; cratio: 25.7;
compression: gzip; compression_opts: 3;
data: h5py._hl.dataset.Dataset
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [4]:
print(repr(genotype))


GenotypeChunkedArray((9643193, 765, 2), int8, chunks=(6553, 10, 2))
  nbytes: 13.7G; cbytes: 548.0M; cratio: 25.7;
  compression: gzip; compression_opts: 3;
  data: h5py._hl.dataset.Dataset

In [5]:
%time genotype.max()


CPU times: user 38.7 s, sys: 124 ms, total: 38.8 s
Wall time: 38.8 s
Out[5]:
3

In [4]:
zarr.blosc.set_nthreads(8)


Out[4]:
4

In [5]:
bcolz.set_nthreads(8)


Out[5]:
8

In [5]:
genotype.copy(stop=1000000)


Out[5]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(1370, 765, 2))
nbytes: 1.4G; cbytes: 85.0M; cratio: 17.2;
compression: blosc; compression_opts: cparams(clevel=5, shuffle=1, cname='lz4', quantize=0);
data: bcolz.carray_ext.carray
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [7]:
genotype_bcolzmem = genotype.copy(stop=1000000, storage='bcolzmem')
%time genotype_bcolzmem.max()
genotype_bcolzmem


CPU times: user 7.4 s, sys: 144 ms, total: 7.54 s
Wall time: 2.63 s
Out[7]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(1370, 765, 2)) nbytes: 1.4G; cbytes: 85.0M; cratio: 17.2; compression: blosc; compression_opts: cparams(clevel=5, shuffle=1, cname='lz4', quantize=0); data: bcolz.carray_ext.carray
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [9]:
genotype_zarrmem = genotype.copy(stop=1000000, storage='zarrmem', chunks=genotype_bcolzmem.chunks,
                                 compression_opts=dict(cname='lz4', clevel=5, shuffle=1))
%time genotype_zarrmem.max()
genotype_zarrmem


CPU times: user 7.49 s, sys: 64 ms, total: 7.56 s
Wall time: 2.73 s
Out[9]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(1370, 765, 2)) nbytes: 1.4G; cbytes: 83.1M; cratio: 17.6; compression: blosc; compression_opts: {'clevel': 5, 'shuffle': 1, 'cname': 'lz4'}; data: zarr.core.Array
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [19]:
genotype_bcolztmp = genotype.copy(stop=1000000, storage='bcolztmp')
%time genotype_bcolztmp.max()
genotype_bcolztmp


CPU times: user 6.24 s, sys: 72 ms, total: 6.31 s
Wall time: 2.47 s
Out[19]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(1370, 765, 2), nbytes=1.4G, cbytes=85.0M, cratio=17.2, compression=blosc, compression_opts=cparams(clevel=5, shuffle=1, cname='lz4', quantize=0), data=bcolz.carray_ext.carray)
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [20]:
genotype_bcolzmem_zlib1 = genotype.copy(stop=1000000, storage='bcolzmem_zlib1')
%time genotype_bcolzmem_zlib1.max()
genotype_bcolzmem_zlib1


CPU times: user 9.48 s, sys: 164 ms, total: 9.65 s
Wall time: 3.09 s
Out[20]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(1370, 765, 2), nbytes=1.4G, cbytes=54.7M, cratio=26.7, compression=blosc, compression_opts=cparams(clevel=1, shuffle=1, cname='zlib', quantize=0), data=bcolz.carray_ext.carray)
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [21]:
genotype_bcolztmp_zlib1 = genotype.copy(stop=1000000, storage='bcolztmp_zlib1')
%time genotype_bcolztmp_zlib1.max()


CPU times: user 9.88 s, sys: 240 ms, total: 10.1 s
Wall time: 3.16 s
Out[21]:
3

In [22]:
genotype_hdf5mem_zlib1 = genotype.copy(stop=1000000, storage='hdf5mem_zlib1')
%time genotype_hdf5mem_zlib1.max()
genotype_hdf5mem_zlib1


CPU times: user 4.12 s, sys: 4 ms, total: 4.12 s
Wall time: 4.12 s
Out[22]:
GenotypeChunkedArray((1000000, 765, 2), int8, chunks=(685, 765, 2), nbytes=1.4G, cbytes=52.4M, cratio=27.8, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [23]:
genotype_hdf5tmp_zlib1 = genotype.copy(stop=1000000, storage='hdf5tmp_zlib1')
%time genotype_hdf5tmp_zlib1.max()


CPU times: user 4.05 s, sys: 36 ms, total: 4.09 s
Wall time: 4.08 s
Out[23]:
3

Tune default HDF5 chunk size


In [11]:
genotype.copy(storage='bcolzmem').chunklen


Out[11]:
2741

In [12]:
genotype_bcolzmem.chunklen


Out[12]:
1370

In [13]:
genotype_hdf5mem_zlib1.chunks


Out[13]:
(685, 765, 2)

In [24]:
for chunksize in 2**15, 2**16, 2**17, 2**18, 2**19, 2**20, 2**21, 2**22, 2**23, 2**24, 2**25:
    chunklen = chunksize // (genotype.shape[1] * genotype.shape[2])
    chunks = (chunklen,) + genotype.shape[1:]
    print(chunksize, chunklen)
    genotype_hdf5mem_zlib1_bigchunks = genotype.copy(stop=1000000, storage='hdf5mem_zlib1', chunks=chunks)
    %time genotype_hdf5mem_zlib1_bigchunks.max()


32768 21
CPU times: user 9.11 s, sys: 0 ns, total: 9.11 s
Wall time: 9.09 s
65536 42
CPU times: user 5.96 s, sys: 0 ns, total: 5.96 s
Wall time: 5.95 s
131072 85
CPU times: user 4.39 s, sys: 0 ns, total: 4.39 s
Wall time: 4.39 s
262144 171
CPU times: user 3.61 s, sys: 0 ns, total: 3.61 s
Wall time: 3.6 s
524288 342
CPU times: user 3.19 s, sys: 0 ns, total: 3.19 s
Wall time: 3.18 s
1048576 685
CPU times: user 3.78 s, sys: 0 ns, total: 3.78 s
Wall time: 3.77 s
2097152 1370
CPU times: user 2.98 s, sys: 0 ns, total: 2.98 s
Wall time: 2.97 s
4194304 2741
CPU times: user 2.89 s, sys: 0 ns, total: 2.89 s
Wall time: 2.89 s
8388608 5482
CPU times: user 2.89 s, sys: 0 ns, total: 2.89 s
Wall time: 2.89 s
16777216 10965
CPU times: user 2.98 s, sys: 0 ns, total: 2.98 s
Wall time: 2.98 s
33554432 21931
CPU times: user 2.99 s, sys: 140 ms, total: 3.13 s
Wall time: 3.13 s

In [26]:
for chunksize in 2**15, 2**16, 2**17, 2**18, 2**19, 2**20, 2**21, 2**22, 2**23, 2**24, 2**25:
    chunklen = chunksize // (genotype.shape[1] * genotype.shape[2])
    print(chunksize, chunklen)
    genotype_bcolzmem_bigchunks = genotype.copy(stop=1000000, storage='bcolzmem', chunklen=chunklen)
    %time genotype_bcolzmem_bigchunks.max()


32768 21
CPU times: user 4.81 s, sys: 0 ns, total: 4.81 s
Wall time: 4.81 s
65536 42
CPU times: user 3.33 s, sys: 4 ms, total: 3.33 s
Wall time: 3.32 s
131072 85
CPU times: user 7.37 s, sys: 724 ms, total: 8.1 s
Wall time: 4.05 s
262144 171
CPU times: user 7.29 s, sys: 432 ms, total: 7.72 s
Wall time: 3.2 s
524288 342
CPU times: user 7.7 s, sys: 276 ms, total: 7.97 s
Wall time: 3.15 s
1048576 685
CPU times: user 7.26 s, sys: 92 ms, total: 7.35 s
Wall time: 2.58 s
2097152 1370
CPU times: user 6.6 s, sys: 80 ms, total: 6.68 s
Wall time: 2.32 s
4194304 2741
CPU times: user 6.16 s, sys: 24 ms, total: 6.18 s
Wall time: 2.21 s
8388608 5482
CPU times: user 5.48 s, sys: 4 ms, total: 5.48 s
Wall time: 1.99 s
16777216 10965
CPU times: user 5.12 s, sys: 8 ms, total: 5.13 s
Wall time: 1.8 s
33554432 21931
CPU times: user 4.53 s, sys: 480 ms, total: 5.01 s
Wall time: 1.55 s

Excercise methods


In [27]:
%time genotype_bcolzmem.is_called()


CPU times: user 12.1 s, sys: 160 ms, total: 12.3 s
Wall time: 3.04 s
Out[27]:
zarr.core.Array((1000000, 765), bool, chunks=(1370, 765), order=C)
  compression: blosc; compression_opts: {'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}
  nbytes: 729.6M; nbytes_stored: 3.6M; ratio: 204.7; initialized: 730/730
  store: builtins.dict

In [28]:
%time genotype_hdf5mem_zlib1.is_called(storage='hdf5mem_zlib1')


CPU times: user 10.6 s, sys: 180 ms, total: 10.8 s
Wall time: 6.98 s
Out[28]:
<HDF5 dataset "data": shape (1000000, 765), type "|b1">

In [29]:
%time genotype_bcolzmem.count_het()


CPU times: user 14.1 s, sys: 176 ms, total: 14.3 s
Wall time: 3.45 s
Out[29]:
28029171

In [30]:
%time genotype_hdf5mem_zlib1.count_het()


CPU times: user 11.1 s, sys: 164 ms, total: 11.3 s
Wall time: 5.44 s
Out[30]:
28029171

In [31]:
%time genotype_bcolzmem.count_alleles()


CPU times: user 17.1 s, sys: 292 ms, total: 17.4 s
Wall time: 7.52 s
Out[31]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=4.3M, cratio=3.6, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)
0 1 2 3
0 1527 3 0 0
1 1529 1 0 0
2 1528 2 0 0
3 1528 2 0 0
4 1526 4 0 0

...


In [32]:
%time genotype_bcolzmem_zlib1.count_alleles()


CPU times: user 23.8 s, sys: 580 ms, total: 24.4 s
Wall time: 8.81 s
Out[32]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=4.3M, cratio=3.6, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)
0 1 2 3
0 1527 3 0 0
1 1529 1 0 0
2 1528 2 0 0
3 1528 2 0 0
4 1526 4 0 0

...


In [33]:
%time genotype_hdf5mem_zlib1.count_alleles(storage='hdf5mem_zlib1')


CPU times: user 7.89 s, sys: 4 ms, total: 7.89 s
Wall time: 7.88 s
Out[33]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=2.4M, cratio=6.3, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3
0 1527 3 0 0
1 1529 1 0 0
2 1528 2 0 0
3 1528 2 0 0
4 1526 4 0 0

...


In [34]:
acs = genotype_bcolzmem.count_alleles_subpops(subpops={'pop1': list(range(100)),
                                                       'pop2': list(range(100, 200))},
                                              max_allele=3)
acs


Out[34]:
AlleleCountsChunkedTable(1000000, nbytes=30.5M, cbytes=4.3M, cratio=7.1, data=allel.chunked.storage_zarr.ZarrTable)
pop1 pop2
[200 0 0 0] [200 0 0 0]
[199 1 0 0] [200 0 0 0]
[200 0 0 0] [200 0 0 0]
[198 2 0 0] [200 0 0 0]
[196 4 0 0] [200 0 0 0]

...


In [35]:
acs['pop1']


Out[35]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=2.2M, cratio=6.9, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)
0 1 2 3
0 200 0 0 0
1 199 1 0 0
2 200 0 0 0
3 198 2 0 0
4 196 4 0 0

...


In [36]:
acs = genotype_hdf5mem_zlib1.count_alleles_subpops(subpops={'pop1': list(range(100)),
                                                            'pop2': list(range(100, 200))},
                                                   max_allele=3,
                                                   storage='hdf5mem_zlib1')
acs


Out[36]:
AlleleCountsChunkedTable(1000000, nbytes=30.5M, cbytes=3.0M, cratio=10.2, data=h5py._hl.files.File)
pop1 pop2
[200 0 0 0] [200 0 0 0]
[199 1 0 0] [200 0 0 0]
[200 0 0 0] [200 0 0 0]
[198 2 0 0] [200 0 0 0]
[196 4 0 0] [200 0 0 0]

...


In [37]:
acs['pop1']


Out[37]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=1.5M, cratio=10.2, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3
0 200 0 0 0
1 199 1 0 0
2 200 0 0 0
3 198 2 0 0
4 196 4 0 0

...


In [38]:
%time genotype_bcolzmem.to_haplotypes()


CPU times: user 10.3 s, sys: 264 ms, total: 10.5 s
Wall time: 3.43 s
Out[38]:
HaplotypeChunkedArray((1000000, 1530), int8, chunks=(685, 1530), nbytes=1.4G, cbytes=78.5M, cratio=18.6, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)
0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
3 0 0 0 0 0 ... 0 0 0 0 0
4 0 0 0 0 0 ... 0 0 0 0 0

...


In [39]:
%time genotype_bcolzmem_zlib1.to_haplotypes(storage='bcolzmem_zlib1')


CPU times: user 19.9 s, sys: 1.14 s, total: 21 s
Wall time: 4.06 s
Out[39]:
HaplotypeChunkedArray((1000000, 1530), int8, chunks=(1370, 1530), nbytes=1.4G, cbytes=54.7M, cratio=26.7, compression=blosc, compression_opts=cparams(clevel=1, shuffle=1, cname='zlib', quantize=0), data=bcolz.carray_ext.carray)
0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
3 0 0 0 0 0 ... 0 0 0 0 0
4 0 0 0 0 0 ... 0 0 0 0 0

...


In [40]:
%time genotype_hdf5mem_zlib1.to_haplotypes(storage='hdf5mem_zlib1')


CPU times: user 7.9 s, sys: 64 ms, total: 7.96 s
Wall time: 7.96 s
Out[40]:
HaplotypeChunkedArray((1000000, 1530), int8, chunks=(685, 1530), nbytes=1.4G, cbytes=52.4M, cratio=27.8, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3 4 ... 1525 1526 1527 1528 1529
0 0 0 0 0 0 ... 0 0 0 0 0
1 0 0 0 0 0 ... 0 0 0 0 0
2 0 0 0 0 0 ... 0 0 0 0 0
3 0 0 0 0 0 ... 0 0 0 0 0
4 0 0 0 0 0 ... 0 0 0 0 0

...


In [41]:
ac = genotype_bcolzmem.count_alleles()
ac


Out[41]:
AlleleCountsChunkedArray((1000000, 4), int32, chunks=(65536, 4), nbytes=15.3M, cbytes=4.3M, cratio=3.6, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)
0 1 2 3
0 1527 3 0 0
1 1529 1 0 0
2 1528 2 0 0
3 1528 2 0 0
4 1526 4 0 0

...


In [42]:
ac.is_segregating()


Out[42]:
ChunkedArray((1000000,), bool, chunks=(1048576,), nbytes=976.6K, cbytes=6.3K, cratio=154.7, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)

In [43]:
ac.max_allele()


Out[43]:
ChunkedArray((1000000,), int8, chunks=(1048576,), nbytes=976.6K, cbytes=229.8K, cratio=4.2, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)

Copy into persistent HDF5 file


In [44]:
h5f = h5py.File('test.h5', mode='w')

In [45]:
h5g = h5f.require_group('calldata')
h5g


Out[45]:
<HDF5 group "/calldata" (0 members)>

In [46]:
if 'genotype' in h5g:
    del h5g['genotype']

In [47]:
gc = genotype.copy(stop=100000, storage='hdf5', group=h5g, name='genotype', compression='gzip', compression_opts=1)
gc


Out[47]:
GenotypeChunkedArray((100000, 765, 2), int8, chunks=(685, 765, 2), nbytes=145.9M, cbytes=5.0M, cratio=29.0, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3 4 ... 760 761 762 763 764
0 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
1 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
2 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
3 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0
4 0/0 0/0 0/0 0/0 0/0 ... 0/0 0/0 0/0 0/0 0/0

...


In [48]:
h5g['genotype']


Out[48]:
<HDF5 dataset "genotype": shape (100000, 765, 2), type "|i1">

In [49]:
ac = gc.count_alleles(storage='hdf5', group=h5g, name='allele_counts', compression='gzip', compression_opts=1)
ac


Out[49]:
AlleleCountsChunkedArray((100000, 4), int32, chunks=(65536, 4), nbytes=1.5M, cbytes=229.0K, cratio=6.8, compression=gzip, compression_opts=1, data=h5py._hl.dataset.Dataset)
0 1 2 3
0 1527 3 0 0
1 1529 1 0 0
2 1528 2 0 0
3 1528 2 0 0
4 1526 4 0 0

...


In [50]:
h5g['allele_counts']


Out[50]:
<HDF5 dataset "allele_counts": shape (100000, 4), type "<i4">

Chunked tables


In [51]:
vt = allel.VariantChunkedTable(callset['3L/variants'])
vt


Out[51]:
VariantChunkedTable(9643193, nbytes=3.7G, cbytes=424.9M, cratio=8.8, data=h5py._hl.group.Group)
ABHet ABHom AC AF ALT AN ANN Accessible BaseCounts BaseQRankSum CHROM Coverage CoverageMQ0 DP DS Dels FILTER_FS FILTER_HRun FILTER_HighCoverage FILTER_HighMQ0 FILTER_LowCoverage FILTER_LowMQ FILTER_LowQual FILTER_NoCoverage FILTER_PASS FILTER_QD FILTER_ReadPosRankSum FILTER_RefN FILTER_RepeatDUST FS HRun HW HaplotypeScore HighCoverage HighMQ0 InbreedingCoeff LOF LowCoverage LowMQ LowPairing MLEAC MLEAF MQ MQ0 MQRankSum NDA NMD NoCoverage OND POS QD QUAL REF RPA RU ReadPosRankSum RefMasked RefN RepeatDUST RepeatMasker RepeatTRF STR VariantType is_snp num_alleles svlen
0.50195 0.99805 [3 0 0] [ 0.00196075 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 21 25251 11 77] -1.8486 b'3L' 25889 41 23344 False 0.0 False False False False False False False False True False False False False 0.88184 1 0.0 1.0332 3 0 -0.0025005 b'' 18 1 368 [0 0 0] [ 0. 0. 0.] 54.844 42 0.30811 1 b'' 0 0.002327 9790 14.047 1447.12 b'C' 0 b'' 0.22998 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [1 0 0] [ 0.00065374 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 39 9 25274 18] -3.5195 b'3L' 25916 14 23321 False 0.0 False False False False False False False False True False False False False 1.4609 0 0.0 1.0449 5 0 -0.0012999 b'' 15 1 370 [0 0 0] [ 0. 0. 0.] 54.844 15 3.2109 2 b'' 0 0.0017834 9798 13.781 551.05 b'G' 0 b'' -0.13501 True False False True False False b'SNP' True 2 0
0.52783 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 45 25873 13 16] -3.0566 b'3L' 26223 8 23877 False 0.0 False False False False False False False False True False False False False 1.3818 1 0.0 1.0645 7 0 -0.0017004 b'' 23 0 300 [0 0 0] [ 0. 0. 0.] 55.094 8 2.498 1 b'' 0 0.0018845 9812 13.273 729.69 b'C' 0 b'' -0.077026 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 43 21 9 26227] 0.18298 b'3L' 26516 8 24217 False 0.0 False False False False False False False False True False False False False 0.0 2 0.0 0.94775 7 0 -0.0017004 b'' 19 0 279 [0 0 0] [ 0. 0. 0.] 55.156 8 3.2871 2 b'' 0 0.0017052 9818 14.352 1147.7 b'T' 0 b'' 0.68213 True False False True False False b'SNP' True 2 0
0.48804 0.99805 [4 0 0] [ 0.00261307 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 14 26351 2 129] -3.1367 b'3L' 26461 9 24344 False 0.0 False False False False False False False False True False False False False 1.5303 2 0.0 0.98145 7 0 -0.0040016 b'' 20 0 230 [0 0 0] [ 0. 0. 0.] 55.406 9 2.6504 1 b'' 0 0.0018349 9829 15.25 2851.81 b'C' 0 b'' 2.0234 True False False True False False b'SNP' True 2 0

...


In [52]:
vt['CHROM']


Out[52]:
ChunkedArray((9643193,), |S12, chunks=(10922,), nbytes=110.4M, cbytes=578.6K, cratio=195.3, compression=gzip, compression_opts=3, data=h5py._hl.dataset.Dataset)

In [53]:
vt[['CHROM', 'POS']]


Out[53]:
VariantChunkedTable(9643193, nbytes=147.1M, cbytes=13.4M, cratio=11.0, data=builtins.list)
CHROM POS
b'3L' 9790
b'3L' 9798
b'3L' 9812
b'3L' 9818
b'3L' 9829

...


In [54]:
query1 = '(AN == 1530) & (MQ > 40)'

In [55]:
%timeit vt.eval(query1, vm='numexpr')


1 loop, best of 3: 471 ms per loop

In [56]:
%timeit vt.eval(query1, vm='python')


1 loop, best of 3: 506 ms per loop

In [57]:
cond = vt.eval(query1, vm='numexpr')
cond.shape, cond.count_nonzero()


Out[57]:
((9643193,), 9249396)

In [58]:
# can only be evaluated with python vm
query2 = '(AN == 1530) & (MQ > 40) & (AC[:, 1] > 5)'

In [59]:
%time vt.eval(query2, vm='python')


CPU times: user 2.26 s, sys: 92 ms, total: 2.36 s
Wall time: 1.3 s
Out[59]:
ChunkedArray((9643193,), bool, chunks=(1048576,), nbytes=9.2M, cbytes=1.9M, cratio=4.9, compression=blosc, compression_opts={'clevel': 5, 'shuffle': 1, 'cname': 'blosclz'}, data=zarr.core.Array)

In [60]:
vt_bcolzmem = vt.copy(storage='bcolzmem', stop=1000000)
%timeit vt_bcolzmem.eval(query1, vm='numexpr')


100 loops, best of 3: 14.7 ms per loop

In [61]:
%time vt_bcolzmem.query(query1)


CPU times: user 2.44 s, sys: 276 ms, total: 2.72 s
Wall time: 881 ms
Out[61]:
VariantChunkedTable(950097, nbytes=368.8M, cbytes=63.7M, cratio=5.8, data=allel.chunked.storage_zarr.ZarrTable)
ABHet ABHom AC AF ALT AN ANN Accessible BaseCounts BaseQRankSum CHROM Coverage CoverageMQ0 DP DS Dels FILTER_FS FILTER_HRun FILTER_HighCoverage FILTER_HighMQ0 FILTER_LowCoverage FILTER_LowMQ FILTER_LowQual FILTER_NoCoverage FILTER_PASS FILTER_QD FILTER_ReadPosRankSum FILTER_RefN FILTER_RepeatDUST FS HRun HW HaplotypeScore HighCoverage HighMQ0 InbreedingCoeff LOF LowCoverage LowMQ LowPairing MLEAC MLEAF MQ MQ0 MQRankSum NDA NMD NoCoverage OND POS QD QUAL REF RPA RU ReadPosRankSum RefMasked RefN RepeatDUST RepeatMasker RepeatTRF STR VariantType is_snp num_alleles svlen
0.50195 0.99805 [3 0 0] [ 0.00196075 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 21 25251 11 77] -1.8486 b'3L' 25889 41 23344 False 0.0 False False False False False False False False True False False False False 0.88184 1 0.0 1.0332 3 0 -0.0025005 b'' 18 1 368 [0 0 0] [ 0. 0. 0.] 54.844 42 0.30811 1 b'' 0 0.002327 9790 14.047 1447.12 b'C' 0 b'' 0.22998 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [1 0 0] [ 0.00065374 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 39 9 25274 18] -3.5195 b'3L' 25916 14 23321 False 0.0 False False False False False False False False True False False False False 1.4609 0 0.0 1.0449 5 0 -0.0012999 b'' 15 1 370 [0 0 0] [ 0. 0. 0.] 54.844 15 3.2109 2 b'' 0 0.0017834 9798 13.781 551.05 b'G' 0 b'' -0.13501 True False False True False False b'SNP' True 2 0
0.52783 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 45 25873 13 16] -3.0566 b'3L' 26223 8 23877 False 0.0 False False False False False False False False True False False False False 1.3818 1 0.0 1.0645 7 0 -0.0017004 b'' 23 0 300 [0 0 0] [ 0. 0. 0.] 55.094 8 2.498 1 b'' 0 0.0018845 9812 13.273 729.69 b'C' 0 b'' -0.077026 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 43 21 9 26227] 0.18298 b'3L' 26516 8 24217 False 0.0 False False False False False False False False True False False False False 0.0 2 0.0 0.94775 7 0 -0.0017004 b'' 19 0 279 [0 0 0] [ 0. 0. 0.] 55.156 8 3.2871 2 b'' 0 0.0017052 9818 14.352 1147.7 b'T' 0 b'' 0.68213 True False False True False False b'SNP' True 2 0
0.48804 0.99805 [4 0 0] [ 0.00261307 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 14 26351 2 129] -3.1367 b'3L' 26461 9 24344 False 0.0 False False False False False False False False True False False False False 1.5303 2 0.0 0.98145 7 0 -0.0040016 b'' 20 0 230 [0 0 0] [ 0. 0. 0.] 55.406 9 2.6504 1 b'' 0 0.0018349 9829 15.25 2851.81 b'C' 0 b'' 2.0234 True False False True False False b'SNP' True 2 0

...


In [62]:
vt_hdf5mem_zlib1 = vt.copy(storage='hdf5mem_zlib1', stop=1000000)
%timeit vt_hdf5mem_zlib1.eval(query1, vm='numexpr')


100 loops, best of 3: 20.9 ms per loop

In [63]:
%time vt_hdf5mem_zlib1.query(query1)


CPU times: user 1.94 s, sys: 148 ms, total: 2.09 s
Wall time: 1.32 s
Out[63]:
VariantChunkedTable(950097, nbytes=368.8M, cbytes=63.7M, cratio=5.8, data=allel.chunked.storage_zarr.ZarrTable)
ABHet ABHom AC AF ALT AN ANN Accessible BaseCounts BaseQRankSum CHROM Coverage CoverageMQ0 DP DS Dels FILTER_FS FILTER_HRun FILTER_HighCoverage FILTER_HighMQ0 FILTER_LowCoverage FILTER_LowMQ FILTER_LowQual FILTER_NoCoverage FILTER_PASS FILTER_QD FILTER_ReadPosRankSum FILTER_RefN FILTER_RepeatDUST FS HRun HW HaplotypeScore HighCoverage HighMQ0 InbreedingCoeff LOF LowCoverage LowMQ LowPairing MLEAC MLEAF MQ MQ0 MQRankSum NDA NMD NoCoverage OND POS QD QUAL REF RPA RU ReadPosRankSum RefMasked RefN RepeatDUST RepeatMasker RepeatTRF STR VariantType is_snp num_alleles svlen
0.50195 0.99805 [3 0 0] [ 0.00196075 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 21 25251 11 77] -1.8486 b'3L' 25889 41 23344 False 0.0 False False False False False False False False True False False False False 0.88184 1 0.0 1.0332 3 0 -0.0025005 b'' 18 1 368 [0 0 0] [ 0. 0. 0.] 54.844 42 0.30811 1 b'' 0 0.002327 9790 14.047 1447.12 b'C' 0 b'' 0.22998 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [1 0 0] [ 0.00065374 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 39 9 25274 18] -3.5195 b'3L' 25916 14 23321 False 0.0 False False False False False False False False True False False False False 1.4609 0 0.0 1.0449 5 0 -0.0012999 b'' 15 1 370 [0 0 0] [ 0. 0. 0.] 54.844 15 3.2109 2 b'' 0 0.0017834 9798 13.781 551.05 b'G' 0 b'' -0.13501 True False False True False False b'SNP' True 2 0
0.52783 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 45 25873 13 16] -3.0566 b'3L' 26223 8 23877 False 0.0 False False False False False False False False True False False False False 1.3818 1 0.0 1.0645 7 0 -0.0017004 b'' 23 0 300 [0 0 0] [ 0. 0. 0.] 55.094 8 2.498 1 b'' 0 0.0018845 9812 13.273 729.69 b'C' 0 b'' -0.077026 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 43 21 9 26227] 0.18298 b'3L' 26516 8 24217 False 0.0 False False False False False False False False True False False False False 0.0 2 0.0 0.94775 7 0 -0.0017004 b'' 19 0 279 [0 0 0] [ 0. 0. 0.] 55.156 8 3.2871 2 b'' 0 0.0017052 9818 14.352 1147.7 b'T' 0 b'' 0.68213 True False False True False False b'SNP' True 2 0
0.48804 0.99805 [4 0 0] [ 0.00261307 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 14 26351 2 129] -3.1367 b'3L' 26461 9 24344 False 0.0 False False False False False False False False True False False False False 1.5303 2 0.0 0.98145 7 0 -0.0040016 b'' 20 0 230 [0 0 0] [ 0. 0. 0.] 55.406 9 2.6504 1 b'' 0 0.0018349 9829 15.25 2851.81 b'C' 0 b'' 2.0234 True False False True False False b'SNP' True 2 0

...


In [64]:
vt_zarrmem = vt.copy(storage='zarrmem', stop=1000000)
%timeit vt_zarrmem.eval(query1, vm='numexpr')


100 loops, best of 3: 6.45 ms per loop

In [65]:
vt_zarrmem


Out[65]:
VariantChunkedTable(1000000, nbytes=388.1M, cbytes=67.1M, cratio=5.8, data=allel.chunked.storage_zarr.ZarrTable)
ABHet ABHom AC AF ALT AN ANN Accessible BaseCounts BaseQRankSum CHROM Coverage CoverageMQ0 DP DS Dels FILTER_FS FILTER_HRun FILTER_HighCoverage FILTER_HighMQ0 FILTER_LowCoverage FILTER_LowMQ FILTER_LowQual FILTER_NoCoverage FILTER_PASS FILTER_QD FILTER_ReadPosRankSum FILTER_RefN FILTER_RepeatDUST FS HRun HW HaplotypeScore HighCoverage HighMQ0 InbreedingCoeff LOF LowCoverage LowMQ LowPairing MLEAC MLEAF MQ MQ0 MQRankSum NDA NMD NoCoverage OND POS QD QUAL REF RPA RU ReadPosRankSum RefMasked RefN RepeatDUST RepeatMasker RepeatTRF STR VariantType is_snp num_alleles svlen
0.50195 0.99805 [3 0 0] [ 0.00196075 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 21 25251 11 77] -1.8486 b'3L' 25889 41 23344 False 0.0 False False False False False False False False True False False False False 0.88184 1 0.0 1.0332 3 0 -0.0025005 b'' 18 1 368 [0 0 0] [ 0. 0. 0.] 54.844 42 0.30811 1 b'' 0 0.002327 9790 14.047 1447.12 b'C' 0 b'' 0.22998 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [1 0 0] [ 0.00065374 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 39 9 25274 18] -3.5195 b'3L' 25916 14 23321 False 0.0 False False False False False False False False True False False False False 1.4609 0 0.0 1.0449 5 0 -0.0012999 b'' 15 1 370 [0 0 0] [ 0. 0. 0.] 54.844 15 3.2109 2 b'' 0 0.0017834 9798 13.781 551.05 b'G' 0 b'' -0.13501 True False False True False False b'SNP' True 2 0
0.52783 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 45 25873 13 16] -3.0566 b'3L' 26223 8 23877 False 0.0 False False False False False False False False True False False False False 1.3818 1 0.0 1.0645 7 0 -0.0017004 b'' 23 0 300 [0 0 0] [ 0. 0. 0.] 55.094 8 2.498 1 b'' 0 0.0018845 9812 13.273 729.69 b'C' 0 b'' -0.077026 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 43 21 9 26227] 0.18298 b'3L' 26516 8 24217 False 0.0 False False False False False False False False True False False False False 0.0 2 0.0 0.94775 7 0 -0.0017004 b'' 19 0 279 [0 0 0] [ 0. 0. 0.] 55.156 8 3.2871 2 b'' 0 0.0017052 9818 14.352 1147.7 b'T' 0 b'' 0.68213 True False False True False False b'SNP' True 2 0
0.48804 0.99805 [4 0 0] [ 0.00261307 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 14 26351 2 129] -3.1367 b'3L' 26461 9 24344 False 0.0 False False False False False False False False True False False False False 1.5303 2 0.0 0.98145 7 0 -0.0040016 b'' 20 0 230 [0 0 0] [ 0. 0. 0.] 55.406 9 2.6504 1 b'' 0 0.0018349 9829 15.25 2851.81 b'C' 0 b'' 2.0234 True False False True False False b'SNP' True 2 0

...


In [68]:
vt_zarrmem[:10]


Out[68]:
VariantTable((10,), dtype=[('ABHet', '<f2'), ('ABHom', '<f2'), ('AC', '<u2', (3,)), ('AF', '<f2', (3,)), ('ALT', 'S1', (3,)), ('AN', '<u2'), ('ANN', [('Allele', 'S12'), ('Annotation', 'S34'), ('Annotation_Impact', 'S8'), ('Gene_Name', 'S14'), ('Gene_ID', 'S14'), ('Feature_Type', 'S20'), ('Feature_ID', 'S14'), ('Transcript_BioType', 'S20'), ('Rank', 'i1'), ('HGVS_c', 'S12'), ('HGVS_p', 'S14'), ('cDNA_pos', '<i4'), ('cDNA_length', '<i4'), ('CDS_pos', '<i4'), ('CDS_length', '<i4'), ('AA_pos', '<i4'), ('AA_length', '<i4'), ('Distance', '<i4')]), ('Accessible', '?'), ('BaseCounts', '<i4', (4,)), ('BaseQRankSum', '<f2'), ('CHROM', 'S12'), ('Coverage', '<i4'), ('CoverageMQ0', '<i4'), ('DP', '<i4'), ('DS', '?'), ('Dels', '<f2'), ('FILTER_FS', '?'), ('FILTER_HRun', '?'), ('FILTER_HighCoverage', '?'), ('FILTER_HighMQ0', '?'), ('FILTER_LowCoverage', '?'), ('FILTER_LowMQ', '?'), ('FILTER_LowQual', '?'), ('FILTER_NoCoverage', '?'), ('FILTER_PASS', '?'), ('FILTER_QD', '?'), ('FILTER_ReadPosRankSum', '?'), ('FILTER_RefN', '?'), ('FILTER_RepeatDUST', '?'), ('FS', '<f2'), ('HRun', 'u1'), ('HW', '<f4'), ('HaplotypeScore', '<f2'), ('HighCoverage', '<i4'), ('HighMQ0', '<i4'), ('InbreedingCoeff', '<f2'), ('LOF', 'S12'), ('LowCoverage', '<i4'), ('LowMQ', '<i4'), ('LowPairing', '<i4'), ('MLEAC', '<u2', (3,)), ('MLEAF', '<f2', (3,)), ('MQ', '<f2'), ('MQ0', '<i4'), ('MQRankSum', '<f2'), ('NDA', '<i4'), ('NMD', 'S12'), ('NoCoverage', '<i4'), ('OND', '<f2'), ('POS', '<i4'), ('QD', '<f2'), ('QUAL', '<f4'), ('REF', 'S1'), ('RPA', '<u2'), ('RU', 'S12'), ('ReadPosRankSum', '<f2'), ('RefMasked', '?'), ('RefN', '?'), ('RepeatDUST', '?'), ('RepeatMasker', '?'), ('RepeatTRF', '?'), ('STR', '?'), ('VariantType', 'S12'), ('is_snp', '?'), ('num_alleles', 'u1'), ('svlen', '<i4')])
ABHet ABHom AC AF ALT AN ANN Accessible BaseCounts BaseQRankSum CHROM Coverage CoverageMQ0 DP DS Dels FILTER_FS FILTER_HRun FILTER_HighCoverage FILTER_HighMQ0 FILTER_LowCoverage FILTER_LowMQ FILTER_LowQual FILTER_NoCoverage FILTER_PASS FILTER_QD FILTER_ReadPosRankSum FILTER_RefN FILTER_RepeatDUST FS HRun HW HaplotypeScore HighCoverage HighMQ0 InbreedingCoeff LOF LowCoverage LowMQ LowPairing MLEAC MLEAF MQ MQ0 MQRankSum NDA NMD NoCoverage OND POS QD QUAL REF RPA RU ReadPosRankSum RefMasked RefN RepeatDUST RepeatMasker RepeatTRF STR VariantType is_snp num_alleles svlen
0.50195 0.99805 [3 0 0] [ 0.00196075 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 21 25251 11 77] -1.8486 b'3L' 25889 41 23344 False 0.0 False False False False False False False False True False False False False 0.88184 1 0.0 1.0332 3 0 -0.0025005 b'' 18 1 368 [0 0 0] [ 0. 0. 0.] 54.844 42 0.30811 1 b'' 0 0.002327 9790 14.047 1447.12 b'C' 0 b'' 0.22998 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [1 0 0] [ 0.00065374 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 39 9 25274 18] -3.5195 b'3L' 25916 14 23321 False 0.0 False False False False False False False False True False False False False 1.4609 0 0.0 1.0449 5 0 -0.0012999 b'' 15 1 370 [0 0 0] [ 0. 0. 0.] 54.844 15 3.2109 2 b'' 0 0.0017834 9798 13.781 551.05 b'G' 0 b'' -0.13501 True False False True False False b'SNP' True 2 0
0.52783 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 45 25873 13 16] -3.0566 b'3L' 26223 8 23877 False 0.0 False False False False False False False False True False False False False 1.3818 1 0.0 1.0645 7 0 -0.0017004 b'' 23 0 300 [0 0 0] [ 0. 0. 0.] 55.094 8 2.498 1 b'' 0 0.0018845 9812 13.273 729.69 b'C' 0 b'' -0.077026 True False False True False False b'SNP' True 2 0
0.5249 0.99805 [2 0 0] [ 0.00130653 0. 0. ] [b'A' b'' b''] 1530 (b'A', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 43 21 9 26227] 0.18298 b'3L' 26516 8 24217 False 0.0 False False False False False False False False True False False False False 0.0 2 0.0 0.94775 7 0 -0.0017004 b'' 19 0 279 [0 0 0] [ 0. 0. 0.] 55.156 8 3.2871 2 b'' 0 0.0017052 9818 14.352 1147.7 b'T' 0 b'' 0.68213 True False False True False False b'SNP' True 2 0
0.48804 0.99805 [4 0 0] [ 0.00261307 0. 0. ] [b'T' b'' b''] 1530 (b'T', b'intergenic_region', b'MODIFIER', b'AGAP010310', b'AGAP010310', b'intergenic_region', b'AGAP010310', b'.', -1, b'.', b'.', -1, -1, -1, -1, -1, -1, -1) True [ 14 26351 2 129] -3.1367 b'3L' 26461 9 24344 False 0.0 False False False False False False False False True False False False False 1.5303 2 0.0 0.98145 7 0 -0.0040016 b'' 20 0 230 [0 0 0] [ 0. 0. 0.] 55.406 9 2.6504 1 b'' 0 0.0018349 9829 15.25 2851.81 b'C' 0 b'' 2.0234 True False False True False False b'SNP' True 2 0

...

Copy into existing HDF5 file


In [69]:
h5g = h5f.require_group('variants')
h5g


Out[69]:
<HDF5 group "/variants" (0 members)>

In [70]:
for k in h5g:
    del h5g[k]

In [71]:
vtc = vt[['CHROM', 'POS', 'AN', 'AC']].copy(stop=100000, storage='hdf5_zlib1', group=h5g)
vtc


Out[71]:
VariantChunkedTable(100000, nbytes=2.3M, cbytes=300.8K, cratio=7.8, data=h5py._hl.group.Group)
CHROM POS AN AC
b'3L' 9790 1530 [3 0 0]
b'3L' 9798 1530 [1 0 0]
b'3L' 9812 1530 [2 0 0]
b'3L' 9818 1530 [2 0 0]
b'3L' 9829 1530 [4 0 0]

...


In [72]:
h5g


Out[72]:
<HDF5 group "/variants" (4 members)>

In [ ]: