In [1]:
import sys
sys.path.insert(0, '..')
import functools
import timeit
import zarr
print('zarr', zarr.__version__)
from zarr import blosc
print('blosc', blosc.version())
import numpy as np
import h5py
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.h5',
mode='r')
genotype = callset['3R/calldata/genotype']
genotype
Out[2]:
In [3]:
n_variants = 500000
In [4]:
genotype_sample = genotype[1000000:1000000+n_variants, ...]
genotype_sample.shape
Out[4]:
In [5]:
nbytes = genotype_sample.nbytes
nbytes
Out[5]:
In [6]:
# 1M chunks of first dimension
chunks = (int(2**20 / (genotype_sample.shape[1] * genotype_sample.shape[2])),
genotype_sample.shape[1],
genotype_sample.shape[2])
chunks
Out[6]:
In [7]:
blosc.get_nthreads()
Out[7]:
In [8]:
zarr.array(genotype_sample, chunks=chunks, compression_opts=dict(cname='lz4', clevel=1, shuffle=2))
Out[8]:
In [9]:
zarr.array(genotype_sample, chunks=chunks, compression_opts=dict(cname='zstd', clevel=1, shuffle=2))
Out[9]:
In [38]:
compression_configs = (
(None, None),
('zlib', 1),
('bz2', 1),
('lzma', dict(preset=1)),
('blosc', dict(cname='snappy', clevel=0, shuffle=0)),
('blosc', dict(cname='snappy', clevel=0, shuffle=2)),
('blosc', dict(cname='snappy', clevel=9, shuffle=0)),
('blosc', dict(cname='snappy', clevel=9, shuffle=2)),
('blosc', dict(cname='blosclz', clevel=1, shuffle=0)),
('blosc', dict(cname='blosclz', clevel=1, shuffle=2)),
('blosc', dict(cname='blosclz', clevel=5, shuffle=0)),
('blosc', dict(cname='blosclz', clevel=5, shuffle=2)),
('blosc', dict(cname='blosclz', clevel=9, shuffle=0)),
('blosc', dict(cname='blosclz', clevel=9, shuffle=2)),
('blosc', dict(cname='lz4', clevel=1, shuffle=0)),
('blosc', dict(cname='lz4', clevel=1, shuffle=2)),
('blosc', dict(cname='lz4', clevel=5, shuffle=0)),
('blosc', dict(cname='lz4', clevel=5, shuffle=2)),
('blosc', dict(cname='lz4', clevel=9, shuffle=0)),
('blosc', dict(cname='lz4', clevel=9, shuffle=2)),
('blosc', dict(cname='lz4hc', clevel=1, shuffle=0)),
('blosc', dict(cname='lz4hc', clevel=1, shuffle=2)),
('blosc', dict(cname='lz4hc', clevel=3, shuffle=0)),
('blosc', dict(cname='lz4hc', clevel=3, shuffle=2)),
('blosc', dict(cname='zstd', clevel=1, shuffle=0)),
('blosc', dict(cname='zstd', clevel=1, shuffle=2)),
('blosc', dict(cname='zstd', clevel=3, shuffle=0)),
('blosc', dict(cname='zstd', clevel=3, shuffle=2)),
('blosc', dict(cname='zstd', clevel=5, shuffle=0)),
('blosc', dict(cname='zstd', clevel=5, shuffle=2)),
('blosc', dict(cname='zlib', clevel=1, shuffle=0)),
('blosc', dict(cname='zlib', clevel=1, shuffle=2)),
('blosc', dict(cname='zlib', clevel=3, shuffle=0)),
('blosc', dict(cname='zlib', clevel=3, shuffle=2)),
('blosc', dict(cname='zlib', clevel=5, shuffle=0)),
('blosc', dict(cname='zlib', clevel=5, shuffle=2)),
)
In [39]:
def log(*msg):
print(*msg, file=sys.stdout)
sys.stdout.flush()
In [40]:
@functools.lru_cache(maxsize=None)
def compression_ratios():
x = list()
for compression, compression_opts in compression_configs:
z = zarr.array(genotype_sample, chunks=chunks, compression=compression,
compression_opts=compression_opts)
ratio = z.nbytes / z.nbytes_stored
x.append(ratio)
log(compression, compression_opts, ratio)
return x
In [43]:
ratios = compression_ratios()
labels = ['%s - %s' % (c, o)
for c, o in compression_configs]
fig = plt.figure(figsize=(12, len(compression_configs)*.3))
fig.suptitle('Compression ratio', fontsize=14, y=1.01)
ax = fig.add_subplot(1, 1, 1)
y = [i for i, (c, o) in enumerate(compression_configs) if c == 'blosc' and o['shuffle'] == 2]
x = [ratios[i] for i in y]
ax.barh(bottom=np.array(y)+.2, width=np.array(x), height=.6, label='bit shuffle', color='b')
y = [i for i, (c, o) in enumerate(compression_configs) if c != 'blosc' or o['shuffle'] == 0]
x = [ratios[i] for i in y]
ax.barh(bottom=np.array(y)+.2, width=np.array(x), height=.6, label='no shuffle', color='g')
ax.set_yticks(np.arange(len(labels))+.5)
ax.set_yticklabels(labels, rotation=0)
ax.set_xlim(0, max(ratios)+3)
ax.set_ylim(0, len(ratios))
ax.set_xlabel('compression ratio')
ax.grid(axis='x')
ax.legend(loc='upper right')
fig.tight_layout();
In [56]:
@functools.lru_cache(maxsize=None)
def compression_decompression_times(repeat=3, number=1):
c = list()
d = list()
for compression, compression_opts in compression_configs:
def compress():
zarr.array(genotype_sample, chunks=chunks, compression=compression,
compression_opts=compression_opts)
t = timeit.Timer(stmt=compress, globals=locals())
compress_times = t.repeat(repeat=repeat, number=number)
c.append(compress_times)
z = zarr.array(genotype_sample, chunks=chunks, compression=compression,
compression_opts=compression_opts)
def decompress():
z[:]
t = timeit.Timer(stmt=decompress, globals=locals())
decompress_times = t.repeat(repeat=repeat, number=number)
d.append(decompress_times)
log(compression, compression_opts, compress_times, decompress_times)
return c, d
In [59]:
co_t, de_t = compression_decompression_times()
fig = plt.figure(figsize=(12, len(compression_configs)*.3))
fig.suptitle('Compression speed', fontsize=14, y=1.01)
ax = fig.add_subplot(1, 1, 1)
y = [i for i, (c, o) in enumerate(compression_configs) if c == 'blosc' and o['shuffle'] == 2]
x = (nbytes / 1000000) / np.array([co_t[i] for i in y])
ax.barh(bottom=np.array(y)+.2, width=x.max(axis=1), height=.6, label='bit shuffle', color='b')
y = [i for i, (c, o) in enumerate(compression_configs) if c != 'blosc' or o['shuffle'] == 0]
x = (nbytes / 1000000) / np.array([co_t[i] for i in y])
ax.barh(bottom=np.array(y)+.2, width=x.max(axis=1), height=.6, label='no shuffle', color='g')
ax.set_yticks(np.arange(len(labels))+.5)
ax.set_yticklabels(labels, rotation=0)
xlim = (0, np.max((nbytes / 1000000) / np.array(co_t)) + 100)
ax.set_xlim(*xlim)
ax.set_ylim(0, len(co_t))
ax.set_xlabel('speed (Mb/s)')
ax.grid(axis='x')
ax.legend(loc='upper right')
fig.tight_layout();
In [60]:
co_t, de_t = compression_decompression_times()
fig = plt.figure(figsize=(12, len(compression_configs)*.3))
fig.suptitle('Decompression speed', fontsize=14, y=1.01)
ax = fig.add_subplot(1, 1, 1)
y = [i for i, (c, o) in enumerate(compression_configs) if c == 'blosc' and o['shuffle'] == 2]
x = (nbytes / 1000000) / np.array([de_t[i] for i in y])
ax.barh(bottom=np.array(y)+.2, width=x.max(axis=1), height=.6, label='bit shuffle', color='b')
y = [i for i, (c, o) in enumerate(compression_configs) if c != 'blosc' or o['shuffle'] == 0]
x = (nbytes / 1000000) / np.array([de_t[i] for i in y])
ax.barh(bottom=np.array(y)+.2, width=x.max(axis=1), height=.6, label='no shuffle', color='g')
ax.set_yticks(np.arange(len(labels))+.5)
ax.set_yticklabels(labels, rotation=0)
xlim = (0, np.max((nbytes / 1000000) / np.array(de_t)) + 100)
ax.set_xlim(*xlim)
ax.set_ylim(0, len(de_t))
ax.set_xlabel('speed (Mb/s)')
ax.grid(axis='x')
ax.legend(loc='upper right')
fig.tight_layout();
In [61]:
import cpuinfo
In [63]:
cpuinfo.main()