In [1]:
import sys
sys.path.insert(0, '..')
import zarr
zarr.__version__


Out[1]:
'2.0.2.dev0+dirty'

In [2]:
store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',
                      mode='r')
grp = zarr.Group(store)
z = grp['3L/calldata/genotype']
z


Out[2]:
Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)
  nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380
  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)
  store: ZipStore

In [5]:
import cProfile
cProfile.run('z[:10]', sort='cumtime')


         1832 function calls in 0.024 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.024    0.024 {built-in method builtins.exec}
        1    0.000    0.000    0.024    0.024 <string>:1(<module>)
        1    0.000    0.000    0.024    0.024 core.py:292(__getitem__)
       20    0.000    0.000    0.023    0.001 core.py:539(_chunk_getitem)
       20    0.000    0.000    0.020    0.001 core.py:679(_decode_chunk)
       20    0.000    0.000    0.020    0.001 codecs.py:355(decode)
       20    0.020    0.001    0.020    0.001 {zarr.blosc.decompress}
       20    0.000    0.000    0.002    0.000 storage.py:766(__getitem__)
       20    0.000    0.000    0.001    0.000 zipfile.py:1235(open)
       20    0.000    0.000    0.001    0.000 zipfile.py:821(read)
       20    0.000    0.000    0.001    0.000 zipfile.py:901(_read1)
       80    0.000    0.000    0.001    0.000 zipfile.py:660(read)
       20    0.000    0.000    0.000    0.000 zipfile.py:854(_update_crc)
       40    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}
       80    0.000    0.000    0.000    0.000 {method 'read' of '_io.BufferedReader' objects}
       20    0.000    0.000    0.000    0.000 zipfile.py:937(_read2)
       80    0.000    0.000    0.000    0.000 core.py:390(<genexpr>)
       20    0.000    0.000    0.000    0.000 zipfile.py:953(close)
       20    0.000    0.000    0.000    0.000 {method 'reshape' of 'numpy.ndarray' objects}
       20    0.000    0.000    0.000    0.000 util.py:106(is_total_slice)
       20    0.000    0.000    0.000    0.000 zipfile.py:708(__init__)
       20    0.000    0.000    0.000    0.000 {method 'decode' of 'bytes' objects}
       20    0.000    0.000    0.000    0.000 core.py:676(_chunk_key)
       80    0.000    0.000    0.000    0.000 {method 'seek' of '_io.BufferedReader' objects}
       20    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.frombuffer}
       80    0.000    0.000    0.000    0.000 core.py:398(<genexpr>)
       20    0.000    0.000    0.000    0.000 {method 'join' of 'str' objects}
       20    0.000    0.000    0.000    0.000 core.py:386(<listcomp>)
       20    0.000    0.000    0.000    0.000 {built-in method builtins.all}
       40    0.000    0.000    0.000    0.000 util.py:121(<genexpr>)
      231    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
       20    0.000    0.000    0.000    0.000 cp437.py:14(decode)
       80    0.000    0.000    0.000    0.000 {method 'tell' of '_io.BufferedReader' objects}
       20    0.000    0.000    0.000    0.000 zipfile.py:667(close)
       20    0.000    0.000    0.000    0.000 {built-in method _struct.unpack}
      140    0.000    0.000    0.000    0.000 {built-in method builtins.max}
       20    0.000    0.000    0.000    0.000 {function ZipExtFile.close at 0x7f8cd5ca2048}
       20    0.000    0.000    0.000    0.000 zipfile.py:1194(getinfo)
      140    0.000    0.000    0.000    0.000 {built-in method builtins.min}
       20    0.000    0.000    0.000    0.000 threading.py:1224(current_thread)
       20    0.000    0.000    0.000    0.000 zipfile.py:654(__init__)
        1    0.000    0.000    0.000    0.000 util.py:195(get_chunk_range)
       20    0.000    0.000    0.000    0.000 {built-in method _codecs.charmap_decode}
        1    0.000    0.000    0.000    0.000 util.py:166(normalize_array_selection)
        1    0.000    0.000    0.000    0.000 util.py:198(<listcomp>)
       20    0.000    0.000    0.000    0.000 zipfile.py:1715(_fpclose)
       20    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
       63    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        1    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.empty}
        2    0.000    0.000    0.000    0.000 util.py:182(<genexpr>)
       20    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}
       20    0.000    0.000    0.000    0.000 {built-in method _thread.get_ident}
        1    0.000    0.000    0.000    0.000 util.py:130(normalize_axis_selection)
       20    0.000    0.000    0.000    0.000 zipfile.py:636(_get_decompressor)
       20    0.000    0.000    0.000    0.000 threading.py:1298(main_thread)
        4    0.000    0.000    0.000    0.000 core.py:373(<genexpr>)
        3    0.000    0.000    0.000    0.000 util.py:187(<genexpr>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



In [6]:
import dask
import dask.array as da
dask.__version__


Out[6]:
'0.11.0'

In [7]:
d = da.from_array(z, chunks=z.chunks)
d


Out[7]:
dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>

In [8]:
%time d.sum(axis=1).compute()


CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s
Wall time: 29.5 s
Out[8]:
array([[3, 0],
       [1, 0],
       [2, 0],
       ..., 
       [2, 8],
       [8, 8],
       [0, 1]])

In [9]:
# compare with same data via directory store
store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')
grp_dir = zarr.Group(store_dir)
z_dir = grp_dir['3L/calldata/genotype']
z_dir


Out[9]:
Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)
  nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380
  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)
  store: DirectoryStore

In [10]:
d_dir = da.from_array(z_dir, chunks=z_dir.chunks)
d_dir


Out[10]:
dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>

In [11]:
%time d_dir.sum(axis=1).compute()


CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s
Wall time: 31.1 s
Out[11]:
array([[3, 0],
       [1, 0],
       [2, 0],
       ..., 
       [2, 8],
       [8, 8],
       [0, 1]])

In [ ]: