In [1]:
import numpy as np
import bcolz
import h5py
import os
os.chdir('../..')
import allel
allel.__version__


Out[1]:
'1.0.0.dev0'

In [2]:
a = np.arange(20000000)
a


Out[2]:
array([       0,        1,        2, ..., 19999997, 19999998, 19999999])

In [3]:
c = allel.chunked.bcolztmp_storage.array(a, cparams=bcolz.cparams(cname='zlib', clevel=1, shuffle=True))
c.flush()
c


Out[3]:
carray((20000000,), int64)
  nbytes := 152.59 MB; cbytes := 2.41 MB; ratio: 63.28
  cparams := cparams(clevel=1, shuffle=True, cname='zlib', quantize=0)
  chunklen := 131072; chunksize: 1048576; blocksize: 131072
  rootdir := '/tmp/scikit_allel_5y9lqsv4.bcolz'
  mode    := 'w'
[       0        1        2 ..., 19999997 19999998 19999999]

In [4]:
!du -hs {c.rootdir}


1.9M	/tmp/scikit_allel_5y9lqsv4.bcolz

In [5]:
allel.chunked.ChunkedArrayWrapper(c)


Out[5]:
<ChunkedArrayWrapper shape=(20000000,) dtype=int64 chunks=(131072,)
   nbytes=152.6M cbytes=2.4M cratio=63.3
   compression=blosc compression_opts=cparams(clevel=1, shuffle=True, cname='zlib', quantize=0)
   values=bcolz.carray_ext.carray>

In [6]:
h = allel.chunked.hdf5tmp_storage.array(a, compression='gzip', compression_opts=1, shuffle=True, chunks=(c.chunklen,))
h.file.flush()
h


Out[6]:
<HDF5 dataset "data": shape (20000000,), type "<i8">

In [7]:
!du -h {h.file.filename}


932K	/tmp/scikit_allel_mo8d55nc.h5

In [8]:
allel.chunked.ChunkedArrayWrapper(h)


Out[8]:
<ChunkedArrayWrapper shape=(20000000,) dtype=int64 chunks=(131072,)
   nbytes=152.6M cbytes=920.4K cratio=169.8
   compression=gzip compression_opts=1
   values=h5py._hl.dataset.Dataset>

In [9]:
z = allel.chunked.copy(a, storage='zarrmem')
z


Out[9]:
Array((20000000,), int64, chunks=(39063,), order=C)
  nbytes: 152.6M; nbytes_stored: 2.3M; ratio: 66.1; initialized: 512/512
  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
  store: DictStore

In [10]:
z = allel.chunked.copy(a, storage='zarrtmp')
z


Out[10]:
Array((20000000,), int64, chunks=(39063,), order=C)
  nbytes: 152.6M; nbytes_stored: 2.3M; ratio: 66.1; initialized: 512/512
  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
  store: TempStore

In [11]:
z.store.path


Out[11]:
'/tmp/scikit_allel_y0qqf9ak.zarr'

In [12]:
z = allel.chunked.copy(a, storage='zarr', store='array.zarr', overwrite=True)
z


Out[12]:
Array((20000000,), int64, chunks=(39063,), order=C)
  nbytes: 152.6M; nbytes_stored: 2.3M; ratio: 66.1; initialized: 512/512
  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
  store: DirectoryStore

In [13]:
z.store.path


Out[13]:
'/home/aliman/src/github/cggh/scikit-allel/array.zarr'

In [14]:
g = allel.chunked.hdf5tmp_storage.table([a, a], compression='gzip', compression_opts=1, shuffle=True)
g


Out[14]:
<HDF5 file "scikit_allel_7rxbjikn.h5" (mode r+)>

In [15]:
allel.chunked.ChunkedTableWrapper(g)


Out[15]:
<ChunkedTableWrapper shape=(20000000,) dtype=[('f0', '<i8'), ('f1', '<i8')] nbytes=305.2M cbytes=1.8M cratio=169.8 values=h5py._hl.files.File>
f0f1
000
111
222
......
199999971999999719999997
199999981999999819999998
199999991999999919999999

In [16]:
allel.chunked.copy_table(g, storage='bcolz')


Out[16]:
ctable((20000000,), [('f0', '<i8'), ('f1', '<i8')])
  nbytes: 305.18 MB; cbytes: 7.19 MB; ratio: 42.44
  cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
[(0, 0) (1, 1) (2, 2) ..., (19999997, 19999997) (19999998, 19999998)
 (19999999, 19999999)]

In [21]:
z = allel.chunked.copy_table(g, storage='zarr')
z


Out[21]:
<allel.chunked.storage_zarr.ZarrTable at 0x7fac3c36ac18>

In [22]:
z.grp


Out[22]:
Group(/, 2)
  arrays: 2; f0, f1
  store: DictStore

In [23]:
z = allel.chunked.copy_table(g, storage='zarr', store='test.zarr', overwrite=True)
z


Out[23]:
<allel.chunked.storage_zarr.ZarrTable at 0x7fac3c379be0>

In [24]:
z.grp


Out[24]:
Group(/, 2)
  arrays: 2; f0, f1
  store: DirectoryStore

In [25]:
!ls -lh test.zarr/


total 24K
drwxrwxr-x 2 aliman aliman 12K Oct 13 10:54 f0
drwxrwxr-x 2 aliman aliman 12K Oct 13 10:54 f1

In [26]:
!ls -lh test.zarr/f0 | head -n10


total 4.0M
-rw------- 1 aliman aliman 4.6K Oct 13 10:54 0
-rw------- 1 aliman aliman 4.6K Oct 13 10:54 1
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 10
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 100
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 101
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 102
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 103
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 104
-rw------- 1 aliman aliman 4.7K Oct 13 10:54 105
ls: write error: Broken pipe

In [27]:
z = allel.chunked.copy_table(g, storage='zarrtmp')
z.grp


Out[27]:
Group(/, 2)
  arrays: 2; f0, f1
  store: TempStore

In [28]:
z['f0'].store.path


Out[28]:
'/tmp/scikit_allel_gi4s2x_e.zarr'

In [30]:
allel.chunked.ChunkedTableWrapper(z)


Out[30]:
<ChunkedTableWrapper shape=(20000000,) dtype=[('f0', '<i8'), ('f1', '<i8')] nbytes=305.2M cbytes=4.6M cratio=66.1 values=allel.chunked.storage_zarr.ZarrTable>
f0f1
000
111
222
......
199999971999999719999997
199999981999999819999998
199999991999999919999999

In [ ]: