In [1]:
import numpy as np
import dask
print('dask', dask.__version__)
import dask.array as da
import h5py
print('h5py', h5py.__version__)
import bcolz
print('bcolz', bcolz.__version__)
import sys
sys.path.insert(0, '../..')
%reload_ext autoreload
%autoreload 1
%aimport allel.model.dask
%aimport allel.model.chunked
%aimport allel.model.ndarray
%aimport allel.model
%aimport allel
print('scikit-allel', allel.__version__)
In [2]:
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.h5', mode='r')
callset
Out[2]:
In [3]:
genotype = allel.model.chunked.GenotypeChunkedArray(callset['3L/calldata/genotype'])
genotype
Out[3]:
In [4]:
# copy the first 2 million rows into memory to use as benchmarking data
gc = genotype.copy(storage='hdf5mem_zlib1', stop=2000000, chunks=(5000, 100, 2))
gc
Out[4]:
In [5]:
# how long does it take to read the data?
%time gc.max()
Out[5]:
In [6]:
# compute count_alleles block-wise in series for comparison with dask
%time gc.count_alleles(max_allele=3)
Out[6]:
In [7]:
gd = allel.model.dask.GenotypeDaskArray.from_array(gc)
gd
Out[7]:
In [8]:
ac = gd.count_alleles(max_allele=3)
ac
Out[8]:
In [9]:
%time _ = ac.compute(num_workers=1)
In [10]:
%time _ = ac.compute(num_workers=2)
In [11]:
%time _ = ac.compute(num_workers=3)
In [12]:
%time _ = ac.compute(num_workers=6)
In [13]:
gc = genotype.copy(storage='bcolzmem', stop=2000000)
gc
Out[13]:
In [14]:
%time gc.max()
Out[14]:
In [15]:
# compute count_alleles block-wise in series for comparison
%time _ = gc.count_alleles(max_allele=3)
In [16]:
gd = allel.model.dask.GenotypeDaskArray.from_array(gc)
gd
Out[16]:
In [17]:
ac = gd.count_alleles(max_allele=3)
ac
Out[17]:
In [18]:
%time _ = ac.compute(num_workers=1)
In [19]:
%time _ = ac.compute(num_workers=2)
In [20]:
%time _ = ac.compute(num_workers=3)
In [21]:
%time _ = ac.compute(num_workers=6)
In [ ]: