In [1]:
from blaze import *
from blaze.expr.split import split
x = Symbol('x', '1000000000 * int')
chunk = Symbol('chunk', '1000000 * int')
split(x, x.sum(), chunk=chunk)
Out[1]:
We reason about the shapes of all of the pieces so that intermediates may be preallocated
In [2]:
(chunk, chunk_expr), (aggregate, aggregate_expr) = split(x, x.sum(), chunk=chunk)
In [3]:
chunk_expr.dshape
Out[3]:
In [4]:
aggregate.dshape
Out[4]:
In [5]:
split(x, count(x), chunk=chunk)
Out[5]:
In [6]:
split(x, x.std(), chunk=chunk)
Out[6]:
In [7]:
t = Symbol('t', 'var * {name: string, amount: int}')
In [8]:
split(t, by(t.name, avg=t.amount.mean()))
Out[8]:
We are smart enough to cut the expression at the right place, deciding which operations should occur on each chunk, and which must occur after the aggregation
In [9]:
t2 = t[t.amount > 1000]
In [10]:
split(t, by(t2.name, total=t2.amount.sum() / 10))
Out[10]:
In [11]:
x = Symbol('x', '1000000 * 2000000 * float64')
chunk = Symbol('chunk', '1000 * 1000 * float64')
In [12]:
split(x, sum(2*x, axis=0), chunk=chunk)
Out[12]:
The datashapes of the various stages of computation
In [13]:
(chunk, chunk_expr), (agg, agg_expr) = split(x, sum(2*x, axis=0), chunk=chunk)
In [14]:
chunk.dshape, chunk_expr.dshape, agg.dshape, agg_expr.dshape
Out[14]:
In [15]:
!rm foo.hdf5
In [16]:
import h5py
f = h5py.File('foo.hdf5')
In [17]:
x = np.arange(20*24, dtype='f4').reshape((20, 24))
In [18]:
d = f.create_dataset('/x', shape=x.shape, dtype=x.dtype,
fillvalue=0.0, chunks=(4, 6))
d[:] = x
In [19]:
d
Out[19]:
Consider the following expression in NumPy
In [20]:
(x + 1).sum(axis=0)
Out[20]:
We can't do this on h5py datasets
In [21]:
(d + 1).sum(axis=0) # d is an h5py dataset
But if we wrap it in Blaze then we can do this computation via chunking
In [22]:
b = Data(d)
In [23]:
compute((b + 1).sum(axis=0)) # b is an h5py dataset wrapped by Blaze
Out[23]:
In [24]:
f.close()
In [25]:
!rm foo.hdf5