Notebook showing how Blaze can compute a matrix-matrix multiplication on persistent storage (i.e. matrix sizes can exceed memory size)


In [ ]:
import shutil, os, os.path
from time import time
import blaze as blz
from blaze.algo.linalg import dot

In [ ]:
# The dimensions for the arrays
# a.shape = (DIM1, DIM2); b.shape = (DIM2, DIM3); out.shape = (DIM1, DIM3)
DIM1, DIM2, DIM3 = 1000, 1000, 2000

In [ ]:
# Remove pre-existent data directories
for d in ('a_dir', 'b_dir', 'out_dir'):
    if os.path.exists(d):
        shutil.rmtree(d)

In [ ]:
# Create array 'a' in directory 'a_dir'
%time a = blz.ones(blz.dshape('%d, %d, float64' % (DIM1, DIM2)), params=blz.params(storage='a_dir'))

In [ ]:
# Create array 'b' in directory 'b_dir'
%time b = blz.ones(blz.dshape('%d, %d, float64' % (DIM2, DIM3)), params=blz.params(storage='b_dir'))

In [ ]:
# Do the dot product and put the result in 'out' (directory 'out_dir')
%time out = dot(a, b, outname='out_dir')

In [ ]:
# Show the output
print "out:", `out`

In [ ]:
# Function that shows sizes on disk
dsize = lambda d: sum([os.path.getsize(os.path.join(d,f)) for f in os.listdir(d)])

In [ ]:
# Data sizes for operands and result
[dsize(data) for data in ('a_dir/data', 'b_dir/data', 'out_dir/data')]

In [ ]:
# Function that shows equivalent sizes for NumPy
nsize = lambda arr: arr.size*arr.itemsize

In [ ]:
# Data sizes for operands and result for NumPy
[nsize(data) for data in (a[:], b[:], out[:])]

In [ ]:
# Compression ratios:
[nsize(arr) / dsize(data) for data, arr in (('a_dir/data', a[:]), ('b_dir/data', b[:]), ('out_dir/data', out[:]))]

In [ ]: