In [40]:
import os
import time
import h5py
import numpy as np

In [41]:
n, k, l = 1000000, 50, 128

In [42]:
def create(filename):
    """Create a contiguous array of size (n,k,l)."""
    with h5py.File(filename, "w") as f:
        a = f.create_dataset('/test',dtype=np.int16,
                             shape=(n,k,l))
        n_ = n//50
        for i in range(50):
            print i,
            a[i*n_:(i+1)*n_,...] = np.random.randint(size=(n_,k,l),
                                                     low=-32000,
                                                     high=32000)

Create the file if necessary.


In [43]:
filename = 'test_%d.h5' % n
if not os.path.exists(filename):
    create(filename)

This function loads an array from disk.


In [44]:
def read(a, out):
    for j, i in enumerate(ind):
        out[j:j+1,...] = a[i:i+1,...]
    return out

Choose rows randomly.


In [45]:
size = 100
ind = np.random.randint(size=size, low=0, high=n)
ind = np.unique(ind)

Load those rows in memory from HDF5.


In [46]:
with h5py.File(filename, "r") as f:
    a = f['/test']
    out = np.empty((len(ind),k,l), dtype=a.dtype)
    t0 = time.clock()
    read(a, out)
    t1 = time.clock()
d = t1-t0
bandwidth = size*k*l*2/(1024*1024.)/d
print("Elapsed: {0:.2f} s".format(d))
print("Bandwidth: {0:.1f} MB/s".format(bandwidth))


Elapsed: 1.07 s
Bandwidth: 1.1 MB/s