In [118]:
import os
import timeit
import tables as tb
import h5py
import numpy as np

In [119]:
n, k = 5000000, 100

In [120]:
def create_contiguous(filename):
    with tb.openFile(filename, "w") as f:
        a = f.createArray('/', 'test', atom=tb.Float32Atom(),
                          shape=(n,k))
        n_ = n//10
        for i in range(10):
            print i,
            a[i*n_:(i+1)*n_,...] = np.random.rand(n_, k)

In [121]:
filename_contiguous = 'features_contiguous.h5'
if not os.path.exists(filename_contiguous):
    create_contiguous(filename_contiguous)

In [122]:
ind = np.random.randint(size=1000, low=0, high=n)
ind = np.unique(ind)

In [123]:
def read(a, out):
    for j, i in enumerate(ind):
        out[j:j+1,...] = a[i:i+1,...]
    return out

In [124]:
with tb.openFile(filename_contiguous, "r") as f:
    a = f.root.test
    out = np.empty((len(ind),k), dtype=a.dtype)
    %timeit -r1 -n1 read(a, out)


1 loops, best of 1: 85 ms per loop

In [125]:
with h5py.File(filename_contiguous, "r") as f:
    a = f['/test']
    out = np.empty((len(ind),k), dtype=a.dtype)
    %timeit -r1 -n1 read(a, out)


1 loops, best of 1: 157 ms per loop

Loading multiple features


In [126]:
N = 1000

In [127]:
def get_cluster_sizes_to_load(cluster_sizes):
    cluster_sizes = np.array(cluster_sizes)
    k = len(cluster_sizes)
    s = float(cluster_sizes.sum())
    sizes = np.round(N/2.*(1./k + cluster_sizes/s)).astype(np.int32)
    sizes = np.minimum(sizes, cluster_sizes)
    rem = N - sizes.sum()
    l = np.argsort(cluster_sizes - sizes)#[::-1]
    l = l[cluster_sizes-sizes>0]
    for i in l:
        crem = cluster_sizes[i] - sizes[i]
        if crem > 0:
            sizes[i] += min(crem, rem)
            rem = N - sizes.sum()
        else:
            break
    return sizes

In [128]:
sizes = [
    [10, 10, 10],
    [10, 20, 30],
    [100, 100, 100],
    [10, 100, 10000],
    [1, 1, 1000],
    [100, 1000, 10000],
    [100, 100000, 100000],
    [10000, 100000, 100000],
    [100000, 100000, 100000],
    [1000000, 1000000, 1000000],
    [10, 20, 30, 40, 50, 60, 70, 100000],
    [100, 200, 300, 400, 500, 600, 700, 10000000],
]
for s in sizes:
    cl =get_cluster_sizes_to_load(s)
    print s, ':', cl, sum(cl)


[10, 10, 10] : [10 10 10] 30
[10, 20, 30] : [10 20 30] 60
[100, 100, 100] : [100 100 100] 300
[10, 100, 10000] : [ 10 100 890] 1000
[1, 1, 1000] : [  1   1 998] 1000
[100, 1000, 10000] : [100 283 617] 1000
[100, 100000, 100000] : [100 483 417] 1000
[10000, 100000, 100000] : [190 405 405] 1000
[100000, 100000, 100000] : [334 333 333] 1000
[1000000, 1000000, 1000000] : [334 333 333] 1000
[10, 20, 30, 40, 50, 60, 70, 100000] : [ 10  20  30  40  50  60  70 720] 1000
[100, 200, 300, 400, 500, 600, 700, 10000000] : [ 60  63  63  63  63  63  63 562] 1000