In [118]:
    
import os
import timeit
import tables as tb
import h5py
import numpy as np
    
In [119]:
    
n, k = 5000000, 100
    
In [120]:
    
def create_contiguous(filename):
    with tb.openFile(filename, "w") as f:
        a = f.createArray('/', 'test', atom=tb.Float32Atom(),
                          shape=(n,k))
        n_ = n//10
        for i in range(10):
            print i,
            a[i*n_:(i+1)*n_,...] = np.random.rand(n_, k)
    
In [121]:
    
filename_contiguous = 'features_contiguous.h5'
if not os.path.exists(filename_contiguous):
    create_contiguous(filename_contiguous)
    
In [122]:
    
ind = np.random.randint(size=1000, low=0, high=n)
ind = np.unique(ind)
    
In [123]:
    
def read(a, out):
    for j, i in enumerate(ind):
        out[j:j+1,...] = a[i:i+1,...]
    return out
    
In [124]:
    
with tb.openFile(filename_contiguous, "r") as f:
    a = f.root.test
    out = np.empty((len(ind),k), dtype=a.dtype)
    %timeit -r1 -n1 read(a, out)
    
    
In [125]:
    
with h5py.File(filename_contiguous, "r") as f:
    a = f['/test']
    out = np.empty((len(ind),k), dtype=a.dtype)
    %timeit -r1 -n1 read(a, out)
    
    
In [126]:
    
N = 1000
    
In [127]:
    
def get_cluster_sizes_to_load(cluster_sizes):
    cluster_sizes = np.array(cluster_sizes)
    k = len(cluster_sizes)
    s = float(cluster_sizes.sum())
    sizes = np.round(N/2.*(1./k + cluster_sizes/s)).astype(np.int32)
    sizes = np.minimum(sizes, cluster_sizes)
    rem = N - sizes.sum()
    l = np.argsort(cluster_sizes - sizes)#[::-1]
    l = l[cluster_sizes-sizes>0]
    for i in l:
        crem = cluster_sizes[i] - sizes[i]
        if crem > 0:
            sizes[i] += min(crem, rem)
            rem = N - sizes.sum()
        else:
            break
    return sizes
    
In [128]:
    
sizes = [
    [10, 10, 10],
    [10, 20, 30],
    [100, 100, 100],
    [10, 100, 10000],
    [1, 1, 1000],
    [100, 1000, 10000],
    [100, 100000, 100000],
    [10000, 100000, 100000],
    [100000, 100000, 100000],
    [1000000, 1000000, 1000000],
    [10, 20, 30, 40, 50, 60, 70, 100000],
    [100, 200, 300, 400, 500, 600, 700, 10000000],
]
for s in sizes:
    cl =get_cluster_sizes_to_load(s)
    print s, ':', cl, sum(cl)