In [118]:
import os
import timeit
import tables as tb
import h5py
import numpy as np
In [119]:
n, k = 5000000, 100
In [120]:
def create_contiguous(filename):
with tb.openFile(filename, "w") as f:
a = f.createArray('/', 'test', atom=tb.Float32Atom(),
shape=(n,k))
n_ = n//10
for i in range(10):
print i,
a[i*n_:(i+1)*n_,...] = np.random.rand(n_, k)
In [121]:
filename_contiguous = 'features_contiguous.h5'
if not os.path.exists(filename_contiguous):
create_contiguous(filename_contiguous)
In [122]:
ind = np.random.randint(size=1000, low=0, high=n)
ind = np.unique(ind)
In [123]:
def read(a, out):
for j, i in enumerate(ind):
out[j:j+1,...] = a[i:i+1,...]
return out
In [124]:
with tb.openFile(filename_contiguous, "r") as f:
a = f.root.test
out = np.empty((len(ind),k), dtype=a.dtype)
%timeit -r1 -n1 read(a, out)
In [125]:
with h5py.File(filename_contiguous, "r") as f:
a = f['/test']
out = np.empty((len(ind),k), dtype=a.dtype)
%timeit -r1 -n1 read(a, out)
In [126]:
N = 1000
In [127]:
def get_cluster_sizes_to_load(cluster_sizes):
cluster_sizes = np.array(cluster_sizes)
k = len(cluster_sizes)
s = float(cluster_sizes.sum())
sizes = np.round(N/2.*(1./k + cluster_sizes/s)).astype(np.int32)
sizes = np.minimum(sizes, cluster_sizes)
rem = N - sizes.sum()
l = np.argsort(cluster_sizes - sizes)#[::-1]
l = l[cluster_sizes-sizes>0]
for i in l:
crem = cluster_sizes[i] - sizes[i]
if crem > 0:
sizes[i] += min(crem, rem)
rem = N - sizes.sum()
else:
break
return sizes
In [128]:
sizes = [
[10, 10, 10],
[10, 20, 30],
[100, 100, 100],
[10, 100, 10000],
[1, 1, 1000],
[100, 1000, 10000],
[100, 100000, 100000],
[10000, 100000, 100000],
[100000, 100000, 100000],
[1000000, 1000000, 1000000],
[10, 20, 30, 40, 50, 60, 70, 100000],
[100, 200, 300, 400, 500, 600, 700, 10000000],
]
for s in sizes:
cl =get_cluster_sizes_to_load(s)
print s, ':', cl, sum(cl)