In [1]:
import sys
import os
from tempfile import NamedTemporaryFile
from gc import collect
import netCDF4
import h5py
import numpy as np
import pylab as pl
sys.path.append('../util')
from meters import ThroughputMeter, clear_host_cache
from ncgen import *
from grids import *
In [2]:
def write_netcdf_file(timescale, time_major=True, grid=canada_5k):
print("Creating a time-{} NetCDF file with {}x{} grid and {} time steps".format('major' if time_major else 'minor', grid['lon']['count'], grid['lat']['count'],len(timescale)))
with NamedTemporaryFile(suffix='.nc', delete=False, dir='/app/tmp') as f:
nc = make_nc(f.name, grid=grid, timescale=timescale, timemajor=time_major)
nc.close()
print("File size: {:.2f}Mb".format(os.path.getsize(f.name)/1024/1024))
return f
In [3]:
def netcdf_read_test(f, time_major):
print("Reading out with python-netCDF4 module...")
# Open the file just created
nc = netCDF4.Dataset(f.name, 'r')
if time_major:
with ThroughputMeter() as t:
a = nc.variables['var_0'][0,:,:]
else:
with ThroughputMeter() as t:
a = nc.variables['var_0'][:,:,0]
res = ('python-netCDF4', time_major, len(timescale), t.megabytes_per_second(a))
# python-netCDF4 seems to leak file descriptors
# We have to take a lot of steps to make sure that the files get closed and that
# the space gets reclaimed by the OS
nc.close
return res
In [4]:
def hdf5_read_test(f, time_major):
print("Reading out with h5py module...")
results = []
# Open the file just created
nc = h5py.File(f.name, 'r')
if time_major:
with ThroughputMeter() as t:
a = nc['var_0'][0,:,:]
else:
with ThroughputMeter() as t:
a = nc['var_0'][:,:,0]
res = ('h5py', time_major, len(timescale), t.megabytes_per_second(a))
# python-netCDF4 seems to leak file descriptors
# We have to take a lot of steps to make sure that the files get closed and that
# the space gets reclaimed by the OS
nc.close
return res
In [5]:
time_major = True
grids = [world_250k, world_125k, canada_5k, bc_400m]
ts = [timescales['seasonal'], timescales['annual'], timescales['monthly']] # Daily takes hours and hours to run #, timescales['daily']]
In [6]:
results = []
for grid in grids:
for timescale in ts:
testfile = write_netcdf_file(timescale, time_major=time_major, grid=grid)
for read_with_h5py in [True, False]:
!sync
clear_host_cache()
if read_with_h5py:
results.append(hdf5_read_test(testfile, time_major))
else:
results.append(netcdf_read_test(testfile, time_major))
print("Removing {}".format(testfile.name))
os.remove(testfile.name)
testfile.close()
del testfile
collect()
In [7]:
results
Out[7]:
In [11]:
h5py_results = [r[3] for r in results[0::2]]
netcdf4_results = [r[3] for r in results[1::2]]
In [12]:
h5py_results
Out[12]:
In [13]:
netcdf4_results
Out[13]:
In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
f, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1)
f.set_size_inches(12, 12)
ax1.plot(h5py_results[0:3], label="hdf5")
ax1.plot(netcdf4_results[0:3], label="netcdf4")
ax1.legend()
ax1.set_title("world_250k")
ax1.set_xticks(range(len(ts)))
ax1.set_xticklabels([len(x) for x in ts])
ax2.plot(h5py_results[3:6], label="hdf5")
ax2.plot(netcdf4_results[3:6], label="netcdf4")
ax2.set_title("world_125k")
ax2.set_xticks(range(len(ts)))
ax2.set_xticklabels([len(x) for x in ts])
ax3.plot(h5py_results[6:9], label="hdf5")
ax3.plot(netcdf4_results[6:9], label="netcdf4")
ax3.set_title("canada_5k")
ax3.set_xticks(range(len(ts)))
ax3.set_xticklabels([len(x) for x in ts])
ax4.plot(h5py_results[9:12], label="hdf5")
ax4.plot(netcdf4_results[9:12], label="netcdf4")
ax4.set_title("bc_400m")
ax4.set_xticks(range(len(ts)))
ax4.set_xticklabels([len(x) for x in ts])
Out[18]:
In [ ]: