In [1]:
from os.path import getsize, realpath
from glob import glob
from time import time
from math import log, ceil

from IPython.parallel import Client
import globalmean
import netCDF4

c = Client()
dv = c[:]
dv.block = True
print len(dv.targets), dv.targets

def dp (n):
    return '%.2f' % n


4 [0, 1, 2, 3]

In [2]:
# get datasets available
model = '[MI]*'
exp = '*'
freq = '*'
realm = 'atmos'
table = '*'
ensemble = '*'
ver = 'latest'
name = 'tas'

details = ('path', 'nfiles', 'du', 'var_sz')
#details = ('path',)

data_path = '%s/%s/%s/%s/%s/%s/%s/%s/'
files = ('/opt/data/' + data_path + '*.nc') % (model, exp, freq, realm, table, ensemble, ver, name)
runs = {(): glob(files)}
for i in xrange(3, 11):
    new_runs = {}
    for k, fs in runs.iteritems():
        fs = [(f.split('/')[i], f) for f in fs]
        new_fs = {}
        for cat, f in fs:
            new_runs.setdefault(k + (cat,), []).append(f)
    runs = new_runs
for k, fs in runs.iteritems():
    if 'path' in details:
        print k,
    if 'nfiles' in details:
        print len(fs),
    if 'du' in details:
        print sum(getsize(realpath(f)) / (1024. ** 3) for f in fs),
    if 'var_sz' in details:
        with netCDF4.MFDataset(fs) as d:
            print d.variables[fs[0].split('/')[10]].shape,
    print ''


('MPI-ESM-LR', 'rcp85', 'day', 'atmos', 'day', 'r3i1p1', 'latest', 'tas') 10 2.38343709707 (34698, 96, 192) 
('MPI-ESM-LR', 'rcp45', 'mon', 'atmos', 'Amon', 'r2i1p1', 'latest', 'tas') 1 0.0783169642091 (1140, 96, 192) 
('MPI-ESM-LR', 'rcp45', 'day', 'atmos', 'day', 'r1i1p1', 'latest', 'tas') 30 7.40115585923 (107746, 96, 192) 
('IPSL-CM5A-MR', 'rcp45', 'day', 'atmos', 'day', 'r1i1p1', 'latest', 'tas') 2 2.66076381505 (34675, 143, 144) 
('IPSL-CM5A-MR', 'rcp85', '3hr', 'atmos', '3hr', 'r1i1p1', 'latest', 'tas') 4 16.8014368117 (219000, 143, 144) 
('IPSL-CM5A-MR', 'rcp85', 'mon', 'atmos', 'Amon', 'r1i1p1', 'latest', 'tas') 1 0.0874905623496 (1140, 143, 144) 
('MPI-ESM-LR', 'rcp45', 'day', 'atmos', 'day', 'r3i1p1', 'latest', 'tas') 10 2.38343709707 (34698, 96, 192) 
('MPI-ESM-LR', 'rcp85', 'day', 'atmos', 'day', 'r2i1p1', 'latest', 'tas') 10 2.38343709707 (34698, 96, 192) 
('MPI-ESM-LR', 'rcp85', 'day', 'atmos', 'day', 'r1i1p1', 'latest', 'tas') 30 7.40115585923 (107746, 96, 192) 
('MPI-ESM-LR', 'rcp85', 'mon', 'atmos', 'Amon', 'r3i1p1', 'latest', 'tas') 1 0.0783169642091 (1140, 96, 192) 
('MPI-ESM-LR', 'rcp45', 'day', 'atmos', 'day', 'r2i1p1', 'latest', 'tas') 10 2.38343709707 (34698, 96, 192) 
('MPI-ESM-LR', 'rcp45', 'mon', 'atmos', 'Amon', 'r1i1p1', 'latest', 'tas') 2 0.243179425597 (3540, 96, 192) 
('IPSL-CM5A-MR', 'rcp45', '3hr', 'atmos', '3hr', 'r1i1p1', 'latest', 'tas') 4 16.8014368117 (219000, 143, 144) 
('IPSL-CM5A-MR', 'rcp45', 'mon', 'atmos', 'Amon', 'r1i1p1', 'latest', 'tas') 1 0.0874905623496 (1140, 143, 144) 
('IPSL-CM5A-MR', 'rcp85', 'day', 'atmos', 'day', 'r1i1p1', 'latest', 'tas') 2 2.66076381505 (34675, 143, 144) 
('MPI-ESM-LR', 'rcp45', 'mon', 'atmos', 'Amon', 'r3i1p1', 'latest', 'tas') 1 0.0783169642091 (1140, 96, 192) 
('MPI-ESM-LR', 'rcp85', 'mon', 'atmos', 'Amon', 'r2i1p1', 'latest', 'tas') 1 0.0783169642091 (1140, 96, 192) 
('MPI-ESM-LR', 'rcp85', 'mon', 'atmos', 'Amon', 'r1i1p1', 'latest', 'tas') 2 0.243179425597 (3540, 96, 192) 

In [3]:
# get files
#match = [x.strip() for x in '''
#MPI-ESM-LR/rcp45/day/atmos/day/r1i1p1/latest/wap/
#'''.split('/') if x.strip()]
#name = match[-1]
match = ('MPI-ESM-LR', 'rcp45', 'day', 'atmos', 'day', 'r1i1p1', 'latest', name)
ks = [k for k in runs if all([x is None or x == y for x, y in zip(match, k)])]
files = sorted(__builtin__.sum((runs[k] for k in ks), []))
print len(files)
for k in ks:
    print k


30
('MPI-ESM-LR', 'rcp45', 'day', 'atmos', 'day', 'r1i1p1', 'latest', 'tas')

In [4]:
:# proper benchmark
start = 0
end = None
sz = sum(getsize(realpath(f)) / (1024. ** 3) for f in files)
with netCDF4.MFDataset(files) as d:
    shape = d.variables[name].shape
path = data_path % tuple('*' if x is None else x for x in match)

print path
print dp(sz)
print shape
%timeit globalmean.run(files, name, start, end, False)
%timeit globalmean.run(files, name, start, end)


MPI-ESM-LR/rcp45/day/atmos/day/r1i1p1/latest/wap/
50.79
(49306, 15, 96, 192)
1 loops, best of 3: 272 s per loop
1 loops, best of 3: 81.6 s per loop

In [4]:
# get var details
with netCDF4.Dataset(files[0]) as d:
    print d.variables[name]


<type 'netCDF4.Variable'>
float32 tas(u'time', u'lat', u'lon')
    standard_name: air_temperature
    long_name: Near-Surface Air Temperature
    units: K
    cell_methods: time: mean
    cell_measures: area: areacella
    history: 2011-07-17T14:42:07Z altered by CMOR: Treated scalar dimension: 'height'.
    coordinates: height
    missing_value: 1e+20
    _FillValue: 1e+20
    associated_files: baseURL: http://cmip-pcmdi.llnl.gov/CMIP5/dataLocation gridspecFile: gridspec_atmos_fx_MPI-ESM-LR_rcp45_r0i0p0.nc areacella: areacella_fx_MPI-ESM-LR_rcp45_r0i0p0.nc
unlimited dimensions = (u'time',)
current size = (1461, 96, 192)


In [37]:
# quick benchmark
for i in xrange(1):
    start, end = 0, None
    n = globalmean.time_bounds(files)[2]
    if end is None:
        end = n
    print start, 'to', end, '(%s total, have %s):' % (end - start, n)
    t0 = time()
    serial_times, serial_var = globalmean.run(files, name, start, end, False)
    rtn = globalmean.run(files, name, start, end, False)
    t1 = time()
    print '    serial:', dp(t1 - t0)
    print rtn
    parallel_times, parallel_var = globalmean.run(files, name, start, end)
    t2 = time()
    print '    parallel:', dp(t2 - t1)
    print '    ratio:', dp(float(t1 - t0) / (t2 - t1))
    print '    equal:', (serial_times == parallel_times).all(), (serial_var == parallel_var).all()
    times = parallel_times
    var = parallel_var

In [31]:
# plot
sum = __builtin__.sum
n = len(times)
scale_factor = 5
with netCDF4.MFDataset(files) as d:
    time_v = d.variables['time']
    units = time_v.units
    cal = time_v.calendar
for n in [5 ** i for i in xrange(2, int(ceil(log(n) / log(scale_factor))))] + [n]:
    figure()
    plot(times[:n], var[:n])
    ts = netCDF4.num2date((times[0], times[n - 1]), units, cal)
    plt.title('%s/%s/%s to %s/%s/%s' % tuple(sum(((t.day, t.month, t.year) for t in ts), ())))