In [1]:
# a little script for open multiple netCDF files using xray
import numpy as np
import pandas as pd
import xray as xr
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
#!wget http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/A20110012011008.L3b_8D_CHL.nc
import xray
da1 = xray.open_dataset('A20110012011008.L3b_8D_CHL.nc', group='level-3_binned_data')

In [3]:
xr.open_dataset?

In [4]:
da1.variables


Out[4]:
Frozen(OrderedDict([('BinList', <xarray.Variable (binListDim: 7654961)>
[7654961 values with dtype={'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True}]), ('chlor_a', <xarray.Variable (binDataDim: 7654961)>
[7654961 values with dtype={'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True}]), ('chl_ocx', <xarray.Variable (binDataDim: 7654961)>
[7654961 values with dtype={'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True}]), ('BinIndex', <xarray.Variable (binIndexDim: 4320)>
array([(1, 0, 0, 3), (4, 0, 0, 9), (13, 0, 0, 16), ...,
       (23761649, 0, 0, 16), (23761665, 0, 0, 9), (23761674, 0, 0, 3)], 
      dtype={'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True})), ('binDataDim', <xarray.Coordinate 'binDataDim' (binDataDim: 7654961)>
[7654961 values with dtype=int64]), ('binIndexDim', <xarray.Coordinate 'binIndexDim' (binIndexDim: 4320)>
array([   0,    1,    2, ..., 4317, 4318, 4319])), ('binListDim', <xarray.Coordinate 'binListDim' (binListDim: 7654961)>
[7654961 values with dtype=int64])]))

In [6]:
da1.attrs


Out[6]:
OrderedDict()

In [7]:
da1.var


Out[7]:
<bound method ImplementsDatasetReduce._reduce_method.<locals>.wrapped_func of <xarray.Dataset>
Dimensions:      (binDataDim: 7654961, binIndexDim: 4320, binListDim: 7654961)
Coordinates:
  * binDataDim   (binDataDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * binIndexDim  (binIndexDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ...
  * binListDim   (binListDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
Data variables:
    BinList      (binListDim) {'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True} (238575, 6, 1, 2.4494898319244385, 0.0) ...
    chlor_a      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8002867698669434, 0.2616187632083893) ...
    chl_ocx      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8008906245231628, 0.26200050115585327) ...
    BinIndex     (binIndexDim) {'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True} (1, 0, 0, 3) ...>

In [8]:
print(da1)


<xarray.Dataset>
Dimensions:      (binDataDim: 7654961, binIndexDim: 4320, binListDim: 7654961)
Coordinates:
  * binDataDim   (binDataDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * binIndexDim  (binIndexDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ...
  * binListDim   (binListDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
Data variables:
    BinList      (binListDim) {'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True} (238575, 6, 1, 2.4494898319244385, 0.0) ...
    chlor_a      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8002867698669434, 0.2616187632083893) ...
    chl_ocx      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8008906245231628, 0.26200050115585327) ...
    BinIndex     (binIndexDim) {'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True} (1, 0, 0, 3) ...

In [9]:
da1.BinIndex[-5:]


Out[9]:
<xarray.DataArray 'BinIndex' (binIndexDim: 5)>
array([(23761599, 0, 0, 28), (23761627, 0, 0, 22), (23761649, 0, 0, 16),
       (23761665, 0, 0, 9), (23761674, 0, 0, 3)], 
      dtype={'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True})
Coordinates:
  * binIndexDim  (binIndexDim) int64 4315 4316 4317 4318 4319

In [10]:
da1.BinList[-5:]


Out[10]:
<xarray.DataArray 'BinList' (binListDim: 5)>
array([(20652298, 7, 1, 2.6457512378692627, 0.0),
       (20652299, 8, 1, 2.8284270763397217, 0.0),
       (20652300, 10, 1, 3.1622776985168457, 0.0),
       (20652301, 7, 1, 2.6457512378692627, 0.0),
       (20652302, 3, 1, 1.7320507764816284, 0.0)], 
      dtype={'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True})
Coordinates:
  * binListDim  (binListDim) int64 7654956 7654957 7654958 7654959 7654960

In [32]:
7654961/ (20652302 - 240292)   == 0.3750224010276303
#[Note]
# There are lots of missing data has been excluded from the dataset
# so there is going to be a lot of missing bin numbers!

############################################
# roughly is the percent of ":percent_data_bins = 32.21558f "
# this info : "percent_data_bins = 32.21558f"
# can be seen by da1 = xr.open_dataset('A20110012011008.L3b_8D_CHL.nc')
#               da1.variabel

# or can be seen by directly using the ncdump command from netCDF4

###########################################
# the  binListDim= 7654961   # number of total bins(has data) contained in this netcdf file
# this can be seen: 
# da1 = xr.open_dataset('A20110012011008.L3b_8D_CHL.nc', group='level-3_binned_data')
# da1.var  # 
# 

##########################################
# us the following:
# da1.BinList[5:]
# da1.BinList[-5:]
# one can see that the bin numbers at least run from 240,292 to 20,652,302
7654961/ (20652302 - 240292) == 0.3750224010276303

#########################################
# use the web: Integerized Sinusoidal Binning Scheme for Level 3 Data
# for resolution of 4.64 km, there are 4320 latitudinal rows, this agrees with 
# binIndexDim: 4320,
# From the table on the same doc, there are totally 23,761,676 bins
# 
# From da1 we know:
# binDataDim (or binListDim) is 7654961

#######
# so the theretical non-empty data ration is: 7654961/23761676 = 0.3221557688102472
# matches: percent_data_bins = 32.21558f


Out[32]:
0.3750224010276303

In [ ]:


In [14]:
############## let's try to use python module netCDF4

In [ ]:


In [9]:
#from netCDF4 import Dataset

#da1_netcdf = Dataset('A20110012011008.L3b_8D_CHL.nc')
#da1_netcdf.variables
#da1_netcdf.ncattrs()

In [11]:
pd.__version__


Out[11]:
'0.17.1'

In [ ]:
# Try the level 2 data
# http://oceandata.sci.gsfc.nasa.gov/cgi/getfile/T2011001053500.L2_LAC_SST.nc
#l2_xray = xr.open_dataset('T2011001053500.L2_LAC_SST.nc')
#l2_xray.variables
#l2_xray.var

In [ ]:
### Now let's explore xarray a little bit

In [26]:
### Now let's try to open two files in xarray

In [16]:
# Let's try smi mapped data
da1_smi = xr.open_dataset('A20110012011008.L3m_8D_CHL_chlor_a_4km.nc')

In [15]:
da1_smi.var


Out[15]:
<bound method ImplementsDatasetReduce._reduce_method.<locals>.wrapped_func of <xray.Dataset>
Dimensions:        (eightbitcolor: 256, lat: 4320, lon: 8640, rgb: 3)
Coordinates:
  * lat            (lat) float64 89.98 89.94 89.9 89.85 89.81 89.77 89.73 ...
  * lon            (lon) float64 -180.0 -179.9 -179.9 -179.9 -179.8 -179.8 ...
  * eightbitcolor  (eightbitcolor) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...
  * rgb            (rgb) int64 0 1 2
Data variables:
    chlor_a        (lat, lon) float64 nan nan nan nan nan nan nan nan nan ...
    palette        (rgb, eightbitcolor) uint8 147 0 108 144 0 111 141 0 114 ...
Attributes:
    product_name: A20110012011008.L3m_8D_CHL_chlor_a_4km.nc
    instrument: MODIS
    title: MODIS Level-3 Standard Mapped Image
    project: Ocean Biology Processing Group (NASA/GSFC/OBPG)
    platform: Aqua
    temporal_range: 8-day
    processing_version: 2014.0
    date_created: 2015-06-26T02:48:06.000Z
    history: smigen par=A20110012011008.L3m_8D_CHL_chlor_a_4km.nc.param
    l2_flag_names: ATMFAIL,LAND,HILT,HISATZEN,STRAYLIGHT,CLDICE,COCCOLITH,LOWLW,CHLWARN,CHLFAIL,NAVWARN,MAXAERITER,ATMWARN,HISOLZEN,NAVFAIL,FILTER,HIGLINT
    time_coverage_start: 2010-12-31T22:25:08.000Z
    time_coverage_end: 2011-01-09T02:45:07.000Z
    start_orbit_number: 46073
    end_orbit_number: 46190
    map_projection: Equidistant Cylindrical
    latitude_units: degrees_north
    longitude_units: degrees_east
    northernmost_latitude: 90.0
    southernmost_latitude: -90.0
    westernmost_longitude: -180.0
    easternmost_longitude: 180.0
    geospatial_lat_max: 90.0
    geospatial_lat_min: -90.0
    geospatial_lon_max: 180.0
    geospatial_lon_min: -180.0
    grid_mapping_name: latitude_longitude
    latitude_step: 0.0416667
    longitude_step: 0.0416667
    sw_point_latitude: -89.9792
    sw_point_longitude: -179.979
    geospatial_lon_resolution: 4.6
    geospatial_lat_resolution: 4.6
    geospatial_lat_units: km
    geospatial_lon_units: km
    spatialResolution: 4.60 km
    data_bins: 9257413
    number_of_lines: 4320
    number_of_columns: 8640
    measure: Mean
    data_minimum: 0.006505
    data_maximum: 99.9425
    suggested_image_scaling_minimum: 0.01
    suggested_image_scaling_maximum: 20.0
    suggested_image_scaling_type: LOG
    suggested_image_scaling_applied: No
    _lastModified: 2015-06-26T02:48:06.000Z
    Conventions: CF-1.6
    institution: NASA Goddard Space Flight Center, Ocean Ecology Laboratory, Ocean Biology Processing Group
    standard_name_vocabulary: NetCDF Climate and Forecast (CF) Metadata Convention
    Metadata_Conventions: Unidata Dataset Discovery v1.0
    naming_authority: gov.nasa.gsfc.sci.oceandata
    id: A20110012011008.L3b_8D_CHL.nc/L3/A20110012011008.L3b_8D_CHL.nc
    license: http://science.nasa.gov/earth-science/earth-science-data/data-information-policy/
    creator_name: NASA/GSFC/OBPG
    publisher_name: NASA/GSFC/OBPG
    creator_email: data@oceancolor.gsfc.nasa.gov
    publisher_email: data@oceancolor.gsfc.nasa.gov
    creator_url: http://oceandata.sci.gsfc.nasa.gov
    publisher_url: http://oceandata.sci.gsfc.nasa.gov
    processing_level: L3 Mapped
    cdm_data_type: grid
    identifier_product_doi_authority: http://dx.doi.org
    identifier_product_doi: 10.5067/AQUA/MODIS_OC.2014.0
    keywords: Oceans > Ocean Chemistry > Chlorophyll; Oceans > Ocean Optics > Ocean Color
    keywords_vocabulary: NASA Global Change Master Directory (GCMD) Science Keywords>

In [11]:


In [17]:
### Let's try to open multiple files in xarray (dask is required)
# http://xarray.pydata.org/en/stable/io.html

xr.open_mfdataset('A*.nc', concat_dim=['binDataDim', 'binIndexDim', 'binListDim'], group='level-3_binned_data')
# search code in: https://github.com/pydata/xarray/

# alright need to send a question to the xarray developer site


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-9a35e60a8f03> in <module>()
      2 # http://xarray.pydata.org/en/stable/io.html
      3 
----> 4 xr.open_mfdataset('A*.nc', concat_dim=['binDataDim', 'binIndexDim', 'binListDim'], group='level-3_binned_data')

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/backends/api.py in open_mfdataset(paths, chunks, concat_dim, preprocess, engine, lock, **kwargs)
    303         datasets = [preprocess(ds) for ds in datasets]
    304 
--> 305     combined = auto_combine(datasets, concat_dim=concat_dim)
    306     combined._file_obj = _MultiFileCloser(file_objs)
    307     return combined

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in auto_combine(datasets, concat_dim)
    374     grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
    375                                 datasets).values()
--> 376     concatenated = [_auto_concat(ds, dim=concat_dim) for ds in grouped]
    377     merged = reduce(lambda ds, other: ds.merge(other), concatenated)
    378     return merged

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in <listcomp>(.0)
    374     grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
    375                                 datasets).values()
--> 376     concatenated = [_auto_concat(ds, dim=concat_dim) for ds in grouped]
    377     merged = reduce(lambda ds, other: ds.merge(other), concatenated)
    378     return merged

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in _auto_concat(datasets, dim)
    325                                  'explicitly')
    326             dim, = concat_dims
--> 327         return concat(datasets, dim=dim)
    328 
    329 

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
    112         raise TypeError('can only concatenate xarray Dataset and DataArray '
    113                         'objects')
--> 114     return f(objs, dim, data_vars, coords, compat, positions)
    115 
    116 

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
    266     for k in concat_over:
    267         vars = ensure_common_dims([ds.variables[k] for ds in datasets])
--> 268         combined = Variable.concat(vars, dim, positions)
    269         insert_result_variable(k, combined)
    270 

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py in concat(cls, variables, dim, positions, shortcut)
    915         # can't do this lazily: we need to loop through variables at least
    916         # twice
--> 917         variables = list(variables)
    918         first_var = variables[0]
    919 

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/combine.py in ensure_common_dims(vars)
    260                 common_shape = tuple(non_concat_dims.get(d, dim_len)
    261                                      for d in common_dims)
--> 262                 var = var.expand_dims(common_dims, common_shape)
    263             yield var
    264 

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py in expand_dims(self, dims, shape)
    713             dims_map = dict(zip(dims, shape))
    714             tmp_shape = [dims_map[d] for d in expanded_dims]
--> 715             expanded_data = ops.broadcast_to(self.data, tmp_shape)
    716         else:
    717             expanded_data = self.data[

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/ops.py in f(*args, **kwargs)
     60             else:
     61                 module = eager_module
---> 62             return getattr(module, name)(*args, **kwargs)
     63     else:
     64         def f(data, *args, **kwargs):

/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/dask/array/core.py in broadcast_to(x, shape)
   2417                            if old != 1):
   2418         raise ValueError('cannot broadcast shape %s to shape %s'
-> 2419                          % (x.shape, shape))
   2420 
   2421     name = 'broadcast_to-' + tokenize(x, shape)

ValueError: cannot broadcast shape (7654961,) to shape (1, 7680932)

In [4]:
xr.open_dataset('A20110012011008.L3b_8D_CHL.nc', group='level-3_binned_data')


Out[4]:
<xarray.Dataset>
Dimensions:      (binDataDim: 7654961, binIndexDim: 4320, binListDim: 7654961)
Coordinates:
  * binDataDim   (binDataDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * binIndexDim  (binIndexDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ...
  * binListDim   (binListDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
Data variables:
    BinList      (binListDim) {'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True} (238575, 6, 1, 2.4494898319244385, 0.0) ...
    chlor_a      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8002867698669434, 0.2616187632083893) ...
    chl_ocx      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.8008906245231628, 0.26200050115585327) ...
    BinIndex     (binIndexDim) {'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True} (1, 0, 0, 3) ...

In [7]:
test=xr.open_dataset('A20110092011016.L3b_8D_CHL.nc', group='level-3_binned_data')
test


Out[7]:
<xarray.Dataset>
Dimensions:      (binDataDim: 7680932, binIndexDim: 4320, binListDim: 7680932)
Coordinates:
  * binDataDim   (binDataDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * binIndexDim  (binIndexDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ...
  * binListDim   (binListDim) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
Data variables:
    BinList      (binListDim) {'names':['bin_num','nobs','nscenes','weights','time_rec'], 'formats':['<u4','<i2','<i2','<f4','<f4'], 'offsets':[0,4,6,8,12], 'itemsize':16, 'aligned':True} (235149, 4, 2, 2.732050895690918, 0.0) ...
    chlor_a      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.9141194224357605, 0.31699174642562866) ...
    chl_ocx      (binDataDim) {'names':['sum','sum_squared'], 'formats':['<f4','<f4'], 'offsets':[0,4], 'itemsize':8, 'aligned':True} (0.9146864414215088, 0.31730812788009644) ...
    BinIndex     (binIndexDim) {'names':['start_num','begin','extent','max'], 'formats':['<u4','<u4','<u4','<u4'], 'offsets':[0,4,8,12], 'itemsize':16, 'aligned':True} (1, 0, 0, 3) ...

In [10]:
list(test.dims.keys())


Out[10]:
['binDataDim', 'binIndexDim', 'binListDim']

In [16]:
xr.open_mfdataset?

In [ ]: