In [28]:
%load_ext load_style
%load_style talk.css
from IPython.display import YouTubeVideo, Image
xray has been developed by scientists / engineers working at the Climate Corporation
It is an open source project and Python package that aims to bring
the labeled data power of pandas to the
physical sciences, by providing N-dimensional variants of the core
pandas data structures, Series and
DataFrame: the xray DataArray and Dataset.
the goal is to provide a pandas-like and pandas-compatible toolkit for
analytics on multi-dimensional arrays, rather than the tabular data for
which pandas excels. The approach adopts the Common Data
Model
for self-describing scientific data in widespread use in the Earth
sciences (e.g., netCDF
and OPeNDAP): xray.Dataset is an in-memory
representation of a netCDF file.
The main advantages of using xray versus netCDF4 are:
What is missing IMHO:
+ Multiple File Datasets (see netCDF4 MFDataset)
UPDATE ! (Thanks Stephan Hoyer!) you can open multiple files Datasets using `glob` and the `concat` method:
import glob
import xray
files = sorted(glob.glob('/Users/nicolasf/data/NCEP1/PAC/hgt.????.nc'))
ds = xray.concat([xray.open_dataset(f) for f in files], 'time')
You can now use the sel method to select slices along e.g. the time dimension:
ds.sel(time=slice('1980','2000'))
There's too much to see in the context of this talk, to know more about all the cool xray features, watch:
PyData talk by Stefan Hoyer: https://www.youtube.com/watch?v=T5CZyNwBa9c
In [29]:
YouTubeVideo('T5CZyNwBa9c', width=500, height=400, start=0)
Out[29]:
In [30]:
%matplotlib inline
from matplotlib import pyplot as plt
In [31]:
import os
import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap as bm
In [32]:
import xray; print(xray.__version__)
In [33]:
dset = xray.open_dataset(os.path.join(os.environ['HOME'], 'data/SST/ER_SST/V4',\
'ersst.realtime.nc'))
dset is a xray.Dataset, It is a dict-like container of labeled arrays (DataArray objects) with aligned dimensions. It is designed as an in-memory representation of the data model from the netCDF file format.
In [34]:
Image('http://xray.readthedocs.org/en/stable/_images/dataset-diagram.png', width=900)
Out[34]:
In [35]:
dset
Out[35]:
In [36]:
dset.dims.keys()
Out[36]:
In [37]:
dset.dims
Out[37]:
In [38]:
dset.attrs.keys()
Out[38]:
In [39]:
dset.keys()
Out[39]:
In [40]:
lat = dset['lat']
lon = dset['lon']
lons, lats = np.meshgrid(lon, lat)
In [41]:
dset.sel(time=('1998-1-15'), zlev=0)
Out[41]:
In [42]:
dset.sel(time=('1998-1-15'), zlev=0, lat=slice(-40,40))
Out[42]:
In [43]:
m = bm(projection='cyl',llcrnrlat=-90,urcrnrlat=90,\
llcrnrlon=0,urcrnrlon=360,\
lat_ts=0,resolution='c')
In [44]:
def plot_field(X, lat, lon, vmin, vmax, step, cmap=plt.get_cmap('jet'), ax=False, title=False, grid=False):
if not ax:
f, ax = plt.subplots(figsize=(10, (X.shape[0] / float(X.shape[1])) * 10))
m.ax = ax
im = m.contourf(lons, lats, X, np.arange(vmin, vmax+step, step), latlon=True, cmap=cmap, extend='both', ax=ax)
m.drawcoastlines()
if grid:
m.drawmeridians(np.arange(0, 360, 60), labels=[0,0,0,1])
m.drawparallels([-40, 0, 40], labels=[1,0,0,0])
m.colorbar(im)
if title:
ax.set_title(title)
In [45]:
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['sst'], lats, lons, 0, 30, 1, grid=True)
In [46]:
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['anom'], lats, lons, -4, 4, 0.1, \
cmap=plt.get_cmap('RdBu_r'), grid=True)
In [47]:
Image(filename='images/split-apply-combine.png', width=500)
Out[47]:
In [48]:
sst = dset[['sst']]
In [49]:
clim = sst.groupby('time.month').mean('time')
In [50]:
clim[['sst']]
Out[50]:
In [51]:
from calendar import month_name
In [52]:
f, axes = plt.subplots(nrows=4,ncols=3, figsize=(14,10))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten()
for i, month in enumerate(xrange(1,13)):
ax = axes[i]
plot_field(clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=month_name[month])
f.savefig('./images/clim_sst.png')
In [53]:
seas_clim = sst.groupby('time.season').mean('time')
In [54]:
seas_clim
Out[54]:
In [55]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')
In [56]:
def get_dpm(time):
"""
return a array of days per month corresponding to the months provided in `months`
"""
import calendar as cal
month_length = np.zeros(len(time), dtype=np.float)
for i, (month, year) in enumerate(zip(time.month, time.year)):
month_length[i] = cal.monthrange(year, month)[1]
return month_length
In [57]:
def season_mean(ds, calendar='standard'):
# Make a DataArray of season/year groups
year_season = xray.DataArray(ds.time.to_index().to_period(freq='Q-NOV').to_timestamp(how='E'),
coords=[ds.time], name='year_season')
# Make a DataArray with the number of days in each month, size = len(time)
month_length = xray.DataArray(get_dpm(ds.time.to_index()),
coords=[ds.time], name='month_length')
# Calculate the weights by grouping by 'time.season'
weights = month_length.groupby('time.season') / month_length.groupby('time.season').sum()
# Test that the sum of the weights for each season is 1.0
np.testing.assert_allclose(weights.groupby('time.season').sum().values, np.ones(4))
# Calculate the weighted average
return (ds * weights).groupby('time.season').sum(dim='time')
In [58]:
sst_seas = season_mean(sst)
In [59]:
sst_seas
Out[59]:
In [60]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')
In [61]:
diff_seas = seas_clim - sst_seas
In [62]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(diff_seas['sst'][i,0,:,:].values, lats, lons, -0.1, 0.1, 0.01, ax=ax, title=seas)
In [107]:
res_quarter = dset.resample('Q-FEB', dim='time', how='mean')
In [108]:
res_quarter_clim = res_quarter.groupby('time.month').mean('time')
In [109]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(seas_clim['sst'][i,0,:,:].values - res_quarter_clim['sst'][i,0,:,:].values, \
lats, lons, -10, 10, 1, ax=ax, title=seas)
something not right here, probably because a Quarter is not what I think it is ...
In [25]:
def demean(x):
return x - x.sel(time=slice('1981-1-15','2010-12-15')).mean('time')
In [28]:
sst_anoms = dset.groupby('time.month').apply(demean)
# or (will return a xray.DataArray)
# sst_anoms = dset['sst'].groupby('time.month').apply(demean)
In [ ]:
sst_anoms
In [30]:
plot_field(sst_anoms.sel(time=('1998-1-15'), zlev=0)['sst'], lats, lons, -4, 4, 0.1, \
cmap=plt.get_cmap('RdBu_r'), grid=True)
In [31]:
sst_anoms.to_netcdf('../data/ersst_anoms.nc')
In [ ]:
!ncdump -h ../data/ersst_anoms.nc
In [33]:
lon = np.linspace(0, 357.5, 144, endpoint=True)
lat = np.linspace(-90,90, 73, endpoint=True)
lons, lats = np.meshgrid(lon,lat)
lev = np.array([1000,925,850])
time = pd.date_range(start='2015-1-1',end='2015-1-3')
In [34]:
arr = np.random.randn(3,3,73,144)
The dictionnary keys are the variables contained in the Dataset.
The Dictionnary values are tuples, with first the (or the list of) dimension(s) over which the array varies, then the array itself
In [35]:
d = {}
d['time'] = ('time',time)
d['latitudes'] = ('latitudes',lat)
d['longitudes'] = ('longitudes', lon)
d['level'] = ('level', lev)
d['dummy'] = (['time','level','latitudes','longitudes'], arr)
In [36]:
dset = xray.Dataset(d)
In [37]:
dset
Out[37]:
In [38]:
dset.sel(time='2015-1-2', level=1000)
Out[38]:
In [39]:
lons, lats = np.meshgrid(dset['longitudes'], dset['latitudes'])
In [40]:
plot_field(dset.sel(time='2015-1-2', level=1000)['dummy'], \
lats, lons, -4, 4, 0.1, grid=True)
In [42]:
dset.to_netcdf('../outputs/dset_from_dict.nc')
In [43]:
!ncdump -h ../outputs/dset_from_dict.nc
In [ ]:
import string
df = pd.DataFrame(np.random.randn(365,5), \
index=pd.date_range(start='2014-1-1', periods=365), \
columns=list(string.ascii_letters[:5]))
In [ ]:
df.head()
In [ ]:
df_ds = xray.Dataset.from_dataframe(df)
In [ ]:
df_ds
In [ ]:
group = df_ds.groupby('index.month').mean('index')
In [ ]:
group
In [ ]:
group_df = group.to_dataframe()
In [ ]:
group_df.reindex_axis(list(string.ascii_letters[:5]), axis=1).head()
In [ ]:
df.groupby(df.index.month).mean().head()
In [32]:
url = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/interp_OLR/olr.day.mean.nc'
In [33]:
olr_dset = xray.open_dataset(url)
In [34]:
olr_dset
Out[34]:
In [35]:
olr_sub = olr_dset.sel(time='1998-1-1',lat=slice(30,-30), lon=slice(170, 300))
In [36]:
olr_sub
Out[36]:
In [37]:
m = bm(projection='merc',llcrnrlat=-30,urcrnrlat=30,\
llcrnrlon=170,urcrnrlon=300,\
lat_ts=0,resolution='c')
In [38]:
lons, lats = np.meshgrid(olr_sub['lon'], olr_sub['lat'])
In [39]:
plot_field(olr_sub['olr'].values, lats, lons, 80, 300, 10, grid=True)