In [1]:
import xarray as xr
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
from dask.diagnostics import ProgressBar
import seaborn as sns
from matplotlib.colors import LogNorm
We already downloaded a subsetted MODIS-Aqua chlorophyll-a dataset for the Arabian Sea.
We can read all the netcdf files into one xarray Dataset using the open_mfsdataset
function. Note that this does not load the data into memory yet. That only happens when we try to access the values.
In [2]:
ds_8day = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_8D.nc')
ds_daily = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_D.nc')
both_datasets = [ds_8day, ds_daily]
How much data is contained here? Let's get the answer in MB.
In [3]:
print([(ds.nbytes / 1e6) for ds in both_datasets])
The 8-day dataset is ~534 MB while the daily dataset is 4.2 GB. These both easily fit in RAM. So let's load them all into memory
In [4]:
[ds.load() for ds in both_datasets]
Out[4]:
In [5]:
def fix_bad_data(ds):
# for some reason, the cloud / land mask is backwards on some data
# this is obvious because there are chlorophyl values less than zero
bad_data = ds.chlor_a.groupby('time').min() < 0
# loop through and fix
for n in np.nonzero(bad_data.values)[0]:
data = ds.chlor_a[n].values
ds.chlor_a.values[n] = np.ma.masked_less(data, 0).filled(np.nan)
In [15]:
[fix_bad_data(ds) for ds in both_datasets]
Out[15]:
In [16]:
ds_8day.chlor_a>0
Out[16]:
In [17]:
(ds_8day.chlor_a>0).sum(dim='time').plot()
Out[17]:
In [18]:
# find a mask for the land
ocean_mask = (ds_8day.chlor_a>0).sum(dim='time')>0
#ocean_mask = (ds_daily.chlor_a>0).sum(dim='time')>0
num_ocean_points = ocean_mask.sum().values # compute the total nonzeros regions(data point)
ocean_mask.plot()
plt.title('%g total ocean points' % num_ocean_points)
Out[18]:
In [19]:
#ds_8day
In [20]:
#ds_daily
In [21]:
plt.figure(figsize=(8,6))
ds_daily.chlor_a.sel(time='2002-11-18',method='nearest').plot(norm=LogNorm())
#ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[21]:
In [22]:
#list(ds_daily.groupby('time')) # take a look at what's inside
Now we count up the number of valid points in each snapshot and divide by the total number of ocean points.
In [23]:
'''
<xarray.Dataset>
Dimensions: (eightbitcolor: 256, lat: 144, lon: 276, rgb: 3, time: 4748)
'''
ds_daily.groupby('time').count() # information from original data
Out[23]:
In [ ]:
In [24]:
ds_daily.chlor_a.groupby('time').count()/float(num_ocean_points)
Out[24]:
In [25]:
count_8day,count_daily = [ds.chlor_a.groupby('time').count()/float(num_ocean_points)
for ds in (ds_8day,ds_daily)]
In [26]:
#count_8day = ds_8day.chl_ocx.groupby('time').count()/float(num_ocean_points)
#coundt_daily = ds_daily.chl_ocx.groupby('time').count()/float(num_ocean_points)
#count_8day, coundt_daily = [ds.chl_ocx.groupby('time').count()/float(num_ocean_points)
# for ds in ds_8day, ds_daily] # not work in python 3
In [27]:
plt.figure(figsize=(12,4))
count_8day.plot(color='k')
count_daily.plot(color='r')
plt.legend(['8 day','daily'])
Out[27]:
In [28]:
count_8day_clim, coundt_daily_clim = [count.groupby('time.month').mean() # monthly data
for count in (count_8day, count_daily)]
In [29]:
print()
In [30]:
# mean value of the monthly data on the count of nonzeros
plt.figure(figsize=(12,4))
count_8day_clim.plot(color='k')
coundt_daily_clim.plot(color='r')
plt.legend(['8 day', 'daily'])
Out[30]:
In [31]:
target_date = '2003-02-15'
plt.figure(figsize=(8,6))
ds_8day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[31]:
In [32]:
plt.figure(figsize=(8,6))
ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[32]:
In [33]:
ds_daily.chlor_a[0].sel_points(lon=[65, 70], lat=[16, 18], method='nearest') # the time is selected!
#ds_daily.chl_ocx[0].sel_points(time= times, lon=lons, lat=times, method='nearest')
Out[33]:
In [34]:
#ds_daily.chlor_a.sel_points?
In [35]:
ds_15day = ds_daily.resample('15D', dim='time')
ds_15day
Out[35]:
In [48]:
plt.figure(figsize=(8,6))
ds_15day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[48]:
In [47]:
# check the range for the longitude
print(ds_15day.lon.min(),'\n' ,ds_15day.lat.min())
In [38]:
# in the following we deal with the data from the gdp float
from buyodata import buoydata
import os
In [39]:
# a list of files
fnamesAll = ['./gdp_float/buoydata_1_5000.dat','./gdp_float/buoydata_5001_10000.dat','./gdp_float/buoydata_10001_15000.dat','./gdp_float/buoydata_15001_jun16.dat']
In [40]:
# read them and cancatenate them into one DataFrame
dfAll = pd.concat([buoydata.read_buoy_data(f) for f in fnamesAll]) # around 4~5 minutes
#mask = df.time>='2002-07-04' # we only have data after this data for chlor_a
dfvvAll = dfAll[dfAll.time>='2002-07-04']
sum(dfvvAll.time<'2002-07-04') # recheck whether the time is
Out[40]:
In [41]:
# process the data so that the longitude are all >0
print('before processing, the minimum longitude is%f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(), dfvvAll.lon.max()))
mask = dfvvAll.lon<0
dfvvAll.lon[mask] = dfvvAll.loc[mask].lon + 360
print('after processing, the minimum longitude is %f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(),dfvvAll.lon.max()) )
dfvvAll.describe()
Out[41]:
In [42]:
# Select only the arabian sea region
arabian_sea = (dfvvAll.lon > 45) & (dfvvAll.lon< 75) & (dfvvAll.lat> 5) & (dfvvAll.lat <28)
# arabian_sea = {'lon': slice(45,75), 'lat': slice(5,28)} # later use this longitude and latitude
floatsAll = dfvvAll.loc[arabian_sea] # directly use mask
print('dfvvAll.shape is %s, floatsAll.shape is %s' % (dfvvAll.shape, floatsAll.shape) )
In [43]:
# avoid run this line repeatedly
# visualize the float around global region
fig, ax = plt.subplots(figsize=(12,10))
dfvvAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
Out[43]:
In [44]:
# pands dataframe cannot do the resamplingn properly
# cause we are really indexing on ['time','id'], pandas.dataframe.resample cannot do this
# TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'MultiIndex'
print()
In [45]:
# dump the surface floater data from pandas.dataframe to xarray.dataset
floatsDSAll = xr.Dataset.from_dataframe(floatsAll.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
floatsDSAll
Out[45]:
In [46]:
# resample on the xarray.dataset onto 15-day frequency
floatsDSAll_15D =floatsDSAll.resample('15D', dim='time')
floatsDSAll_15D
Out[46]:
In [49]:
# transfer it back to pandas.dataframe for plotting
floatsDFAll_15D = floatsDSAll_15D.to_dataframe()
floatsDFAll_15D
floatsDFAll_15D = floatsDFAll_15D.reset_index()
floatsDFAll_15D
# visualize the subsamping of floats around arabian region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_15D.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
Out[49]:
In [50]:
# get the value for the chllorophy for each data entry
floatsDFAll_15Dtimeorder = floatsDFAll_15D.sort_values(['time','id'],ascending=True)
floatsDFAll_15Dtimeorder # check whether it is time ordered!!
# should we drop nan to speed up??
Out[50]:
In [51]:
floatsDFAll_15Dtimeorder.lon.dropna().shape # the longitude data has lots of values (2195,)
Out[51]:
In [52]:
# a little test for the api in loops for the dataframe
# check df.itertuples? it is faster and preserves the data format
'''
chl_ocx=[]
for row in floats_timeorder.itertuples():
#print(row)
#print('row.time = %s, row.id=%d, row.lon=%4.3f, row.lat=%4.3f' % (row.time,row.id,row.lon,row.lat) )
tmp=ds_2day.chl_ocx.sel_points(time=[row.time],lon=[row.lon], lat=[row.lat], method='nearest') # interpolation
chl_ocx.append(tmp)
floats_timeorder['chl_ocx'] = pd.Series(chl_ocx, index=floats_timeorder.index)
chl_ocx[0].to_series
'''
Out[52]:
In [53]:
# this one line avoid the list above
# it took a really long time for 2D interpolation, it takes an hour
tmpAll = ds_15day.chlor_a.sel_points(time=list(floatsDFAll_15Dtimeorder.time),lon=list(floatsDFAll_15Dtimeorder.lon), lat=list(floatsDFAll_15Dtimeorder.lat), method='nearest')
print('the count of nan vaues in tmpAll is',tmpAll.to_series().isnull().sum())
In [ ]:
In [ ]:
In [ ]:
In [54]:
#print(tmpAll.dropna().shape)
tmpAll.to_series().dropna().shape # (1296,) good values
Out[54]:
In [56]:
# tmp.to_series() to transfer it from xarray dataset to series
floatsDFAll_15Dtimeorder['chlor_a'] = pd.Series(np.array(tmpAll.to_series()), index=floatsDFAll_15Dtimeorder.index)
print("after editing the dataframe the nan values in 'chlor_a' is", floatsDFAll_15Dtimeorder.chlor_a.isnull().sum() ) # they should be the same values as above
# take a look at the data
floatsDFAll_15Dtimeorder
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_15Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a', cmap='RdBu_r', edgecolor='none', ax=ax)
def scale(x):
logged = np.log10(x)
return logged
#print(floatsAll_timeorder['chlor_a'].apply(scale))
floatsDFAll_15Dtimeorder['chlor_a_log10'] = floatsDFAll_15Dtimeorder['chlor_a'].apply(scale)
floatsDFAll_15Dtimeorder
#print("after the transformation the nan values in 'chlor_a_log10' is", floatsAll_timeorder.chlor_a_log10.isnull().sum() )
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_15Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
#floatsDFAll_15Dtimeorder.chlor_a.dropna().shape # (1296,)
floatsDFAll_15Dtimeorder.chlor_a_log10.dropna().shape # (1296,)
Out[56]:
In [57]:
# take the diff of the chlor_a, and this has to be done in xarray
# transfer the dataframe into xarry dataset again
# take the difference
floatsDFAll_15Dtimeorder
Out[57]:
In [58]:
# unstack() will provide a 2d dataframe
# reset_index() will reset all the index as columns
In [61]:
# prepare the data in dataset and about to take the diff
tmp = xr.Dataset.from_dataframe(floatsDFAll_15Dtimeorder.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
# take the diff on the chlor_a
chlor_a_rate = tmp.diff(dim='time',n=1).chlor_a.to_series().reset_index()
# make the column to a proper name
chlor_a_rate.rename(columns={'chlor_a':'chl_rate'}, inplace='True')
chlor_a_rate
# merge the two dataframes {floatsDFAll_XDtimeorder; chlor_a_rate} into one dataframe based on the index {id, time} and use the left method
floatsDFAllRate_15Dtimeorder=pd.merge(floatsDFAll_15Dtimeorder,chlor_a_rate, on=['time','id'], how = 'left')
floatsDFAllRate_15Dtimeorder
# check
print('check the sum of the chlor_a before the merge', chlor_a_rate.chl_rate.sum())
print('check the sum of the chlor_a after the merge',floatsDFAllRate_15Dtimeorder.chl_rate.sum())
# visualize the chlorophyll rate, it is *better* to visualize at this scale
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAllRate_15Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r', vmin=-0.8, vmax=0.8, edgecolor='none', ax=ax)
# visualize the chlorophyll rate on the log scale
floatsDFAllRate_15Dtimeorder['chl_rate_log10'] = floatsDFAllRate_15Dtimeorder['chl_rate'].apply(scale)
floatsDFAllRate_15Dtimeorder
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAllRate_15Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
floatsDFAllRate_15Dtimeorder.chl_rate.dropna().shape # (853,) data points
#floatsDFAllRate_15Dtimeorder.chl_rate_log10.dropna().shape # (345,) data points..... notice, chl_rate can be negative, so do not take log10
Out[61]:
In [62]:
pd.to_datetime(floatsDFAllRate_15Dtimeorder.time)
type(pd.to_datetime(floatsDFAllRate_15Dtimeorder.time))
ts = pd.Series(0, index=pd.to_datetime(floatsDFAllRate_15Dtimeorder.time) ) # creat a target time series for masking purpose
# take the month out
month = ts.index.month
# month.shape # a check on the shape of the month.
selector = ((11==month) | (12==month) | (1==month) | (2==month) | (3==month) )
selector
print('shape of the selector', selector.shape)
print('all the data count in [11-01, 03-31] is', floatsDFAllRate_15Dtimeorder[selector].chl_rate.dropna().shape) # total (500,)
print('all the data count is', floatsDFAllRate_15Dtimeorder.chl_rate.dropna().shape ) # total (853,)
In [63]:
# histogram for non standarized data
axfloat = floatsDFAllRate_15Dtimeorder[selector].chl_rate.dropna().hist(bins=100,range=[-0.3,0.3])
axfloat.set_title('15-Day chl_rate')
Out[63]:
In [64]:
# standarized series
ts = floatsDFAllRate_15Dtimeorder[selector].chl_rate.dropna()
ts_standardized = (ts - ts.mean())/ts.std()
axts = ts_standardized.hist(bins=100,range=[-0.3,0.3])
axts.set_title('15-Day standardized chl_rate')
Out[64]:
In [65]:
# all the data
fig, axes = plt.subplots(nrows=8, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, ax in zip(range(2002,2017), axes.flat) :
tmpyear = floatsDFAllRate_15Dtimeorder[ (floatsDFAllRate_15Dtimeorder.time > str(i)) & (floatsDFAllRate_15Dtimeorder.time < str(i+1)) ] # if year i
#fig, ax = plt.subplots(figsize=(12,10))
print(tmpyear.chl_rate.dropna().shape) # total is 1061
tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r',vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
ax.set_title('year %g' % i)
# remove the extra figure
ax = plt.subplot(8,2,16)
fig.delaxes(ax)
In [66]:
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, ax in zip(range(2002,2016), axes.flat) :
tmpyear = floatsDFAllRate_15Dtimeorder[ (floatsDFAllRate_15Dtimeorder.time >= (str(i)+ '-11-01') ) & (floatsDFAllRate_15Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
# select only particular month, Nov 1 to March 31
#fig, ax = plt.subplots(figsize=(12,10))
print(tmpyear.chl_rate.dropna().shape) # the total is 692
tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r', vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
ax.set_title('year %g' % i)
In [ ]:
In [ ]:
In [67]:
# let's output the data as a csv or hdf file to disk to save the experiment time
df_list = []
for i in range(2002,2017) :
tmpyear = floatsDFAllRate_15Dtimeorder[ (floatsDFAllRate_15Dtimeorder.time >= (str(i)+ '-11-01') ) & (floatsDFAllRate_15Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
# select only particular month, Nov 1 to March 31
df_list.append(tmpyear)
df_tmp = pd.concat(df_list)
print('all the data count in [11-01, 03-31] is ', df_tmp.chl_rate.dropna().shape) # again, the total is (500,)
df_chl_out_15D_modisa = df_tmp[~df_tmp.chl_rate.isnull()] # only keep the non-nan values
#list(df_chl_out_XD.groupby(['id'])) # can see the continuity pattern of the Lagarangian difference for each float id
# output to a csv or hdf file
df_chl_out_15D_modisa.head()
Out[67]:
In [68]:
df_chl_out_15D_modisa.index.name = 'index' # make it specific for the index name
# CSV CSV CSV CSV with specfic index
df_chl_out_15D_modisa.to_csv('df_chl_out_15D_modisa.csv', sep=',', index_label = 'index')
# load CSV output
test = pd.read_csv('df_chl_out_15D_modisa.csv', index_col='index')
test.head()
Out[68]:
In [ ]: