In [2]:
import xarray as xr
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
from dask.diagnostics import ProgressBar
import seaborn as sns
from matplotlib.colors import LogNorm
In [3]:
ds_8day = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_8D.nc')
ds_daily = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_D.nc')
both_datasets = [ds_8day, ds_daily]
In [4]:
# how much data is contained here? let's get the answer in MB
print([(ds.nbytes / 1e6) for ds in both_datasets])
In [5]:
# load all the data in the memory
[ds.load() for ds in both_datasets]
Out[5]:
In [16]:
# fix bad data
def fix_bad_data(ds):
# for some reason, the cloud / land mask is backwards on some data
# this is obvious because there are chlorophyl values less than zero
bad_data = ds.chlor_a.groupby('time').min() < 0
# loop through and fix
for n in np.nonzero(bad_data.values)[0]:
data = ds.chlor_a[n].values
ds.chlor_a.values[n] = np.ma.masked_less(data, 0).filled(np.nan)
In [17]:
[fix_bad_data(ds) for ds in both_datasets]
Out[17]:
In [19]:
ds_8day.chlor_a>0 # mask
Out[19]:
In [20]:
# count the number of ocean data points
(ds_8day.chlor_a>0).sum(dim='time').plot()
Out[20]:
In [21]:
# find a mask for the land
ocean_mask = (ds_8day.chlor_a>0).sum(dim='time')>0
#ocean_mask = (ds_daily.chl_ocx>0).sum(dim='time')>0
num_ocean_points = ocean_mask.sum().values # compute the total nonzeros regions(data point)
ocean_mask.plot()
plt.title('%g total ocean points' % num_ocean_points)
Out[21]:
In [22]:
#ds_8day
In [23]:
#ds_daily
In [24]:
ds_daily.chlor_a.sel(time='2002-11-18',method='nearest').plot(norm=LogNorm())
#ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[24]:
In [25]:
#list(ds_daily.groupby('time')) # take a look at what's inside
In [26]:
'''
<xarray.Dataset>
Dimensions: (eightbitcolor: 256, lat: 144, lon: 276, rgb: 3, time: 4748)
'''
ds_daily.groupby('time').count() # information from original data
Out[26]:
In [27]:
ds_daily.chlor_a.groupby('time').count()/float(num_ocean_points)
Out[27]:
In [28]:
count_8day,count_daily = [ds.chlor_a.groupby('time').count()/float(num_ocean_points)
for ds in (ds_8day, ds_daily)]
In [29]:
plt.figure(figsize=(12,4))
count_8day.plot(color='k')
count_daily.plot(color='r')
plt.legend(['8 day','daily'])
Out[29]:
In [30]:
# Seasonal Climatology
count_8day_clim, coundt_daily_clim = [count.groupby('time.month').mean() # monthly data
for count in (count_8day, count_daily)]
In [31]:
# mean value of the monthly data on the count of nonzeros
plt.figure(figsize=(12,4))
count_8day_clim.plot(color='k')
coundt_daily_clim.plot(color='r')
plt.legend(['8 day', 'daily'])
Out[31]:
In [32]:
# Maps of individual days
target_date = '2003-02-15'
plt.figure(figsize=(8,6))
ds_8day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[32]:
In [33]:
plt.figure(figsize=(8,6))
ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[33]:
In [34]:
ds_daily.chlor_a[0].sel_points(lon=[65, 70], lat=[16, 18], method='nearest') # the time is selected!
#ds_daily.chlor_a[0].sel_points(time= times, lon=lons, lat=times, method='nearest')
Out[34]:
In [35]:
#ds_daily.chlor_a.sel_points?
In [36]:
#ds_9day = ds_daily.resample('9D', dim='time')
ds_8day # just use what we have... this is from OceanColor website
Out[36]:
In [38]:
plt.figure(figsize=(8,6))
ds_8day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())
Out[38]:
In [39]:
# check the range for the longitude
print(ds_8day.lon.min(),'\n' ,ds_8day.lat.min())
In [40]:
# in the following we deal with the data from the gdp float
from buyodata import buoydata
import os
In [41]:
# a list of files
fnamesAll = ['./gdp_float/buoydata_1_5000.dat','./gdp_float/buoydata_5001_10000.dat','./gdp_float/buoydata_10001_15000.dat','./gdp_float/buoydata_15001_jun16.dat']
In [42]:
# read them and cancatenate them into one DataFrame
dfAll = pd.concat([buoydata.read_buoy_data(f) for f in fnamesAll]) # around 4~5 minutes
#mask = df.time>='2002-07-04' # we only have data after this data for chlor_a
dfvvAll = dfAll[dfAll.time>='2002-07-04']
sum(dfvvAll.time<'2002-07-04') # recheck whether the time is
Out[42]:
In [43]:
# process the data so that the longitude are all >0
print('before processing, the minimum longitude is%f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(), dfvvAll.lon.max()))
mask = dfvvAll.lon<0
dfvvAll.lon[mask] = dfvvAll.loc[mask].lon + 360
print('after processing, the minimum longitude is %f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(),dfvvAll.lon.max()) )
dfvvAll.describe()
Out[43]:
In [44]:
# Select only the arabian sea region
arabian_sea = (dfvvAll.lon > 45) & (dfvvAll.lon< 75) & (dfvvAll.lat> 5) & (dfvvAll.lat <28)
# arabian_sea = {'lon': slice(45,75), 'lat': slice(5,28)} # later use this longitude and latitude
floatsAll = dfvvAll.loc[arabian_sea] # directly use mask
print('dfvvAll.shape is %s, floatsAll.shape is %s' % (dfvvAll.shape, floatsAll.shape) )
# visualize the float around global region
fig, ax = plt.subplots(figsize=(12,10))
dfvvAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
Out[44]:
In [45]:
# dump the surface floater data from pandas.dataframe to xarray.dataset
floatsDSAll = xr.Dataset.from_dataframe(floatsAll.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
floatsDSAll
Out[45]:
In [46]:
# resample on the xarray.dataset onto two-day frequency
floatsDSAll_8D =floatsDSAll.resample('8D', dim='time')
floatsDSAll_8D
Out[46]:
In [47]:
# transfer it back to pandas.dataframe for plotting
floatsDFAll_8D = floatsDSAll_8D.to_dataframe()
floatsDFAll_8D
floatsDFAll_8D = floatsDFAll_8D.reset_index()
floatsDFAll_8D
# visualize the 2D-floats around arabian region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_8D.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)
Out[47]:
In [48]:
# get the value for the chllorophy for each data entry
floatsDFAll_8Dtimeorder = floatsDFAll_8D.sort_values(['time','id'],ascending=True)
floatsDFAll_8Dtimeorder # check whether it is time ordered!!
# should we drop nan to speed up??
Out[48]:
In [49]:
floatsDFAll_8Dtimeorder.lon.dropna().shape # the longitude data has lots of values (3855,)
Out[49]:
In [50]:
mask = floatsDFAll_8Dtimeorder.lon.isnull() | floatsDFAll_8Dtimeorder.lat.isnull() | floatsDFAll_8Dtimeorder.time.isnull()
mask
floatsDFAll_8Dtimeorder[~mask].shape # the {long, lat, time} data has lots of values (3855,)
Out[50]:
In [51]:
tmpAll = ds_8day.chlor_a.sel_points(time=list(floatsDFAll_8Dtimeorder.time),lon=list(floatsDFAll_8Dtimeorder.lon), lat=list(floatsDFAll_8Dtimeorder.lat), method='nearest')
print('the count of nan vaues in tmpAll is',tmpAll.to_series().isnull().sum())
In [52]:
#print(tmpAll.dropna().shape)
tmpAll.to_series().dropna().shape # (1741,) good values
Out[52]:
In [53]:
# tmp.to_series() to transfer it from xarray dataset to series
floatsDFAll_8Dtimeorder['chlor_a'] = pd.Series(np.array(tmpAll.to_series()), index=floatsDFAll_8Dtimeorder.index)
print("after editing the dataframe the nan values in 'chlor_a' is",floatsDFAll_8Dtimeorder.chlor_a.isnull().sum() ) # they should be the same values as above
# take a look at the data
floatsDFAll_8Dtimeorder
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a', cmap='RdBu_r', edgecolor='none', ax=ax)
Out[53]:
In [54]:
def scale(x):
logged = np.log10(x)
return logged
In [59]:
#print(floatsAll_timeorder['chlor_a'].apply(scale))
floatsDFAll_8Dtimeorder['chlor_a_log10'] = floatsDFAll_8Dtimeorder['chlor_a'].apply(scale)
floatsDFAll_8Dtimeorder
#print("after the transformation the nan values in 'chlor_a_log10' is", floatsAll_timeorder.chlor_a_log10.isnull().sum() )
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAll_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
floatsDFAll_8Dtimeorder.chlor_a.dropna().shape # (1741,)
#floatsDFAll_8Dtimeorder.chlor_a_log10.dropna().shape # (1741,)
Out[59]:
In [56]:
# take the diff of the chl_ocx, and this has to be done in xarray
# transfer the dataframe into xarry dataset again
# take the difference
floatsDFAll_8Dtimeorder
Out[56]:
In [57]:
# unstack() will provide a 2d dataframe
# reset_index() will reset all the index as columns
In [60]:
# prepare the data in dataset and about to take the diff
tmp = xr.Dataset.from_dataframe(floatsDFAll_8Dtimeorder.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
# take the diff on the chl_ocx
chlor_a_rate = tmp.diff(dim='time',n=1).chlor_a.to_series().reset_index()
# make the column to a proper name
chlor_a_rate.rename(columns={'chlor_a':'chl_rate'}, inplace='True')
chlor_a_rate
# merge the two dataframes {floatsDFAll_2Dtimeorder; chl_ocx_rate} into one dataframe based on the index {id, time} and use the left method
floatsDFAllRate_8Dtimeorder=pd.merge(floatsDFAll_8Dtimeorder,chlor_a_rate, on=['time','id'], how = 'left')
floatsDFAllRate_8Dtimeorder
# check
print('check the sum of the chl_ocx before the merge', chlor_a_rate.chl_rate.sum())
print('check the sum of the chl_ocx after the merge',floatsDFAllRate_8Dtimeorder.chl_rate.sum())
# visualize the chlorophyll rate, it is *better* to visualize at this scale
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAllRate_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r', vmin=-0.8, vmax=0.8, edgecolor='none', ax=ax)
# visualize the chlorophyll rate on the log scale
floatsDFAllRate_8Dtimeorder['chl_rate_log10'] = floatsDFAllRate_8Dtimeorder['chl_rate'].apply(scale)
floatsDFAllRate_8Dtimeorder
fig, ax = plt.subplots(figsize=(12,10))
floatsDFAllRate_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
#floatsDFAllRate_8Dtimeorder.chl_rate.dropna().shape # (1050,) data points
floatsDFAllRate_8Dtimeorder.chl_rate_log10.dropna().shape # (427,) data points..... notice, chl_rate can be negative, so do not take log10
Out[60]:
In [61]:
tmp # can check the dimension, (id: 299, time: 845)
Out[61]:
In [62]:
mask2 = floatsDFAllRate_8Dtimeorder.lon.isnull() | floatsDFAllRate_8Dtimeorder.lat.isnull() | floatsDFAllRate_8Dtimeorder.time.isnull() | floatsDFAllRate_8Dtimeorder.chl_rate.isnull()
mask2
floatsDFAllRate_8Dtimeorder[~mask2].shape # the {long, lat, time} data has lots of values (1050, 15)
Out[62]:
In [63]:
pd.to_datetime(floatsDFAllRate_8Dtimeorder.time)
type(pd.to_datetime(floatsDFAllRate_8Dtimeorder.time))
ts = pd.Series(0, index=pd.to_datetime(floatsDFAllRate_8Dtimeorder.time) ) # creat a target time series for masking purpose
# take the month out
month = ts.index.month
# month.shape # a check on the shape of the month.
selector = ((11==month) | (12==month) | (1==month) | (2==month) | (3==month) )
selector
print('shape of the selector', selector.shape)
print('all the data count in [11-01, 03-31] is', floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna().shape) # total (683,)
print('all the data count is', floatsDFAllRate_8Dtimeorder.chl_rate.dropna().shape ) # total (1050,)
In [64]:
# histogram for non standarized data
axfloat = floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna().hist(bins=100,range=[-0.3,0.3])
axfloat.set_title('8-Day chl_rate')
Out[64]:
In [65]:
# standarized series
ts = floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna()
ts_standardized = (ts - ts.mean())/ts.std()
axts = ts_standardized.hist(bins=100,range=[-0.3,0.3])
axts.set_title('8-Day standardized chl_rate')
Out[65]:
In [66]:
# all the data
fig, axes = plt.subplots(nrows=8, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, ax in zip(range(2002,2017), axes.flat) :
tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time > str(i)) & (floatsDFAllRate_8Dtimeorder.time < str(i+1)) ] # if year i
#fig, ax = plt.subplots(figsize=(12,10))
print(tmpyear.chl_rate.dropna().shape) # total is 1050
tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r',vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
ax.set_title('year %g' % i)
# remove the extra figure
ax = plt.subplot(8,2,16)
fig.delaxes(ax)
In [67]:
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, ax in zip(range(2002,2016), axes.flat) :
tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time >= (str(i)+ '-11-01') ) & (floatsDFAllRate_8Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
# select only particular month, Nov 1 to March 31
#fig, ax = plt.subplots(figsize=(12,10))
print(tmpyear.chl_rate.dropna().shape) # the total is 683
tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r', vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
ax.set_title('year %g' % i)
In [68]:
# let's output the data as a csv or hdf file to disk to save the experiment time
df_list = []
for i in range(2002,2017) :
tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time >= (str(i)+ '-11-01') ) & (floatsDFAllRate_8Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
# select only particular month, Nov 1 to March 31
df_list.append(tmpyear)
df_tmp = pd.concat(df_list)
print('all the data count in [11-01, 03-31] is ', df_tmp.chl_rate.dropna().shape) # again, the total is (683,)
df_chl_out_8D_modisa = df_tmp[~df_tmp.chl_rate.isnull()] # only keep the non-nan values
#list(df_chl_out_XD.groupby(['id'])) # can see the continuity pattern of the Lagarangian difference for each float id
# output to a csv or hdf file
df_chl_out_8D_modisa.head()
Out[68]:
In [69]:
df_chl_out_8D_modisa.index.name = 'index' # make it specific for the index name
# CSV CSV CSV CSV with specfic index
df_chl_out_8D_modisa.to_csv('df_chl_out_8DOC_modisa.csv', sep=',', index_label = 'index')
# load CSV output
test = pd.read_csv('df_chl_out_8DOC_modisa.csv', index_col='index')
test.head()
Out[69]: