8Day data from the OceanColor repository

no resampling is done here


In [2]:
import xarray as xr
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
from dask.diagnostics import ProgressBar
import seaborn as sns
from matplotlib.colors import LogNorm


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

load oceancolor dataset


In [3]:
ds_8day = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_8D.nc')
ds_daily = xr.open_mfdataset('./data_collector_modisa_chla9km/ModisA_Arabian_Sea_chlor_a_9km_*_D.nc')
both_datasets = [ds_8day, ds_daily]

In [4]:
# how much data is contained here? let's get the answer in MB
print([(ds.nbytes / 1e6) for ds in both_datasets])


[534.295504, 4241.4716]

In [5]:
# load all the data in the memory
[ds.load() for ds in both_datasets]


Out[5]:
[<xarray.Dataset>
 Dimensions:        (eightbitcolor: 256, lat: 276, lon: 360, rgb: 3, time: 667)
 Coordinates:
   * lat            (lat) float64 27.96 27.87 27.79 27.71 27.62 27.54 27.46 ...
   * lon            (lon) float64 45.04 45.13 45.21 45.29 45.38 45.46 45.54 ...
   * rgb            (rgb) int64 0 1 2
   * eightbitcolor  (eightbitcolor) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...
   * time           (time) datetime64[ns] 2002-07-04 2002-07-12 2002-07-20 ...
 Data variables:
     palette        (time, rgb, eightbitcolor) float64 -109.0 0.0 108.0 ...
     chlor_a        (time, lat, lon) float64 nan nan nan nan nan nan nan nan ...,
 <xarray.Dataset>
 Dimensions:        (eightbitcolor: 256, lat: 276, lon: 360, rgb: 3, time: 5295)
 Coordinates:
   * lat            (lat) float64 27.96 27.87 27.79 27.71 27.62 27.54 27.46 ...
   * lon            (lon) float64 45.04 45.13 45.21 45.29 45.38 45.46 45.54 ...
   * rgb            (rgb) int64 0 1 2
   * eightbitcolor  (eightbitcolor) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...
   * time           (time) datetime64[ns] 2002-07-04 2002-07-05 2002-07-06 ...
 Data variables:
     palette        (time, rgb, eightbitcolor) float64 -109.0 0.0 108.0 ...
     chlor_a        (time, lat, lon) float64 nan nan nan nan nan nan nan nan ...]

In [16]:
# fix bad data
def fix_bad_data(ds):
    # for some reason, the cloud / land mask is backwards on some data
    # this is obvious because there are chlorophyl values less than zero
    bad_data = ds.chlor_a.groupby('time').min() < 0
    # loop through and fix
    for n in np.nonzero(bad_data.values)[0]:
        data = ds.chlor_a[n].values 
        ds.chlor_a.values[n] = np.ma.masked_less(data, 0).filled(np.nan)

In [17]:
[fix_bad_data(ds) for ds in both_datasets]


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py:1046: RuntimeWarning: invalid value encountered in less
  if not reflexive
Out[17]:
[None, None]

In [19]:
ds_8day.chlor_a>0  # mask


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py:1046: RuntimeWarning: invalid value encountered in greater
  if not reflexive
Out[19]:
<xarray.DataArray 'chlor_a' (time: 667, lat: 276, lon: 360)>
array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ...,  True, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False,  True],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False,  True,  True]],

       ..., 
       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]], dtype=bool)
Coordinates:
  * lat      (lat) float64 27.96 27.87 27.79 27.71 27.62 27.54 27.46 27.37 ...
  * lon      (lon) float64 45.04 45.13 45.21 45.29 45.38 45.46 45.54 45.63 ...
  * time     (time) datetime64[ns] 2002-07-04 2002-07-12 2002-07-20 ...

In [20]:
# count the number of ocean data points
(ds_8day.chlor_a>0).sum(dim='time').plot()


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py:1046: RuntimeWarning: invalid value encountered in greater
  if not reflexive
Out[20]:
<matplotlib.collections.QuadMesh at 0x1197897b8>

In [21]:
#  find a mask for the land
ocean_mask = (ds_8day.chlor_a>0).sum(dim='time')>0
#ocean_mask = (ds_daily.chl_ocx>0).sum(dim='time')>0
num_ocean_points = ocean_mask.sum().values  # compute the total nonzeros regions(data point)
ocean_mask.plot()
plt.title('%g total ocean points' % num_ocean_points)


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/xarray/core/variable.py:1046: RuntimeWarning: invalid value encountered in greater
  if not reflexive
Out[21]:
<matplotlib.text.Text at 0x13d450400>

In [22]:
#ds_8day

In [23]:
#ds_daily

In [24]:
ds_daily.chlor_a.sel(time='2002-11-18',method='nearest').plot(norm=LogNorm())
#ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())


Out[24]:
<matplotlib.collections.QuadMesh at 0x128b63be0>
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/matplotlib/colors.py:1022: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

In [25]:
#list(ds_daily.groupby('time')) # take a look at what's inside

In [26]:
'''
<xarray.Dataset>
Dimensions:        (eightbitcolor: 256, lat: 144, lon: 276, rgb: 3, time: 4748)
'''
ds_daily.groupby('time').count() # information from original data


Out[26]:
<xarray.Dataset>
Dimensions:  (time: 5295)
Coordinates:
  * time     (time) datetime64[ns] 2002-07-04 2002-07-05 2002-07-06 ...
Data variables:
    palette  (time) int64 768 768 768 768 768 768 768 768 768 768 768 768 ...
    chlor_a  (time) int64 658 1170 1532 2798 2632 1100 1321 636 2711 1163 ...

In [27]:
ds_daily.chlor_a.groupby('time').count()/float(num_ocean_points)


Out[27]:
<xarray.DataArray 'chlor_a' (time: 5295)>
array([ 0.01053255,  0.01872809,  0.02452259, ...,  0.        ,
        0.        ,  0.        ])
Coordinates:
  * time     (time) datetime64[ns] 2002-07-04 2002-07-05 2002-07-06 ...

In [28]:
count_8day,count_daily = [ds.chlor_a.groupby('time').count()/float(num_ocean_points)
                            for ds in (ds_8day, ds_daily)]

In [29]:
plt.figure(figsize=(12,4))
count_8day.plot(color='k')
count_daily.plot(color='r')

plt.legend(['8 day','daily'])


Out[29]:
<matplotlib.legend.Legend at 0x104820860>

In [30]:
# Seasonal Climatology
count_8day_clim, coundt_daily_clim = [count.groupby('time.month').mean()  # monthly data
                                      for count in (count_8day, count_daily)]

In [31]:
# mean value of the monthly data on the count of nonzeros
plt.figure(figsize=(12,4))
count_8day_clim.plot(color='k')
coundt_daily_clim.plot(color='r')
plt.legend(['8 day', 'daily'])


Out[31]:
<matplotlib.legend.Legend at 0x129ad9198>

In [32]:
# Maps of individual days
target_date = '2003-02-15'
plt.figure(figsize=(8,6))
ds_8day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())


Out[32]:
<matplotlib.collections.QuadMesh at 0x1296826a0>
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/matplotlib/colors.py:1022: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

In [33]:
plt.figure(figsize=(8,6))
ds_daily.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())


Out[33]:
<matplotlib.collections.QuadMesh at 0x11a875358>
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/matplotlib/colors.py:1022: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

In [34]:
ds_daily.chlor_a[0].sel_points(lon=[65, 70], lat=[16, 18], method='nearest')   # the time is selected!
#ds_daily.chlor_a[0].sel_points(time= times, lon=lons, lat=times, method='nearest')


Out[34]:
<xarray.DataArray 'chlor_a' (points: 2)>
array([ nan,  nan])
Coordinates:
    time     datetime64[ns] 2002-07-04
    lat      (points) float64 16.04 18.04
    lon      (points) float64 65.04 70.04
  * points   (points) int64 0 1

In [35]:
#ds_daily.chlor_a.sel_points?

In [36]:
#ds_9day = ds_daily.resample('9D', dim='time')
ds_8day    # just use what we have... this is from OceanColor website


Out[36]:
<xarray.Dataset>
Dimensions:        (eightbitcolor: 256, lat: 276, lon: 360, rgb: 3, time: 667)
Coordinates:
  * lat            (lat) float64 27.96 27.87 27.79 27.71 27.62 27.54 27.46 ...
  * lon            (lon) float64 45.04 45.13 45.21 45.29 45.38 45.46 45.54 ...
  * rgb            (rgb) int64 0 1 2
  * eightbitcolor  (eightbitcolor) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...
  * time           (time) datetime64[ns] 2002-07-04 2002-07-12 2002-07-20 ...
Data variables:
    palette        (time, rgb, eightbitcolor) float64 -109.0 0.0 108.0 ...
    chlor_a        (time, lat, lon) float64 nan nan nan nan nan nan nan nan ...

In [38]:
plt.figure(figsize=(8,6))
ds_8day.chlor_a.sel(time=target_date, method='nearest').plot(norm=LogNorm())


Out[38]:
<matplotlib.collections.QuadMesh at 0x11b12c780>
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/matplotlib/colors.py:1022: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

In [39]:
# check the range for the longitude
print(ds_8day.lon.min(),'\n' ,ds_8day.lat.min())


<xarray.DataArray 'lon' ()>
array(45.04166793823242) 
 <xarray.DataArray 'lat' ()>
array(5.041661739349365)

++++++++++++++++++++++++++++++++++++++++++++++

All GDP Floats

Load the float data

Map a (time, lon, lat) to a value on the cholorphlly value


In [40]:
# in the following we deal with the data from the gdp float
from buyodata  import buoydata
import os

In [41]:
# a list of files
fnamesAll = ['./gdp_float/buoydata_1_5000.dat','./gdp_float/buoydata_5001_10000.dat','./gdp_float/buoydata_10001_15000.dat','./gdp_float/buoydata_15001_jun16.dat']

In [42]:
# read them and cancatenate them into one DataFrame
dfAll = pd.concat([buoydata.read_buoy_data(f) for f in fnamesAll])  # around 4~5 minutes

#mask = df.time>='2002-07-04' # we only have data after this data for chlor_a
dfvvAll = dfAll[dfAll.time>='2002-07-04']

sum(dfvvAll.time<'2002-07-04') # recheck whether the time is


Out[42]:
0

In [43]:
# process the data so that the longitude are all >0
print('before processing, the minimum longitude is%f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(), dfvvAll.lon.max()))
mask = dfvvAll.lon<0
dfvvAll.lon[mask] = dfvvAll.loc[mask].lon + 360
print('after processing, the minimum longitude is %f4.3 and maximum is %f4.3' % (dfvvAll.lon.min(),dfvvAll.lon.max()) )

dfvvAll.describe()


before processing, the minimum longitude is0.0000004.3 and maximum is 360.0000004.3
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/pandas/core/generic.py:4695: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2881: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
after processing, the minimum longitude is 0.0000004.3 and maximum is 360.0000004.3
Out[43]:
id lat lon temp ve vn spd var_lat var_lon var_tmp
count 2.147732e+07 2.131997e+07 2.131997e+07 1.986179e+07 2.129142e+07 2.129142e+07 2.129142e+07 2.147732e+07 2.147732e+07 2.147732e+07
mean 1.765662e+06 -2.263128e+00 2.124412e+02 1.986121e+01 2.454172e-01 4.708192e-01 2.613427e+01 7.326258e+00 7.326555e+00 7.522298e+01
std 9.452835e+06 3.401115e+01 9.746941e+01 8.339498e+00 2.525050e+01 2.052160e+01 1.939087e+01 8.527853e+01 8.527851e+01 2.637454e+02
min 2.578000e+03 -7.764700e+01 0.000000e+00 -1.685000e+01 -2.916220e+02 -2.601400e+02 0.000000e+00 5.268300e-07 -3.941600e-02 1.001300e-03
25% 4.897500e+04 -3.186000e+01 1.490720e+02 1.437300e+01 -1.411400e+01 -1.044700e+01 1.290300e+01 4.366500e-06 7.512600e-06 1.435700e-03
50% 7.141300e+04 -4.920000e+00 2.153940e+02 2.214400e+01 -5.560000e-01 1.970000e-01 2.176700e+01 8.833600e-06 1.495800e-05 1.691700e-03
75% 1.094330e+05 2.756000e+01 3.064370e+02 2.688900e+01 1.356100e+01 1.109300e+01 3.405900e+01 1.833300e-05 3.627900e-05 2.294200e-03
max 6.399288e+07 8.989900e+01 3.600000e+02 4.595000e+01 4.417070e+02 2.783220e+02 4.421750e+02 1.000000e+03 1.000000e+03 1.000000e+03

In [44]:
# Select only the arabian sea region
arabian_sea = (dfvvAll.lon > 45) & (dfvvAll.lon< 75) & (dfvvAll.lat> 5) & (dfvvAll.lat <28)
# arabian_sea = {'lon': slice(45,75), 'lat': slice(5,28)} # later use this longitude and latitude
floatsAll = dfvvAll.loc[arabian_sea]   # directly use mask
print('dfvvAll.shape is %s, floatsAll.shape is %s' % (dfvvAll.shape, floatsAll.shape) )

# visualize the float around global region
fig, ax  = plt.subplots(figsize=(12,10))
dfvvAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)

# visualize the float around the arabian sea region
fig, ax  = plt.subplots(figsize=(12,10))
floatsAll.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)


dfvvAll.shape is (21477317, 11), floatsAll.shape is (111894, 11)
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x239ba2630>

In [45]:
# dump the surface floater data from pandas.dataframe to xarray.dataset
floatsDSAll = xr.Dataset.from_dataframe(floatsAll.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
floatsDSAll


Out[45]:
<xarray.Dataset>
Dimensions:  (id: 259, time: 17499)
Coordinates:
  * time     (time) datetime64[ns] 2002-07-04 2002-07-04T06:00:00 ...
  * id       (id) int64 7574 10206 10208 11089 15703 15707 27069 27139 28842 ...
Data variables:
    lat      (time, id) float64 nan 16.3 14.03 16.4 14.04 nan 20.11 nan ...
    lon      (time, id) float64 nan 66.23 69.48 64.58 69.51 nan 68.55 nan ...
    temp     (time, id) float64 nan nan nan 28.0 28.53 nan 28.93 nan 27.81 ...
    ve       (time, id) float64 nan 8.68 5.978 6.286 4.844 nan 32.9 nan ...
    vn       (time, id) float64 nan -13.18 -18.05 -7.791 -17.47 nan 15.81 ...
    spd      (time, id) float64 nan 15.78 19.02 10.01 18.13 nan 36.51 nan ...
    var_lat  (time, id) float64 nan 0.0002661 5.01e-05 5.018e-05 5.024e-05 ...
    var_lon  (time, id) float64 nan 0.0006854 8.851e-05 9.018e-05 8.968e-05 ...
    var_tmp  (time, id) float64 nan 1e+03 1e+03 0.003733 0.0667 nan 0.001683 ...

In [46]:
# resample on the xarray.dataset onto two-day frequency
floatsDSAll_8D =floatsDSAll.resample('8D', dim='time')
floatsDSAll_8D


Out[46]:
<xarray.Dataset>
Dimensions:  (id: 259, time: 639)
Coordinates:
  * id       (id) int64 7574 10206 10208 11089 15703 15707 27069 27139 28842 ...
  * time     (time) datetime64[ns] 2002-07-04 2002-07-12 2002-07-20 ...
Data variables:
    spd      (time, id) float64 nan 8.832 18.7 19.48 17.6 nan 25.74 nan ...
    vn       (time, id) float64 nan -0.2949 -9.82 -12.91 -8.593 nan -1.964 ...
    var_lon  (time, id) float64 nan 0.005415 0.0001179 0.0001259 9.874e-05 ...
    lon      (time, id) float64 nan 66.51 69.86 64.99 69.87 nan 69.35 nan ...
    lat      (time, id) float64 nan 16.21 13.62 16.14 13.65 nan 20.09 nan ...
    var_lat  (time, id) float64 nan 0.001424 6.2e-05 6.574e-05 5.391e-05 nan ...
    ve       (time, id) float64 nan 7.335 13.25 12.26 12.13 nan 24.29 nan ...
    var_tmp  (time, id) float64 nan 1e+03 1e+03 0.00364 0.08777 nan 0.001711 ...
    temp     (time, id) float64 nan nan nan 27.81 28.57 nan 28.99 nan 27.65 ...

In [47]:
# transfer it back to pandas.dataframe for plotting
floatsDFAll_8D = floatsDSAll_8D.to_dataframe()
floatsDFAll_8D
floatsDFAll_8D = floatsDFAll_8D.reset_index()
floatsDFAll_8D
# visualize the 2D-floats around arabian region
fig, ax  = plt.subplots(figsize=(12,10))
floatsDFAll_8D.plot(kind='scatter', x='lon', y='lat', c='temp', cmap='RdBu_r', edgecolor='none', ax=ax)


Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x11afe0240>

In [48]:
# get the value for the chllorophy for each data entry
floatsDFAll_8Dtimeorder = floatsDFAll_8D.sort_values(['time','id'],ascending=True)
floatsDFAll_8Dtimeorder # check whether it is time ordered!!
# should we drop nan to speed up??


Out[48]:
id time spd vn var_lon lon lat var_lat ve var_tmp temp
0 7574 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
639 10206 2002-07-04 8.832125 -0.294906 0.005415 66.510656 16.208687 0.001424 7.334875 1000.000000 NaN
1278 10208 2002-07-04 18.702906 -9.820156 0.000118 69.858594 13.617187 0.000062 13.248719 1000.000000 NaN
1917 11089 2002-07-04 19.484250 -12.911969 0.000126 64.993937 16.140125 0.000066 12.260094 0.003640 27.807781
2556 15703 2002-07-04 17.604156 -8.592531 0.000099 69.867031 13.648188 0.000054 12.131219 0.087771 28.569812
3195 15707 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3834 27069 2002-07-04 25.743625 -1.963906 0.000099 69.350187 20.090281 0.000054 24.285875 0.001711 28.985781
4473 27139 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
5112 28842 2002-07-04 17.876969 -7.667469 0.000218 60.820937 18.663125 0.000103 3.765094 0.003330 27.649500
5751 34159 2002-07-04 35.638313 14.802688 0.000116 59.602656 12.808719 0.000061 31.401250 1000.000000 NaN
6390 34173 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
7029 34210 2002-07-04 25.260719 -15.458906 0.000125 56.754250 6.092000 0.000064 -2.609125 0.003695 26.541750
7668 34211 2002-07-04 27.956094 -13.744125 0.000102 68.470625 8.265656 0.000055 22.912125 0.003516 28.372250
8307 34212 2002-07-04 46.824937 12.766750 0.000102 65.751156 6.659437 0.000055 41.911531 0.003590 28.576844
8946 34223 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
9585 34310 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
10224 34311 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
10863 34312 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
11502 34314 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
12141 34315 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
12780 34374 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
13419 34708 2002-07-04 33.377594 2.054469 0.000110 60.558000 10.218219 0.000058 33.013969 0.001789 27.259781
14058 34709 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN
14697 34710 2002-07-04 46.721688 -4.710250 0.000100 50.311875 13.146719 0.000052 5.607969 0.001858 31.146688
15336 34714 2002-07-04 38.004406 3.964156 0.000113 64.554750 13.736031 0.000060 36.962156 0.001808 27.743656
15975 34716 2002-07-04 34.903438 7.112344 0.000107 66.159969 7.701938 0.000058 32.958500 0.001758 28.789000
16614 34718 2002-07-04 38.508094 -31.845094 0.000105 72.890563 15.600562 0.000056 20.738031 0.001709 29.088344
17253 34719 2002-07-04 27.892406 -20.870063 0.000109 71.331469 17.318281 0.000058 14.599125 0.001661 28.957969
17892 34720 2002-07-04 26.035375 -21.496063 0.000113 69.435531 14.194000 0.000061 11.059094 0.001797 28.665031
18531 34721 2002-07-04 13.515531 -10.533031 0.000115 65.534344 16.971531 0.000061 6.204906 0.001749 27.911625
... ... ... ... ... ... ... ... ... ... ... ...
146969 3098682 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
147608 60073460 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
148247 60074440 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
148886 60077450 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
149525 60150420 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
150164 60454500 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
150803 60656200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
151442 60657200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
152081 60658190 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
152720 60659110 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
153359 60659120 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
153998 60659190 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
154637 60659200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
155276 60940960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
155915 60940970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
156554 60941960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
157193 60941970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
157832 60942960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
158471 60942970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
159110 60943960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
159749 60943970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
160388 60944960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
161027 60944970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
161666 60945970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
162305 60946960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
162944 60947960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
163583 60947970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
164222 60948960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
164861 60950430 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN
165500 62321420 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN

165501 rows × 11 columns


In [49]:
floatsDFAll_8Dtimeorder.lon.dropna().shape  # the longitude data has lots of values (3855,)


Out[49]:
(3855,)

In [50]:
mask = floatsDFAll_8Dtimeorder.lon.isnull() | floatsDFAll_8Dtimeorder.lat.isnull() | floatsDFAll_8Dtimeorder.time.isnull()
mask
floatsDFAll_8Dtimeorder[~mask].shape # the {long, lat, time} data has lots of values (3855,)


Out[50]:
(3855, 11)

In [51]:
tmpAll = ds_8day.chlor_a.sel_points(time=list(floatsDFAll_8Dtimeorder.time),lon=list(floatsDFAll_8Dtimeorder.lon), lat=list(floatsDFAll_8Dtimeorder.lat), method='nearest')
print('the count of nan vaues in tmpAll is',tmpAll.to_series().isnull().sum())


/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/pandas/indexes/base.py:2352: RuntimeWarning: invalid value encountered in less
  indexer = np.where(op(left_distances, right_distances) |
/Users/vyan2000/local/miniconda3/envs/condapython3/lib/python3.5/site-packages/pandas/indexes/base.py:2352: RuntimeWarning: invalid value encountered in less_equal
  indexer = np.where(op(left_distances, right_distances) |
the count of nan vaues in tmpAll is 163760

In [52]:
#print(tmpAll.dropna().shape)
tmpAll.to_series().dropna().shape  # (1741,) good values


Out[52]:
(1741,)

In [53]:
# tmp.to_series() to transfer it from xarray dataset to series
floatsDFAll_8Dtimeorder['chlor_a'] = pd.Series(np.array(tmpAll.to_series()), index=floatsDFAll_8Dtimeorder.index)
print("after editing the dataframe the nan values in 'chlor_a' is",floatsDFAll_8Dtimeorder.chlor_a.isnull().sum() )  # they should be the same values as above

# take a look at the data
floatsDFAll_8Dtimeorder

# visualize the float around the arabian sea region
fig, ax  = plt.subplots(figsize=(12,10))
floatsDFAll_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a', cmap='RdBu_r', edgecolor='none', ax=ax)


after editing the dataframe the nan values in 'chlor_a' is 163760
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x11afe48d0>

In [54]:
def scale(x):
    logged = np.log10(x)
    return logged

In [59]:
#print(floatsAll_timeorder['chlor_a'].apply(scale))
floatsDFAll_8Dtimeorder['chlor_a_log10'] = floatsDFAll_8Dtimeorder['chlor_a'].apply(scale)
floatsDFAll_8Dtimeorder
#print("after the transformation the nan values in 'chlor_a_log10' is", floatsAll_timeorder.chlor_a_log10.isnull().sum() )

# visualize the float around the arabian sea region
fig, ax  = plt.subplots(figsize=(12,10))
floatsDFAll_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chlor_a_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
floatsDFAll_8Dtimeorder.chlor_a.dropna().shape  # (1741,)
#floatsDFAll_8Dtimeorder.chlor_a_log10.dropna().shape  # (1741,)


Out[59]:
(1741,)

In [56]:
# take the diff of the chl_ocx, and this has to be done in xarray
# transfer the dataframe into xarry dataset again
# take the difference
floatsDFAll_8Dtimeorder


Out[56]:
id time spd vn var_lon lon lat var_lat ve var_tmp temp chlor_a chlor_a_log10
0 7574 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
639 10206 2002-07-04 8.832125 -0.294906 0.005415 66.510656 16.208687 0.001424 7.334875 1000.000000 NaN NaN NaN
1278 10208 2002-07-04 18.702906 -9.820156 0.000118 69.858594 13.617187 0.000062 13.248719 1000.000000 NaN NaN NaN
1917 11089 2002-07-04 19.484250 -12.911969 0.000126 64.993937 16.140125 0.000066 12.260094 0.003640 27.807781 NaN NaN
2556 15703 2002-07-04 17.604156 -8.592531 0.000099 69.867031 13.648188 0.000054 12.131219 0.087771 28.569812 NaN NaN
3195 15707 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3834 27069 2002-07-04 25.743625 -1.963906 0.000099 69.350187 20.090281 0.000054 24.285875 0.001711 28.985781 NaN NaN
4473 27139 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5112 28842 2002-07-04 17.876969 -7.667469 0.000218 60.820937 18.663125 0.000103 3.765094 0.003330 27.649500 NaN NaN
5751 34159 2002-07-04 35.638313 14.802688 0.000116 59.602656 12.808719 0.000061 31.401250 1000.000000 NaN NaN NaN
6390 34173 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7029 34210 2002-07-04 25.260719 -15.458906 0.000125 56.754250 6.092000 0.000064 -2.609125 0.003695 26.541750 NaN NaN
7668 34211 2002-07-04 27.956094 -13.744125 0.000102 68.470625 8.265656 0.000055 22.912125 0.003516 28.372250 0.104210 -0.982091
8307 34212 2002-07-04 46.824937 12.766750 0.000102 65.751156 6.659437 0.000055 41.911531 0.003590 28.576844 NaN NaN
8946 34223 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9585 34310 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10224 34311 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10863 34312 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11502 34314 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12141 34315 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12780 34374 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13419 34708 2002-07-04 33.377594 2.054469 0.000110 60.558000 10.218219 0.000058 33.013969 0.001789 27.259781 NaN NaN
14058 34709 2002-07-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14697 34710 2002-07-04 46.721688 -4.710250 0.000100 50.311875 13.146719 0.000052 5.607969 0.001858 31.146688 NaN NaN
15336 34714 2002-07-04 38.004406 3.964156 0.000113 64.554750 13.736031 0.000060 36.962156 0.001808 27.743656 NaN NaN
15975 34716 2002-07-04 34.903438 7.112344 0.000107 66.159969 7.701938 0.000058 32.958500 0.001758 28.789000 0.119733 -0.921786
16614 34718 2002-07-04 38.508094 -31.845094 0.000105 72.890563 15.600562 0.000056 20.738031 0.001709 29.088344 NaN NaN
17253 34719 2002-07-04 27.892406 -20.870063 0.000109 71.331469 17.318281 0.000058 14.599125 0.001661 28.957969 NaN NaN
17892 34720 2002-07-04 26.035375 -21.496063 0.000113 69.435531 14.194000 0.000061 11.059094 0.001797 28.665031 NaN NaN
18531 34721 2002-07-04 13.515531 -10.533031 0.000115 65.534344 16.971531 0.000061 6.204906 0.001749 27.911625 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
146969 3098682 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
147608 60073460 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
148247 60074440 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
148886 60077450 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
149525 60150420 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150164 60454500 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150803 60656200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
151442 60657200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
152081 60658190 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
152720 60659110 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
153359 60659120 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
153998 60659190 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
154637 60659200 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
155276 60940960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
155915 60940970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
156554 60941960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
157193 60941970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
157832 60942960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
158471 60942970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
159110 60943960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
159749 60943970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
160388 60944960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
161027 60944970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
161666 60945970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
162305 60946960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
162944 60947960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
163583 60947970 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
164222 60948960 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
164861 60950430 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
165500 62321420 2016-06-24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

165501 rows × 13 columns


In [57]:
# unstack() will provide a 2d dataframe
# reset_index() will reset all the index as columns

In [60]:
# prepare the data in dataset and about to take the diff
tmp = xr.Dataset.from_dataframe(floatsDFAll_8Dtimeorder.set_index(['time','id']) ) # set time & id as the index); use reset_index to revert this operation
# take the diff on the chl_ocx
chlor_a_rate = tmp.diff(dim='time',n=1).chlor_a.to_series().reset_index()
# make the column to a proper name
chlor_a_rate.rename(columns={'chlor_a':'chl_rate'}, inplace='True')
chlor_a_rate


# merge the two dataframes {floatsDFAll_2Dtimeorder; chl_ocx_rate} into one dataframe based on the index {id, time} and use the left method
floatsDFAllRate_8Dtimeorder=pd.merge(floatsDFAll_8Dtimeorder,chlor_a_rate, on=['time','id'], how = 'left')
floatsDFAllRate_8Dtimeorder

# check 
print('check the sum of the chl_ocx before the merge', chlor_a_rate.chl_rate.sum())
print('check the sum of the chl_ocx after the merge',floatsDFAllRate_8Dtimeorder.chl_rate.sum())


# visualize the chlorophyll rate, it is *better* to visualize at this scale
fig, ax  = plt.subplots(figsize=(12,10))
floatsDFAllRate_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r',  vmin=-0.8, vmax=0.8, edgecolor='none', ax=ax)

# visualize the chlorophyll rate on the log scale
floatsDFAllRate_8Dtimeorder['chl_rate_log10'] = floatsDFAllRate_8Dtimeorder['chl_rate'].apply(scale)
floatsDFAllRate_8Dtimeorder
fig, ax  = plt.subplots(figsize=(12,10))
floatsDFAllRate_8Dtimeorder.plot(kind='scatter', x='lon', y='lat', c='chl_rate_log10', cmap='RdBu_r', edgecolor='none', ax=ax)
#floatsDFAllRate_8Dtimeorder.chl_rate.dropna().shape   #  (1050,) data points
floatsDFAllRate_8Dtimeorder.chl_rate_log10.dropna().shape   # (427,)  data points..... notice, chl_rate can be negative, so do not take log10


check the sum of the chl_ocx before the merge -69.5044389218092
check the sum of the chl_ocx after the merge -69.5044389218092
Out[60]:
(427,)

In [61]:
tmp # can check the dimension, (id: 299, time: 845)


Out[61]:
<xarray.Dataset>
Dimensions:        (id: 259, time: 639)
Coordinates:
  * time           (time) datetime64[ns] 2002-07-04 2002-07-12 2002-07-20 ...
  * id             (id) int64 7574 10206 10208 11089 15703 15707 27069 27139 ...
Data variables:
    spd            (time, id) float64 nan 8.832 18.7 19.48 17.6 nan 25.74 ...
    vn             (time, id) float64 nan -0.2949 -9.82 -12.91 -8.593 nan ...
    var_lon        (time, id) float64 nan 0.005415 0.0001179 0.0001259 ...
    lon            (time, id) float64 nan 66.51 69.86 64.99 69.87 nan 69.35 ...
    lat            (time, id) float64 nan 16.21 13.62 16.14 13.65 nan 20.09 ...
    var_lat        (time, id) float64 nan 0.001424 6.2e-05 6.574e-05 ...
    ve             (time, id) float64 nan 7.335 13.25 12.26 12.13 nan 24.29 ...
    var_tmp        (time, id) float64 nan 1e+03 1e+03 0.00364 0.08777 nan ...
    temp           (time, id) float64 nan nan nan 27.81 28.57 nan 28.99 nan ...
    chlor_a        (time, id) float64 nan nan nan nan nan nan nan nan nan ...
    chlor_a_log10  (time, id) float64 nan nan nan nan nan nan nan nan nan ...

In [62]:
mask2 = floatsDFAllRate_8Dtimeorder.lon.isnull() | floatsDFAllRate_8Dtimeorder.lat.isnull() | floatsDFAllRate_8Dtimeorder.time.isnull() | floatsDFAllRate_8Dtimeorder.chl_rate.isnull()
mask2
floatsDFAllRate_8Dtimeorder[~mask2].shape # the {long, lat, time} data has lots of values (1050, 15)


Out[62]:
(1050, 15)

In [63]:
pd.to_datetime(floatsDFAllRate_8Dtimeorder.time)
type(pd.to_datetime(floatsDFAllRate_8Dtimeorder.time))
ts = pd.Series(0, index=pd.to_datetime(floatsDFAllRate_8Dtimeorder.time) ) # creat a target time series for masking purpose

# take the month out
month = ts.index.month 
# month.shape # a check on the shape of the month.
selector = ((11==month) | (12==month) | (1==month) | (2==month) | (3==month) )  
selector
print('shape of the selector', selector.shape)

print('all the data count in [11-01, 03-31]  is', floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna().shape) # total (683,)
print('all the data count is', floatsDFAllRate_8Dtimeorder.chl_rate.dropna().shape )   # total (1050,)


shape of the selector (165501,)
all the data count in [11-01, 03-31]  is (683,)
all the data count is (1050,)

In [64]:
# histogram for non standarized data
axfloat = floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna().hist(bins=100,range=[-0.3,0.3])
axfloat.set_title('8-Day chl_rate')


Out[64]:
<matplotlib.text.Text at 0x1704b1e10>

In [65]:
# standarized series
ts = floatsDFAllRate_8Dtimeorder[selector].chl_rate.dropna()
ts_standardized = (ts - ts.mean())/ts.std()
axts = ts_standardized.hist(bins=100,range=[-0.3,0.3])
axts.set_title('8-Day standardized chl_rate')


Out[65]:
<matplotlib.text.Text at 0x457d2ad30>

In [66]:
# all the data
fig, axes = plt.subplots(nrows=8, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)

for i, ax in zip(range(2002,2017), axes.flat) :
    tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time > str(i))  & (floatsDFAllRate_8Dtimeorder.time < str(i+1)) ] # if year i
    #fig, ax  = plt.subplots(figsize=(12,10))
    print(tmpyear.chl_rate.dropna().shape)   # total is 1050
    tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r',vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
    ax.set_title('year %g' % i)     
    
# remove the extra figure
ax = plt.subplot(8,2,16)
fig.delaxes(ax)


(45,)
(48,)
(6,)
(43,)
(98,)
(98,)
(139,)
(42,)
(64,)
(21,)
(37,)
(30,)
(213,)
(111,)
(55,)

In [67]:
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)

for i, ax in zip(range(2002,2016), axes.flat) :
    tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time >= (str(i)+ '-11-01') )  & (floatsDFAllRate_8Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
    # select only particular month, Nov 1 to March 31
    #fig, ax  = plt.subplots(figsize=(12,10))
    print(tmpyear.chl_rate.dropna().shape)  # the total is 683
    tmpyear.plot(kind='scatter', x='lon', y='lat', c='chl_rate', cmap='RdBu_r', vmin=-0.6, vmax=0.6, edgecolor='none', ax=ax)
    ax.set_title('year %g' % i)


(56,)
(0,)
(11,)
(73,)
(44,)
(108,)
(29,)
(51,)
(1,)
(38,)
(0,)
(120,)
(97,)
(55,)

In [68]:
# let's output the data as a csv or hdf file to disk to save the experiment time

df_list = []
for i in range(2002,2017) :
    tmpyear = floatsDFAllRate_8Dtimeorder[ (floatsDFAllRate_8Dtimeorder.time >= (str(i)+ '-11-01') )  & (floatsDFAllRate_8Dtimeorder.time <= (str(i+1)+'-03-31') ) ] # if year i
    # select only particular month, Nov 1 to March 31
    df_list.append(tmpyear)
    
df_tmp = pd.concat(df_list)
print('all the data count in [11-01, 03-31]  is ', df_tmp.chl_rate.dropna().shape) # again, the total is  (683,)
df_chl_out_8D_modisa = df_tmp[~df_tmp.chl_rate.isnull()] # only keep the non-nan values
#list(df_chl_out_XD.groupby(['id']))   # can see the continuity pattern of the Lagarangian difference for each float id

# output to a csv or hdf file
df_chl_out_8D_modisa.head()


all the data count in [11-01, 03-31]  is  (683,)
Out[68]:
id time spd vn var_lon lon lat var_lat ve var_tmp temp chlor_a chlor_a_log10 chl_rate chl_rate_log10
3886 10206 2002-11-01 11.188906 6.509875 0.000996 67.351188 10.873656 0.000352 -6.823625 1000.000000 NaN 0.132783 -0.876858 -0.017698 NaN
3888 11089 2002-11-01 13.679406 4.337844 0.000106 65.099156 14.269219 0.000057 -11.122000 0.003679 28.969813 0.150789 -0.821630 0.025481 -1.593784
3908 34710 2002-11-01 12.432687 11.684344 0.000123 63.145031 17.038563 0.000064 0.757312 0.001698 28.970219 0.388257 -0.410881 0.064084 -1.193250
4145 10206 2002-11-09 3.428062 1.562844 0.003551 67.108219 11.155719 0.000984 -0.786375 1000.000000 NaN 0.135089 -0.869380 0.002306 -2.637141
4147 11089 2002-11-09 19.677781 -6.951906 0.000126 64.193281 14.220969 0.000065 -17.539250 0.003868 28.742188 0.201879 -0.694909 0.051090 -1.291664

In [69]:
df_chl_out_8D_modisa.index.name = 'index'  # make it specific for the index name

# CSV CSV CSV CSV with specfic index
df_chl_out_8D_modisa.to_csv('df_chl_out_8DOC_modisa.csv', sep=',', index_label = 'index')

# load CSV output
test = pd.read_csv('df_chl_out_8DOC_modisa.csv', index_col='index')
test.head()


Out[69]:
id time spd vn var_lon lon lat var_lat ve var_tmp temp chlor_a chlor_a_log10 chl_rate chl_rate_log10
index
3886 10206 2002-11-01 11.188906 6.509875 0.000996 67.351188 10.873656 0.000352 -6.823625 1000.000000 NaN 0.132783 -0.876858 -0.017698 NaN
3888 11089 2002-11-01 13.679406 4.337844 0.000106 65.099156 14.269219 0.000057 -11.122000 0.003679 28.969813 0.150789 -0.821630 0.025481 -1.593784
3908 34710 2002-11-01 12.432687 11.684344 0.000123 63.145031 17.038563 0.000064 0.757312 0.001698 28.970219 0.388257 -0.410881 0.064084 -1.193250
4145 10206 2002-11-09 3.428062 1.562844 0.003551 67.108219 11.155719 0.000984 -0.786375 1000.000000 NaN 0.135089 -0.869380 0.002306 -2.637141
4147 11089 2002-11-09 19.677781 -6.951906 0.000126 64.193281 14.220969 0.000065 -17.539250 0.003868 28.742188 0.201879 -0.694909 0.051090 -1.291664