In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import xarray as xr
In [4]:
# take a look at the first ten lines of the file
!bzcat "./dist2land_data/dist2coast.txt.bz2" | head -10
In [5]:
# the resolution is 0.01 degree, which is 1 km
dist_db = pd.read_csv("./dist2land_data/dist2coast.txt.bz2", header=None, sep='\t', names=['lon','lat','dist'])
In [6]:
dist_db.head()
#dist_db.columns = ['lon','lat','dist']
#make a copy for safety
dist_db_copy =dist_db
In [7]:
mask= dist_db_copy.lon<0
mask
Out[7]:
In [8]:
dist_db_copy.lon[mask] = dist_db_copy.loc[mask].lon + 360
print('after processing, the minimum longitude is %f4.3 and maximum is %f4.3' % (dist_db_copy.lon.min(),dist_db_copy.lon.max()) )
In [9]:
# reduce dataset
# Select only the arabian sea region
arabian_sea = (dist_db_copy.lon > 45) & (dist_db_copy.lon< 75) & (dist_db_copy.lat> 5) & (dist_db_copy.lat <28)
dist_db_arabian = dist_db_copy[arabian_sea]
print('dist_db_copy.shape is %s, dist_db_arabian.shape is %s' % (dist_db_copy.shape, dist_db_arabian.shape) )
In [12]:
# visualize the unsigned(in-land & out-land) distance around global region
fig, ax = plt.subplots(figsize=(12,8))
dist_db_arabian.plot(kind='scatter', x='lon', y='lat', c='dist', cmap='RdBu_r', edgecolor='none', ax=ax, title='distance to the nearest coast')
Out[12]:
In [11]:
# transfer the dataframe into dataset, and to prepare for dataset.sel
dist_DS = xr.Dataset.from_dataframe(dist_db_arabian.set_index(['lon','lat']) ) # set time & id as the index); use reset_index to revert this operation
dist_DS
Out[11]:
In [13]:
# load the floats data
# ********************
# *** CSV files ***
# ********************
# load the floats data, take the lon and lat as list out and calculate the distance
# load CSV output
# some how the CSV Format has some compatibility issues here
df_chl_out_2D = pd.read_csv('./data_collector_modisa_chla9km/df_chl_out_2D_modisa.csv',index_col='index')
df_chl_out_2D
Out[13]:
In [ ]:
'''
# load the 2D data, based on the floats data and the lagrangian rate of change on chl_ocx
import pandas as pd
test = pd.read_hdf('df_chl_out_2D.h5')
test
# a check
list(test.groupby(['id']))
'''
In [14]:
# check the lat and lon
# df_chl_out_2D.lon
# df_chl_out_2D.lat
In [15]:
tmp_dist = dist_DS.dist.sel_points(lon=list(df_chl_out_2D.lon),lat=list(df_chl_out_2D.lat), method='nearest')
print('the count of nan vaues in tmpAll is',tmp_dist.to_series().isnull().sum())
tmp_dist.to_series()
Out[15]:
In [16]:
# tmp.to_series() to transfer it from xarray dataset to series
df_chl_out_2D['dist'] = pd.Series(np.array(tmp_dist.to_series()), index=df_chl_out_2D.index)
print("after editing the dataframe the nan values in 'chl_ocx' is",df_chl_out_2D.dist.isnull().sum() ) # they should be the same values as above
# take a look at the data
df_chl_out_2D
# visualize the float around the arabian sea region
fig, ax = plt.subplots(figsize=(12,10))
df_chl_out_2D.plot(kind='scatter', x='lon', y='lat', c='dist', cmap='RdBu_r', edgecolor='none', ax=ax, title = 'distance to the nearest coast')
Out[16]:
In [17]:
# CSV CSV CSV CSV with specfic index
# df_chl_out_2D_3.csv -- {lat, lon, temp, chlor_a, dist}
# 3 represents 3 features: {temp, chlor_a, dist}
df_chl_out_2D.to_csv('df_chl_out_2D_modisa_3.csv', sep=',', index_label = 'index')
# load CSV output
test = pd.read_csv('df_chl_out_2D_modisa_3.csv', index_col='index')
# a check
test.head()
Out[17]:
In [18]:
# summary
# do a transformation to make the longitude positive
# transform the dataframe into dataset
# carry out the interpolation on dataset and transform it into a dataframe
# think about output the data from 2D interpolations as a binary file to save time
#
In [19]:
from datetime import datetime, timedelta
datetime(2002, 1, 1) + timedelta(days=184)
Out[19]: