this is a file for checking the distance to land


In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import xarray as xr

In [4]:
# take a look at the first ten lines of the file
!bzcat "./dist2land_data/dist2coast.txt.bz2" | head -10


-179.98	89.98	712.935
-179.94	89.98	712.934
-179.9	89.98	712.933
-179.86	89.98	712.932
-179.82	89.98	712.932
-179.78	89.98	712.931
-179.74	89.98	712.93
-179.7	89.98	712.929
-179.66	89.98	712.928
-179.62	89.98	712.927

bzcat: I/O or other error, bailing out.  Possible reason follows.
bzcat: Broken pipe
	Input file = ./dist2land_data/dist2coast.txt.bz2, output file = (stdout)

In [5]:
# the resolution is 0.01 degree, which is 1 km
dist_db = pd.read_csv("./dist2land_data/dist2coast.txt.bz2", header=None, sep='\t', names=['lon','lat','dist'])

In [6]:
dist_db.head()
#dist_db.columns = ['lon','lat','dist']

#make a copy for safety
dist_db_copy =dist_db

In [7]:
mask= dist_db_copy.lon<0
mask


Out[7]:
0            True
1            True
2            True
3            True
4            True
5            True
6            True
7            True
8            True
9            True
10           True
11           True
12           True
13           True
14           True
15           True
16           True
17           True
18           True
19           True
20           True
21           True
22           True
23           True
24           True
25           True
26           True
27           True
28           True
29           True
            ...  
40499970    False
40499971    False
40499972    False
40499973    False
40499974    False
40499975    False
40499976    False
40499977    False
40499978    False
40499979    False
40499980    False
40499981    False
40499982    False
40499983    False
40499984    False
40499985    False
40499986    False
40499987    False
40499988    False
40499989    False
40499990    False
40499991    False
40499992    False
40499993    False
40499994    False
40499995    False
40499996    False
40499997    False
40499998    False
40499999    False
Name: lon, dtype: bool

In [8]:
dist_db_copy.lon[mask] = dist_db_copy.loc[mask].lon + 360
print('after processing, the minimum longitude is %f4.3 and maximum is %f4.3' % (dist_db_copy.lon.min(),dist_db_copy.lon.max()) )


after processing, the minimum longitude is 0.0200004.3 and maximum is 359.9800004.3

In [9]:
# reduce dataset 
# Select only the arabian sea region
arabian_sea = (dist_db_copy.lon > 45) & (dist_db_copy.lon< 75) & (dist_db_copy.lat> 5) & (dist_db_copy.lat <28)
dist_db_arabian = dist_db_copy[arabian_sea]
print('dist_db_copy.shape is %s, dist_db_arabian.shape is %s' % (dist_db_copy.shape, dist_db_arabian.shape) )


dist_db_copy.shape is (40500000, 3), dist_db_arabian.shape is (431250, 3)

In [12]:
# visualize the unsigned(in-land & out-land) distance around global region
fig, ax  = plt.subplots(figsize=(12,8))
dist_db_arabian.plot(kind='scatter', x='lon', y='lat', c='dist', cmap='RdBu_r', edgecolor='none', ax=ax, title='distance to the nearest coast')


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a742a90>

In [11]:
# transfer the dataframe into dataset, and to prepare for dataset.sel
dist_DS = xr.Dataset.from_dataframe(dist_db_arabian.set_index(['lon','lat']) ) # set time & id as the index); use reset_index to revert this operation
dist_DS


Out[11]:
<xarray.Dataset>
Dimensions:  (lat: 575, lon: 750)
Coordinates:
  * lon      (lon) float64 45.02 45.06 45.1 45.14 45.18 45.22 45.26 45.3 ...
  * lat      (lat) float64 5.02 5.06 5.1 5.14 5.18 5.22 5.26 5.3 5.34 5.38 ...
Data variables:
    dist     (lon, lat) float64 280.4 283.3 286.3 289.3 292.3 295.3 298.3 ...

In [13]:
# load the floats data 
# ********************
# *** CSV files ***
# ********************
# load the floats data, take the lon and lat as list out and calculate the distance

# load CSV output    
# some how the CSV Format has some compatibility issues here
df_chl_out_2D = pd.read_csv('./data_collector_modisa_chla9km/df_chl_out_2D_modisa.csv',index_col='index')
df_chl_out_2D


Out[13]:
id time lon ve var_lon var_tmp vn spd var_lat lat temp chlor_a chlor_a_log10 chl_rate chl_rate_log10
index
15828 34721 2002-11-03 67.570000 3.303000 0.000135 0.001823 2.865500 8.211375 0.000069 12.647250 29.435500 0.123307 -0.909012 -0.010569 NaN
16081 34710 2002-11-05 63.160750 0.491500 0.000088 0.001596 10.219250 10.749875 0.000050 17.127000 28.991250 0.445077 -0.351565 0.033500 -1.474955
16320 11089 2002-11-07 64.835375 -15.401500 0.000105 0.003391 2.263500 16.392000 0.000056 14.366875 28.939625 0.182671 -0.738330 0.024246 -1.615360
16322 15707 2002-11-07 67.399125 -19.144750 0.000118 1000.000000 -21.695125 29.499000 0.000063 13.739500 NaN 0.161665 -0.791384 -0.004139 NaN
16336 34315 2002-11-07 57.375375 -49.749875 0.000085 0.004589 4.788500 52.729375 0.000048 5.613000 29.647125 0.115356 -0.937960 -0.009359 NaN
16340 34710 2002-11-07 63.170375 -0.728375 0.000148 0.001614 10.725375 11.110875 0.000075 17.297375 28.900250 0.376970 -0.423693 -0.068107 NaN
16579 11089 2002-11-09 64.586375 -14.547875 0.000189 0.004779 -3.024000 15.786125 0.000090 14.356250 29.047125 0.194167 -0.711825 0.011496 -1.939453
16581 15707 2002-11-09 67.213250 -5.045875 0.000183 1000.000000 -23.739500 24.623250 0.000089 13.350625 NaN 0.164932 -0.782695 0.003267 -2.485850
16605 34721 2002-11-09 67.965125 13.566000 0.000215 0.002024 1.321875 15.994875 0.000099 12.587500 29.499125 0.143538 -0.843033 -0.006717 NaN
16836 10206 2002-11-11 67.132250 0.870375 0.001112 1000.000000 0.028125 1.862250 0.000387 11.153750 NaN 0.125101 -0.902739 -0.010332 NaN
16838 11089 2002-11-11 64.346000 -17.929000 0.000106 0.003450 -3.980500 18.692500 0.000056 14.287500 28.949875 0.192742 -0.715024 -0.001425 NaN
16864 34721 2002-11-11 68.167500 10.140250 0.000095 0.001749 12.864500 17.991375 0.000052 12.704750 29.393250 0.116437 -0.933909 -0.027101 NaN
17097 11089 2002-11-13 64.063625 -17.144125 0.000109 0.003639 -8.458625 20.047500 0.000058 14.201750 28.580125 0.310467 -0.507985 0.117725 -0.929131
17376 34710 2002-11-15 62.952000 3.871500 0.000097 0.001616 42.243000 42.566500 0.000054 18.600125 27.833875 0.588544 -0.230221 0.096043 -1.017532
17631 34315 2002-11-17 56.618250 27.657625 0.000099 0.003358 6.597500 28.794500 0.000053 8.295375 28.890875 0.135714 -0.867375 -0.023010 NaN
17890 34315 2002-11-19 57.227875 40.891500 0.000082 0.003607 -16.705250 45.736625 0.000045 8.237625 28.946000 0.134862 -0.870110 -0.000852 NaN
18412 34710 2002-11-23 61.505875 -28.221000 0.000107 0.001671 50.589125 58.013875 0.000058 21.247250 27.339625 0.461292 -0.336024 -0.053822 NaN
18671 34710 2002-11-25 61.228750 3.868750 0.000083 0.001570 36.908250 40.428250 0.000045 21.894000 27.418125 0.472869 -0.325259 0.011577 -1.936404
18930 34710 2002-11-27 61.789250 56.797500 0.000107 0.001730 59.141000 82.187625 0.000056 22.656125 27.353875 0.490433 -0.309420 0.017564 -1.755377
19185 34315 2002-11-29 57.628250 -22.780000 0.000093 0.004921 -22.870625 32.527125 0.000050 6.674250 29.091000 0.144218 -0.840981 0.001275 -2.894491
19189 34710 2002-11-29 62.757750 53.614125 0.000108 0.001707 -7.459750 60.761625 0.000057 23.191000 27.267125 0.538974 -0.268432 0.048541 -1.313891
19430 15707 2002-12-01 64.433875 -11.834500 0.000121 1000.000000 -16.979625 21.206625 0.000064 11.775250 NaN 0.155115 -0.809346 -0.010187 NaN
19444 34315 2002-12-01 57.136625 -43.623250 0.000097 0.004656 -13.519000 47.304750 0.000053 6.325875 29.152500 0.132107 -0.879074 -0.012111 NaN
19448 34710 2002-12-01 63.540375 28.900750 0.000095 0.001725 -69.216625 78.193125 0.000052 22.506000 27.212750 2.014595 0.304188 1.475621 0.168975
19454 34721 2002-12-01 68.144750 0.107500 0.000105 0.001705 8.952375 10.363500 0.000057 14.616000 28.928125 0.127701 -0.893806 -0.026448 NaN
19703 34315 2002-12-03 56.320500 -60.573375 0.000100 0.003538 5.745375 61.158375 0.000053 6.289875 28.744000 0.141334 -0.849753 0.009227 -2.034940
19707 34710 2002-12-03 63.536375 -16.860500 0.000196 0.001618 -72.597500 74.810750 0.000095 21.308000 27.071000 3.837143 0.584008 1.822548 0.260679
19713 34721 2002-12-03 68.175500 0.083000 0.000107 0.001703 9.842750 12.343000 0.000058 14.761125 28.846000 0.129488 -0.887770 0.001787 -2.747874
21002 34710 2002-12-13 64.403125 8.392625 0.000089 0.001617 11.196375 15.064125 0.000050 21.922750 26.346375 0.666574 -0.176152 0.180225 -0.744185
21261 34710 2002-12-15 64.540375 6.939500 0.000188 0.001751 3.002125 8.841375 0.000092 22.029625 26.316750 0.586663 -0.231611 -0.079911 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
639142 114917 2016-01-06 73.034500 3.679125 0.000030 0.001725 -11.626750 16.511875 0.000067 12.796250 29.599375 0.152041 -0.818039 0.002645 -2.577643
640179 114945 2016-01-14 64.360375 -10.025125 0.000045 0.001819 -2.312875 10.748375 0.000103 11.534625 28.285875 0.165873 -0.780226 0.001062 -2.973958
640437 114917 2016-01-16 72.883375 13.589000 0.000032 0.001995 18.156000 24.358875 0.000072 13.019500 29.498375 0.145469 -0.837230 -0.010957 NaN
641474 114945 2016-01-24 63.422125 -15.508375 0.000052 0.001857 -2.947125 16.809250 0.000119 11.429875 27.825375 0.230810 -0.636745 0.027577 -1.559458
641507 147127 2016-01-24 63.999375 -7.378750 0.000029 0.001707 -11.448125 14.115250 0.000062 17.388750 26.014875 0.723136 -0.140780 0.370582 -0.431116
642251 114945 2016-01-30 62.758750 -21.994625 0.000081 0.001775 -1.689000 22.978750 0.000176 11.332000 27.811250 0.249393 -0.603116 -0.007247 NaN
642284 147127 2016-01-30 63.762875 -11.290750 0.000033 0.001761 -21.071375 24.077750 0.000075 16.785125 25.873000 0.498726 -0.302138 -0.072032 NaN
643061 147127 2016-02-05 63.346875 7.951375 0.000030 0.001599 -13.005125 18.579625 0.000068 16.240750 25.755250 0.557763 -0.253550 -0.001579 NaN
643320 147127 2016-02-07 63.507125 3.390500 0.000018 0.002040 5.950500 12.145125 0.000041 16.145375 25.923750 1.839371 0.264669 1.281608 0.107755
643579 147127 2016-02-09 63.413000 -14.578000 0.000025 0.001646 -3.235750 16.112750 0.000057 16.239500 25.853875 2.783544 0.444598 0.944173 -0.024948
643838 147127 2016-02-11 63.122250 -18.242875 0.000015 0.001719 -4.686250 19.574125 0.000035 16.148625 25.919750 0.628723 -0.201541 -2.154821 NaN
644064 114945 2016-02-13 61.092375 -17.913500 0.000190 0.001932 10.498250 21.127750 0.000369 11.686500 27.474500 0.323277 -0.490425 0.003536 -2.451548
644097 147127 2016-02-13 62.951250 -9.086625 0.000021 0.001674 0.620625 13.320375 0.000050 16.119125 25.953375 0.690027 -0.161134 0.061304 -1.212510
644107 60150420 2016-02-13 61.325500 17.384375 0.000005 0.001684 23.459000 29.365250 0.000003 8.555625 27.533875 0.181103 -0.742074 0.002238 -2.650210
644323 114945 2016-02-15 60.796625 -17.572750 0.000057 0.001992 4.481250 18.944375 0.000130 11.797125 27.450125 0.304521 -0.516382 -0.018756 NaN
644366 60150420 2016-02-15 61.501125 3.067375 0.000005 0.001684 19.457750 20.727875 0.000003 8.905125 27.459125 0.191219 -0.718470 0.010115 -1.995016
646169 147127 2016-02-29 62.393375 -2.211750 0.000021 0.001621 -3.316000 4.783375 0.000047 16.148375 26.163125 1.065263 0.027457 0.489301 -0.310424
646428 147127 2016-03-02 62.357000 -1.091125 0.000018 0.001607 -1.968500 4.452125 0.000040 16.115000 26.305625 0.748080 -0.126052 -0.317182 NaN
646654 114945 2016-03-04 58.558000 -11.817250 0.000271 0.002018 2.511250 12.682625 0.000499 12.121625 27.794250 0.197348 -0.704766 0.012846 -1.891234
646687 147127 2016-03-04 62.347500 -0.529750 0.000019 0.001620 -5.424875 8.089875 0.000046 16.075375 26.686250 0.581469 -0.235474 -0.166612 NaN
646908 114873 2016-03-06 55.233625 -19.139750 0.000059 0.001751 -7.309125 20.571625 0.000130 8.470125 27.989000 0.153596 -0.813619 -0.004269 NaN
646946 147127 2016-03-06 62.302000 -9.565750 0.000014 0.001622 -17.889250 21.602500 0.000031 15.891250 26.725375 0.461541 -0.335790 -0.119928 NaN
647190 127429 2016-03-08 73.433000 -38.342250 0.000025 0.001743 2.449500 39.909500 0.000058 5.192875 29.816375 0.114650 -0.940624 -0.003402 NaN
647205 147127 2016-03-08 62.019875 -22.442000 0.000022 0.001673 -4.468125 26.933625 0.000051 15.642250 26.796250 0.393819 -0.404703 -0.067722 NaN
647464 147127 2016-03-10 61.795250 0.052125 0.000021 0.001705 12.220000 13.752125 0.000048 15.758375 27.590750 0.393192 -0.405396 -0.000627 NaN
647474 60150420 2016-03-10 61.905750 -26.666750 0.000005 0.001684 -0.501250 26.782875 0.000003 10.351625 28.316750 0.125900 -0.899975 -0.018524 NaN
647685 114873 2016-03-12 54.483500 -11.473250 0.000021 0.001851 -8.404250 14.929250 0.000048 8.216875 28.428125 0.128125 -0.892368 -0.009478 NaN
647944 114873 2016-03-14 54.361125 -7.151000 0.000019 0.001713 -5.623625 9.931875 0.000044 8.080250 28.609750 0.125732 -0.900553 -0.002392 NaN
649287 60150420 2016-03-24 58.873125 -38.982125 0.000005 0.001684 -11.351875 40.748625 0.000003 9.988250 28.701250 0.159817 -0.796378 -0.100210 NaN
649536 147127 2016-03-26 61.967375 -0.919500 0.000018 0.001664 -15.201000 16.286000 0.000043 14.534125 28.357250 0.297591 -0.526380 0.019124 -1.718413

805 rows × 15 columns


In [ ]:
'''
# load the 2D data, based on the floats data and the lagrangian rate of change on chl_ocx
import pandas as pd
test = pd.read_hdf('df_chl_out_2D.h5')
test

# a check
list(test.groupby(['id']))
'''

In [14]:
# check the lat and lon
# df_chl_out_2D.lon
# df_chl_out_2D.lat

In [15]:
tmp_dist = dist_DS.dist.sel_points(lon=list(df_chl_out_2D.lon),lat=list(df_chl_out_2D.lat), method='nearest')
print('the count of nan vaues in tmpAll is',tmp_dist.to_series().isnull().sum())
tmp_dist.to_series()


the count of nan vaues in tmpAll is 0
Out[15]:
points
0      514.4420
1      579.2250
2      855.6750
3      572.6910
4      824.3420
5      568.6320
6      878.5510
7      572.6400
8      470.1010
9      540.8280
10     866.4410
11     452.2190
12     848.3430
13     467.1530
14     520.5600
15     560.4820
16     204.7940
17     149.9980
18     201.5180
19     732.5160
20     215.2650
21     839.5940
22     741.0380
23     291.2670
24     550.8700
25     698.6280
26     399.2320
27     558.2770
28     356.4060
29     347.1890
         ...   
775    124.2240
776    842.6260
777    147.4100
778    947.0290
779    632.1170
780    902.8360
781    654.2140
782    655.4240
783    676.9840
784    664.6880
785    645.3280
786    719.9600
787    629.2100
788    865.5090
789    684.1940
790    860.9300
791    576.3870
792    575.3420
793    437.1820
794    577.8640
795    445.3440
796    584.7310
797     17.5207
798    577.7550
799    552.5030
800    838.9730
801    417.0810
802    414.2320
803    549.9370
804    661.8910
Name: dist, dtype: float64

In [16]:
# tmp.to_series() to transfer it from xarray dataset to series
df_chl_out_2D['dist'] = pd.Series(np.array(tmp_dist.to_series()), index=df_chl_out_2D.index)
print("after editing the dataframe the nan values in 'chl_ocx' is",df_chl_out_2D.dist.isnull().sum() )  # they should be the same values as above

# take a look at the data
df_chl_out_2D

# visualize the float around the arabian sea region
fig, ax  = plt.subplots(figsize=(12,10))
df_chl_out_2D.plot(kind='scatter', x='lon', y='lat', c='dist', cmap='RdBu_r', edgecolor='none', ax=ax, title = 'distance to the nearest coast')


after editing the dataframe the nan values in 'chl_ocx' is 0
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a756588>

In [17]:
# CSV CSV CSV CSV with specfic index
# df_chl_out_2D_3.csv --  {lat, lon, temp, chlor_a, dist}
# 3 represents 3 features: {temp, chlor_a, dist}
df_chl_out_2D.to_csv('df_chl_out_2D_modisa_3.csv', sep=',', index_label = 'index')

# load CSV output
test = pd.read_csv('df_chl_out_2D_modisa_3.csv', index_col='index')

# a check
test.head()


Out[17]:
id time lon ve var_lon var_tmp vn spd var_lat lat temp chlor_a chlor_a_log10 chl_rate chl_rate_log10 dist
index
15828 34721 2002-11-03 67.570000 3.303000 0.000135 0.001823 2.865500 8.211375 0.000069 12.647250 29.435500 0.123307 -0.909012 -0.010569 NaN 514.442
16081 34710 2002-11-05 63.160750 0.491500 0.000088 0.001596 10.219250 10.749875 0.000050 17.127000 28.991250 0.445077 -0.351565 0.033500 -1.474955 579.225
16320 11089 2002-11-07 64.835375 -15.401500 0.000105 0.003391 2.263500 16.392000 0.000056 14.366875 28.939625 0.182671 -0.738330 0.024246 -1.615360 855.675
16322 15707 2002-11-07 67.399125 -19.144750 0.000118 1000.000000 -21.695125 29.499000 0.000063 13.739500 NaN 0.161665 -0.791384 -0.004139 NaN 572.691
16336 34315 2002-11-07 57.375375 -49.749875 0.000085 0.004589 4.788500 52.729375 0.000048 5.613000 29.647125 0.115356 -0.937960 -0.009359 NaN 824.342

In [18]:
# summary
# do a transformation to make the longitude positive
# transform the dataframe into dataset
# carry out the interpolation on dataset and transform it into a dataframe

# think about output the data from 2D interpolations as a binary file to save time
#

In [19]:
from datetime import datetime, timedelta
datetime(2002, 1, 1) + timedelta(days=184)


Out[19]:
datetime.datetime(2002, 7, 4, 0, 0)