In [1]:
import datetime
import re
import zipfile
import os
import numpy as np
import scipy.interpolate
import pyproj
import netCDF4
import pandas
PSMSLNAME=re.compile(r'(?P<id>\d+)\.rlrdata$')
In [2]:
# open the zip file
zf = zipfile.ZipFile('rlr_monthly.zip')
# create dataframes for each time series
dfs = []
for info in zf.filelist:
# only loop over data
if not info.filename.endswith('rlrdata'):
continue
# open the file
f = zf.open(info.filename)
# load it as a csv file
df = pandas.read_csv(f, sep=';', header=None, index_col=None)
# set the columns (found in matlab parser)
df.columns = ['year.month', 'waterlevel', 'missing', 'dataflag']
id = int(PSMSLNAME.search(info.filename).group('id'))
df["id"] = id
dfs.append(df)
# group
series_df = pandas.concat(dfs)
In [3]:
# read the overview
f = zf.open('rlr_monthly/filelist.txt')
overview_df = pandas.DataFrame.from_csv(f, sep=';', header=None, index_col=None)
overview_df.columns = ['id', 'latitude', 'longitude', 'name', 'coastline', 'stationcode', 'stationflag']
overview_df["name"] = overview_df["name"].apply(str.strip)
In [4]:
dfs = []
for stationfile in os.listdir('extra'):
if stationfile == 'stations.csv':
continue
df = pandas.read_csv(os.path.join('extra', stationfile))
dfs.append(df)
series_df = pandas.concat(dfs + [series_df.copy()])
df = pandas.read_csv(os.path.join('extra', 'stations.csv'))
overview_df = overview_df.append(df)
In [5]:
# curry the time series
# No missings in ints (http://pandas.pydata.org/pandas-docs/stable/gotchas.html)
series_df["waterlevel"] = series_df.waterlevel.astype('double')
series_df["waterlevel"][series_df["waterlevel"] == -99999] = None
# date time transformation
series_df["year"] = np.floor(series_df["year.month"])
overview_df = overview_df.set_index('id')
In [6]:
# avoid memory error?!
series_df["month"] = np.floor((np.array(series_df["year.month"])- np.array(series_df["year"]))*12 )+1
# make datetimes
f = lambda x: datetime.datetime(int(x["year"]), int(x["month"]),1)
series_df["time"] = pandas.to_datetime(series_df[["year", "month"]].apply(f, axis=1))
# lookup station id
data_grouped = series_df.groupby(["id"])
In [7]:
# Read the GIA file
ds = netCDF4.Dataset('./dsea250.1grid.ICE5Gv1.3_VM2_L90_2012.nc')
lon = ds.variables['Lon'][:]
lat = ds.variables['Lat'][:]
dsea = ds.variables['Dsea_250'][:]
print(ds.variables['Dsea_250'].dimensions)
ds.close()
# flip for increasing lat
dsea = dsea[::-1,:] # flipud
lon = np.mod(np.deg2rad(lon), np.pi*2)
lat = np.deg2rad(lat[::-1]) + 0.5*np.pi # flip
# interpolate on the earth
F = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=dsea)
overview_df["peltier"] = F.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))
In [8]:
# Read wind reanalysis
ds = netCDF4.Dataset('uwnd.10m.mon.mean.nc')
lon = ds.variables['lon'][:]
lat = ds.variables['lat'][:]
u = ds.variables['uwnd'][:]
time = netCDF4.num2date(ds.variables['time'][:], ds.variables['time'].units)
print(ds.variables['uwnd'].dimensions)
ds.close()
lon = np.mod(np.deg2rad(lon), np.pi*2)
lat = np.deg2rad(lat[::-1]) + 0.5*np.pi # flip
u = u[:,::-1,:]
ds = netCDF4.Dataset('vwnd.10m.mon.mean.nc')
v = ds.variables['vwnd'][:]
ds.close()
v = v[:,::-1,:]
In [9]:
# Interpolate on the earth, to each station
dfs = []
for i, t in enumerate(time):
F_i = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=u[i,:,:])
u_i = F_i.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))
F_i = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=v[i,:,:])
v_i = F_i.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))
df = overview_df[[]].reset_index()
df["u_i"] = u_i
df["v_i"] = v_i
df["time"] = t
dfs.append(df)
df = pandas.concat(dfs)
uv_grouped = df.groupby(["id"])
In [10]:
# Do an inverse barometer correction
ds = netCDF4.Dataset('./slp.mnmean.real.nc')
time = netCDF4.num2date(ds.variables['time'][:],ds.variables['time'].units)
lon = ds.variables['lon'][:].astype('double')
lat = ds.variables['lat'][:].astype('double')
slp = ds.variables['slp'][:].astype('double')
print(ds.variables['slp'].dimensions)
ds.close()
ds = netCDF4.Dataset("landmask.nc")
landmask = ds.variables["mask"][:][::-1,:].astype("bool") # no need to flip
ds.close()
In [11]:
slp_mask=np.tile(landmask[np.newaxis, :,:], (slp.shape[0], 1,1))
slp_masked = np.ma.masked_array(slp, mask=slp_mask)
In [12]:
area_weight = np.ma.masked_array(
np.tile(np.cos(np.rad2deg(lat))[np.newaxis,:,np.newaxis], (slp_masked.shape[0], 1, slp_masked.shape[2])),
slp_mask
)
glob_p = (slp_masked*area_weight).sum(1).sum(1)
glob_p /= area_weight.sum(1).sum(1)
IB = -(slp_masked-np.tile(glob_p[:,np.newaxis, np.newaxis], (1,slp_masked.shape[1],slp_masked.shape[2])))*0.9948
In [13]:
# Lookup closest IB for each station (interpolation breaks because of the landmasks)
wgs84 = pyproj.Geod(ellps='WGS84')
Lon, Lat = np.meshgrid(lon, lat)
lon1, lat1 = Lon.ravel(), Lat.ravel()
dfs = []
for id, row in overview_df.iterrows():
lon2, lat2 = np.ones_like(Lon.ravel())*row['longitude'], np.ones_like(Lat.ravel())*row['latitude']
az1, az2, dist = wgs84.inv(lon1, lat1, lon2, lat2)
dist_masked = np.ma.masked_array(dist.reshape(Lon.shape), mask=landmask[:,:])
idx = np.unravel_index(dist_masked.argmin(), dist_masked.shape)
ib = IB[:, idx[0], idx[1]]
df = pandas.DataFrame(data=dict(time=time, ib=ib))
df['id'] = id
dfs.append(df)
In [ ]:
ib_dfs = pandas.concat(dfs)
ib_grouped = ib_dfs.groupby(['id'])
In [ ]:
merges = []
for station, row in overview_df.iterrows():
uv = uv_grouped.get_group(station)
ib = ib_grouped.get_group(station)
data = data_grouped.get_group(station)
merged = pandas.merge(data, ib, how="left", on=("time","id"))
merged = pandas.merge(merged, uv, how="left", on=("time", "id"))
merges.append(merged)
dataset = pandas.concat(merges)
In [ ]:
dataset["u2"] = dataset["u_i"]**2
dataset["v2"] = dataset["v_i"]**2
In [44]:
output = dataset[['id', 'year', 'waterlevel', 'ib', 'u_i', 'v_i', "u2", "v2"]]
year_df = dataset[["id","year", "waterlevel", "ib", "u_i","v_i", "u2", "v2"]].groupby(['id', 'year']).mean()
year_df = year_df.reset_index()
year_df.rename(columns={'year': 'year.month'}, inplace=True)
In [45]:
output.to_csv("records.csv", index=False)
year_df.to_csv("records_annual.csv", index=False)
overview_df.to_csv("overview.csv", index=True, index_label="id")
In [ ]: