In [1]:
import datetime
import re
import zipfile
import os

import numpy as np
import scipy.interpolate
import pyproj
import netCDF4
import pandas

PSMSLNAME=re.compile(r'(?P<id>\d+)\.rlrdata$')

PSMSL data


In [2]:
# open the zip file
zf = zipfile.ZipFile('rlr_monthly.zip')
# create dataframes for each time series
dfs = []
for info in zf.filelist:
    # only loop over data
    if not info.filename.endswith('rlrdata'):
        continue
    # open the file
    f = zf.open(info.filename)
    # load it as a csv file
    df = pandas.read_csv(f, sep=';', header=None, index_col=None)
    # set the columns (found in matlab parser)
    df.columns = ['year.month', 'waterlevel', 'missing', 'dataflag']
    id = int(PSMSLNAME.search(info.filename).group('id'))
    df["id"] = id
    dfs.append(df)
# group 
series_df = pandas.concat(dfs)

In [3]:
# read the overview
f = zf.open('rlr_monthly/filelist.txt')
overview_df = pandas.DataFrame.from_csv(f, sep=';', header=None, index_col=None)
overview_df.columns = ['id', 'latitude', 'longitude', 'name', 'coastline', 'stationcode', 'stationflag']         
overview_df["name"] = overview_df["name"].apply(str.strip)

In [4]:
dfs = []
for stationfile in os.listdir('extra'):
    if stationfile == 'stations.csv':
        continue
    df = pandas.read_csv(os.path.join('extra', stationfile))
    dfs.append(df)
series_df = pandas.concat(dfs + [series_df.copy()])

df = pandas.read_csv(os.path.join('extra', 'stations.csv'))
overview_df = overview_df.append(df)

In [5]:
# curry the time series
# No missings in ints (http://pandas.pydata.org/pandas-docs/stable/gotchas.html)
series_df["waterlevel"] = series_df.waterlevel.astype('double')
series_df["waterlevel"][series_df["waterlevel"] == -99999] = None
# date time transformation
series_df["year"] = np.floor(series_df["year.month"])
overview_df = overview_df.set_index('id')

In [6]:
# avoid memory error?!
series_df["month"] = np.floor((np.array(series_df["year.month"])- np.array(series_df["year"]))*12 )+1
# make datetimes
f = lambda x: datetime.datetime(int(x["year"]), int(x["month"]),1)
series_df["time"] = pandas.to_datetime(series_df[["year", "month"]].apply(f, axis=1))
# lookup  station id

data_grouped = series_df.groupby(["id"])

GIA


In [7]:
# Read the GIA file
ds = netCDF4.Dataset('./dsea250.1grid.ICE5Gv1.3_VM2_L90_2012.nc')
lon = ds.variables['Lon'][:]
lat = ds.variables['Lat'][:]
dsea = ds.variables['Dsea_250'][:]
print(ds.variables['Dsea_250'].dimensions)
ds.close()
# flip for increasing lat
dsea = dsea[::-1,:] # flipud
lon = np.mod(np.deg2rad(lon), np.pi*2)
lat = np.deg2rad(lat[::-1]) + 0.5*np.pi # flip
# interpolate on the earth
F = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=dsea)
overview_df["peltier"] = F.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))


(u'Lat', u'Lon')

Reanalysis


In [8]:
# Read wind reanalysis
ds = netCDF4.Dataset('uwnd.10m.mon.mean.nc')
lon = ds.variables['lon'][:]
lat = ds.variables['lat'][:]
u = ds.variables['uwnd'][:]
time = netCDF4.num2date(ds.variables['time'][:], ds.variables['time'].units)
print(ds.variables['uwnd'].dimensions)
ds.close()
lon = np.mod(np.deg2rad(lon), np.pi*2)
lat = np.deg2rad(lat[::-1]) + 0.5*np.pi # flip
u = u[:,::-1,:]
ds = netCDF4.Dataset('vwnd.10m.mon.mean.nc')
v = ds.variables['vwnd'][:]
ds.close()
v = v[:,::-1,:]


(u'time', u'lat', u'lon')

In [9]:
# Interpolate on the earth, to each station
dfs = []
for i, t in enumerate(time):
    F_i = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=u[i,:,:])
    u_i = F_i.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))
    F_i = scipy.interpolate.RectSphereBivariateSpline(u=lat, v=lon, r=v[i,:,:])    
    v_i = F_i.ev(np.deg2rad(overview_df.latitude) + 0.5*np.pi, np.mod(np.deg2rad(overview_df.longitude), np.pi*2))
    df = overview_df[[]].reset_index()
    df["u_i"] = u_i
    df["v_i"] = v_i
    df["time"] = t
    dfs.append(df)
df = pandas.concat(dfs)
uv_grouped = df.groupby(["id"])

Inverse barometer correction

IB (mm) = -9.948 * ( d Rdry (mbars) - 1013.3 )


In [10]:
# Do an inverse barometer correction
ds = netCDF4.Dataset('./slp.mnmean.real.nc')
time = netCDF4.num2date(ds.variables['time'][:],ds.variables['time'].units)
lon = ds.variables['lon'][:].astype('double')
lat = ds.variables['lat'][:].astype('double')
slp = ds.variables['slp'][:].astype('double')
print(ds.variables['slp'].dimensions)
ds.close()

ds = netCDF4.Dataset("landmask.nc")
landmask = ds.variables["mask"][:][::-1,:].astype("bool") # no need to flip
ds.close()


(u'time', u'lat', u'lon')

In [11]:
slp_mask=np.tile(landmask[np.newaxis, :,:], (slp.shape[0], 1,1))
slp_masked = np.ma.masked_array(slp, mask=slp_mask)

In [12]:
area_weight = np.ma.masked_array(
            np.tile(np.cos(np.rad2deg(lat))[np.newaxis,:,np.newaxis], (slp_masked.shape[0], 1, slp_masked.shape[2])),
            slp_mask        
                    )
glob_p = (slp_masked*area_weight).sum(1).sum(1)
glob_p /= area_weight.sum(1).sum(1)

IB = -(slp_masked-np.tile(glob_p[:,np.newaxis, np.newaxis], (1,slp_masked.shape[1],slp_masked.shape[2])))*0.9948

In [13]:
# Lookup closest IB for each station (interpolation breaks because of the landmasks)
wgs84 = pyproj.Geod(ellps='WGS84')
Lon, Lat = np.meshgrid(lon, lat)

lon1, lat1 = Lon.ravel(), Lat.ravel()


dfs = []
for id, row in overview_df.iterrows():
    lon2, lat2 = np.ones_like(Lon.ravel())*row['longitude'], np.ones_like(Lat.ravel())*row['latitude']
    az1, az2, dist = wgs84.inv(lon1, lat1, lon2, lat2)
    dist_masked = np.ma.masked_array(dist.reshape(Lon.shape), mask=landmask[:,:])
    idx = np.unravel_index(dist_masked.argmin(), dist_masked.shape)
    ib = IB[:, idx[0], idx[1]]
    df = pandas.DataFrame(data=dict(time=time, ib=ib))
    df['id'] = id
    dfs.append(df)

In [ ]:
ib_dfs = pandas.concat(dfs)
ib_grouped = ib_dfs.groupby(['id'])

Merge data


In [ ]:
merges = []
for station, row in overview_df.iterrows():
    uv = uv_grouped.get_group(station)
    ib = ib_grouped.get_group(station)
    data = data_grouped.get_group(station)
    merged = pandas.merge(data, ib, how="left", on=("time","id"))
    merged = pandas.merge(merged, uv, how="left", on=("time", "id"))
    merges.append(merged)
dataset = pandas.concat(merges)

In [ ]:
dataset["u2"] = dataset["u_i"]**2
dataset["v2"] = dataset["v_i"]**2

Aggregate


In [44]:
output = dataset[['id', 'year', 'waterlevel', 'ib', 'u_i', 'v_i', "u2", "v2"]]
year_df = dataset[["id","year", "waterlevel", "ib", "u_i","v_i", "u2", "v2"]].groupby(['id', 'year']).mean()
year_df = year_df.reset_index()
year_df.rename(columns={'year': 'year.month'}, inplace=True)

In [45]:
output.to_csv("records.csv", index=False)
year_df.to_csv("records_annual.csv", index=False)
overview_df.to_csv("overview.csv", index=True, index_label="id")

In [ ]: