In [48]:
# We need a few libraries
import io
import numpy as np
import pandas
import requests
import matplotlib.pyplot as plt
import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline
Let's start with downloading the data from the different tidal guages. We use the 6 main Dutch stations.
In [49]:
stations = [
['Vlissingen', 20, lambda x: x - (6976-46)],
['Hoek van Holland', 22, lambda x:x - (6994 - 121)],
['Den Helder', 23, lambda x: x - (6988-42)],
['Delfzijl', 24, lambda x: x - (6978-155)],
['Harlingen', 25, lambda x: x - (7036-122)],
['IJmuiden', 32, lambda x: x - (7033-83)],
]
# you could use the monthly data
monthly_url = 'http://www.psmsl.org/data/obtaining/rlr.monthly.data/%d.rlrdata'
# and you can use the annual dat
annual_url = 'http://www.psmsl.org/data/obtaining/rlr.annual.data/%d.rlrdata'
In [50]:
# Let's download them both
monthly_dfs = []
annual_dfs = []
for station, id, to_local in stations:
f = io.BytesIO(requests.get(monthly_url % (id,)).content)
df = pandas.read_csv(f, sep=';', names=('year', 'waterlevel', 'code', 'another code'))
# add a column with the station name
df['station'] = station
# convert to local coordinate system
df['nap'] = df['waterlevel'].apply(to_local)
# ignore the part before 1890
df = df[df.year >= 1890]
monthly_dfs.append(df)
f = io.BytesIO(requests.get(annual_url % (id,)).content)
df = pandas.read_csv(f, sep=';', names=('year', 'waterlevel', 'code', 'another code'))
df['station'] = station
df['nap'] = df['waterlevel'].apply(to_local)
df = df[df.year >= 1890]
annual_dfs.append(df)
Now we have downloaded the stations, we can create a new station with the mean. You can use many other techniques, but the mean works just fine here.
In [51]:
# compute the monthly mean
monthly_df = pandas.concat(monthly_dfs)
mean = monthly_df.groupby('year').mean().reset_index()
mean['station'] = 'mean'
monthly_df = pandas.concat([monthly_df, mean[['year', 'waterlevel', 'nap', 'station']]])
monthly_df = monthly_df.reset_index()
# add the year number
monthly_df['year_floor'] = np.floor(monthly_df['year'])
annual_df = pandas.concat(annual_dfs)
mean = annual_df.groupby('year').mean().reset_index()
mean['station'] = 'mean'
annual_df = pandas.concat([annual_df, mean[['year', 'waterlevel', 'nap', 'station']]])
annual_df = annual_df.reset_index()
# save the files to json
# add the annual mean also to the monthly means
monthly_df = monthly_df.merge(annual_df[['year', 'nap']], left_on='year_floor', right_on='year', suffixes=['', '_annual'])
monthly_df[['year', 'nap', 'nap_annual', 'station']].to_json('monthly.json', orient='records')
annual_df[['year', 'nap', 'station']].to_json('annual.json', orient='records')
df = annual_df
In [52]:
# in memory file
grouped = df.groupby(['station'])
In [53]:
fig, ax = plt.subplots(figsize=(10,7))
for station, df in grouped:
annual_jitter = np.random.uniform(-0.5, 0.5, size=len(df))
monthly_jitter = np.random.uniform(-0.5*1/12.0, 0.5*1/12.0, size=len(df))
if station == 'mean':
pch = '-'
alpha = 1.0
ax.plot(df['year'] + annual_jitter, df['nap']/10 , pch, alpha=alpha, label=station)
else:
pch = '.'
alpha = 0.5
ax.plot(df['year'] + annual_jitter, df['nap']/10 , pch, alpha=alpha, label=station)
ax.set_ylabel('waterlevel [cm NAP]')
ax.set_xlabel('year')
legend = ax.legend(loc='best')
legend.legendPatch.set_alpha(0.5)
legend.legendPatch.set_facecolor('white')
fig.savefig('stations.png')
In [54]:
import statsmodels.api as sm
df = grouped.get_group('mean')
In [55]:
y = df['waterlevel']
X = df['year']
X = sm.add_constant(X)
In [56]:
model = sm.OLS(y, X)
results = model.fit()
from IPython.display import display
display(results.summary())
In [57]:
def fit(startyear):
df_selected = df[df['year'] > startyear]
y = df_selected['waterlevel']
X = df_selected['year']
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
display(results.summary())
In [58]:
from IPython.html.widgets import interactive
In [59]:
interactive(fit, startyear=(1890, 1980, 10))
In [60]:
def plot(startyear):
df_selected = df[df['year'] > startyear]
y = df_selected['waterlevel']
X = df_selected['year']
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
fig, ax = plt.subplots()
ax.plot(df['year'], df['waterlevel'], '.', alpha=0.1)
ax.plot(df_selected['year'], results.fittedvalues)
ax.set_ylim(6400,7300)
In [61]:
interactive(plot, startyear=(1860, 1980, 10))
In [62]:
from statsmodels.sandbox.regression.predstd import wls_prediction_std
def plot(startyear):
df_selected = df[df['year'] > startyear]
y = df_selected['waterlevel']
X = df_selected['year']
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
fig, ax = plt.subplots()
ax.plot(df['year'], df['waterlevel'], '.', alpha=0.1)
ax.plot(df_selected['year'], results.fittedvalues)
fit, lower, upper = wls_prediction_std(results)
plt.fill_between(df_selected['year'], lower, upper, alpha=0.3)
ax.set_ylim(6400,7300)
In [63]:
import statsmodels.sandbox.regression.predstd
In [64]:
interactive(plot, startyear=(1860, 1980, 10))
In [65]:
%load_ext rpy2.ipython
In [66]:
%Rpush df
%R library('ggplot2')
Out[66]:
In [67]:
def test(startyear):
df_selected = df[df['year']>startyear]
%Rpush df_selected
%R print(ggplot(df, aes(year, waterlevel)) + geom_point(alpha=0.3) + stat_smooth(data=df_selected, method='loess'))
interactive(test, startyear=(1860, 1980, 10))
In [68]:
def plot(startyear):
df_selected = df[(df['year'] >= startyear) & (df['year'] < (startyear + 18.6*2))]
y = df_selected['waterlevel']
X = np.c_[df_selected['year']-1970, (df_selected['year']-1970)**2]
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
fig, ax = plt.subplots()
ax.plot(df['year'], df['waterlevel'], '.', alpha=0.3)
ax.plot(df_selected['year'], results.fittedvalues, linewidth=5)
In [69]:
interactive(plot, startyear=(1860, 1980, 5))
In [70]:
def plot(startyear, n_periods):
df_selected = df[(df['year'] >= startyear) & (df['year'] < (startyear + 18.6*n_periods))]
y = df_selected['waterlevel']
X = np.c_[
df_selected['year']-1970,
np.cos(2*np.pi*(df_selected['year']-1970)/18.613),
np.sin(2*np.pi*(df_selected['year']-1970)/18.613)
]
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
# e^{i\theta} = \cos\theta + i\sin\theta
phase = np.angle(complex(results.params['x2'], results.params['x3']))
ampl = np.sqrt(results.params['x2']**2 + results.params['x3']**2)
# plot it
fig, ax = plt.subplots()
ax.set_title("Phase (rad): %.1f, amplitude: %.1f\nA(cos): %.1f, B(sin): %.1f" % (phase, ampl, results.params['x2'], results.params['x3']))
ax.plot(df['year'], df['waterlevel'], '.', alpha=0.3)
ax.plot(df_selected['year'], results.fittedvalues, linewidth=5)
In [71]:
interactive(plot, startyear=(1860, 1980, 5), n_periods=(2,5))
In [72]:
df = annual_df[annual_df['station']=='mean']
hkv = pandas.read_csv('full_output_station_gemiddeld.csv', sep=';')
df = df.merge(hkv[['year', 'U2sin', 'U2cos']], on='year', suffixes=('', '_hkv'))
df = df.reset_index()
y = df['nap']
X = np.c_[
df['year']-1970,
np.cos(2*np.pi*(df['year']-1970)/18.613),
np.sin(2*np.pi*(df['year']-1970)/18.613),
df['U2cos'],
df['U2sin']
]
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
linear = results.params['const'] + results.params['x1'] * (df['year']-1970)
nodal = results.params['x2'] * np.cos(2*np.pi*(df['year']-1970)/18.613) + results.params['x3'] * np.sin(2*np.pi*(df['year']-1970)/18.613)
wind = results.params['x4'] * df['U2cos'] + results.params['x5'] * df['U2sin']
df['linear'] = linear
df['nodal'] = nodal
df['wind'] = wind
df['residual'] = results.resid
df['loess'] = sm.nonparametric.lowess(df['residual'], df['year'], frac=60.0/len(df))[:,1]
df['residual_loess'] = df['residual'] - df['loess']
In [73]:
df[['year', 'nap', 'station', 'linear', 'nodal', 'residual', 'wind', 'loess', 'residual_loess']].to_json('fit.json', orient='records')
In [74]:
y = df['nap']
X = np.c_[
np.ones(shape=(len(df),))
]
# compute semipartial correlations
rsquares = []
terms = [df['year']-1970,
np.c_[np.cos(2*np.pi*(df['year']-1970)/18.613), np.sin(2*np.pi*(df['year']-1970)/18.613)],
np.c_[df['U2cos'], df['U2sin']]]
names = ['linear', 'nodal', 'wind']
rsquare = 0
for name, term in zip(names, terms):
X = np.c_[X, term]
model = sm.OLS(y, X)
results = model.fit()
rsquares.append((name, results.rsquared - rsquare))
rsquare = results.rsquared
rsquares.append(('loess', 1- np.var(df['residual_loess'])/np.var(df['nap'])- rsquare))
rsquare = results.rsquared
rsquares.append(('residual', np.var(df['residual_loess'])/np.var(df['nap'])))
import json
json.dump(dict(rsquares), open('rsquares.json', 'w'))
rsquares
Out[74]:
In [75]:
a = results.summary(yname='waterlevel', xname=['constant', 'linear', 'nodal cos', 'nodal sin', 'wind cos', 'wind sin'])
#print(a.as_latex())
a
Out[75]:
In [76]:
def plot(degree):
fitted = pandas.DataFrame(dict(fit=results.resid, year=df['year']))
%Rpush fitted
%Rpush degree
%R l = predict(loess(fitted, degree=degree, control=loess.control(surface='direct')), se=TRUE, newdata=data.frame(year=seq(1890,2100)))
l = %Rget l
import pandas.rpy.common as com
l = com.load_data('l')
plt.fill_between(np.arange(1890, 2101), l['fit'] - l['se.fit']*1.96, l['fit'] + l['se.fit']*1.96, alpha=0.5)
plt.plot(np.arange(1890, 2101), l['fit'])
plt.ylim(-200,400)
plt.title('Loess model with degree %d' % degree)
plt.ylabel("sea surface height residual ['mm']")
interactive(plot, degree=(1,2))
In [77]:
# The broken linear model
y = df['nap']
X = np.c_[
df['year']-1970,
np.cos(2*np.pi*(df['year']-1970)/18.613),
np.sin(2*np.pi*(df['year']-1970)/18.613),
df['U2cos'],
df['U2sin'],
(df['year']-1990 > 0)*(df['year']-1990)
]
X = sm.add_constant(X)
model = sm.OLS(y, X)
fit = model.fit()
In [78]:
%Rpush df
%R fit <- lm(nap ~ year + I((year > 1990)*(year-1990)) + I(cos((2*pi*year-1970)/18.613)) + I(sin((2*pi*year-1970)/18.613) + U2cos + U2sin), data=df)
%R years <- seq(1890, 2100)
%R pred <- predict(fit, interval="confidence", newdata=data.frame(year=years, U2cos=rep(mean(df$U2cos), length(years)), U2sin=rep(mean(df$U2sin), length(years))))
pred = %Rget pred
%R print(summary(fit))
In [79]:
def compute(split):
%Rpush split
%Rpush df
%R fit <- lm(nap ~ year + I((year > split)*(year-split)) + I(cos((2*pi*year-1970)/18.613)) + I(sin((2*pi*year-1970)/18.613) + U2cos + U2sin), data=df)
%R years <- seq(1890, 2100)
%R pred <- predict(fit, interval="confidence", newdata=data.frame(year=years, U2cos=rep(mean(df$U2cos), length(years)), U2sin=rep(mean(df$U2sin), length(years))))
pred = %Rget pred
%R print(summary(fit))
interactive(compute, split=(1980, 2000))
In [80]:
plt.fill_between(np.arange(1890, 2101), np.array(pred)[:,1], np.array(pred)[:,2], alpha=0.5)
plt.plot(np.arange(1890, 2101), np.array(pred)[:,0])
Out[80]:
In [81]:
# The linear model
%Rpush df
%R fit <- lm(nap ~ year + I(cos((2*pi*year-1970)/18.613)) + I(sin((2*pi*year-1970)/18.613)) + U2cos + U2sin, data=df)
%R years <- seq(1890, 2100)
# Padd with means
%R U2cos = df$U2cos
%R U2cos[years>2012] = mean(df$U2cos)
%R U2sin = df$U2cos
%R U2sin[years>2012] = mean(df$U2sin)
%R pred <- predict(fit, interval="confidence", newdata=data.frame(year=years, U2cos=U2cos, U2sin=U2sin))
pred = %Rget pred
%R print(summary(fit))
In [124]:
fitted = pandas.DataFrame(dict(fit=results.resid, year=df['year']))
degree = 1
%Rpush fitted
%Rpush degree
%R l = predict(loess(fitted, degree=degree, control=loess.control(surface='direct')), se=TRUE, newdata=data.frame(year=seq(1890,2100)))
l = %Rget l
import pandas.rpy.common as com
l = com.load_data('l')
ci_loess = pandas.DataFrame(data=dict(year=np.arange(1890, 2101), se=l['se.fit'], fit=l['fit'], lwr=l['fit'] - l['se.fit']*1.96, upr=l['fit'] + l['se.fit']*1.96))
# compute angle
beta = (ci_loess[ci_loess['year'] == 2100].fit.item() - ci_loess[ci_loess['year'] == 2013].fit.item())/(2100-2013)
# compute std error (1.96 -> z 5%, 2 two-tailed)
se = ((ci_loess[ci_loess['year'] == 2100].upr.item() - ci_loess[ci_loess['year'] == 2100].lwr.item()))/1.96/2.0/(2100-2013)
beta, se
Out[124]:
In [114]:
fitted = pandas.DataFrame(dict(fit=results.resid, year=df['year']))
degree = 2
%Rpush fitted
%Rpush degree
%R fit <- loess(fitted, degree=degree, control=loess.control(surface='direct'))
%R l <- predict(fit, se=TRUE, newdata=data.frame(year=seq(1890,2100)))
import pandas.rpy.common as com
l = com.load_data('l')
fit = %Rget fit
ci_loess2 = pandas.DataFrame(data=dict(year=np.arange(1890, 2101), fit=l['fit'], lwr=l['fit'] - l['se.fit']*1.96, upr=l['fit'] + l['se.fit']*1.96))
#ci_loess = ci_loess.set_index('year')
In [84]:
ci_linear = com.load_data("pred")
ci_linear['year'] = np.arange(1890, 2101)
#ci_linear= ci_linear.set_index('year')
In [85]:
ci = pandas.merge(ci_loess, ci_loess2, on='year', suffixes=('_loess', '_loess2'))
ci = pandas.merge(ci_linear, ci, on='year', suffixes=('_linear', ''))
#ci['fit'].name('fit_linear')
#ci['lwr'].rename('lwr_linear')
#ci['upr'].rename('upr_linear')
ci.rename(columns={'fit':'fit_linear', 'lwr': 'lwr_linear', 'upr': 'upr_linear'}, inplace=True)
ci.to_csv('ci.csv')
In [86]:
plt.fill_between(np.arange(1890,2101), ci_linear['lwr'] + ci_loess['lwr'], ci_linear['upr'] + ci_loess['upr'])
plt.plot(np.arange(1890,2101), ci_linear['fit'] + ci_loess['fit'])
plt.plot(np.arange(1890,2101), ci_linear['fit'], color='black')
Out[86]:
In [40]:
plt.fill_between(np.arange(1890,2101), ci_linear['lwr'] , ci_linear['upr'] )
plt.plot(np.arange(1890,2101), ci_linear['fit'])
plt.fill_between(np.arange(1890,2101), ci_loess['lwr'] , ci_loess['upr'] )
plt.plot(np.arange(1890,2101), ci_loess['fit'])
Out[40]:
In [40]:
In [41]:
df = annual_df[annual_df['station']=='mean']
hkv = pandas.read_csv('full_output_station_gemiddeld.csv', sep=';')
df = df.merge(hkv[['year', 'U2sin', 'U2cos']], on='year', suffixes=('', '_hkv'))
df = df.reset_index()
y = df['nap']
plt.subplots(figsize=(10,6))
for split in np.arange(1990, 2000):
X = np.c_[
df['year']-1970,
(df['year']-split)*(df['year']>split),
np.cos(2*np.pi*(df['year']-1970)/18.613),
np.sin(2*np.pi*(df['year']-1970)/18.613),
df['U2cos'],
df['U2sin']
]
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
plt.plot(df['year'], results.fittedvalues, label=str(split), alpha=0.3)
In [42]:
# Based on the records of Amsterdam/Den Helder
df = pandas.read_csv('/Users/baart_f/src/oetpython/applications/sealevel//sealevel/static/data/extra/9000.csv')
# Create a plot
fig, ax1 = plt.subplots(1,1, figsize=(10,6))
# compute a trend
def trend(index):
# Select based on index
df_selected = df.ix[index]
# Create a linear model
y = df_selected['waterlevel']
X = np.c_[
df_selected['year.month'],
np.cos(2*np.pi*(df_selected['year.month']-1970)/18.613),
np.sin(2*np.pi*(df_selected['year.month']-1970)/18.613)
]
X = statsmodels.regression.linear_model.add_constant(X)
model= statsmodels.regression.linear_model.OLS(y, X)
# fit the model
result = model.fit()
# ignore if we have missings
if np.isnan(df_selected['year.month']).any():
return np.nan
# create a plot
ax1.plot(df_selected['year.month'], result.fittedvalues, 'k-', alpha=0.2, linewidth=3)
# return the trend
return result.params['x1']
df['waterlevel'][df['waterlevel'] == -99999] = np.nan
ax1.plot(df['year.month'], df['waterlevel'], '.')
ax1.set_xlabel('tijd [jaar]')
ax1.set_ylabel('zeespiegelniveau [m boven NAP]')
df = df[df['year.month']<1890]
betas = pandas.rolling_apply(np.array(df.index, dtype='int'),20, trend)
fig.savefig('amsterdam.pdf')
In [43]:
# standard error of beta_1 and confidence range
np.std(betas[~np.isnan(betas)]), 1.96 * 2 * np.std(betas[~np.isnan(betas)])
Out[43]:
In [44]:
# the same for the whole of amsterdam
resp = requests.get('http://www.psmsl.org/data/longrecords/amsterdam.sea.level')
f = io.BytesIO(resp.content)
df = pandas.read_fwf(f, widths=(10,20), names=('year.month', 'waterlevel'), skiprows=1)
df['waterlevel']/=10.0
# Create a plot
fig, ax1 = plt.subplots(1,1, figsize=(10,6))
ax1.plot(df['year.month'], df['waterlevel'], '.')
ax1.set_xlabel('tijd [jaar]')
ax1.set_ylabel('zeespiegelniveau [m boven NAP]')
betas = pandas.rolling_apply(np.array(df.index, dtype='int'),20, trend)
np.std(betas[~np.isnan(betas)]), 1.96 * 2 * np.std(betas[~np.isnan(betas)])
Out[44]:
In [44]: