In [100]:
# Imports
import numpy as np
import sys
sys.path.append('../') # This is where all the python files are!
from importlib import reload
import utils; reload(utils)
from utils import *
import keras_models; reload(keras_models)
from keras_models import *
from losses import crps_cost_function
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import animation
import seaborn as sns
sns.set_style('dark')
sns.set_context('poster')
from tqdm import tqdm_notebook as tqdm
from collections import OrderedDict
from IPython.display import HTML
import time
from keras.utils.generic_utils import get_custom_objects
metrics_dict = dict([(f.__name__, f) for f in [crps_cost_function]])
get_custom_objects().update(metrics_dict)
In [101]:
# Basic setup
DATA_DIR = '/Volumes/STICK/data/ppnn_data/' # Mac
# DATA_DIR = '/project/meteo/w2w/C7/ppnn_data/' # LMU
So what is a neural network anyway. Let's start by looking at a picture.
A neural network consists of several layers of interconnected nodes. Each node represents a weighted sum of all the nodes in the previous layers plus a bias.
$$\sum_j w_j x_j + b$$Additionally, each hidden layer is modified by a non-linear function $g(z)$. One very simple and popular activation function is called ReLU:
$$\mathrm{relu}(z) = \mathrm{max}(0, z)$$Let's build a simplified network with one input and one output in pure Numpy.
In [102]:
# Create the data
n_samples = 50
x = np.expand_dims(np.random.uniform(0, 1, n_samples), -1)
y = np.sin(2 * x) + 0.5 * np.sin(15 * x)
plt.scatter(x, y);
x.shape
Out[102]:
In [103]:
# Initialize the weights and biases for the input --> hidden layer step
n_hidden = 200 # Number of nodes in hidden layer
w1 = np.random.normal(size=(1, n_hidden)) # a matrix
b1 = np.random.normal(size=n_hidden) # a vector
w1.shape, b1.shape
Out[103]:
In [104]:
# Do the first step
hidden = np.dot(x, w1) + b1
hidden.shape
Out[104]:
In [105]:
# Here comes the non-linearity
def relu(z):
return np.maximum(0, z)
In [106]:
hidden = relu(hidden)
In [107]:
# Now the weights and biases for the hidden --> output step
w2 = np.random.normal(size=(n_hidden, 1))
b2 = np.random.normal(size=1)
w2.shape, b2.shape
Out[107]:
In [108]:
# Now the second step
preds = np.dot(hidden, w2) + b2
preds.shape
Out[108]:
In [109]:
plt.scatter(x, y);
plt.scatter(x, preds);
Now this is of course just some random garbage. The goal of the optimization algorithm is to change the weights and biases to get the output closer to the target. Mathematically, we are minimizing a loss function, for example the mean squared error.
$$L = \frac{1}{N_\mathrm{{samples}}} \sum_{i=1}^{N_\mathrm{{samples}}} (\mathrm{output} - \mathrm{target})^2$$To minimize this loss we are using stochastic gradient descent. For this we need to compute the gradient of the loss with respect to all weights and biases. Basically, this mean using the chain rule of calculus. The algorithm to do this efficiently is called backpropagation.
In [110]:
def mse(predictions, targets):
return np.mean((predictions - targets) ** 2)
In [111]:
mse(preds, y)
Out[111]:
In [112]:
# Some helper function to reset the weights
def init_weights(n_hidden):
w1 = np.random.normal(size=(1, n_hidden)) # a matrix
b1 = np.random.normal(size=n_hidden) # a vector
w2 = np.random.normal(size=(n_hidden, 1))
b2 = np.random.normal(size=1)
return [w1, w2], [b1, b2]
In [113]:
# First define the forward pass.
def forward_pass(x, weights, biases):
hidden = relu(np.dot(x, weights[0]) + biases[0])
return np.dot(hidden, weights[1]) + biases[1]
In [114]:
# Define the derivative of the loss function and the activation function
def dmse(predictions, targets):
return predictions - targets
def drelu(z):
return 1. * (z > 0)
In [115]:
def backprop_and_update(x, y, weights, biases, lr=1e-5):
# Compute the predictions
hidden = relu(np.dot(x, weights[0]) + biases[0])
preds = np.dot(hidden, weights[1]) + biases[1]
# Compute the loss
loss = mse(preds, y)
# Compute Ds
delta2 = dmse(preds, y)
dw2 = np.dot(hidden.T, delta2)
db2 = np.sum(delta2, axis=0)
delta1 = np.dot(delta2, weights[1].T) * drelu(hidden)
dw1 = np.dot(x.T, delta1)
db1 = np.sum(delta1, axis=0)
# Update parameters
weights[0] -= lr * dw1
biases[0] -= lr * db1
weights[1] -= lr * dw2
biases[1] -= lr * db2
return loss
In [118]:
weights, biases = init_weights(n_hidden)
In [119]:
n_steps = 50000
saved_preds = []
pbar = tqdm(total=n_steps)
for i in range(n_steps):
loss = backprop_and_update(x, y, weights, biases, lr=1e-4)
pbar.update(1)
pbar.set_postfix(OrderedDict({'loss': loss}))
if i % 500 == 0:
saved_preds.append(forward_pass(x, weights, biases))
pbar.close()
In [120]:
saved_preds = np.array(saved_preds)
saved_preds.shape
Out[120]:
In [121]:
fig, ax = plt.subplots()
ax.scatter(x, y)
s = ax.scatter(x[:, 0], saved_preds[0, :, 0])
ax.set_ylim(0, 2)
def animate(i):
y_i = saved_preds[i, :, 0]
s.set_offsets(np.array([x[:, 0], y_i]).T)
plt.close();
In [122]:
ani = animation.FuncAnimation(fig, animate, np.arange(saved_preds.shape[0]),
interval=100)
In [123]:
HTML(ani.to_html5_video())
Out[123]:
In [124]:
network = Sequential([
Dense(n_hidden, input_dim=1, activation='relu'),
Dense(1)
])
network.summary()
In [125]:
network.compile(SGD(), 'mse')
In [126]:
network.fit(x, y, batch_size=x.shape[0], epochs=10)
Out[126]:
In post-processing we want to correct model biases by looking at past forecast/observation pairs. Specifically, if we are looking at probabilistic/ensemble forecasts, we want the forecast to be calibrated. For example, for all cases where the forecasts say that the chance of rain is 40%, it should actually rain in 40% of these cases.
For this study we are looking at 48h ensemble forecasts of temperature at around DWD 500 surface stations in Germany. Our forecasts are ECMWF 50 member ensemble forecasts taken from the TIGGE dataset which contain forecasts from 2008 to now upscaled to 40km grid spacing. The forecast data was bilinearly interpolated to the station locations.
We will use all of 2015 for training the model and all of 2016 to test how well the model performs.
In [127]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates)
The raw ensemble contains 50 ensemble members. We take the mean and standard deviations of these 50 values which is a good approximation since temperature is normally distributed.
In [128]:
train_set.feature_names
Out[128]:
In total we have around 500 stations for every day with some missing observation data. In total that makes around 180k samples.
In [129]:
len(np.unique(train_set.station_ids))
Out[129]:
In [130]:
train_set.features.shape, train_set.targets.shape
Out[130]:
In [ ]:
The goal of post-processing is to produce a sharp but reliable distribution.
In [131]:
plot_fc(train_set, 1001)
To measure the skill of the forecast, we use the CRPS:
$$ \mathrm{crps}(F, y) = \int_{-\infty}^{\infty} [F(t) - H(t-y)]^2\mathrm{d}t, $$where $F(t)$ is the forecast CDF and $H(t-y)$ is the Heaviside function.
In [132]:
plot_fc(train_set, 1001, 'cdf')
For a normal distribution we can easily compute the CRPS from the mean and standard deviation for the raw ensemble, which is the score we want to improve.
In [133]:
np.mean(crps_normal(
test_set.features[:, 0] * test_set.scale_factors[0],
test_set.features[:, 1] * test_set.scale_factors[1],
test_set.targets
))
Out[133]:
The most common post-processing technique for this sort of problem is called Ensemble Model Output Statistic (Gneiting et al. 2005). In this technique, the goal is to find a distribution $$ \mathcal{N}(a + bX, c + dS^2), $$ where $X$ is the raw ensemble mean and $S$ is the raw ensemble standard deviation, so that $$ \min_{a, b, c, d} \frac{1}{N_{\mathrm{sample}}} \sum_{i = 1}^{N_{\mathrm{sample}}} crps(\mathcal{N}(a + bX, c + dS^2), y_i)$$ The minimum over all samples is found using some optimization algorithm. We can also view this as a network graph
There are two commonly used variant of EMOS: Global EMOS where all stations share the same coefficients and training happens over a rolling window of e.g. 25 days and local EMOS where each station is fit separately with a longer training window (e.g. 1 year).
The CRPS scores for 2016 are:
This is the benchmark for our networks.
Let's start by fitting a very simple fully connected network like this:
In [134]:
# Build the network using Keras
fc_model = Sequential([
Dense(2, input_dim=2)
])
In [135]:
fc_model.summary()
In [136]:
fc_model.compile(Adam(0.1), crps_cost_function)
In [137]:
fc_model.fit(train_set.features, train_set.targets, epochs=20, batch_size=4096)
Out[137]:
In [138]:
# Now display the score for 2016
fc_model.evaluate(test_set.features, test_set.targets, 4096, verbose=0)
Out[138]:
So we basically get the same score as global EMOS, which is what we would expect.
The stations probably differ a lot in their post-processing characteristics. So we want to include this somehow. In local EMOS, we wit a separate model for each station, but this takes a long time and doesn't optimally use all the training data.
Embeddings are a neural network technique which provide a natural way to include station information. An embedding is a mapping from a discrete object, in our case the station ID, to a vector. The elements of the vector are learned by the network just like the other weights and biases and represent some extra information about each station.
In [139]:
emb_size = 2
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
max_id
Out[139]:
In [140]:
features_inp = Input(shape=(2,))
id_inp = Input(shape=(1,))
emb = Embedding(max_id+1, emb_size)(id_inp)
emb = Flatten()(emb)
x = Concatenate()([features_inp, emb])
outp = Dense(2)(x)
emb_model = Model([features_inp, id_inp], outp)
In [141]:
emb_model.summary()
In [142]:
emb_model.compile(Adam(0.1), crps_cost_function)
In [143]:
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=20, batch_size=4096);
In [144]:
emb_model.evaluate([test_set.features, test_set.cont_ids], test_set.targets, 4096, 0)
Out[144]:
This score is about 1% better than local EMOS and is much faster.
In [145]:
preds = emb_model.predict([test_set.features, test_set.cont_ids], 4096)
In [146]:
plot_fc(test_set, 5, preds=preds)
In [ ]:
def create_emb_hidden_model(hidden_nodes, n_features=2, activation='relu'):
features_inp = Input(shape=(n_features,))
id_inp = Input(shape=(1,))
emb = Embedding(max_id+1, emb_size)(id_inp)
emb = Flatten()(emb)
x = Concatenate()([features_inp, emb])
for h in hidden_nodes:
x = Dense(h, activation=activation)(x)
outp = Dense(2)(x)
return Model([features_inp, id_inp], outp)
In [ ]:
neural_net = create_emb_hidden_model([1024])
neural_net.summary()
In [ ]:
neural_net.compile(Adam(0.1), crps_cost_function)
In [ ]:
neural_net.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=20, batch_size=4096)
In [ ]:
neural_net.evaluate([test_set.features, test_set.cont_ids], test_set.targets, 4096, 0)
For the simple input that we have, adding non-linearity doesn't help to improve the fit.
So far we have only used the temperature forecast as input but really we have a lot more variables from each forecast which might give us more information about the weather situation.
In traditional post-processing there are techniques to utilize these auxiliary variables, called boosting techniques.
Here are the benchmark scores from Sebastian's boosting experiments:
As a first attempt we can simply throw in these extra variables to our standard network and see what happens.
In [147]:
#aux_train_set, aux_test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
# aux_dict=aux_dict)
with open(DATA_DIR + 'pickled/aux_15_16.pkl', 'rb') as f:
aux_train_set, aux_test_set = pickle.load(f)
In [148]:
print(aux_train_set.feature_names)
len(aux_train_set.feature_names)
Out[148]:
As with temperature, we took the ensemble mean and standard deviation of all auxiliary variables (except for the constants). Now we can build the same network as earlier but with 40 inputs.
In [149]:
aux_fc_model = Sequential([
Dense(2, input_dim=40)
])
aux_fc_model.compile(Adam(0.1), crps_cost_function)
aux_fc_model.summary()
In [150]:
aux_fc_model.fit(aux_train_set.features, aux_train_set.targets, epochs=20, batch_size=4096)
Out[150]:
In [151]:
aux_fc_model.evaluate(aux_test_set.features, aux_test_set.targets, 4096, 0)
Out[151]:
So we get a big improvement from 1.01 for only temperature. we are also doing better than global boosting. Next let's include our station embeddings.
In [152]:
def create_emb_hidden_model(hidden_nodes, n_features=2, activation='relu'):
features_inp = Input(shape=(n_features,))
id_inp = Input(shape=(1,))
emb = Embedding(max_id+1, emb_size)(id_inp)
emb = Flatten()(emb)
x = Concatenate()([features_inp, emb])
for h in hidden_nodes:
x = Dense(h, activation=activation)(x)
outp = Dense(2)(x)
return Model([features_inp, id_inp], outp)
In [153]:
aux_emb_model = create_emb_hidden_model([], n_features=40)
aux_emb_model.compile(Adam(0.01), crps_cost_function)
aux_emb_model.summary()
In [154]:
aux_emb_model.fit([aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
epochs=50, batch_size=1024);
In [156]:
aux_emb_model.evaluate([aux_test_set.features, aux_test_set.cont_ids],
aux_test_set.targets, 4096, 0)
Out[156]:
This is slightly worse than the local boosting algorithm.
In [157]:
nn_model = create_emb_hidden_model([100], n_features=40)
nn_model.compile(Adam(0.005), crps_cost_function)
nn_model.summary()
In [ ]:
#nn_model.fit([aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
# epochs=50, batch_size=4096);
In [ ]:
#nn_model.save(DATA_DIR + 'saved_models/nn_model.h5')
In [158]:
nn_model = keras.models.load_model(DATA_DIR + 'saved_models/nn_model.h5')
In [159]:
nn_model.evaluate([aux_test_set.features, aux_test_set.cont_ids],
aux_test_set.targets, 4096, 0)
Out[159]:
The added non-linearity gives us a another few percent improvement compared to the local boosting algorithm. Why not try increasing the number of hidden layers and nodes?
In [160]:
better_nn = create_emb_hidden_model([512, 512], 40)
better_nn.compile(Adam(0.01), crps_cost_function)
better_nn.summary()
In [ ]:
#better_nn.fit([aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
# epochs=50, batch_size=1024);
In [ ]:
#better_nn.save(DATA_DIR + 'saved_models/better_nn.h5')
In [161]:
better_nn = keras.models.load_model(DATA_DIR + 'saved_models/better_nn.h5')
In [162]:
# Training score
better_nn.evaluate([aux_train_set.features, aux_train_set.cont_ids],
aux_train_set.targets, 4096, 0)
Out[162]:
In [163]:
# Test score
better_nn.evaluate([aux_test_set.features, aux_test_set.cont_ids],
aux_test_set.targets, 4096, 0)
Out[163]:
This is what is called overfitting and is a serious problem in machine learning. The model basically memorizes the training examples and does not generalize to unseen samples.
The model complexity is limited by the amount of training data!
In [164]:
long_train_dates = ['2008-01-01', '2016-01-01']
#long_train_set, long_test_set = get_train_test_sets(DATA_DIR, long_train_dates, test_dates,
# aux_dict=aux_dict)
with open(DATA_DIR + 'pickled/aux_08-15_16.pkl', 'rb') as f:
long_train_set, long_test_set = pickle.load(f)
In [165]:
long_train_set.features.shape
Out[165]:
In [166]:
nn_model = create_emb_hidden_model([500], n_features=40)
nn_model.compile(Adam(0.002), crps_cost_function)
nn_model.summary()
In [ ]:
#nn_model.fit([long_train_set.features, long_train_set.cont_ids], long_train_set.targets,
# epochs=100, batch_size=4096, validation_split=0.2,
# callbacks=[EarlyStopping(patience=2)]);
In [ ]:
#nn_model.save(DATA_DIR + 'saved_models/nn_model_long.h5')
In [167]:
nn_model = keras.models.load_model(DATA_DIR + 'saved_models/nn_model_long.h5')
In [168]:
nn_model.evaluate([long_test_set.features, long_test_set.cont_ids],
long_test_set.targets, 4096, 0)
Out[168]:
So we got another 4% improvement with more training data. We do not yet have benchmark scores for the traditional post-processing techniques for the long training period.
Neural network provide a flexible and fast way of post-processing probabilistic NWP forecasts and are even a little better than traditional techniques courtesy of the added non-linearity.
This is however an academic dataset with a nicely behaved variable, temperature. Application-driven post-processing poses some extra challenges:
In [ ]: