In [26]:
# Imports
import sys
sys.path.append('../') # This is where all the python files are!
from importlib import reload
import utils; reload(utils)
from utils import *
import keras_models; reload(keras_models)
from keras_models import *
import losses; reload(losses)
from losses import crps_cost_function, crps_cost_function_seq
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.layers import Input, Dense, merge, Embedding, Flatten, Dropout, \
SimpleRNN, LSTM, TimeDistributed, GRU, Dropout, Masking
from keras.layers.merge import Concatenate
from keras.models import Model, Sequential
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, Adam
In [3]:
# Use this if you want to limit the GPU RAM usage
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
In [4]:
# Basic setup
DATA_DIR = '/Volumes/STICK/data/ppnn_data/' # Mac
# DATA_DIR = '/project/meteo/w2w/C7/ppnn_data/' # LMU
results_dir = '../results/'
window_size = 25 # Days in rolling window
fclt = 48 # Forecast lead time in hours
In [4]:
seq_len=5
In [73]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
seq_len=seq_len, fill_value=-999.)
In [74]:
train_set.features.shape, train_set.targets.shape
Out[74]:
The arrays have dimensions [sample, time step, feature]
In [75]:
batch_size = 1024
hidden_nodes = 100 # Number of hidden nodes inside RNN cell
In [76]:
inp = Input(shape=(seq_len, 2, )) # time step, feature
x = GRU(hidden_nodes)(inp)
x = Dense(2, activation='linear')(x)
rnn_model = Model(inputs=inp, outputs=x)
In [77]:
rnn_model.compile(optimizer=Adam(0.01), loss=crps_cost_function)
In [78]:
rnn_model.summary()
In [79]:
rnn_model.fit(train_set.features, train_set.targets[:,-1], epochs=10, batch_size=batch_size,
validation_data=(test_set.features, test_set.targets[:,-1]))
Out[79]:
So we get a better train score and a worse validation score. This indicates overfitting.
In [80]:
inp = Input(shape=(seq_len, 2, )) # time step, feature
x = GRU(hidden_nodes, return_sequences=True)(inp)
x = TimeDistributed(Dense(2, activation='linear'))(x)
seq_rnn_model = Model(inputs=inp, outputs=x)
In [81]:
seq_rnn_model.summary()
In [82]:
seq_rnn_model.compile(optimizer=Adam(0.01), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
In [148]:
def train_and_valid(model, train_set, test_set, epochs, batch_size, verbose=0, emb=False):
"""Write our own function to train and validate,
because the keras fit function cannot handle sample weights for training
and validation at the same time.
"""
train_inp = [train_set.features, train_set.cont_ids] if emb else train_set.features
test_inp = [test_set.features, test_set.cont_ids] if emb else test_set.features
for i in range(epochs):
print('Epoch:', i+1)
t1 = timeit.default_timer()
h = model.fit(train_inp, train_set.targets, epochs=1, batch_size=batch_size,
sample_weight=train_set.sample_weights, verbose=verbose)
t2 = timeit.default_timer()
print('Train loss: %.4f - Valid loss: %.4f - Time: %.1fs' % (h.history['loss'][0],
model.evaluate(test_inp, test_set.targets, batch_size=10000,
sample_weight=test_set.sample_weights, verbose=verbose),
t2 - t1))
In [88]:
train_and_valid(seq_rnn_model, train_set, test_set, 10, batch_size)
Same as with the first RNN above we seem to overfit to the dataset, but maybe not as strongly. Let's now try a more complex model with a longer sequence length.
In [112]:
seq_len = 20
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
seq_len=seq_len, fill_value=-999.)
In [90]:
hidden_nodes = 200
In [91]:
inp = Input(shape=(seq_len, 2, )) # time step, feature
x = GRU(hidden_nodes, return_sequences=True)(inp)
x = TimeDistributed(Dense(2, activation='linear'))(x)
seq_rnn_model = Model(inputs=inp, outputs=x)
In [92]:
seq_rnn_model.summary()
In [93]:
seq_rnn_model.compile(optimizer=Adam(0.01), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
In [94]:
train_and_valid(seq_rnn_model, train_set, test_set, 10, batch_size)
So again we are overfitting, but maybe there is something to be learned. Let's first add some regularization and then try a longer training set.
In [95]:
inp = Input(shape=(seq_len, 2, )) # time step, feature
x = GRU(hidden_nodes, return_sequences=True, recurrent_dropout=0.5)(inp)
x = TimeDistributed(Dense(2, activation='linear'))(x)
seq_rnn_model = Model(inputs=inp, outputs=x)
seq_rnn_model.compile(optimizer=Adam(0.001), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
In [97]:
train_and_valid(seq_rnn_model, train_set, test_set, 10, batch_size)
So with drop out we get slightly better validation results, but we are still starting to overfit. I think there is a lot of parameter tuning that would be possible with the complexity of the network and so forth.
In [98]:
train_dates_long = ['2008-01-01', '2016-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates_long, test_dates)
In [99]:
# Copied from fc_network notebook
def build_fc_model():
inp = Input(shape=(2,))
x = Dense(2, activation='linear')(inp)
return Model(inputs=inp, outputs=x)
In [100]:
fc_model = build_fc_model()
fc_model.compile(optimizer=Adam(0.1), loss=crps_cost_function)
In [101]:
fc_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[101]:
Maybe a small improvement. Now let's test our sequence model with a longer training period.
In [102]:
seq_len = 20
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates_long, test_dates,
seq_len=seq_len, fill_value=-999.)
In [107]:
def build_seq_rnn(hidden_nodes, n_features, dropout=0, lr=0.01):
inp = Input(shape=(seq_len, n_features, )) # time step, feature
x = GRU(hidden_nodes, return_sequences=True, recurrent_dropout=dropout)(inp)
x = TimeDistributed(Dense(2, activation='linear'))(x)
seq_rnn_model = Model(inputs=inp, outputs=x)
seq_rnn_model.compile(optimizer=Adam(lr), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
return seq_rnn_model
In [104]:
inp = Input(shape=(seq_len, 2, )) # time step, feature
x = GRU(hidden_nodes, return_sequences=True, recurrent_dropout=0.5)(inp)
x = TimeDistributed(Dense(2, activation='linear'))(x)
seq_rnn_model = Model(inputs=inp, outputs=x)
seq_rnn_model.compile(optimizer=Adam(0.001), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
In [106]:
# This takes several minutes on the GPU
# Epoch counter: 7
train_and_valid(seq_rnn_model, train_set, test_set, 2, batch_size)
In [5]:
from collections import OrderedDict
aux_dict = OrderedDict()
aux_dict['data_aux_geo_interpolated.nc'] = ['orog',
'station_alt',
'station_lat',
'station_lon']
aux_dict['data_aux_pl500_interpolated_00UTC.nc'] = ['u_pl500_fc',
'v_pl500_fc',
'gh_pl500_fc']
aux_dict['data_aux_pl850_interpolated_00UTC.nc'] = ['u_pl850_fc',
'v_pl850_fc',
'q_pl850_fc']
aux_dict['data_aux_surface_interpolated_00UTC.nc'] = ['cape_fc',
'sp_fc',
'tcc_fc']
In [157]:
# Start with just one training year
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
seq_len=seq_len, fill_value=-999., aux_dict=aux_dict)
In [158]:
train_set.cont_ids.shape
Out[158]:
In [111]:
n_features = train_set.features.shape[-1]
n_features
Out[111]:
In [114]:
seq_rnn_model = build_seq_rnn(hidden_nodes, n_features, dropout=0.5, lr=0.01)
In [121]:
# Epoch counter: 8
train_and_valid(seq_rnn_model, train_set, test_set, 2, batch_size, verbose=1)
In [131]:
seq_rnn_model.summary()
In [187]:
def build_seq_rnn_with_embeddings(seq_len, hidden_nodes, n_features, emb_size, max_id,
recurrent_dropout=0, dropout=0, lr=0.01):
features_inp = Input(shape=(seq_len, n_features, )) # time step, feature
id_in = Input(shape=(seq_len,))
emb = Embedding(max_id + 1, emb_size)(id_in)
x = GRU(hidden_nodes, return_sequences=True, recurrent_dropout=recurrent_dropout)(features_inp)
x = Concatenate()([x, emb])
x = Dropout(dropout)(x)
x = TimeDistributed(Dense(2, activation='linear'))(x)
model = Model(inputs=[features_inp, id_in], outputs=x)
model.compile(optimizer=Adam(lr), loss=crps_cost_function_seq,
sample_weight_mode="temporal")
return model
In [163]:
emb_size = 5
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
hidden_nodes, max_id
Out[163]:
In [164]:
emb_rnn = build_seq_rnn_with_embeddings(hidden_nodes, n_features, emb_size, max_id, 0.5)
In [165]:
emb_rnn.summary()
In [167]:
train_and_valid(emb_rnn, train_set, test_set, 5, batch_size, emb=True)
In [182]:
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
seq_len=2, fill_value=-999.)
In [191]:
emb_rnn = build_seq_rnn_with_embeddings(2, 10, 2, emb_size, max_id, recurrent_dropout=0)
In [192]:
emb_rnn.summary()
In [198]:
train_and_valid(emb_rnn, train_set, test_set, 5, batch_size, emb=True)
In [200]:
train_dates_long = ['2008-01-01', '2016-01-01']
train_set_long, test_set = get_train_test_sets(DATA_DIR, train_dates_long, test_dates,
seq_len=seq_len, fill_value=-999.)
In [204]:
emb_rnn = build_seq_rnn_with_embeddings(seq_len, 30, 2, emb_size, max_id, recurrent_dropout=0.3)
In [205]:
train_and_valid(emb_rnn, train_set_long, test_set, 15, batch_size, emb=True)
In [31]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
add_current_error=True)
In [32]:
train_set.feature_names
Out[32]:
In [33]:
train_set.features.shape
Out[33]:
In [34]:
test_set.features[10000:11000, :]
Out[34]:
In [35]:
fc_model = build_fc_model(5, 2, compile=True)
In [36]:
# Note: I am running this cell several times
fc_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[36]:
Erm ok wow, that is pretty incredible. But wait maybe this is very similar to the embedding information.
In [7]:
emb_size = 3
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
max_id
In [38]:
emb_model = build_emb_model(5, 2, [], emb_size, max_id, compile=True,
lr=0.01)
In [42]:
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=10, batch_size=1024,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[42]:
Alright, I need to check whether I am cheating, but for now let's try to build the best model.
In [6]:
from collections import OrderedDict
more_aux_dict = OrderedDict()
more_aux_dict['data_aux_geo_interpolated.nc'] = ['orog',
'station_alt',
'station_lat',
'station_lon']
more_aux_dict['data_aux_pl500_interpolated_00UTC.nc'] = ['u_pl500_fc',
'v_pl500_fc',
'gh_pl500_fc']
more_aux_dict['data_aux_pl850_interpolated_00UTC.nc'] = ['u_pl850_fc',
'v_pl850_fc',
'q_pl850_fc']
more_aux_dict['data_aux_surface_interpolated_00UTC.nc'] = ['cape_fc',
'sp_fc',
'tcc_fc']
more_aux_dict['data_aux_surface_more_interpolated_part1_00UTC.nc'] = [
'sshf_fc', 'slhf_fc', 'u10_fc','v10_fc'
]
more_aux_dict['data_aux_surface_more_interpolated_part2_00UTC.nc'] = [
'ssr_fc', 'str_fc', 'd2m_fc','sm_fc'
]
In [5]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
more_train_set, more_test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
aux_dict=more_aux_dict, add_current_error=True)
In [8]:
emb_size = 3
max_id = int(np.max([more_train_set.cont_ids.max(), more_test_set.cont_ids.max()]))
max_id
Out[8]:
In [22]:
emb_model = build_emb_model(more_train_set.features.shape[1], 2, [50], 3, max_id,
compile=True, lr=0.01)
In [17]:
from IPython.display import SVG,
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(emb_model, show_shapes=True).create(prog='dot', format='svg'))
Out[17]:
In [23]:
emb_model.fit([more_train_set.features, more_train_set.cont_ids], more_train_set.targets, epochs=30,
batch_size=4096, validation_split=0.0)
#callbacks=[EarlyStopping(monitor='val_loss',
# min_delta=0,
# patience=3)])
Out[23]:
In [24]:
emb_model.evaluate([more_test_set.features, more_test_set.cont_ids], more_test_set.targets, batch_size=10000)
Out[24]:
In [25]:
emb_model.summary()
In [54]:
long_train_dates = ['2008-01-01', '2016-01-01']
In [55]:
long_more_train_set, more_test_set = get_train_test_sets(DATA_DIR, long_train_dates, test_dates,
aux_dict=more_aux_dict,
add_current_error=True)
In [56]:
emb_model = build_emb_model(long_more_train_set.features.shape[1], 2, [50], 3, max_id,
compile=True, lr=0.01)
In [58]:
emb_model.fit([long_more_train_set.features, long_more_train_set.cont_ids], long_more_train_set.targets, epochs=50,
batch_size=4096, validation_split=0.2,
callbacks=[EarlyStopping(monitor='val_loss',
min_delta=0,
patience=2)])
Out[58]:
In [59]:
emb_model.evaluate([more_test_set.features, more_test_set.cont_ids], test_set.targets, batch_size=10000)
Out[59]:
In [27]:
# Test current error
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
add_current_error=True,
current_error_len=1)
In [28]:
train_set.feature_names
Out[28]:
In [29]:
fc_model = build_fc_model(4, 2, compile=True)
In [30]:
fc_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[30]:
The error is important, but one of the other two is enough!
In [ ]: