Now that we have established that we can get equivalent EMOS results using a network architecture with SGD, we can now extend this approach to fully connected networks.
Here we will try several approaches:
In [2]:
# Imports
import sys
sys.path.append('../') # This is where all the python files are!
from importlib import reload
import emos_network_theano; reload(emos_network_theano)
from emos_network_theano import EMOS_Network
from losses import crps_cost_function
import utils; reload(utils)
from utils import *
import keras_models; reload(keras_models)
from keras_models import *
from collections import OrderedDict
In [3]:
# Basic setup
DATA_DIR = '/Volumes/STICK/data/ppnn_data/' # Mac
# DATA_DIR = '/project/meteo/w2w/C7/ppnn_data/' # LMU
results_dir = '../results/'
window_size = 25 # Days in rolling window
fclt = 48 # Forecast lead time in hours
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
In [4]:
date_str = '2011-02-14'
train_set, test_set = get_train_test_sets(DATA_DIR, predict_date=date_str,
fclt=fclt, window_size=window_size)
In [6]:
fc_model = build_fc_model(2, 2, compile=True)
In [7]:
fc_model.summary()
Now we have 6 parameters instead of 4 with the standard EMOS Network.
In [8]:
# Define some parameters
early_stopping_delta = 1e-4 # How much the CRPS must improve before stopping
steps_max = 1000 # How many steps to fit at max
batch_size = train_set.features.shape[0]
In [9]:
fc_model.fit(train_set.features, train_set.targets, epochs=steps_max,
batch_size=batch_size,
validation_data=[test_set.features, test_set.targets],
verbose=0,
callbacks=[EarlyStopping(monitor='loss',
min_delta=early_stopping_delta,
patience=2)]);
In [10]:
# Get train and test CRPS
(fc_model.evaluate(train_set.features, train_set.targets, batch_size, verbose=0),
fc_model.evaluate(test_set.features, test_set.targets, batch_size, verbose=0))
Out[10]:
For this particular day we get a score that is slightly better than the standard EMOS network.
In [11]:
date_str_start = '2016-01-01'
date_str_stop = '2017-01-01'
In [12]:
fc_model = build_fc_model(2, 2, compile=True, optimizer='sgd')
In [15]:
# Use the loop function in utils
train_crps_list, valid_crps_list, results_df = loop_over_days(
DATA_DIR,
fc_model,
date_str_start, date_str_stop,
window_size=window_size,
fclt=fclt,
epochs_max=steps_max,
early_stopping_delta=early_stopping_delta,
lr=0.1,
verbose=0)
In [16]:
np.mean(train_crps_list), np.mean(valid_crps_list)
Out[16]:
So we get a slightly better training score and a slightly worse test score. This is a sign of overfitting. But the differences are small.
In [17]:
results_df.to_csv(results_dir + 'fc_network_rolling_window.csv')
In [4]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates)
In [5]:
train_set.features.shape
Out[5]:
In [6]:
fc_model = build_fc_model(2, 2, compile=True)
In [14]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(fc_model).create(prog='dot', format='svg'))
Out[14]:
In [25]:
fc_model.compile(keras.optimizers.Adam(0.001), loss=crps_cost_function)
In [7]:
# Note: I am running this cell several times (40)
fc_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[7]:
In [8]:
fc_model.evaluate(test_set.features, test_set.targets, 4096)
Out[8]:
Very similar to the standard EMOS Network. This indicates that there is not much additional information in the two extra connections we added.
In [27]:
preds = fc_model.predict(test_set.features)
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'fc_network_train_2015_pred_2016.csv')
In [48]:
hidden_model = build_hidden_model(2, 2, hidden_nodes=10, compile=True)
In [49]:
hidden_model.summary()
In [34]:
hidden_model.compile(keras.optimizers.Adam(0.0001), loss=crps_cost_function)
In [50]:
# We can use the same data from above!
# Note I am running this cell several times
hidden_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[50]:
Again, the results are pretty similar. This indicates that for the given data, the added nonlinearity is not important.
In [36]:
preds = hidden_model.predict(test_set.features)
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'hidden_nn_train_2015_pred_2016.csv')
In [54]:
hidden_model = build_hidden_model(2, 2, hidden_nodes=[100, 100, 100], compile=True)
In [59]:
hidden_model.summary()
In [56]:
hidden_model.compile(keras.optimizers.Adam(0.0001), loss=crps_cost_function)
In [58]:
hidden_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=4096,
validation_data=[test_set.features, test_set.targets])
Out[58]:
So we can see that even for a model with 20,000 parameters then training score only goes down a few percent. For a simple bias and spread correction, a linear model seems fully sufficient.
In [9]:
emb_size = 3
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
max_id
Out[9]:
In [87]:
emb_model = build_emb_model(2, 2, [], emb_size, max_id, compile=True,
lr=0.01)
In [88]:
emb_model.summary()
In [92]:
# Ran this for 40 epochs
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=10, batch_size=1024,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[92]:
In [93]:
preds = emb_model.predict([test_set.features, test_set.cont_ids])
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'embedding_fc_train_2015_pred_2016.csv')
In [94]:
def build_and_run_emb_model(emb_size):
emb_model = build_emb_model(2, 2, [], emb_size, max_id, compile=True)
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=40,batch_size=1024, verbose=0,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
print(emb_model.evaluate([train_set.features, train_set.cont_ids], train_set.targets, verbose=0),
emb_model.evaluate([test_set.features, test_set.cont_ids], test_set.targets, verbose=0))
In [95]:
for emb_size in [1, 2, 3, 5, 10, 20]:
print(emb_size)
build_and_run_emb_model(emb_size)
Note that there is some variability. In our first experiment above with an embedding size of 5 we got a better score than here. For this very simple network an embedding size of three seems sufficient.
In [6]:
# The prepare_data function takes an ordered dict as an input
aux_dict = OrderedDict()
aux_dict['data_aux_geo_interpolated.nc'] = ['orog',
'station_alt',
'station_lat',
'station_lon']
aux_dict['data_aux_pl500_interpolated_00UTC.nc'] = ['u_pl500_fc',
'v_pl500_fc',
'gh_pl500_fc']
aux_dict['data_aux_pl850_interpolated_00UTC.nc'] = ['u_pl850_fc',
'v_pl850_fc',
'q_pl850_fc']
aux_dict['data_aux_surface_interpolated_00UTC.nc'] = ['cape_fc',
'sp_fc',
'tcc_fc']
In [4]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
aux_dict=aux_dict)
In [5]:
train_set.features.shape
Out[5]:
In [109]:
fc_model = build_fc_model(train_set.features.shape[1], 2, compile=True,
lr=0.01)
In [115]:
# Note that I am running this cell multiple times
fc_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[115]:
In [ ]:
preds = fc_model.predict([test_set.features, test_set.cont_ids])
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'embedding_fc_train_2015_pred_2016.csv')
In [154]:
hidden_model = build_hidden_model(train_set.features.shape[1], 2,
hidden_nodes=[50], compile=True)
In [155]:
hidden_model.summary()
In [157]:
# Note that I am running this cell multiple times
hidden_model.fit(train_set.features, train_set.targets, epochs=10, batch_size=1024,
validation_data=[test_set.features, test_set.targets])
Out[157]:
So we see a definite improvement using auxiliary variables. Again, the hidden layer does not seem to improve things a lot compared to the simple linear model.
In [7]:
more_aux_dict = aux_dict
In [8]:
more_aux_dict['data_aux_surface_more_interpolated_part1_00UTC.nc'] = [
'sshf_fc', 'slhf_fc', 'u10_fc','v10_fc'
]
more_aux_dict['data_aux_surface_more_interpolated_part2_00UTC.nc'] = [
'ssr_fc', 'str_fc', 'd2m_fc','sm_fc'
]
In [6]:
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
In [7]:
more_train_set, more_test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
aux_dict=more_aux_dict)
In [8]:
more_train_set.features.shape
Out[8]:
In [4]:
import pickle
In [10]:
%ls $DATA_DIR
In [11]:
with open(DATA_DIR + 'aux_15_16.pkl', 'wb') as f:
pickle.dump((more_train_set, more_test_set), f)
In [10]:
def save_pickle(fn, train_dates=['2015-01-01', '2016-01-01'], add_current_error=False,
current_error_len=1):
sets = get_train_test_sets(
DATA_DIR, train_dates, test_dates, aux_dict=more_aux_dict,
add_current_error=add_current_error, current_error_len=current_error_len
)
with open(DATA_DIR + fn, 'wb') as f:
pickle.dump(sets, f)
In [12]:
save_pickle('aux_15_16_current30.pkl', ['2015-01-01', '2016-01-01'],
add_current_error=True, current_error_len=30)
In [18]:
save_pickle('aux_14-15_16.pkl', ['2014-01-01', '2016-01-01'])
save_pickle('aux_13-15_16.pkl', ['2013-01-01', '2016-01-01'])
save_pickle('aux_10-15_16.pkl', ['2010-01-01', '2016-01-01'])
save_pickle('aux_08-15_16.pkl', ['2008-01-01', '2016-01-01'])
In [19]:
save_pickle('aux_12-15_16.pkl', ['2012-01-01', '2016-01-01'])
save_pickle('aux_11-15_16.pkl', ['2011-01-01', '2016-01-01'])
In [10]:
fc_model = build_fc_model(more_train_set.features.shape[1], 2, compile=True,
lr=0.01)
In [15]:
# Note that I am running this cell multiple times
fc_model.fit(more_train_set.features, more_train_set.targets, epochs=10, batch_size=1024,
validation_data=[more_test_set.features, more_test_set.targets])
Out[15]:
In [ ]:
Adding these extra variables gets us another percent or so
In [10]:
emb_model = build_emb_model(train_set.features.shape[1], 2, [], 3, max_id,
compile=True, lr=0.01)
In [11]:
emb_model.summary()
In [15]:
emb_model.optimizer.lr=0.001
In [17]:
# Again I am running this multiple times
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets, epochs=10,
batch_size=1024,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[17]:
In [16]:
emb_model = build_emb_model(train_set.features.shape[1], 2, [50], 3, max_id,
compile=True, lr=0.01)
In [149]:
# Again I am running this multiple times
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets, epochs=10,
batch_size=4096,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[149]:
This is our best score so far. Here the non-linearity seems to make a difference compared to the simple linear model. But we do get some overfitting. Let's try out some techniques. Fewer or more hidden nodes does not seem to change all that much.
No we need to be very careful here. I am currently stopping when the validation score does not decrease further. THIS IS CHEATING!
Let's try doing the train valid split just with the training set, and see if we can get a good early stopping point.
In [41]:
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
In [35]:
emb_model = build_emb_model(train_set.features.shape[1], 2, [50], 3, max_id,
compile=True, lr=0.01)
In [36]:
# Again I am running this multiple times
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets, epochs=50,
batch_size=4096, validation_split=0.2,
callbacks=[EarlyStopping(monitor='val_loss',
min_delta=0,
patience=2)])
Out[36]:
In [37]:
emb_model.evaluate([test_set.features, test_set.cont_ids], test_set.targets, batch_size=10000)
Out[37]:
In [38]:
preds = emb_model.predict([test_set.features, test_set.cont_ids])
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'embedding_nn_aux_train_2015_pred_2016.csv')
In [40]:
long_train_dates = ['2008-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
long_train_set, test_set = get_train_test_sets(DATA_DIR, long_train_dates, test_dates,
aux_dict=aux_dict)
In [15]:
emb_model = build_emb_model(long_train_set.features.shape[1], 2, [50], 3, max_id,
compile=True, lr=0.01)
In [ ]:
In [43]:
# Again I am running this multiple times
emb_model.fit([long_train_set.features, long_train_set.cont_ids], long_train_set.targets,
epochs=50,
batch_size=4096, validation_split=0.2,
callbacks=[EarlyStopping(monitor='val_loss',
min_delta=0,
patience=5)]);
Out[43]:
In [44]:
emb_model.evaluate([test_set.features, test_set.cont_ids], test_set.targets, batch_size=10000)
Out[44]:
In [45]:
preds = emb_model.predict([test_set.features, test_set.cont_ids])
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'embedding_nn_aux_train_2008-2015_pred_2016.csv')
In [46]:
long_more_train_set, more_test_set = get_train_test_sets(DATA_DIR, long_train_dates, test_dates,
aux_dict=more_aux_dict)
In [51]:
emb_model = build_emb_model(long_more_train_set.features.shape[1], 2, [100], 3, max_id,
compile=True, lr=0.01)
In [52]:
# Again I am running this multiple times
emb_model.fit([long_more_train_set.features, long_more_train_set.cont_ids],
long_more_train_set.targets,
epochs=50, batch_size=4096, validation_split=0.2,
callbacks=[EarlyStopping(monitor='val_loss',
min_delta=0,
patience=5)])
Out[52]:
In [53]:
emb_model.evaluate([more_test_set.features, more_test_set.cont_ids], more_test_set.targets,
batch_size=10000)
Out[53]:
In [54]:
preds = emb_model.predict([more_test_set.features, more_test_set.cont_ids])
results_df = create_results_df(test_set.date_strs, test_set.station_ids,
preds[:, 0], preds[:, 1])
results_df.to_csv(results_dir + 'embedding_nn_more_aux_train_2008-2015_pred_2016.csv')
In [24]:
with open(DATA_DIR + 'aux_15_16.pkl', 'rb') as f:
train_set, test_set = pickle.load(f)
In [25]:
train_set.features.shape
Out[25]:
In [30]:
train_set.features.std(axis=0)
Out[30]:
In [31]:
train_set.features.mean(axis=0)[1::2]
Out[31]:
In [32]:
train_set.features.std(axis=0)[::2]
Out[32]:
In [34]:
scales = np.zeros(train_set.features.shape[1])
scales[1::2] = train_set.features.mean(axis=0)[1::2]
scales[::2] = train_set.features.std(axis=0)[::2]
In [55]:
features_aug = train_set.features + np.random.normal(size=train_set.features.shape) * scales * 0.05
In [56]:
targets_aug = train_set.targets + np.random.normal(scale=0.1, size=train_set.targets.shape)
In [57]:
train_set.targets[:5], targets_aug[:5]
Out[57]:
In [40]:
np.concatenate([train_set.features, features_aug], axis=0).shape
Out[40]:
In [58]:
emb_model = build_emb_model(train_set.features.shape[1], 2, [100], 3, max_id,
compile=True, lr=0.01)
In [45]:
emb_model.fit([train_set.features, train_set.cont_ids], train_set.targets,
epochs=50, batch_size=4096,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[45]:
In [59]:
emb_model.fit([
np.concatenate([train_set.features, features_aug], axis=0),
np.concatenate([train_set.cont_ids, train_set.cont_ids])
], np.concatenate([train_set.targets, targets_aug]),
epochs=50, batch_size=4096,
validation_data=[[test_set.features, test_set.cont_ids], test_set.targets])
Out[59]:
In [69]:
# Test current error
train_dates = ['2015-01-01', '2016-01-01']
test_dates = ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates,
add_current_error=True,
current_error_len=3)
In [70]:
train_set.features.shape
Out[70]:
In [71]:
train_set.feature_names
Out[71]:
In [ ]: