Categorical/binned output

Here we will use categorical output instead of the mean and standard deviation.


In [34]:
# Imports
from importlib import reload
import utils; reload(utils)
from utils import *
import losses; reload(losses)
from losses import approx_crps_cat, approx_crps_cat, crps_cost_function
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import OrderedDict

import keras
from keras.layers import Input, Dense, merge, Embedding, Flatten, Dropout
from keras.layers.merge import Concatenate
from keras.models import Model
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, Adam
from scipy.stats import binned_statistic


Anaconda environment: py36_gpu
Linux 4.4.0-101-generic

In [2]:
# Setup
# DATA_DIR = '/Volumes/STICK/data/ppnn_data/'  # Mac
DATA_DIR = '/project/meteo/w2w/C7/ppnn_data/'   # LMU
fclt = 48

Convert targets to categorical

We will one-hot-encode the target array


In [3]:
train_dates = ['2010-01-01', '2016-01-01']
test_dates =  ['2016-01-01', '2017-01-01']
train_set, test_set = get_train_test_sets(DATA_DIR, train_dates, test_dates)


train set contains 2191 days
test set contains 366 days

In [4]:
# Define the bins
bin_width = 0.5
bin_edges = np.arange(-30, 35 + bin_width, bin_width)
bin_edges.shape


Out[4]:
(131,)

Assign observations to bins, part 1: Always to the left


In [5]:
def convert_targets(targets, bin_edges):
    bin_idxs = binned_statistic(targets, targets, bins=bin_edges)[-1]
    return keras.utils.to_categorical(bin_idxs, num_classes=bin_edges.shape[0]-1)

In [6]:
train_cat_targets = convert_targets(train_set.targets, bin_edges)
test_cat_targets = convert_targets(test_set.targets, bin_edges)

In [7]:
print('Obs:', test_set.targets[0])
plt.bar(bin_edges[:-1] + 0.25, test_cat_targets[0], width=0.5, zorder=0.1)
plt.axvline(test_set.targets[0], c='r')
plt.show()


Obs: 4.3

In [8]:
def build_cat_model(n_features, hidden_nodes, n_bins):
    inp = Input(shape=(n_features,))
    x = Dense(hidden_nodes[0], activation='relu')(inp)
    if len(hidden_nodes) > 1:
        for h in hidden_nodes[1:]:
            x = Dense(h, activation='relu')(x)
    x = Dense(n_bins, activation='softmax')(x)
    return Model(inputs=inp, outputs=x)

In [18]:
model = build_cat_model(2, [50], bin_edges.shape[0]-1)

In [19]:
model.compile(optimizer=Adam(0.01), loss=approx_crps_cat(bin_width))

In [20]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_2 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                150       
_________________________________________________________________
dense_4 (Dense)              (None, 130)               6630      
=================================================================
Total params: 6,780
Trainable params: 6,780
Non-trainable params: 0
_________________________________________________________________

In [21]:
model.fit(train_set.features, train_cat_targets, epochs=5, batch_size=4096,
          validation_data=[test_set.features, test_cat_targets])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/5
1092828/1092828 [==============================] - 6s - loss: 1.5224 - val_loss: 1.0572
Epoch 2/5
1092828/1092828 [==============================] - 5s - loss: 1.0797 - val_loss: 1.0367
Epoch 3/5
1092828/1092828 [==============================] - 5s - loss: 1.0715 - val_loss: 1.0295
Epoch 4/5
1092828/1092828 [==============================] - 5s - loss: 1.0703 - val_loss: 1.0290
Epoch 5/5
1092828/1092828 [==============================] - 5s - loss: 1.0700 - val_loss: 1.0289
Out[21]:
<keras.callbacks.History at 0x7f9540e24748>

In [24]:
preds = model.predict(test_set.features, batch_size=4096)

In [35]:
# Get correct CRPS
maybe_correct_cat_crps(preds, test_set.targets, bin_edges)


Out[35]:
1.0445440088439826

In [26]:
i = 10
plt.bar(bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
plt.bar(bin_edges[:-1] + 0.25, test_cat_targets[i], width=0.5, zorder=0.1)
plt.axvline(test_set.targets[i], c='r')
plt.show()


Assign observations to bins, part 2: Nearest edge


In [36]:
def convert_targets_nearest(targets, bin_edges):
    mat_bin_edges = np.repeat(np.atleast_2d(bin_edges), targets.shape[0], axis=0).T
    bin_idxs = np.argmin(np.abs(mat_bin_edges - targets),axis=0)
    return keras.utils.to_categorical(bin_idxs, num_classes=bin_edges.shape[0]-1)

In [37]:
train_cat_targets_nearest = convert_targets_nearest(train_set.targets, bin_edges)
test_cat_targets_nearest = convert_targets_nearest(test_set.targets, bin_edges)

In [38]:
model = build_cat_model(2, [50], bin_edges.shape[0]-1)

In [39]:
model.compile(optimizer=Adam(0.01), loss=approx_crps_cat(bin_width))

In [40]:
model.fit(train_set.features, train_cat_targets_nearest, epochs=5, batch_size=1024,
          validation_data=[test_set.features, test_cat_targets_nearest])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/5
1092828/1092828 [==============================] - 10s - loss: 1.2403 - val_loss: 1.0672
Epoch 2/5
1092828/1092828 [==============================] - 10s - loss: 1.1153 - val_loss: 1.0643
Epoch 3/5
1092828/1092828 [==============================] - 10s - loss: 1.1146 - val_loss: 1.0724
Epoch 4/5
1092828/1092828 [==============================] - 9s - loss: 1.1139 - val_loss: 1.0678
Epoch 5/5
1092828/1092828 [==============================] - 9s - loss: 1.1118 - val_loss: 1.0646
Out[40]:
<keras.callbacks.History at 0x7f93fd5dcb00>

In [43]:
model.optimizer.lr = 1e-3

In [44]:
model.fit(train_set.features, train_cat_targets_nearest, epochs=5, batch_size=1024,
          validation_data=[test_set.features, test_cat_targets_nearest])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/5
1092828/1092828 [==============================] - 10s - loss: 1.1100 - val_loss: 1.0639
Epoch 2/5
1092828/1092828 [==============================] - 10s - loss: 1.1096 - val_loss: 1.0639
Epoch 3/5
1092828/1092828 [==============================] - 10s - loss: 1.1080 - val_loss: 1.0597
Epoch 4/5
1092828/1092828 [==============================] - 10s - loss: 1.1074 - val_loss: 1.0606
Epoch 5/5
1092828/1092828 [==============================] - 10s - loss: 1.1057 - val_loss: 1.0537
Out[44]:
<keras.callbacks.History at 0x7f94232abf60>

In [45]:
preds = model.predict(test_set.features)

In [46]:
maybe_correct_cat_crps(preds, test_set.targets, bin_edges)


Out[46]:
1.060824475038503

In [103]:
for i in np.random.randint(0, 182218, 10):
    plt.bar(bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
    plt.bar(bin_edges[:-1] + 0.25, test_cat_targets_nearest[i], width=0.5, zorder=0.1)
    plt.axvline(test_set.targets[i], c='r')
    plt.axvline(test_set.features[i, 0] * test_set.scale_factors[0], c='g')
    plt.show()



In [101]:
plt.hist(test_set.targets, bins = 50);


Are we overfitting?


In [43]:
model = build_cat_model(2, [50], bin_edges.shape[0]-1)
model.compile(optimizer=Adam(0.01), loss=approx_crps_cat(bin_width))

In [45]:
model.fit(train_set.features, train_cat_targets_nearest, epochs=5, batch_size=1024,
              validation_split=0.2)


Train on 144679 samples, validate on 36170 samples
Epoch 1/5
144679/144679 [==============================] - 3s - loss: 1.1115 - val_loss: 1.1108
Epoch 2/5
144679/144679 [==============================] - 3s - loss: 1.1097 - val_loss: 1.1187
Epoch 3/5
144679/144679 [==============================] - 3s - loss: 1.1092 - val_loss: 1.1148
Epoch 4/5
144679/144679 [==============================] - 3s - loss: 1.1105 - val_loss: 1.1353
Epoch 5/5
144679/144679 [==============================] - 4s - loss: 1.1095 - val_loss: 1.1421
Out[45]:
<keras.callbacks.History at 0x12357aeb8>

Only at the end, try a more complex model.


In [49]:
model = build_cat_model(2, [500, 500], bin_edges.shape[0]-1)
model.compile(optimizer=Adam(0.01), loss=approx_crps_cat(bin_width))

In [50]:
model.fit(train_set.features, train_cat_targets_nearest, epochs=5, batch_size=1024,
              validation_split=0.2)


Train on 144679 samples, validate on 36170 samples
Epoch 1/5
144679/144679 [==============================] - 16s - loss: 1.3433 - val_loss: 1.3339
Epoch 2/5
144679/144679 [==============================] - 15s - loss: 1.1372 - val_loss: 1.1312
Epoch 3/5
144679/144679 [==============================] - 15s - loss: 1.1271 - val_loss: 1.0937
Epoch 4/5
144679/144679 [==============================] - 15s - loss: 1.1054 - val_loss: 1.1512
Epoch 5/5
144679/144679 [==============================] - 17s - loss: 1.0943 - val_loss: 1.1259
Out[50]:
<keras.callbacks.History at 0x109a345c0>

Bin size tuning


In [241]:
wide_bin_width = 1.5
wide_bin_edges = np.arange(-30, 35 + wide_bin_width, wide_bin_width)

In [242]:
wide_train_cat_targets = convert_targets_nearest(train_set.targets, wide_bin_edges)
wide_test_cat_targets = convert_targets_nearest(test_set.targets, wide_bin_edges)

In [243]:
model = build_cat_model(2, [50], wide_bin_edges.shape[0]-1)

In [244]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_38 (InputLayer)        (None, 2)                 0         
_________________________________________________________________
dense_61 (Dense)             (None, 50)                150       
_________________________________________________________________
dense_62 (Dense)             (None, 44)                2244      
=================================================================
Total params: 2,394
Trainable params: 2,394
Non-trainable params: 0
_________________________________________________________________

In [253]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [254]:
model.fit(train_set.features, wide_train_cat_targets, epochs=5, batch_size=1024,
          validation_data=[test_set.features, wide_test_cat_targets])


Train on 180849 samples, validate on 182218 samples
Epoch 1/5
180849/180849 [==============================] - 2s - loss: 0.4007 - val_loss: 0.3866
Epoch 2/5
180849/180849 [==============================] - 2s - loss: 0.4000 - val_loss: 0.3857
Epoch 3/5
180849/180849 [==============================] - 2s - loss: 0.3994 - val_loss: 0.3850
Epoch 4/5
180849/180849 [==============================] - 2s - loss: 0.3988 - val_loss: 0.3844
Epoch 5/5
180849/180849 [==============================] - 2s - loss: 0.3983 - val_loss: 0.3841
Out[254]:
<keras.callbacks.History at 0x7fa46e695b00>

In [255]:
preds = model.predict(test_set.features)

In [69]:
i = 100447
plt.bar(wide_bin_edges[:-1] + 0.75, preds[i], width=1.5, alpha=0.8)
plt.bar(wide_bin_edges[:-1] + 0.75, wide_test_cat_targets[i], width=1.5, zorder=0.1)
plt.axvline(test_set.targets[i], c='r')
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-69-655979f123d5> in <module>()
      1 i = 100447
----> 2 plt.bar(wide_bin_edges[:-1] + 0.75, preds[i], width=1.5, alpha=0.8)
      3 plt.bar(wide_bin_edges[:-1] + 0.75, wide_test_cat_targets[i], width=1.5, zorder=0.1)
      4 plt.axvline(test_set.targets[i], c='r')
      5 plt.show()

NameError: name 'wide_bin_edges' is not defined

In [257]:
maybe_correct_cat_crps(preds, test_set.targets, wide_bin_edges)


Out[257]:
1.2046557611208457

How complex does a network have to be to produce a clean PDF


In [54]:
preds[:, 1].mean()


Out[54]:
2.2552676e-08

In [57]:
# Create dummy dataset of means and stds
dummy_means = np.random.rand(100000) * 40. + 5
dummy_std = np.random.rand(100000) * 3.

In [58]:
# Create the corresponding PDFs in bins
dummy_bin_edges = np.arange(0, 50 + bin_width, bin_width)

In [ ]:

Convert to relative bins


In [114]:
mean_error = test_set.features[:, 0] * test_set.scale_factors[0] - test_set.targets

In [115]:
mean_error.min(), mean_error.max()


Out[115]:
(-13.662431, 19.809189)

In [116]:
mean_error.argmin()


Out[116]:
162412

In [117]:
np.mean(np.abs(mean_error))


Out[117]:
1.4406393

In [118]:
train_conv_targets = train_set.targets - train_set.features[:, 0] * train_set.scale_factors[0]
test_conv_targets = test_set.targets - test_set.features[:, 0] * test_set.scale_factors[0]

In [120]:
train_conv_targets.min(), train_conv_targets.max()


Out[120]:
(-13.174971, 16.22398)

In [121]:
conv_bin_edges = np.arange(-15, 20 + bin_width, bin_width)

In [122]:
conv_train_cat_targets_nearest = convert_targets_nearest(train_conv_targets, conv_bin_edges)
conv_test_cat_targets_nearest = convert_targets_nearest(test_conv_targets, conv_bin_edges)

In [123]:
model = build_cat_model(2, [50], conv_bin_edges.shape[0]-1)
model.compile(optimizer=Adam(0.01), loss=approx_crps_cat(bin_width))

In [124]:
model.fit(train_set.features, conv_train_cat_targets_nearest, epochs=5, batch_size=1024,
          validation_data=[test_set.features, conv_test_cat_targets_nearest])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/5
1092828/1092828 [==============================] - 17s - loss: 1.0699 - val_loss: 1.0088
Epoch 2/5
1092828/1092828 [==============================] - 16s - loss: 1.0445 - val_loss: 1.0160
Epoch 3/5
1092828/1092828 [==============================] - 16s - loss: 1.0440 - val_loss: 1.0072
Epoch 4/5
1092828/1092828 [==============================] - 16s - loss: 1.0436 - val_loss: 1.0215
Epoch 5/5
1092828/1092828 [==============================] - 16s - loss: 1.0434 - val_loss: 1.0186
Out[124]:
<keras.callbacks.History at 0x1268baac8>

In [125]:
preds = model.predict(test_set.features)

In [126]:
maybe_correct_cat_crps(preds, test_conv_targets, conv_bin_edges)


Out[126]:
1.0511612930624885

In [128]:
for i in np.random.randint(0, 182218, 10):
    print(i)
    print(test_set.targets[i], test_set.features[i, 0] * test_set.scale_factors[0])
    plt.bar(conv_bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
    plt.bar(conv_bin_edges[:-1] + 0.25, conv_test_cat_targets_nearest[i], width=0.5, zorder=0.1)
    plt.axvline(test_conv_targets[i], c='r')
    plt.show()


69598
12.1 10.2448
26259
6.9 6.52055
127710
20.8 20.3243
38035
-3.2 -0.0233978
143127
2.0 6.13449
32744
0.8 1.60835
55777
8.4 7.39353
171743
4.6 0.255672
134850
9.2 10.5802
179564
7.9 7.09409

Full ensemble features


In [62]:
train_set_full_ens, test_set_full_ens = get_train_test_sets(DATA_DIR, train_dates, test_dates,
                                                            full_ensemble_t=True)


train set contains 365 days
test set contains 366 days

In [64]:
train_set_full_ens.features.shape


Out[64]:
(180849, 50)

In [181]:
model = build_cat_model(50, [50], bin_edges.shape[0]-1)

In [182]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_31 (InputLayer)        (None, 50)                0         
_________________________________________________________________
dense_49 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_50 (Dense)             (None, 130)               6630      
=================================================================
Total params: 9,180
Trainable params: 9,180
Non-trainable params: 0
_________________________________________________________________

In [185]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [186]:
model.fit(train_set_full_ens.features, train_cat_targets_nearest, epochs=10, batch_size=1024,
          validation_data=[test_set_full_ens.features, test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 3s - loss: 1.1662 - val_loss: 1.1399
Epoch 2/10
180849/180849 [==============================] - 3s - loss: 1.1656 - val_loss: 1.1393
Epoch 3/10
180849/180849 [==============================] - 3s - loss: 1.1651 - val_loss: 1.1394
Epoch 4/10
180849/180849 [==============================] - 3s - loss: 1.1646 - val_loss: 1.1383
Epoch 5/10
180849/180849 [==============================] - 3s - loss: 1.1641 - val_loss: 1.1380
Epoch 6/10
180849/180849 [==============================] - 3s - loss: 1.1637 - val_loss: 1.1378
Epoch 7/10
180849/180849 [==============================] - 3s - loss: 1.1632 - val_loss: 1.1371
Epoch 8/10
180849/180849 [==============================] - 3s - loss: 1.1628 - val_loss: 1.1367
Epoch 9/10
180849/180849 [==============================] - 3s - loss: 1.1625 - val_loss: 1.1362
Epoch 10/10
180849/180849 [==============================] - 3s - loss: 1.1621 - val_loss: 1.1369
Out[186]:
<keras.callbacks.History at 0x7fa48caabef0>

In [187]:
preds = model.predict(test_set_full_ens.features)

In [188]:
maybe_correct_cat_crps(preds, test_set.targets, bin_edges)


Out[188]:
1.1306173910191841

In [189]:
i = 10
plt.bar(bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
plt.bar(bin_edges[:-1] + 0.25, test_cat_targets[i], width=0.5, zorder=0.1)
plt.axvline(test_set.targets[i], c='r')
plt.show()


With convolution


In [190]:
def build_cat_conv_model(hidden_nodes, feature_size, n_bins, maps):
    features_in = Input(shape=(feature_size, 1))
    conv = Conv1D(maps, feature_size)(features_in)
    conv = Flatten()(conv)
    x = Dense(hidden_nodes, activation='relu')(conv)
    x = Dense(n_bins, activation='softmax')(x)
    model = Model(inputs=features_in, outputs=x)
    return model

In [200]:
model = build_cat_conv_model(50, 50, bin_edges.shape[0]-1, 10)

In [201]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_34 (InputLayer)        (None, 50, 1)             0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 1, 10)             510       
_________________________________________________________________
flatten_13 (Flatten)         (None, 10)                0         
_________________________________________________________________
dense_55 (Dense)             (None, 50)                550       
_________________________________________________________________
dense_56 (Dense)             (None, 130)               6630      
=================================================================
Total params: 7,690
Trainable params: 7,690
Non-trainable params: 0
_________________________________________________________________

In [204]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [205]:
model.fit(np.atleast_3d(train_set_full_ens.features), train_cat_targets_nearest, 
          epochs=10, batch_size=1024,
          validation_data=[np.atleast_3d(test_set_full_ens.features), test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 4s - loss: 1.1959 - val_loss: 1.1233
Epoch 2/10
180849/180849 [==============================] - 3s - loss: 1.1956 - val_loss: 1.1215
Epoch 3/10
180849/180849 [==============================] - 3s - loss: 1.1956 - val_loss: 1.1217
Epoch 4/10
180849/180849 [==============================] - 3s - loss: 1.1956 - val_loss: 1.1225
Epoch 5/10
180849/180849 [==============================] - 3s - loss: 1.1934 - val_loss: 1.1185
Epoch 6/10
180849/180849 [==============================] - 3s - loss: 1.1890 - val_loss: 1.1168
Epoch 7/10
180849/180849 [==============================] - 3s - loss: 1.1887 - val_loss: 1.1178
Epoch 8/10
180849/180849 [==============================] - 3s - loss: 1.1885 - val_loss: 1.1171
Epoch 9/10
180849/180849 [==============================] - 3s - loss: 1.1884 - val_loss: 1.1167
Epoch 10/10
180849/180849 [==============================] - 3s - loss: 1.1883 - val_loss: 1.1171
Out[205]:
<keras.callbacks.History at 0x7fa47cdf3ac8>

In [ ]:

Full ensemble with mean and std output


In [113]:
def build_hidden_model(hidden_nodes, feature_size=2):
    inp = Input(shape=(feature_size,))
    x = Dense(hidden_nodes, activation='relu')(inp)
    x = Dense(2, activation='linear')(x)
    return Model(inputs=inp, outputs=x)

In [114]:
model = build_hidden_model(50, 50)
model.compile(optimizer=Adam(0.01), loss=crps_cost_function)

In [115]:
model.fit(train_set_full_ens.features, train_set_full_ens.targets, epochs=5, 
                 batch_size=1024,
                 validation_data=[test_set_full_ens.features, test_set_full_ens.targets])


Train on 180849 samples, validate on 182218 samples
Epoch 1/5
180849/180849 [==============================] - 1s - loss: 1.4411 - val_loss: 1.0460
Epoch 2/5
180849/180849 [==============================] - 1s - loss: 1.0766 - val_loss: 1.0239
Epoch 3/5
180849/180849 [==============================] - 1s - loss: 1.0731 - val_loss: 1.0242
Epoch 4/5
180849/180849 [==============================] - 1s - loss: 1.0711 - val_loss: 1.0422
Epoch 5/5
180849/180849 [==============================] - 1s - loss: 1.0709 - val_loss: 1.0318
Out[115]:
<keras.callbacks.History at 0x7fa4a4530978>

In [116]:
preds = model.predict(test_set_full_ens.features)

In [118]:
preds[10], test_set_full_ens.targets[10]


Out[118]:
(array([ 2.14922285, -1.70540333], dtype=float32), -0.60000002)

With station embeddings


In [129]:
def build_cat_emb_model(hidden_nodes, emb_size, max_id, feature_size, n_bins):
    features_in = Input(shape=(feature_size,))
    id_in = Input(shape=(1,))
    emb = Embedding(max_id + 1, emb_size)(id_in)
    emb = Flatten()(emb)
    x = Concatenate()([features_in, emb])
    x = Dense(hidden_nodes, activation='relu')(x)
    x = Dense(n_bins, activation='softmax')(x)
    model = Model(inputs=[features_in, id_in], outputs=x)
    return model

In [130]:
emb_size = 5
max_id = int(np.max([train_set.cont_ids.max(), test_set.cont_ids.max()]))
max_id


Out[130]:
536

With relative bins


In [131]:
model = build_cat_emb_model(100, emb_size, max_id, 2, conv_bin_edges.shape[0]-1)

In [132]:
model.compile(optimizer=Adam(0.001), loss=approx_crps_cat(bin_width))

In [133]:
model.fit([train_set.features, train_set.cont_ids], conv_train_cat_targets_nearest, 
              epochs=5, batch_size=1024, 
              validation_data=[[test_set.features, test_set.cont_ids], conv_test_cat_targets_nearest])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/5
1092828/1092828 [==============================] - 20s - loss: 1.0948 - val_loss: 0.9109
Epoch 2/5
1092828/1092828 [==============================] - 19s - loss: 0.9418 - val_loss: 0.9069
Epoch 3/5
1092828/1092828 [==============================] - 19s - loss: 0.9391 - val_loss: 0.9057
Epoch 4/5
1092828/1092828 [==============================] - 19s - loss: 0.9374 - val_loss: 0.9071
Epoch 5/5
1092828/1092828 [==============================] - 19s - loss: 0.9358 - val_loss: 0.9071
Out[133]:
<keras.callbacks.History at 0x1225c6f28>

In [134]:
preds = model.predict([test_set.features, test_set.cont_ids])

In [135]:
maybe_correct_cat_crps(preds, test_conv_targets, conv_bin_edges)


Out[135]:
0.93468747827570597

In [136]:
for i in np.random.randint(0, 182218, 10):
    print(i)
    print(test_set.targets[i], test_set.features[i, 0] * test_set.scale_factors[0])
    plt.bar(conv_bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
    plt.bar(conv_bin_edges[:-1] + 0.25, conv_test_cat_targets_nearest[i], width=0.5, zorder=0.1)
    plt.axvline(test_conv_targets[i], c='r')
    plt.show()


22414
2.0 1.32883
12547
1.3 -0.955767
18650
9.9 8.75629
14403
1.6 1.57018
29467
-1.0 0.0790331
159501
3.5 3.59646
33520
-2.8 -2.84231
14755
1.3 2.12982
139630
8.5 10.2912
39821
3.0 1.32466

With full ensemble input


In [137]:
train_set_full_ens, test_set_full_ens = get_train_test_sets(DATA_DIR, train_dates, test_dates,
                                                            full_ensemble_t=True)


train set contains 2191 days
test set contains 366 days

In [138]:
model = build_cat_emb_model(100, emb_size, max_id, 50, conv_bin_edges.shape[0]-1)

In [139]:
model.compile(optimizer=Adam(0.001), loss=approx_crps_cat(bin_width))

In [141]:
model.fit([train_set_full_ens.features, train_set_full_ens.cont_ids], conv_train_cat_targets_nearest, 
              epochs=3, batch_size=1024, 
              validation_data=[[test_set_full_ens.features, test_set_full_ens.cont_ids], 
                               conv_test_cat_targets_nearest])


Train on 1092828 samples, validate on 182218 samples
Epoch 1/3
1092828/1092828 [==============================] - 21s - loss: 1.0665 - val_loss: 0.9268
Epoch 2/3
1092828/1092828 [==============================] - 21s - loss: 0.9519 - val_loss: 0.9114
Epoch 3/3
1092828/1092828 [==============================] - 22s - loss: 0.9474 - val_loss: 0.9115
Out[141]:
<keras.callbacks.History at 0x127051b38>

In [142]:
preds = model.predict([test_set_full_ens.features, test_set_full_ens.cont_ids])

In [143]:
maybe_correct_cat_crps(preds, test_conv_targets, conv_bin_edges)


Out[143]:
0.93664919338451591

In [144]:
for i in np.random.randint(0, 182218, 10):
    plt.bar(conv_bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
    plt.bar(conv_bin_edges[:-1] + 0.25, conv_test_cat_targets_nearest[i], width=0.5, zorder=0.1)
    plt.axvline(test_conv_targets[i], c='r')
    plt.show()



In [ ]:

Standard bins


In [135]:
model = build_cat_emb_model(100, emb_size, max_id, 2, bin_edges.shape[0]-1)

In [147]:
model.compile(optimizer=Adam(0.00001), loss=approx_crps)

In [148]:
model.fit([train_set.features, train_set.cont_ids], train_cat_targets_nearest, 
              epochs=10, batch_size=4096, 
              validation_data=[[test_set.features, test_set.cont_ids], test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 3s - loss: 0.9701 - val_loss: 0.9442
Epoch 2/10
180849/180849 [==============================] - 2s - loss: 0.9701 - val_loss: 0.9441
Epoch 3/10
180849/180849 [==============================] - 2s - loss: 0.9701 - val_loss: 0.9440
Epoch 4/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9440
Epoch 5/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9439
Epoch 6/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9439
Epoch 7/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9438
Epoch 8/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9438
Epoch 9/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9437
Epoch 10/10
180849/180849 [==============================] - 2s - loss: 0.9700 - val_loss: 0.9437
Out[148]:
<keras.callbacks.History at 0x7fa49d4cff28>

In [149]:
preds = model.predict([test_set.features, test_set.cont_ids])

In [150]:
maybe_correct_cat_crps(preds, test_set.targets, bin_edges)


Out[150]:
0.95332045387173248

In [151]:
i = 10
plt.bar(bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8)
plt.bar(bin_edges[:-1] + 0.25, test_cat_targets[i], width=0.5, zorder=0.1)
plt.axvline(test_set.targets[i], c='r')
plt.show()


testing a convolution layer


In [153]:
from keras.layers import Conv1D

In [158]:
def build_cat_emb_conv_model(hidden_nodes, emb_size, max_id, feature_size, n_bins, maps):
    features_in = Input(shape=(feature_size, 1))
    id_in = Input(shape=(1,))
    emb = Embedding(max_id + 1, emb_size)(id_in)
    emb = Flatten()(emb)
    conv = Conv1D(maps, feature_size)(features_in)
    conv = Flatten()(conv)
    x = Concatenate()([conv, emb])
    x = Dense(hidden_nodes, activation='relu')(x)
    x = Dense(n_bins, activation='softmax')(x)
    model = Model(inputs=[features_in, id_in], outputs=x)
    return model

In [161]:
model = build_cat_emb_conv_model(100, emb_size, max_id, 50, bin_edges.shape[0]-1, 2)

In [162]:
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_27 (InputLayer)            (None, 50, 1)         0                                            
____________________________________________________________________________________________________
input_28 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
conv1d_4 (Conv1D)                (None, 1, 2)          102         input_27[0][0]                   
____________________________________________________________________________________________________
embedding_7 (Embedding)          (None, 1, 5)          2685        input_28[0][0]                   
____________________________________________________________________________________________________
flatten_9 (Flatten)              (None, 2)             0           conv1d_4[0][0]                   
____________________________________________________________________________________________________
flatten_8 (Flatten)              (None, 5)             0           embedding_7[0][0]                
____________________________________________________________________________________________________
concatenate_5 (Concatenate)      (None, 7)             0           flatten_9[0][0]                  
                                                                   flatten_8[0][0]                  
____________________________________________________________________________________________________
dense_43 (Dense)                 (None, 100)           800         concatenate_5[0][0]              
____________________________________________________________________________________________________
dense_44 (Dense)                 (None, 130)           13130       dense_43[0][0]                   
====================================================================================================
Total params: 16,717
Trainable params: 16,717
Non-trainable params: 0
____________________________________________________________________________________________________

In [166]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [167]:
model.fit([np.atleast_3d(train_set_full_ens.features), train_set_full_ens.cont_ids], 
          train_cat_targets_nearest, 
          epochs=10, batch_size=4096, 
          validation_data=[[np.atleast_3d(test_set_full_ens.features), test_set_full_ens.cont_ids], 
                           test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 3s - loss: 1.1204 - val_loss: 1.0730
Epoch 2/10
180849/180849 [==============================] - 2s - loss: 1.1179 - val_loss: 1.0703
Epoch 3/10
180849/180849 [==============================] - 2s - loss: 1.1158 - val_loss: 1.0683
Epoch 4/10
180849/180849 [==============================] - 2s - loss: 1.1138 - val_loss: 1.0658
Epoch 5/10
180849/180849 [==============================] - 2s - loss: 1.1119 - val_loss: 1.0634
Epoch 6/10
180849/180849 [==============================] - 2s - loss: 1.1102 - val_loss: 1.0619
Epoch 7/10
180849/180849 [==============================] - 2s - loss: 1.1086 - val_loss: 1.0601
Epoch 8/10
180849/180849 [==============================] - 2s - loss: 1.1071 - val_loss: 1.0581
Epoch 9/10
180849/180849 [==============================] - 2s - loss: 1.1057 - val_loss: 1.0567
Epoch 10/10
180849/180849 [==============================] - 2s - loss: 1.1044 - val_loss: 1.0546
Out[167]:
<keras.callbacks.History at 0x7fa4940734e0>

Auxiliary data

Mean and std


In [208]:
aux_dict = OrderedDict()
aux_dict['data_aux_geo_interpolated.nc'] = ['orog', 
                                            'station_alt', 
                                            'station_lat', 
                                            'station_lon']
aux_dict['data_aux_pl500_interpolated_00UTC.nc'] = ['u_pl500_fc',
                                                    'v_pl500_fc',
                                                    'gh_pl500_fc']
aux_dict['data_aux_pl850_interpolated_00UTC.nc'] = ['u_pl850_fc',
                                                    'v_pl850_fc',
                                                    'q_pl850_fc']
aux_dict['data_aux_surface_interpolated_00UTC.nc'] = ['cape_fc',
                                                      'sp_fc',
                                                      'tcc_fc']

In [212]:
train_set_aux, test_set_aux = get_train_test_sets(DATA_DIR, train_dates, test_dates,
                                                  aux_dict=aux_dict)


train set contains 365 days
test set contains 366 days

In [214]:
model = build_cat_model(train_set_aux.features.shape[1], [50], bin_edges.shape[0]-1)

In [215]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_36 (InputLayer)        (None, 24)                0         
_________________________________________________________________
dense_57 (Dense)             (None, 50)                1250      
_________________________________________________________________
dense_58 (Dense)             (None, 130)               6630      
=================================================================
Total params: 7,880
Trainable params: 7,880
Non-trainable params: 0
_________________________________________________________________

In [223]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [224]:
model.fit(train_set_aux.features, train_cat_targets_nearest, epochs=5, batch_size=1024,
          validation_data=[test_set_aux.features, test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/5
180849/180849 [==============================] - 3s - loss: 1.0580 - val_loss: 1.0450
Epoch 2/5
180849/180849 [==============================] - 3s - loss: 1.0578 - val_loss: 1.0457
Epoch 3/5
180849/180849 [==============================] - 3s - loss: 1.0576 - val_loss: 1.0456
Epoch 4/5
180849/180849 [==============================] - 3s - loss: 1.0575 - val_loss: 1.0435
Epoch 5/5
180849/180849 [==============================] - 3s - loss: 1.0573 - val_loss: 1.0438
Out[224]:
<keras.callbacks.History at 0x7fa47c186e10>

In [267]:
model = build_cat_emb_model(100, emb_size, max_id, train_set_aux.features.shape[1], 
                            bin_edges.shape[0]-1)

In [268]:
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_44 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_10 (Embedding)         (None, 1, 5)          2685        input_44[0][0]                   
____________________________________________________________________________________________________
input_43 (InputLayer)            (None, 24)            0                                            
____________________________________________________________________________________________________
flatten_16 (Flatten)             (None, 5)             0           embedding_10[0][0]               
____________________________________________________________________________________________________
concatenate_8 (Concatenate)      (None, 29)            0           input_43[0][0]                   
                                                                   flatten_16[0][0]                 
____________________________________________________________________________________________________
dense_67 (Dense)                 (None, 100)           3000        concatenate_8[0][0]              
____________________________________________________________________________________________________
dense_68 (Dense)                 (None, 130)           13130       dense_67[0][0]                   
====================================================================================================
Total params: 18,815
Trainable params: 18,815
Non-trainable params: 0
____________________________________________________________________________________________________

In [274]:
model.compile(optimizer=Adam(0.00001), loss=approx_crps)

In [275]:
model.fit([train_set_aux.features, train_set_aux.cont_ids], train_cat_targets_nearest, 
              epochs=10, batch_size=4096, 
              validation_data=[[test_set_aux.features, test_set_aux.cont_ids], 
                               test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 3s - loss: 0.9672 - val_loss: 0.9603
Epoch 2/10
180849/180849 [==============================] - 2s - loss: 0.9665 - val_loss: 0.9605
Epoch 3/10
180849/180849 [==============================] - 2s - loss: 0.9662 - val_loss: 0.9607
Epoch 4/10
180849/180849 [==============================] - 2s - loss: 0.9661 - val_loss: 0.9608
Epoch 5/10
180849/180849 [==============================] - 2s - loss: 0.9661 - val_loss: 0.9610
Epoch 6/10
180849/180849 [==============================] - 2s - loss: 0.9660 - val_loss: 0.9610
Epoch 7/10
180849/180849 [==============================] - 2s - loss: 0.9660 - val_loss: 0.9611
Epoch 8/10
180849/180849 [==============================] - 2s - loss: 0.9660 - val_loss: 0.9610
Epoch 9/10
180849/180849 [==============================] - 2s - loss: 0.9659 - val_loss: 0.9609
Epoch 10/10
180849/180849 [==============================] - 2s - loss: 0.9659 - val_loss: 0.9609
Out[275]:
<keras.callbacks.History at 0x7fa46f6fbcf8>

Full ensemble


In [226]:
train_set_aux_ens, test_set_aux_ens = get_train_test_sets(DATA_DIR, train_dates, test_dates,
                                                          aux_dict=aux_dict, 
                                                          full_ensemble_t=True)


train set contains 365 days
test set contains 366 days

In [276]:
train_set_aux_ens.features.shape


Out[276]:
(180849, 72)

In [280]:
model = build_cat_emb_model(100, emb_size, max_id, train_set_aux_ens.features.shape[1], 
                            bin_edges.shape[0]-1)

In [285]:
model.compile(optimizer=Adam(0.0001), loss=approx_crps)

In [286]:
model.fit([train_set_aux_ens.features, train_set_aux_ens.cont_ids], train_cat_targets_nearest, 
              epochs=10, batch_size=4096, 
              validation_data=[[test_set_aux_ens.features, test_set_aux_ens.cont_ids], 
                               test_cat_targets_nearest])


Train on 180849 samples, validate on 182218 samples
Epoch 1/10
180849/180849 [==============================] - 3s - loss: 0.9205 - val_loss: 0.9256
Epoch 2/10
180849/180849 [==============================] - 2s - loss: 0.9202 - val_loss: 0.9257
Epoch 3/10
180849/180849 [==============================] - 2s - loss: 0.9201 - val_loss: 0.9260
Epoch 4/10
180849/180849 [==============================] - 2s - loss: 0.9199 - val_loss: 0.9251
Epoch 5/10
180849/180849 [==============================] - 2s - loss: 0.9197 - val_loss: 0.9258
Epoch 6/10
180849/180849 [==============================] - 2s - loss: 0.9195 - val_loss: 0.9262
Epoch 7/10
180849/180849 [==============================] - 2s - loss: 0.9194 - val_loss: 0.9250
Epoch 8/10
180849/180849 [==============================] - 2s - loss: 0.9193 - val_loss: 0.9260
Epoch 9/10
180849/180849 [==============================] - 2s - loss: 0.9191 - val_loss: 0.9249
Epoch 10/10
180849/180849 [==============================] - 2s - loss: 0.9190 - val_loss: 0.9256
Out[286]:
<keras.callbacks.History at 0x7fa46d45cc18>

In [287]:
preds = model.predict([test_set_aux_ens.features, test_set_aux_ens.cont_ids])

In [288]:
maybe_correct_cat_crps(preds, test_set_aux_ens.targets, bin_edges)


Out[288]:
0.92655730152299154

XGBoost


In [145]:
import xgboost as xgb


/Users/stephanrasp/anaconda/envs/py36_keras/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [154]:
def convert_targets_nearest_idxs(targets, bin_edges):
    mat_bin_edges = np.repeat(np.atleast_2d(bin_edges), targets.shape[0], axis=0).T
    return np.argmin(np.abs(mat_bin_edges - targets),axis=0)

In [155]:
xgb_train_targets = convert_targets_nearest_idxs(train_conv_targets, conv_bin_edges)
xgb_test_targets = convert_targets_nearest_idxs(test_conv_targets, conv_bin_edges)

In [160]:
xgb_train_targets[:5]


Out[160]:
array([30, 33, 29, 35, 30])

In [156]:
dtrain = xgb.DMatrix(train_set.features, xgb_train_targets, 
                     feature_names=train_set.feature_names)
dtest = xgb.DMatrix(test_set.features, xgb_test_targets, 
                     feature_names=test_set.feature_names)

In [169]:
our_params={
            'eta':0.2,
            'subsample':0.8,
            'objective':'multi:softprob',
            'eval_metric': 'mlogloss',
            'num_class': conv_bin_edges.shape[0]-1
            }
watchlist  = [(dtest,'eval'), (dtrain,'train')]

In [170]:
bst=xgb.train(our_params, dtrain, 15, watchlist)


[0]	eval-mlogloss:3.77573	train-mlogloss:3.79603
[1]	eval-mlogloss:3.55214	train-mlogloss:3.5737
[2]	eval-mlogloss:3.40014	train-mlogloss:3.42155
[3]	eval-mlogloss:3.28701	train-mlogloss:3.30776
[4]	eval-mlogloss:3.19848	train-mlogloss:3.21844
[5]	eval-mlogloss:3.12733	train-mlogloss:3.14643
[6]	eval-mlogloss:3.06885	train-mlogloss:3.08704
[7]	eval-mlogloss:3.02004	train-mlogloss:3.03736
[8]	eval-mlogloss:2.97877	train-mlogloss:2.99532
[9]	eval-mlogloss:2.94371	train-mlogloss:2.95945
[10]	eval-mlogloss:2.91356	train-mlogloss:2.92853
[11]	eval-mlogloss:2.88757	train-mlogloss:2.90173
[12]	eval-mlogloss:2.86505	train-mlogloss:2.87848
[13]	eval-mlogloss:2.84545	train-mlogloss:2.85814
[14]	eval-mlogloss:2.82833	train-mlogloss:2.84032

In [171]:
preds_xgb = bst.predict(dtest)

In [172]:
maybe_correct_cat_crps(preds_xgb, test_conv_targets, conv_bin_edges)


Out[172]:
1.1241186175877742

In [175]:
for i in np.random.randint(0, 182218, 10):
    print(i)
    print(test_set.targets[i], test_set.features[i, 0] * test_set.scale_factors[0])
    plt.bar(conv_bin_edges[:-1] + 0.25, preds_xgb[i], width=0.5, alpha=0.8, color='navy')
    plt.bar(conv_bin_edges[:-1] + 0.25, preds[i], width=0.5, alpha=0.8, color='lightblue')
    plt.bar(conv_bin_edges[:-1] + 0.25, conv_test_cat_targets_nearest[i], width=0.5, zorder=0.1)
    plt.axvline(test_conv_targets[i], c='r')
    plt.show()


51970
8.3 6.36791
146430
4.3 3.80336
49493
0.9 2.75774
67465
7.4 6.4445
132167
11.7 13.4324
72286
10.5 10.3513
133893
13.1 14.2716
25253
0.5 0.162234
121419
18.9 18.6738
21629
-0.9 -1.11395

In [ ]: