Compare Forecasts



In [1]:

    
import os
import sys
import datetime
import pandas as pd
import numpy as np

from keras.models import load_model
from keras import backend as K

import pytz

# Setup for Latex Export: https://matplotlib.org/users/pgf.html. Need to import before pyplot
def figsize(scale):
    fig_width_pt = 469.755                          # Get this from LaTeX using \the\textwidth
    inches_per_pt = 1.0/72.27                       # Convert pt to inch
    golden_mean = (np.sqrt(5.0)-1.0)/2.0            # Aesthetic ratio (you could change this)
    fig_width = fig_width_pt*inches_per_pt*scale    # width in inches
    fig_height = fig_width*golden_mean              # height in inches
    fig_size = [fig_width,fig_height]
    return fig_size

import matplotlib as mpl
mpl.use('pgf')
pgf_with_rc_fonts = {
    "text.usetex": True,
    "font.family": "serif",
    "axes.labelsize": 10,               # LaTeX default is 10pt font.
    "font.size": 10,
    "legend.fontsize": 8,               # Make the legend/label fonts a little smaller
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "figure.figsize": figsize(0.9),     # default fig size of 0.9 textwidth
    #"font.serif": [],                   # use latex default serif font
    #"font.sans-serif": ["DejaVu Sans"], # use a specific sans-serif font
}
mpl.rcParams.update(pgf_with_rc_fonts)

import matplotlib.pyplot as plt


from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate

# Import custom module functions
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lstm_load_forecasting import data, lstm

from ast import literal_eval

%matplotlib









    



Using TensorFlow backend.






    



Using matplotlib backend: TkAgg

TBATS Benchmark



In [2]:

    
tbats_fc = pd.read_csv(os.path.join('../Data', 'tbats_forecast_01022017-h5000.csv'))

starting = datetime.datetime(2017,2,1,0,0,0,0, tzinfo=pytz.utc )
forecasts = pd.DataFrame(data={"tbats_forecast": tbats_fc['tbats_fc'].values}, index=pd.date_range(starting, periods=5000, freq='60min'))

ARMA Forecast 1: ENTSOE + Calendar



In [3]:

    
arma_fc = pd.read_csv(os.path.join('../Data', 'arma_fc.csv'))
starting = datetime.datetime(2017,2,1,0,0,0,0, tzinfo=pytz.utc)
arma_forecasts = pd.DataFrame(data={"arma_forecast": arma_fc['x'].values}, index=pd.date_range(starting, periods=arma_fc.shape[0], freq='60min'))

ARMA Forecast 2: Weather + Calendar



In [4]:

    
arma_fc2 = pd.read_csv(os.path.join('../Data', 'arma_fc2.csv'))
starting = datetime.datetime(2017,2,1,0,0,0,0, tzinfo=pytz.utc)
arma_forecasts2 = pd.DataFrame(data={"arma_forecast2": arma_fc2['x'].values}, index=pd.date_range(starting, periods=arma_fc2.shape[0], freq='60min'))

Actual Load and ENTSOE Benchmark



In [5]:

    
#df = pd.read_csv(os.path.join('Data', 'fulldataset.csv'), sep=';', usecols=[0,1,2], parse_dates=[0], index_col = 0)
path = os.path.join('../Data', 'fulldataset.csv')
entsoe = data.load_dataset(path=path, modules=['entsoe'])
actual = data.load_dataset(path=path, modules=['actual'])

forecasts = forecasts.join(entsoe)
forecasts = forecasts.join(actual)
forecasts = forecasts.join(arma_forecasts)
forecasts = forecasts.join(arma_forecasts2)

LSTM Models

Settings



In [6]:

    
# Best models based on test results
res_path = os.path.abspath('../results/')
model_dir = os.path.abspath('../models/')
date = '20170616'
starting = datetime.datetime(2017,2,1,0,0,0,0, tzinfo=pytz.utc )
epochs = 20
retrain = False
min_delta = 0.006
patience = 2

Model 4 (ENTSO-E + Calendar)



In [7]:

    
model_id = 4

df4 = data.load_dataset(path=path, modules=['actual', 'entsoe', 'calendar'])
df4_scaled = df4.copy()
df4_scaled = df4_scaled.dropna()

# Get all float type columns
floats = [key for key in dict(df4_scaled.dtypes) if dict(df4_scaled.dtypes)[key] in ['float64']]
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df4_scaled[floats])
df4_scaled[floats] = scaled_columns

df4_train = df4_scaled.loc[(df4_scaled.index < starting)].copy()
df4_test = df4_scaled.loc[df4_scaled.index >= starting].copy()
y_train = df4_train['actual'].copy()
X_train = df4_train.drop('actual', 1).copy()
y_test = df4_test['actual'].copy()
X_test = df4_test.drop('actual', 1).copy()

valid_results_4 = pd.read_csv(os.path.join(res_path, 'notebook_04/', str('04_results_' + date + '.csv')), delimiter=';')
test_results_4 = pd.read_csv(os.path.join(res_path, 'notebook_04/', str('04_test_results' + date + '.csv')), delimiter=';')
test_results_4 = test_results_4.sort_values('Mean absolute error', ascending=True)
best_model_4 = test_results_4.loc[0]['Model name']

config = valid_results_4.loc[valid_results_4['model_name'] == best_model_4]
batch_size = int(config['batch_train'].values[0])
size = int(y_test.shape[0] / batch_size)

# If manually want to retrain model with different settings
if retrain:
    layers = literal_eval(config['config'].values[0])
    layers = layers['layers']

    model4 = lstm.create_model(layers=layers, sample_size=X_train.shape[0], 
                               batch_size=config['batch_train'].values, timesteps=1, 
                               features=X_train.shape[1], loss='mse', optimizer='adam')
    history = lstm.train_model(model=model4, mode='fit', y=y_train, X=X_train, 
                           batch_size=batch_size, timesteps=1, epochs=epochs, 
                           rearrange=False, validation_split=0.2, verbose=1,
                           early_stopping=False
                         )
# Load trained model
elif not retrain:
    notebook = 'notebook_0' + str(model_id)
    mod_name = config['model_name'].values[0]
    filename = os.path.join(model_dir, notebook, (mod_name +'.h5'))
    model4 = load_model(filename)
    model4.reset_states()
    
scaled_predictions = lstm.get_predictions(model=model4, X=X_test[0:size*batch_size], batch_size=batch_size, timesteps=1, verbose=1)

mu = scaler.mean_[0]
sigma = scaler.scale_[0]

mod4_predictions = mu + sigma*scaled_predictions
df_mod4 = pd.DataFrame(data={"model4": mod4_predictions.flatten()}, index=pd.date_range(starting, periods=mod4_predictions.shape[0], freq='60min'))
if 'model4' in forecasts.columns:
    forecasts = forecasts.drop('model4', 1)
forecasts = forecasts.join(df_mod4)

K.clear_session()
import tensorflow as tf
tf.reset_default_graph()









    



2424/2512 [===========================>..] - ETA: 0s

Model 5 (Calendar + Weather)



In [8]:

    
model_id = 5

df5 = data.load_dataset(path=path, modules=['actual', 'calendar', 'weather'])
df5_scaled = df5.copy()
df5_scaled = df5_scaled.dropna()

# Get all float type columns
floats = [key for key in dict(df5_scaled.dtypes) if dict(df5_scaled.dtypes)[key] in ['float64']]
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df5_scaled[floats])
df5_scaled[floats] = scaled_columns

df5_train = df5_scaled.loc[(df5_scaled.index < starting)].copy()
df5_test = df5_scaled.loc[df5_scaled.index >= starting].copy()
y_train = df5_train['actual'].copy()
X_train = df5_train.drop('actual', 1).copy()
y_test = df5_test['actual'].copy()
X_test = df5_test.drop('actual', 1).copy()

valid_results_5 = pd.read_csv(os.path.join(res_path, 'notebook_05/', str('05_results_' + date + '.csv')), delimiter=';')
test_results_5 = pd.read_csv(os.path.join(res_path, 'notebook_05/', str('05_test_results' + date + '.csv')), delimiter=';')
test_results_5 = test_results_5.sort_values('Mean absolute error', ascending=True)
best_model_5 = test_results_5.loc[0]['Model name']

config = valid_results_5.loc[valid_results_5['model_name'] == best_model_5]
batch_size = int(config['batch_train'].values[0])
size = int(y_test.shape[0] / batch_size)

# If manually want to retrain model with different settings
if retrain:
    layers = literal_eval(config['config'].values[0])
    layers = layers['layers']

    model5 = lstm.create_model(layers=layers, sample_size=X_train.shape[0], 
                               batch_size=config['batch_train'].values, timesteps=1, 
                               features=X_train.shape[1], loss='mse', optimizer='adam')
    history = lstm.train_model(model=model5, mode='fit', y=y_train, X=X_train, 
                           batch_size=batch_size, timesteps=1, epochs=epochs, 
                           rearrange=False, validation_split=0.2, verbose=1,
                           early_stopping=False
                         )
# Load trained model
elif not retrain:
    notebook = 'notebook_0' + str(model_id)
    mod_name = config['model_name'].values[0]
    filename = os.path.join(model_dir, notebook, (mod_name +'.h5'))
    model5 = load_model(filename)
    model5.reset_states()
    
scaled_predictions = lstm.get_predictions(model=model5, X=X_test[0:size*batch_size], batch_size=batch_size, timesteps=1, verbose=1)

mu = scaler.mean_[0]
sigma = scaler.scale_[0]

mod5_predictions = mu + sigma*scaled_predictions
df_mod5 = pd.DataFrame(data={"model5": mod5_predictions.flatten()}, index=pd.date_range(starting, periods=mod5_predictions.shape[0], freq='60min'))
if 'model5' in forecasts.columns:
    forecasts = forecasts.drop('model5', 1)
forecasts = forecasts.join(df_mod5)

K.clear_session()
import tensorflow as tf
tf.reset_default_graph()









    



2400/2512 [===========================>..] - ETA: 0s

LSTM Model 6 (All available data)



In [9]:

    
model_id = 6

df6 = data.load_dataset(path=path, modules=['all'])
df6_scaled = df6.copy()
df6_scaled = df6_scaled.dropna()

# Get all float type columns
floats = [key for key in dict(df6_scaled.dtypes) if dict(df6_scaled.dtypes)[key] in ['float64']]
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df6_scaled[floats])
df6_scaled[floats] = scaled_columns

df6_train = df6_scaled.loc[(df6_scaled.index < starting)].copy()
df6_test = df6_scaled.loc[df6_scaled.index >= starting].copy()
y_train = df6_train['actual'].copy()
X_train = df6_train.drop('actual', 1).copy()
y_test = df6_test['actual'].copy()
X_test = df6_test.drop('actual', 1).copy()

valid_results_6 = pd.read_csv(os.path.join(res_path, 'notebook_06/', str('06_results_' + date + '.csv')), delimiter=';')
test_results_6 = pd.read_csv(os.path.join(res_path, 'notebook_06/', str('06_test_results' + date + '.csv')), delimiter=';')
test_results_6 = test_results_6.sort_values('Mean absolute error', ascending=True)
best_model_6 = test_results_6.loc[0]['Model name']

config = valid_results_6.loc[valid_results_6['model_name'] == best_model_6]
batch_size = int(config['batch_train'].values[0])
size = int(y_test.shape[0] / batch_size)

# If manually want to retrain model with different settings
if retrain:
    layers = literal_eval(config['config'].values[0])
    layers = layers['layers']

    model6 = lstm.create_model(layers=layers, sample_size=X_train.shape[0], 
                               batch_size=config['batch_train'].values, timesteps=1, 
                               features=X_train.shape[1], loss='mse', optimizer='adam')
    history = lstm.train_model(model=model6, mode='fit', y=y_train, X=X_train, 
                           batch_size=batch_size, timesteps=1, epochs=epochs, 
                           rearrange=False, validation_split=0.2, verbose=1,
                           early_stopping=False
                         )
# Load trained model
elif not retrain:
    notebook = 'notebook_0' + str(model_id)
    mod_name = config['model_name'].values[0]
    filename = os.path.join(model_dir, notebook, (mod_name +'.h5'))
    model6 = load_model(filename)
    model6.reset_states()
    
scaled_predictions = lstm.get_predictions(model=model6, X=X_test[0:size*batch_size], batch_size=batch_size, timesteps=1, verbose=1)

mu = scaler.mean_[0]
sigma = scaler.scale_[0]

mod6_predictions = mu + sigma*scaled_predictions
df_mod6 = pd.DataFrame(data={"model6": mod6_predictions.flatten()}, index=pd.date_range(starting, periods=mod6_predictions.shape[0], freq='60min'))
if 'model6' in forecasts.columns:
    forecasts = forecasts.drop('model6', 1)
forecasts = forecasts.join(df_mod6)

K.clear_session()
import tensorflow as tf
tf.reset_default_graph()









    



2400/2512 [===========================>..] - ETA: 0s

Table with Results



In [10]:

    
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

forecasts = forecasts.dropna()

results = {}
results[''] = ['MSE', 'MAE', 'MAPE']
results['entsoe'] = [mean_squared_error(forecasts['actual'], forecasts['entsoe']), 
                     mean_absolute_error(forecasts['actual'], forecasts['entsoe']),
                     mean_absolute_percentage_error(forecasts['actual'], forecasts['entsoe'])
                    ]
results['tbats'] = [mean_squared_error(forecasts['actual'], forecasts['tbats_forecast']), 
                    mean_absolute_error(forecasts['actual'], forecasts['tbats_forecast']),
                    mean_absolute_percentage_error(forecasts['actual'], forecasts['tbats_forecast'])
                   ]
results['arima'] = [mean_squared_error(forecasts['actual'], forecasts['arma_forecast']), 
                    mean_absolute_error(forecasts['actual'], forecasts['arma_forecast']),
                    mean_absolute_percentage_error(forecasts['actual'], forecasts['arma_forecast'])
                   ]
results['arima2'] = [mean_squared_error(forecasts['actual'], forecasts['arma_forecast2']), 
                    mean_absolute_error(forecasts['actual'], forecasts['arma_forecast2']),
                    mean_absolute_percentage_error(forecasts['actual'], forecasts['arma_forecast2'])
                   ]
results['m4-entsoe-cal'] = [mean_squared_error(forecasts['actual'], forecasts['model4']), 
                          mean_absolute_error(forecasts['actual'], forecasts['model4']),
                          mean_absolute_percentage_error(forecasts['actual'], forecasts['model4'])
                         ]
results['m5-cal-weather'] = [mean_squared_error(forecasts['actual'], forecasts['model5']), 
                          mean_absolute_error(forecasts['actual'], forecasts['model5']),
                          mean_absolute_percentage_error(forecasts['actual'], forecasts['model5'])
                         ]
results['m6-all'] = [mean_squared_error(forecasts['actual'], forecasts['model6']), 
                     mean_absolute_error(forecasts['actual'], forecasts['model6']),
                     mean_absolute_percentage_error(forecasts['actual'], forecasts['model6'])
                    ]
print(tabulate(results, headers='keys', numalign="right", tablefmt='latex_booktabs', floatfmt=".1f"))
print(tabulate(results, headers='keys', numalign="right", floatfmt=".2f"))









    



\begin{tabular}{lrrrrrrr}
\toprule
      &   entsoe &    tbats &    arima &   arima2 &   m4-entsoe-cal &   m5-cal-weather &   m6-all \\
\midrule
 MSE  & 427415.1 & 994304.1 & 398569.0 & 494220.7 &        175988.0 &         162298.2 & 180442.3 \\
 MAE  &    518.3 &    841.7 &    495.9 &    562.9 &           335.0 &            311.4 &    336.7 \\
 MAPE &      7.7 &     13.0 &      7.7 &      8.6 &             5.0 &              4.7 &      5.1 \\
\bottomrule
\end{tabular}
         entsoe      tbats      arima     arima2    m4-entsoe-cal    m5-cal-weather     m6-all
----  ---------  ---------  ---------  ---------  ---------------  ----------------  ---------
MSE   427415.12  994304.13  398569.04  494220.71        175987.98         162298.21  180442.25
MAE      518.27     841.72     495.88     562.88           334.96            311.43     336.68
MAPE       7.69      13.00       7.69       8.63             5.05              4.67       5.12



In [11]:

    
#%matplotlib qt
#plt.clf()
#plt.ion()

plt.figure()
plt.plot(forecasts.index, forecasts['entsoe'], label='ENTSOE Forecast')
plt.plot(forecasts.index, forecasts['actual'], label='Actual Load')
#plt.plot(forecasts.index, forecasts['tbats_forecast'], label='TBATS Forecast')
#plt.plot(forecasts.index, forecasts['arma_forecast'], label='ARIMA Forecast')
#plt.plot(forecasts.index, forecasts['model4'], label='Model 4 (ENTSO-E & Calendar)')
plt.plot(forecasts.index, forecasts['model5'], label='Model 5 (Calendar & Weather)')
#plt.plot(forecasts.index, forecasts['model6'], label='Model 6 (All)')
plt.title('Forecast Comparison: Test Data')
plt.ylabel('Electricity load (in MW)')
plt.xlabel('Date')
plt.legend(loc='upper left')
plt.show

#filename = plot_dir + model_name + 'top_model_predictions'
#plt.savefig(filename + '.pgf')
#plt.savefig(filename + '.pdf')









    Out[11]:





<function matplotlib.pyplot.show>



In [ ]: