In [5]:
# Model category name used throughout the subsequent analysis
model_cat_id = "01"
# Which features from the dataset should be loaded:
# ['all', 'actual', 'entsoe', 'weather_t', 'weather_i', 'holiday', 'weekday', 'hour', 'month']
features = ['actual', 'entsoe']
# LSTM Layer configuration
# ========================
# Stateful True or false
layer_conf = [ True, True, True ]
# Number of neurons per layer
cells = [[ 5, 10, 20, 30, 50, 75, 100, 125, 150 ], [0, 10, 20, 50], [0, 10, 15, 20]]
# Regularization per layer
dropout = [0, 0.1, 0.2]
# Size of how many samples are used for one forward/backward pass
batch_size = [8]
# In a sense this is the output neuron dimension, or how many timesteps the neuron should output. Currently not implemented, defaults to 1.
timesteps = [1]
In [3]:
import os
import sys
import math
import itertools
import datetime as dt
import pytz
import time as t
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas import datetime
from numpy import newaxis
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.tsa import stattools
from tabulate import tabulate
import math
import keras as keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM
from keras.callbacks import TensorBoard
from keras.utils import np_utils
from keras.models import load_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import HTML
from IPython.display import display
%matplotlib notebook
mpl.rcParams['figure.figsize'] = (9,5)
# Import custom module functions
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
sys.path.append(module_path)
from lstm_load_forecasting import data, lstm
In [6]:
# Directory with dataset
path = os.path.join(os.path.abspath(''), '../data/fulldataset.csv')
# Splitdate for train and test data. As the TBATS and ARIMA benchmark needs 2 full cycle of all seasonality, needs to be after jan 01.
loc_tz = pytz.timezone('Europe/Zurich')
split_date = loc_tz.localize(dt.datetime(2017,2,1,0,0,0,0))
# Validation split percentage
validation_split = 0.2
# How many epochs in total
epochs = 30
# Set verbosity level. 0 for only per model, 1 for progress bar...
verbose = 0
# Dataframe containing the relevant data from training of all models
results = pd.DataFrame(columns=['model_name', 'config', 'dropout',
'train_loss', 'train_rmse', 'train_mae', 'train_mape',
'valid_loss', 'valid_rmse', 'valid_mae', 'valid_mape',
'test_rmse', 'test_mae', 'test_mape',
'epochs', 'batch_train', 'input_shape',
'total_time', 'time_step', 'splits'
])
# Early stopping parameters
early_stopping = True
min_delta = 0.006
patience = 2
In [7]:
# Generate output folders and files
res_dir = '../results/notebook_' + model_cat_id + '/'
plot_dir = '../plots/notebook_' + model_cat_id + '/'
model_dir = '../models/notebook_' + model_cat_id + '/'
os.makedirs(res_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
output_table = res_dir + model_cat_id + '_results_' + t.strftime("%Y%m%d") + '.csv'
test_output_table = res_dir + model_cat_id + '_test_results' + t.strftime("%Y%m%d") + '.csv'
# Generate model combinations
models = []
models = lstm.generate_combinations(
model_name=model_cat_id + '_', layer_conf=layer_conf, cells=cells, dropout=dropout,
batch_size=batch_size, timesteps=[1])
In [11]:
# Load data and prepare for standardization
df = data.load_dataset(path=path, modules=features)
df_scaled = df.copy()
df_scaled = df_scaled.dropna()
# Get all float type columns and standardize them
floats = [key for key in dict(df_scaled.dtypes) if dict(df_scaled.dtypes)[key] in ['float64']]
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df_scaled[floats])
df_scaled[floats] = scaled_columns
# Split in train and test dataset
df_train = df_scaled.loc[(df_scaled.index < split_date )].copy()
df_test = df_scaled.loc[df_scaled.index >= split_date].copy()
# Split in features and label data
y_train = df_train['actual'].copy()
X_train = df_train.drop('actual', 1).copy()
y_test = df_test['actual'].copy()
X_test = df_test.drop('actual', 1).copy()
In [6]:
start_time = t.time()
for idx, m in enumerate(models):
stopper = t.time()
print('========================= Model {}/{} ========================='.format(idx+1, len(models)))
print(tabulate([['Starting with model', m['name']], ['Starting time', datetime.fromtimestamp(stopper)]],
tablefmt="jira", numalign="right", floatfmt=".3f"))
try:
# Creating the Keras Model
model = lstm.create_model(layers=m['layers'], sample_size=X_train.shape[0], batch_size=m['batch_size'],
timesteps=m['timesteps'], features=X_train.shape[1])
# Training...
history = lstm.train_model(model=model, mode='fit', y=y_train, X=X_train,
batch_size=m['batch_size'], timesteps=m['timesteps'], epochs=epochs,
rearrange=False, validation_split=validation_split, verbose=verbose,
early_stopping=early_stopping, min_delta=min_delta, patience=patience)
# Write results
min_loss = np.min(history.history['val_loss'])
min_idx = np.argmin(history.history['val_loss'])
min_epoch = min_idx + 1
if verbose > 0:
print('______________________________________________________________________')
print(tabulate([['Minimum validation loss at epoch', min_epoch, 'Time: {}'.format(t.time()-stopper)],
['Training loss & MAE', history.history['loss'][min_idx], history.history['mean_absolute_error'][min_idx] ],
['Validation loss & mae', history.history['val_loss'][min_idx], history.history['val_mean_absolute_error'][min_idx] ],
], tablefmt="jira", numalign="right", floatfmt=".3f"))
print('______________________________________________________________________')
result = [{'model_name': m['name'], 'config': m, 'train_loss': history.history['loss'][min_idx], 'train_rmse': 0,
'train_mae': history.history['mean_absolute_error'][min_idx], 'train_mape': 0,
'valid_loss': history.history['val_loss'][min_idx], 'valid_rmse': 0,
'valid_mae': history.history['val_mean_absolute_error'][min_idx],'valid_mape': 0,
'test_rmse': 0, 'test_mae': 0, 'test_mape': 0, 'epochs': '{}/{}'.format(min_epoch, epochs), 'batch_train':m['batch_size'],
'input_shape':(X_train.shape[0], timesteps, X_train.shape[1]), 'total_time':t.time()-stopper,
'time_step':0, 'splits':str(split_date), 'dropout': m['layers'][0]['dropout']
}]
results = results.append(result, ignore_index=True)
# Saving the model and weights
model.save(model_dir + m['name'] + '.h5')
# Write results to csv
results.to_csv(output_table, sep=';')
#if not os.path.isfile(output_table):
#results.to_csv(output_table, sep=';')
#else: # else it exists so append without writing the header
# results.to_csv(output_table,mode = 'a',header=False, sep=';')
K.clear_session()
import tensorflow as tf
tf.reset_default_graph()
# Shouldn't catch all errors, but for now...
except BaseException as e:
print('=============== ERROR {}/{} ============='.format(idx+1, len(models)))
print(tabulate([['Model:', m['name']], ['Config:', m]], tablefmt="jira", numalign="right", floatfmt=".3f"))
print('Error: {}'.format(e))
result = [{'model_name': m['name'], 'config': m, 'train_loss': str(e)}]
results = results.append(result, ignore_index=True)
results.to_csv(output_table,sep=';')
continue
Select the top 5 models based on the Mean Absolute Error in the validation data: http://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-error
In [9]:
# Number of the selected top models
selection = 5
# If run in the same instance not necessary. If run on the same day, then just use output_table
results_fn = res_dir + model_cat_id + '_results_' + '20170616' + '.csv'
results_csv = pd.read_csv(results_fn, delimiter=';')
top_models = results_csv.nsmallest(selection, 'valid_mae')
In [12]:
# Init test results table
test_results = pd.DataFrame(columns=['Model name', 'Mean absolute error', 'Mean squared error'])
# Init empty predictions
predictions = {}
# Loop through models
for index, row in top_models.iterrows():
filename = model_dir + row['model_name'] + '.h5'
model = load_model(filename)
batch_size = int(row['batch_train'])
# Calculate scores
loss, mae = lstm.evaluate_model(model=model, X=X_test, y=y_test, batch_size=batch_size, timesteps=1, verbose=verbose)
# Store results
result = [{'Model name': row['model_name'],
'Mean squared error': loss, 'Mean absolute error': mae
}]
test_results = test_results.append(result, ignore_index=True)
# Generate predictions
model.reset_states()
model_predictions = lstm.get_predictions(model=model, X=X_test, batch_size=batch_size, timesteps=timesteps[0], verbose=verbose)
# Save predictions
predictions[row['model_name']] = model_predictions
K.clear_session()
import tensorflow as tf
tf.reset_default_graph()
test_results = test_results.sort_values('Mean absolute error', ascending=True)
test_results = test_results.set_index(['Model name'])
if not os.path.isfile(test_output_table):
test_results.to_csv(test_output_table, sep=';')
else: # else it exists so append without writing the header
test_results.to_csv(test_output_table,mode = 'a',header=False, sep=';')
In [13]:
print('Test dataset performance of the best {} (out of {} tested models):'.format(min(selection, len(models)), len(models)))
print(tabulate(test_results, headers='keys', tablefmt="grid", numalign="right", floatfmt=".3f"))
In [ ]: