In [1]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
In [2]:
# read the test batches from files created by the Preprocessing_NN_Data notebook
def read_batches(filepath, include_cl_kadij_input=True):
batches = []
number_read = -1
while True:
number_read += 1
try:
batch = pd.read_csv(filepath + '_' + str(number_read) + '.csv', index_col=0, parse_dates=True)
if not include_cl_kadij_input:
batch = batch.drop('cl_kadij_1', 1)
batches.append(batch)
except FileNotFoundError:
break
return batches
#make sure this matches what was used to train the model or errors will occur due to different input sizes:
include_cl_kadij_input = True
test_batches = read_batches('data/test/batch', True)
basic_prediction = [batch['cl_kadij_1'].as_matrix() for batch in test_batches]
if not include_cl_kadij_input:
test_batches = [batch.drop('cl_kadij_1', 1) for batch in test_batches]
In [3]:
#print an example batch to see what they look like
batch = test_batches[0]
batch.head()
Out[3]:
In [4]:
# convert the dataframes in train_batches to the correct input and output shape (and make it arrays)
def convert_to_keras_input(batches):
input_batches = []
output_batches = []
for batch in batches:
input_df = batch.iloc[:,1:]
output_df = batch['cl_kadij_out']
input_batches.append(input_df.as_matrix()[:,np.newaxis,:])
output_batches.append(output_df.as_matrix())
return (input_batches, output_batches)
(input_test_batches, output_test_batches) = convert_to_keras_input(test_batches)
In [5]:
# load the trained model from a file
model_filename = 'models/2layer_lstm_11cells'
model = keras.models.load_model(model_filename + '.h5')
info_file = open(model_filename + '_info.txt')
info = ''
for line in info_file:
info += line
info_file.close()
print('------- info of read model --------\n')
print(info)
In [6]:
# get the test results:
output_model = []
actual_value = []
for i in range(0,len(test_batches)):
batch_model = model.predict(input_test_batches[i], batch_size = len(input_test_batches[i]))
output_model.append([val for sublist in batch_model for val in sublist])
actual_value.append(output_test_batches[i])
In [7]:
# load the scaler data from a file and define a function that retransforms the cl_kadij data to the actual values
scaler_data = pd.read_csv('data/scaler_data.csv', index_col=0)
def rescale_data(data,scaler_name):
data_range = scaler_data.loc[scaler_name, 'data_range_']
data_min = scaler_data.loc[scaler_name, 'data_min_']
converted = np.array(data) * data_range + data_min
return converted
def convert_cl_kadij_output(output):
rescaled_output = rescale_data(output, 'cl_kadij_scaler')
return np.exp(rescaled_output)
In [8]:
# rescale and transform data back to original values:
output_model_scaled = [convert_cl_kadij_output(output) for output in output_model]
actual_value_scaled = [convert_cl_kadij_output(output) for output in actual_value]
basic_prediction_scaled = [convert_cl_kadij_output(output) for output in basic_prediction]
In [9]:
# define some functions to calculate analysis of test results
def calculate_MSE(predicted, actual, start_index=0):
'''calculates the MSE and returns it as well as the number of predictions and batches used to calculate it
returns (MSE, #predictions, #batches)
@param predicted: array of arrays of outputs per batch
@param actual: array of arrays of expected values per batch
@param start_index: only values starting at the start_index will be used for MSE for each batch
'''
if len(predicted) != len(actual):
raise ValueError('predicted and actual should contain the same number of batches')
if start_index < 0:
raise ValueError("can't start at a negative index")
SE_sum = 0
instance_count = 0
batch_count = 0
for i in range(0,len(predicted)):
prediction = predicted[i]
expected = actual[i]
if (len(prediction) <= start_index):
continue
SE = (prediction[start_index:] - expected[start_index:])**2
SE_sum += np.sum((prediction[start_index:] - expected[start_index:])**2)
instance_count += len(prediction[start_index:])
batch_count += 1
if (instance_count == 0):
raise ValueError('No output data available with this start_index')
return (SE_sum / instance_count, instance_count, batch_count)
def calculate_prediction_skill(predicted, basic_predicted, actual, start_index=0):
'''calculates the prediction skill compared to the basic prediction method and returns it
returns (prediction_skill, MSE_model, MSE_basic, #predictions, #batches)'''
(MSE_model, instance_count, batch_count) = calculate_MSE(predicted, actual, start_index)
(MSE_basic, instance_count, batch_count) = calculate_MSE(basic_predicted, actual, start_index)
prediction_skill = 1 - MSE_model / MSE_basic
return (prediction_skill, MSE_model, MSE_basic, instance_count, batch_count)
In [10]:
# define a function to print the calculated analysis results:
def print_prediction_skill(prediction_skill, MSE_model, MSE_basic, instance_count, batch_count):
print('prediction skill:', prediction_skill)
print('MSE model:', MSE_model)
print('MSE basic method:', MSE_basic)
print('based on', instance_count, 'instances in', batch_count, 'batches')
In [11]:
#print analysis results for all test data:
print('--- Analysis done over all test data ---')
print_prediction_skill(*calculate_prediction_skill(output_model_scaled, basic_prediction_scaled, actual_value_scaled))
In [12]:
# calculate the prediction skill with differing start_index (start-up time ignored by analysis)
minimum_instances = 2000
minimum_batches = 200
index = 0
prediction_skills = []
MSEs_model = []
MSEs_basic = []
instance_counts = []
batch_counts = []
while True:
(prediction_skill, MSE_model, MSE_basic, instance_count, batch_count) = calculate_prediction_skill(output_model_scaled, basic_prediction_scaled, actual_value_scaled, start_index=index)
prediction_skills.append(prediction_skill)
if batch_count < minimum_batches or instance_count < minimum_instances:
break
MSEs_model.append(MSE_model)
MSEs_basic.append(MSE_basic)
instance_counts.append(instance_count)
batch_counts.append(batch_count)
index += 1
In [13]:
plt.plot(prediction_skills)
plt.xlabel('start_index')
plt.ylabel('prediction skill');
In [14]:
plt.plot(MSEs_model, 'b', label='MSE model')
plt.plot(MSEs_basic, 'r--', label='MSE basic method')
plt.xlabel('start_index')
plt.legend();
In [15]:
f, ax_array = plt.subplots(2, sharex=True)
f.tight_layout()
#f.xlabel('start_index')
ax_array[0].plot(instance_counts, label='number of instances')
ax_array[0].legend()
ax_array[1].plot(batch_counts, label='number of batches')
ax_array[1].legend()
f.text(0.5, 0.0, 'start_index', ha='center');
In [16]:
# old functions to calculate MSE and prediction skill still used by cell below
def print_MSE(predicted, actual, print_prefix='', start_index=0):
'''calculates the MSE and prints it, returns the MSE'''
errors = np.abs(predicted - actual)
squared_errors = errors**2
MSE = np.average(squared_errors)
print(print_prefix + 'MSE:', MSE)
return MSE
def print_prediction_skill_old(predicted, actual, basic_predicted, verbose=False):
'''calculates the prediction skill compared to the basic prediction method of using the same value as yesterday
the calculated prediction skill is printed and returned
note that it calls the print_MSE function which will also print the MSE
use verbose=True to also print the MSE of the basic prediction method'''
MSE = print_MSE(predicted, actual)
basic_errors = np.abs(actual - basic_predicted)
basic_MSE = np.average(basic_errors**2)
prediction_skill = 1 - MSE / basic_MSE
if (verbose):
print('MSE basic method:', basic_MSE)
print('prediction skill:', prediction_skill)
return prediction_skill
In [17]:
batch_index = 0
print('batch size', len(test_batches[batch_index]))
batch_output_model = model.predict(input_test_batches[batch_index], batch_size=len(input_test_batches[batch_index]))
batch_output_model = [val for subarray in batch_output_model for val in subarray]
batch_output_actual_converted = convert_cl_kadij_output(output_test_batches[batch_index])
batch_output_model_converted = convert_cl_kadij_output(batch_output_model)
batch_output_basic_converted = basic_prediction_scaled[batch_index]
plt.plot(test_batches[batch_index].index.values, batch_output_actual_converted, 'b--', label='actual data')
plt.plot(test_batches[batch_index].index.values, batch_output_model_converted, 'y', label='predicted data')
plt.axhline(250, color='red')
plt.legend();
print('\n--- When analysing each data point in this batch ---')
print_prediction_skill(*calculate_prediction_skill([batch_output_model_converted], [batch_output_basic_converted], [batch_output_actual_converted]))
In [18]:
# print the indices of big (>100) batches to use in the cell above
batch_sizes = [len(batch) for batch in test_batches]
print(np.where(np.array(batch_sizes) > 100))
In [ ]: