In [1]:
import keras
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
cl_kadij = np.genfromtxt("Cl_KadIJ_Merge_dt1h.tek", skip_header=29, skip_footer=1)
cl_lobith = np.genfromtxt("Cl_Lobith_Merge_dt1h.lss", skip_header=131, skip_footer=1)
afvoer_lobith = np.genfromtxt("Q_Lobith_Merge_dt1h.lss", skip_header=131, skip_footer=1)
stand_hvh = np.genfromtxt("h_HvH_Merge_dt1h.tek", skip_header=30, skip_footer=1)
In [3]:
print("shape van cl_kadij:", np.shape(cl_kadij))
print("shape van cl_lobith:", np.shape(cl_lobith))
print("shape van afvoer_lobith:", np.shape(afvoer_lobith))
print("shape van stand_hvh:", np.shape(stand_hvh))
In [4]:
print("eerste elementen:")
print(cl_kadij[1,:])
print(cl_lobith[1,:])
print(afvoer_lobith[1,:])
print(stand_hvh[1,:])
In [5]:
usable_indices = (cl_kadij[:,2] != 999.999) & (cl_lobith[:,3] != 999.999) & (afvoer_lobith[:,3] != 999.999) & (stand_hvh[:,2] != 999.999)
print("Aantal metingen aanwezig in allen:", np.sum(usable_indices))
In [6]:
#check of data inderdaad allemaal voor zelfde date-time bij zelfde index staat:
if (np.sum(cl_kadij[:,:2] != cl_lobith[:,:2]) | np.sum(cl_lobith[:,:2] != afvoer_lobith[:,:2]) | np.sum(afvoer_lobith[:,:2] != stand_hvh[:,:2])):
print("something is of")
In [7]:
#time_data = (cl_kadij[:,:2])[usable_indices]
time_data = ((np.genfromtxt("Cl_KadIJ_Merge_dt1h.tek", skip_header=29, skip_footer=1, dtype='str'))[:,:2])[usable_indices]
cl_kadij_data = (cl_kadij[:,2])[usable_indices]
cl_lobith_data = (cl_lobith[:,3])[usable_indices]
afvoer_lobith_data = (afvoer_lobith[:,3])[usable_indices]
stand_hvh_data = (stand_hvh[:,2])[usable_indices]
print("shapes:", np.shape(time_data), np.shape(cl_kadij_data), np.shape(cl_lobith_data), np.shape(afvoer_lobith_data), np.shape(stand_hvh_data))
In [8]:
#convert time_data into datetime format (this will take a while, several minutes!)
from datetime import datetime
datetime_data = np.array([]);
for row in range(len(time_data)):
as_datetime = datetime.strptime(time_data[row,0] + " " + time_data[row,1], '%Y%m%d %H%M%S')
datetime_data = np.append(datetime_data,as_datetime)
In [ ]:
Train een model om uit de data van een dag geleden (0:00 uur), de chloride bij Krimpen aan de IJssel te voorspellen.
Notes:
In [9]:
indices = (time_data[:,1] == "000000")
output_raw = cl_kadij_data[indices]
input_raw = np.transpose(np.vstack((cl_lobith_data, afvoer_lobith_data, stand_hvh_data)))
input_raw = input_raw[indices]
datetime_raw = datetime_data[indices]
print("input shape:", np.shape(input_raw))
print("output shape:", np.shape(output_raw))
print("datetime shape:", np.shape(datetime_raw))
In [10]:
#second option: also take the previous value into account
input2_raw = np.transpose(np.vstack((cl_kadij_data, cl_lobith_data, afvoer_lobith_data, stand_hvh_data)));
input2_raw = input2_raw[indices]
print("input 2 shape:", np.shape(input2_raw))
In [11]:
#normalize the data
import sklearn.preprocessing
def normalize_data(data):
'''returns (scaled_data, scaler)'''
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))
scaler.fit(data)
scaled_data = scaler.transform(data)
return (scaled_data, scaler)
In [12]:
output_scaled, output_scaler = normalize_data(output_raw.reshape(-1,1))
print("min output:", output_scaler.data_min_)
print("range output:", output_scaler.data_range_)
input_scaled, input_scaler = normalize_data(input_raw)
print("min input:", input_scaler.data_min_)
print("range input:", input_scaler.data_range_)
input2_scaled, input2_scaler = normalize_data(input2_raw)
print("min input 2:", input2_scaler.data_min_)
print("range input 2:", input2_scaler.data_range_)
In [13]:
plt.axhline(y=250,color='red')
plt.title("raw output data")
plt.plot(output_raw);
In [14]:
yline = output_scaler.transform(np.ones((1,1)) * 250)
plt.axhline(y=yline,color='red')
plt.title("scaled output data")
plt.plot(output_scaled);
In [15]:
#note that there may be gaps, so let's find those if they exist
from datetime import timedelta
def delete_gaps(datetimes, desired_delta, input_dat, output_dat):
'''returns (input_nogaps, output_nogaps, datetimes_input)
also transposes data such that input and output data returned are off by one timedelta'''
datetime_diff = datetimes[1:] - datetimes[:-1]
desired_diff = datetime_diff == desired_delta
print("detected", np.sum(~desired_diff), "gaps")
#print(np.where(desired_diff == False))
input_nogaps = input_dat[np.append(desired_diff, np.array([False]))]
datetimes_input = datetimes[np.append(desired_diff, np.array([False]))]
output_nogaps = output_dat[np.insert(desired_diff, 0, False)]
return (input_nogaps, output_nogaps, datetimes_input)
In [16]:
input_full , output_full, datetimes_input= delete_gaps(datetime_raw, timedelta(days=1), input_scaled, output_scaled)
input2_full , trash , trash2 = delete_gaps(datetime_raw, timedelta(days=1), input2_scaled, output_scaled)
print("final input shape:", np.shape(input_full))
print("final output shape:", np.shape(output_full))
print("final input 2 shape:", np.shape(input2_full))
In [17]:
#just split the data in three equal parts for training, test and validation
def split_data(data):
'''returns (train_data, test_data, validate_data)'''
size = int(len(data) / 3)
train_data = data[:size]
test_data = data[size:2*size]
validate_data = data[2*size:]
return (train_data, test_data, validate_data)
input_train , input_test, input_validate = split_data(input_full)
output_train, output_test, output_validate = split_data(output_full)
input2_train, input2_test, input2_validate = split_data(input2_full)
datetimes_train, datetimes_test, datetimes_validate = split_data(datetimes_input)
In [18]:
keras.callbacks.TensorBoard(
log_dir='./logs',
histogram_freq=1,
batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None
);
In [19]:
model = keras.models.Sequential()
layer = keras.layers.LSTM(4,input_shape=(None,len(input_full[0])))
model.add(layer)
layer = keras.layers.Dense(1)
model.add(layer)
model.compile(loss='mean_squared_error', optimizer='adam')
In [20]:
model2 = keras.models.Sequential()
layer = keras.layers.LSTM(4,input_shape=(None, len(input2_full[0])))
model2.add(layer)
layer = keras.layers.Dense(1)
model2.add(layer)
model2.compile(loss='mean_squared_error', optimizer='adam')
In [21]:
print("first model:")
model.summary()
print("\nsecond model:")
model2.summary()
In [22]:
history = model.fit(input_train[:,np.newaxis,:], output_train, epochs=5, batch_size=1, verbose=2)
In [23]:
history2 = model2.fit(input2_train[:,np.newaxis,:], output_train, epochs=5, batch_size=1, verbose=2)
In [24]:
output_model = model.predict(input_test[:,np.newaxis,:])
output_model2 = model2.predict(input2_test[:,np.newaxis,:])
In [25]:
plt.axhline(y=yline,color='red')
plt.plot(datetimes_test, output_test,'b--',label='actual data')
plt.plot(datetimes_test, output_model, 'y',label='predicted data')
plt.title("model 1: no info previous day")
plt.xlim(datetimes_test[0], datetimes_test[-1])
plt.legend();
In [26]:
plt.axhline(y=yline,color='red')
plt.plot(datetimes_test, output_test,'b--',label='actual data')
plt.plot(datetimes_test, output_model2, 'y',label='predicted data')
plt.xlim(datetimes_test[0], datetimes_test[-1])
plt.title("model 2: with previous value as additional input")
plt.legend();
In [27]:
print("length of output:",len(output_model))
#kleiner stuk data helpt voor zichtbaarheid, verander selection om andere data the bekijken
selection = np.s_[200:250]
plt.axhline(y=yline,color='red')
plt.plot(datetimes_test[selection], output_test[selection],'b--',label='actual data')
plt.plot(datetimes_test[selection], output_model[selection], 'y',label='predicted data')
plt.xlim((datetimes_test[selection])[0], (datetimes_test[selection])[-1])
plt.gcf().autofmt_xdate()
plt.title("model 1: no info previous day")
plt.legend();
In [28]:
print("length of output:",len(output_model))
#kleiner stuk data helpt voor zichtbaarheid, verander selection om andere data the bekijken
selection = np.s_[200:250]
plt.axhline(y=yline,color='red')
plt.plot(datetimes_test[selection], output_test[selection],'b--',label='actual data')
plt.plot(datetimes_test[selection], output_model2[selection], 'y',label='predicted data')
plt.xlim((datetimes_test[selection])[0], (datetimes_test[selection])[-1])
plt.gcf().autofmt_xdate()
plt.title("model 2: with previous value as additional input")
plt.legend();
In [47]:
print('output model for test data point 200:',output_model2[200])
print('desired output for test data point 200:',output_test[200])
print('input given to model for test data point 200:', input2_test[200])
print('input given to model for test data point 201:', input2_test[201])
In [29]:
errors = np.abs(output_test - output_model)
MSE = errors ** 2
print("highest absolute SE:",np.max(MSE))
print("absolute MSE:",np.average(MSE))
actual_errors = errors * output_scaler.data_range_ + output_scaler.data_min_
MSE_actual = actual_errors ** 2
print("non-normalized highest absolute SE:",np.max(MSE_actual))
print("non-normalized absolute MSE:",np.average(MSE_actual))
rel_errors = actual_errors / (output_test*output_scaler.data_range_ + output_scaler.data_min_)
print("highest relative error:",np.max(rel_errors))
print("average relative error:",np.average(rel_errors))
In [30]:
errors = np.abs(output_test - output_model2)
MSE2 = errors ** 2
print("highest absolute SE:",np.max(MSE2))
print("absolute MSE:",np.average(MSE2))
actual_errors = errors * output_scaler.data_range_ + output_scaler.data_min_
MSE_actual = actual_errors ** 2
print("non-normalized highest absolute SE:",np.max(MSE_actual))
print("non-normalized absolute MSE:",np.average(MSE_actual))
rel_errors = actual_errors / (output_test*output_scaler.data_range_ + output_scaler.data_min_)
print("highest relative error:",np.max(rel_errors))
print("average relative error:",np.average(rel_errors))
In [31]:
#vergelijk met basisvoorspelling, zelfde zoutgehalte als gisteren:
basic_error = np.abs(output_test[1:] - output_test[:-1])
MSE_basic = np.average(basic_error ** 2)
print("absolute MSE basic prediction method:", MSE_basic)
print("Prediction skill model 1:", 1 - np.average(MSE) / MSE_basic)
print("Prediction skill model 2:", 1 - np.average(MSE2) / MSE_basic)
In [32]:
hour_input_raw = np.transpose(np.vstack((cl_kadij_data, cl_lobith_data, afvoer_lobith_data, stand_hvh_data)))
print("shape raw input:", np.shape(hour_input_raw))
hour_output_raw = cl_kadij_data
print("shape raw output:", np.shape(hour_output_raw))
print("shape raw datetimes:", np.shape(datetime_data))
In [33]:
hour_output_scaled, hour_output_scaler = normalize_data(hour_output_raw.reshape(-1,1))
print("min output:", hour_output_scaler.data_min_)
print("range output:", hour_output_scaler.data_range_)
hour_input_scaled, hour_input_scaler = normalize_data(hour_input_raw)
print("min input:", hour_input_scaler.data_min_)
print("range input:", hour_input_scaler.data_range_)
In [34]:
hour_input_full, hour_output_full, hour_datetimes_full = delete_gaps(datetime_data, timedelta(hours=1), hour_input_scaled, hour_output_scaled)
print("final input shape:", np.shape(hour_input_full))
print("final output shape:", np.shape(hour_output_full))
In [35]:
hour_itest, hour_itrain, hour_ivalidate = split_data(hour_input_full)
hour_otest, hour_otrain, hour_ovalidate = split_data(hour_output_full)
hour_dttest, hour_dttrain, hour_dtvalidate = split_data(hour_datetimes_full)
In [36]:
hour_model = keras.models.Sequential()
layer = keras.layers.LSTM(4,input_shape=(None,len(hour_input_full[0])))
hour_model.add(layer)
layer = keras.layers.Dense(1)
hour_model.add(layer)
hour_model.compile(loss='mean_squared_error', optimizer='adam')
In [37]:
# will take about 140 seconds per epoch
hour_history = hour_model.fit(hour_itrain[:,np.newaxis,:], hour_otrain, epochs=5, batch_size=1, verbose=2)
In [38]:
output_hourmodel = hour_model.predict(hour_itest[:,np.newaxis,:])
In [39]:
yline_hour = output_scaler.transform(np.ones((1,1)) * 250)
plt.axhline(y=yline_hour,color='red')
plt.plot(hour_dttest, hour_otest,'b--',label='actual data')
plt.plot(hour_dttest, output_hourmodel, 'y',label='predicted data')
plt.title("Prediction of cl 1 hour ahead")
plt.xlim(hour_dttest[0], hour_dttest[-1])
plt.legend();
In [40]:
print("length of output:",len(output_hourmodel))
#kleiner stuk data helpt voor zichtbaarheid, verander selection om andere data the bekijken
selection = np.s_[200*24:210*24]
plt.axhline(y=yline_hour,color='red')
plt.plot(hour_dttest[selection], hour_otest[selection],'b--',label='actual data')
plt.plot(hour_dttest[selection], output_hourmodel[selection], 'y',label='predicted data')
plt.xlim((hour_dttest[selection])[0], (hour_dttest[selection])[-1])
plt.gcf().autofmt_xdate()
plt.title("Prediction of cl 1 hour ahead (zoomed)")
plt.legend();
In [41]:
errors = np.abs(hour_otest - output_hourmodel)
MSEhour = errors ** 2
print("highest absolute SE:",np.max(MSEhour))
print("absolute MSE:",np.average(MSEhour))
actual_errors = errors * hour_output_scaler.data_range_ + hour_output_scaler.data_min_
MSE_actual = actual_errors ** 2
print("non-normalized highest absolute SE:",np.max(MSE_actual))
print("non-normalized absolute MSE:",np.average(MSE_actual))
rel_errors = actual_errors / (hour_otest*hour_output_scaler.data_range_ + output_scaler.data_min_)
print("highest relative error:",np.max(rel_errors))
print("average relative error:",np.average(rel_errors))
In [42]:
basic_error = np.abs(hour_otest[1:] - hour_otest[:-1])
MSE_basic = np.average(basic_error ** 2)
print("absolute MSE basic prediction method:", MSE_basic)
print("Prediction skill hour model:", 1 - np.average(MSEhour) / MSE_basic)
In [ ]: