In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import math
%matplotlib inline
In [2]:
# read the data from the csv file created by the Data_Preprocessing notebook
data = pd.read_csv('data/raw_data_unscaled.csv', index_col=0, parse_dates=True)
data.head()
Out[2]:
In [3]:
# use log for cl data
f, ax_array = plt.subplots(2)
f.tight_layout()
ax_array[0].hist(np.log(data['cl_kadij']), bins=100)
ax_array[0].set_title('log(cl_kadij)')
ax_array[1].hist(np.log(data['cl_lobith']), bins=100)
ax_array[1].set_title('log(cl_lobith)');
In [4]:
plt.axhline(np.log(250),color='red')
plt.plot(np.log(data['cl_kadij']));
In [5]:
# afv_lobith can be used as is or since it's cubic, take the cubic root
f, ax_array = plt.subplots(2)
f.tight_layout()
ax_array[0].hist(data['afv_lobith'], bins=100)
ax_array[0].set_title('afv_lobith raw')
ax_array[1].hist((data['afv_lobith'])**(1.0/3.0), bins=100)
ax_array[1].set_title('afv_lobith cubic root');
In [6]:
# the cum_stand data can be used as is
plt.hist(data.dropna()['cum_stand'], bins=100)
plt.title('cum_stand');
In [7]:
# Conclusion: use the cubic root for afv_lobith and the log for both cl parameters
nn_data_raw = data.copy()
nn_data_raw['cl_kadij'] = np.log(nn_data_raw['cl_kadij'])
nn_data_raw['cl_lobith'] = np.log(nn_data_raw['cl_lobith'])
nn_data_raw['afv_lobith'] = (nn_data_raw['afv_lobith'])**(1.0/3.0)
nn_data_raw.drop(['stand_hvh'], axis=1, inplace=True)
nn_data_raw.head()
Out[7]:
In [8]:
# write this intermediate data to file in case we want the unscaled data later
nn_data_raw.to_csv('data/nn_data_raw_unscaled.csv')
In [9]:
# define a function to easily normalize the data:
def normalize_data(data):
'''returns (scaler, scaled_data)'''
data = data.values
data = data[~np.isnan(data)].reshape(-1,1)
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)
return (scaler, data)
In [10]:
# scale the data to interval (0,1)
nn_data_scaled = nn_data_raw.copy()
(cl_kadij_scaler, nn_data_scaled['cl_kadij']) = normalize_data(nn_data_raw['cl_kadij'])
(cl_lobith_scaler, nn_data_scaled['cl_lobith']) = normalize_data(nn_data_raw['cl_lobith'])
(afv_lobith_scaler, nn_data_scaled['afv_lobith']) = normalize_data(nn_data_raw['afv_lobith'])
(stand_scaler, stand_scaled_data) = normalize_data(nn_data_raw.dropna()['cum_stand'])
nn_data_scaled.loc[~np.isnan(nn_data_scaled['cum_stand']), 'cum_stand'] = stand_scaled_data.flatten()
nn_data_scaled.head(n=30)
Out[10]:
In [11]:
# write the scaled data to file in case we want it later
nn_data_scaled.to_csv('data/nn_data_raw_scaled.csv')
In [12]:
# write the scalers to file in case we want them later
index = ['cl_kadij_scaler', 'cl_lobith_scaler', 'afv_lobith_scaler', 'stand_scaler']
columns = ['data_max_', 'data_min_', 'data_range_']
scalers = [cl_kadij_scaler, cl_lobith_scaler, afv_lobith_scaler, stand_scaler]
data = np.zeros((4,3),dtype=float)
for i in range(0,len(scalers)):
scaler = scalers[i]
data[i][0] = scaler.data_max_[0]
data[i][1] = scaler.data_min_[0]
data[i][2] = scaler.data_range_[0]
scaler_data = pd.DataFrame(data=data, columns=columns, index=index)
scaler_data.to_csv('data/scaler_data.csv')
scaler_data.head()
Out[12]:
In [13]:
# we will be able to retransform the data later using:
scaled_value = nn_data_scaled['cl_kadij'][0]
unscaled_value = scaled_value * scaler_data.loc['cl_kadij_scaler','data_range_'] + scaler_data.loc['cl_kadij_scaler','data_min_']
print('scaled_value', scaled_value)
print('unscaled_value', unscaled_value)
print('original value', nn_data_raw['cl_kadij'][0])
In the Correlation_Analysis notebook, we found intervals (in days) where the correlation between cl_kadij and the input parameters cl_lobith, afv_lobith and cum_stand was high. For the NN we use the data of x days past for these input parameters where x is in the found intervals.
In [14]:
# use the intervals selected in the Correlation_Analysis notebook
afv_lobith_firstday = 6
afv_lobith_lastday = 9
cl_lobith_firstday = 3
cl_lobith_lastday = 6
stand_firstday = 1 #would ideally be 0 but then we need to use prediction for prediction so let's avoid that for the moment
stand_lastday = 2
In [15]:
# define a function to add shifted data to a dataframe (note that it doesn't detect gaps!)
def add_shifted_data(result_data, raw_data, var_name, firstday, lastday, multiplier):
'''adds shifted data to the result_data dataframe
returns a new dataframe containing the shifted data
@param result_data: the dataframe to which the data should be added
@param raw_data: the dataframe containing the input data to be shifted
@param var_name: the name of the variable in raw_data that is to be shifted
@param firstday & lastday: the data will be shifted and added for each number of days in the interval [firstday,lastday]
@param multiplier: if the raw_data contains data rows with hours, use multiplier 24'''
for i in range(firstday, lastday+1):
result_data = pd.concat([result_data, raw_data.shift(i*multiplier)[var_name].rename(var_name + '_' + str(i))], axis=1)
return result_data
In [16]:
# define a function that removes a certain number of data items before and after each gap
def remove_gaps(data, timedelta, num_before, num_after):
'''removes the first num_before rows before each gap and the first num_after rows after each gap
returns a new dataframe in which the rows are removed
@param data: the dataframe in which the rows should be deleted
@param timedelta: the timedelta used to detect gaps'''
gaps = (data.index.values[1:] - data.index.values[:-1]) != timedelta
print('detected', np.sum(gaps), 'gaps')
delete_these = np.zeros(data.shape[0], dtype=bool)
gap_indices = np.where(gaps == True)[0] + 1
for i in range(0,len(gap_indices)):
gap_index = gap_indices[i]
if i+1 >= len(gap_indices):
next_gap_index = len(gap_indices)
else:
next_gap_index = gap_indices[i+1]
range_limit = min(gap_index+num_after, next_gap_index)
for j in range(gap_index,range_limit):
delete_these[j] = True
if i-1 < 0:
prev_gap_index = 0
else:
prev_gap_index = gap_indices[i-1]
for j in range(max(gap_index - num_before, prev_gap_index), gap_index):
delete_these[j] = True
print('removing', np.sum(delete_these), 'data entries')
return data.drop(data.index[delete_these])
In [17]:
# construct a dataframe containing the wanted input parameters and output
nn_data = pd.concat([nn_data_scaled['cl_kadij'].rename('cl_kadij_out'),nn_data_scaled.shift(24)['cl_kadij'].rename('cl_kadij_1')], axis=1)
nn_data = add_shifted_data(nn_data, nn_data_scaled, 'cl_lobith', cl_lobith_firstday, cl_lobith_lastday, 24)
nn_data = add_shifted_data(nn_data, nn_data_scaled, 'afv_lobith', afv_lobith_firstday, afv_lobith_lastday, 24)
nn_data = add_shifted_data(nn_data, nn_data_scaled, 'cum_stand', stand_firstday, stand_lastday, 24)
nn_data = remove_gaps(nn_data, np.timedelta64(1,'h'), 0, 24 * np.max([cl_lobith_lastday, afv_lobith_lastday, stand_lastday]))
nn_data.dropna(inplace=True)
print('number of remaining data entries:',len(nn_data))
nn_data.head(n = 25)
Out[17]:
In [18]:
# write the resulting dataframe to csv in case we want it later:
nn_data.to_csv('data/nn_data_full_scaled.csv')
In [19]:
# define a function to get 1 day timestep batches from the data
def get_batches_for_time(data, time, verbose=2):
'''takes the data entries at the provided time of day and divides them into batches of continuous (gapless) data'''
nn_data = data.between_time(time, time)
gaps = (nn_data.index.values[1:] - nn_data.index.values[:-1]) == np.timedelta64(1,'D')
if verbose == 2:
print('detected', np.sum(~gaps), 'gaps')
gap_indices = (np.where(gaps == False))[0] + 1
gap_indices = np.append(gap_indices,nn_data.shape[0])
last_index = 0
batches = []
batch_sizes = []
for i in gap_indices:
batches.append(nn_data.iloc[last_index:i])
batch_sizes.append(nn_data.iloc[last_index:i].shape[0])
last_index = i
if verbose >= 1:
print('created', len(batches), 'batches for time', time)
if verbose == 2:
print('batch lengths:')
print(batch_sizes)
print('minimum length', np.min(batch_sizes))
print('maximum length', np.max(batch_sizes))
print('total entries', np.sum(batch_sizes))
return batches
In [20]:
# now get batches for each time of day (0:00, 1:00, etc.)
batches = []
for time in range(0,24):
time_string = str(time) + ':00'
batches += get_batches_for_time(nn_data, time_string,1)
print('total number of batches', len(batches))
In [21]:
# visualize the distribution
plt.title('distribution of the length of the batches')
plt.xlabel('length of batch')
plt.ylabel('number of batches')
plt.hist([len(batch) for batch in batches], bins=30);
In [22]:
# ONLY USE THIS CELL TO GENERATE A NEW SHUFFLE WHEN ABSOLUTELY NECESSARY
# # generate a random shuffle of the batches to make sure batches for the same time are no longer right after each other
# shuffled_batch_indices = np.arange(0,len(batches))
# np.random.shuffle(shuffled_batch_indices)
# print(shuffled_batch_indices)
# np.savetxt('data/shuffle', shuffled_batch_indices)
In [23]:
# read the provided shuffle from file and use it to divide the batches among test, train and validation data
shuffle = np.genfromtxt('data/shuffle', dtype=int)
if (len(batches) != len(shuffle)):
print('The number of batches is different from the read shuffle')
print('Use the above cell to generate a new shuffle but please be aware that this will result in a different distribution of batches among train, test and validation data.')
print('Old models should not be reused on this new data and if the old data is already present in the test, train and validate folders, please move it or the new batches will not be written')
raise ValueError('number of batches does not match the shuffle length')
train_batches = []
test_batches = []
validate_batches = []
for i in range(0,len(batches)):
if i % 4 == 0 or i % 4 == 1:
train_batches.append(batches[shuffle[i]])
elif i % 4 == 2:
test_batches.append(batches[shuffle[i]])
else:
validate_batches.append(batches[shuffle[i]])
print(np.sum([len(batch) for batch in train_batches]), 'train instances in', len(train_batches), 'batches')
print(np.sum([len(batch) for batch in test_batches]), 'test instances in', len(test_batches), 'batches')
print(np.sum([len(batch) for batch in validate_batches]), 'validate instances in', len(validate_batches), 'batches')
In [24]:
# visualize the batch size distribution for each set
bins = 30
f, axarr = plt.subplots(3, sharex=True)
f.tight_layout()
axarr[0].hist([len(batch) for batch in train_batches], bins=bins)
axarr[1].hist([len(batch) for batch in test_batches], bins=bins)
axarr[2].hist([len(batch) for batch in validate_batches], bins=bins)
axarr[0].set_title('train batch sizes')
axarr[1].set_title('test batch sizes')
axarr[2].set_title('validate batch sizes');
In [25]:
# define a function to quickly write an array of batches to a file:
def write_batches_to_file(batches, filename):
for i in range(0,len(batches)):
batches[i].to_csv(filename + '_' + str(i) + '.csv')
In [26]:
# write the train, test and validate batches to files for use later:
# note that if the total number of batches has become smaller, some of the old files may not be overwritten.
# this should only happen if the number of gaps is different
import os
if (os.listdir('data/train') != [] or os.listdir('data/test') != [] or os.listdir('data/validate') != []):
print('Are you sure you want to overwrite the current data?')
print('Note that if this notebook was re-executed the data will be shuffled in a different order and this may result in mixing the train, test and validate data for a previously trained and stored model!')
print('Please back up the current train, test and validate directories first')
else:
write_batches_to_file(train_batches, 'data/train/batch')
write_batches_to_file(test_batches, 'data/test/batch')
write_batches_to_file(validate_batches, 'data/validate/batch')
In [ ]:
In [ ]: