In [1]:
import glob
import os
import pandas as pd
import random
import sys
import dateutil.parser
from datetime import datetime
import numpy as np
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils import np_utils, normalize
from sklearn import metrics as me
from sklearn.utils import shuffle
import sklearn.linear_model as lm
%matplotlib inline
In [2]:
# Load the dataset
PATH_DATA = "../data/running/APE_running_data"
def convert(date):
dt = dateutil.parser.parse(date).replace(tzinfo=None)
epoch = datetime.utcfromtimestamp(0)
delta = dt - epoch
return delta.total_seconds()
colnames = np.array(['time', 'elevation', 'distance', 'speed'])
datasets_all = []
os.chdir(PATH_DATA)
for file in glob.glob("*.tab"):
print("Processing {}".format(file))
dataset = np.genfromtxt(file, skip_header=1,delimiter='\t', converters={0: convert})
dataset[:,0] -= dataset[0,0]
dataset = pd.DataFrame(dataset,columns=colnames)
slope = np.array([])
window_size_half = 8
for j in dataset.index:
index = np.arange(j-window_size_half+1, j+window_size_half+1)
index = index[(index >= 0) & (index < len(dataset))]
dataset_part = dataset.iloc[index].dropna()
regr = lm.LinearRegression()
regr.fit(dataset_part.distance[:,np.newaxis], np.array(dataset_part.elevation))
slope = np.append(slope,regr.coef_)
dataset['slope'] = slope
if (len(dataset) > 300) == (len(dataset) < 900):
datasets_all.append(dataset)
print('\nDataset sample')
print(datasets_all[0][:10])
print(len(datasets_all))
In [12]:
# Plot features of one race
RACE_NUMBER = 0
# Retrieve the data of race
data = datasets_all[RACE_NUMBER]
print(data[:10])
# Plot the speed and the slope
# There are two different scales (left and right y axis)
fig_speed_slope, ax1_ss = plt.subplots(figsize=(20, 4))
plt.title('Speed and slope of race ' + str(RACE_NUMBER))
plt.xlabel('time [s]')
speed, = ax1_ss.plot(data['time'], data['speed'], color='darkgreen', linestyle='-')
ax1_ss.set_ylabel('speed [m/s]', color='darkgreen')
ax1_ss.tick_params('y', colors='darkgreen')
ax2_ss = ax1_ss.twinx()
slope, = ax2_ss.plot(data['time'], data['slope'], color='darkblue', linestyle='-')
ax2_ss.set_ylabel('slope', color='darkblue')
ax2_ss.tick_params('y', colors='darkblue')
p = [speed, slope]
ax1_ss.legend(p, [p_.get_label() for p_ in p], loc='upper right')
# Plot the elevation and the distance
fig_elevation_distance, ax1_ed = plt.subplots(figsize=(20, 4))
plt.title('Elevation and distance of race ' + str(RACE_NUMBER))
plt.xlabel('time [s]')
elevation, = ax1_ed.plot(data['time'], data['elevation'], color='darkmagenta', linestyle='-')
ax1_ed.set_ylabel('elevation [m]', color='darkmagenta')
ax1_ed.tick_params('y', colors='darkmagenta')
ax2_ed = ax1_ed.twinx()
distance, = ax2_ed.plot(data['time'], data['distance'], color='teal', linestyle='-')
ax2_ed.set_ylabel('distance [m]', color='teal')
ax2_ed.tick_params('y', colors='teal')
p = [elevation, distance]
ax1_ed.legend(p, [p_.get_label() for p_ in p], loc='upper center')
Out[12]:
In [16]:
# Statistics on data
# Concatenation of all races and statistics
# Count is not the same on every column due to missing values (NaN)
df = pd.concat(datasets_all)
print(df.describe())
In [7]:
# Number of points taken for each prediction
NB_POINTS = 10
# Number of races taken for the training set
NB_TRAINING = 20
# Number of races taken for the testing set
NB_TEST = 10
# Number of next points for the average goal
NB_POINTS_AVG = 1 # Must not be greater than NB_POINTS
# Shuffle data or not ?
SHUFFLE = True
# List of features (columns of dataframe)
FEATURES = ['time', 'speed', 'slope']
# Remove NaN values in dataset
dataset = [i.dropna() for i in datasets_all]
# Filter features
dataset = [x[FEATURES] for x in dataset]
# Shuffle the dataset
random.seed(42)
random.shuffle(dataset)
# Deep copy of the races
dataset_not_normalized = []
for race in dataset:
dataset_not_normalized.append(race.copy())
# Normalization of each feature
# Min and max values for normalization are saved in order
# to normalize the data of the test set with those values
# Contains, for each feature the min and the max value of the feature of the training set
# These min and max values will be used to normalize the dataset
max_feature = dict.fromkeys(FEATURES, -sys.maxsize - 1)
min_feature = dict.fromkeys(FEATURES, sys.maxsize)
# Get the maximal and minimal values for each column of the training set
for race in range(NB_TRAINING):
for column in FEATURES:
max_local = np.amax(dataset[race][column])
min_local = np.amin(dataset[race][column])
if max_feature[column] < max_local:
max_feature[column] = max_local
if min_feature[column] > min_local:
min_feature[column] = min_local
# Min-max normalisation
def norm_min_max(x, min_local, max_local):
return (x - min_local) / (max_local - min_local)
# Apply the normalization
for race in dataset:
for column in FEATURES:
race[column] = race[column].apply(norm_min_max, min_local=min_feature[column], max_local=max_feature[column])
seq = []
next_speed = []
# Creation of train set
for race in range(NB_TRAINING):
print('Race={} ({} samples) (train)'.format(race, len(dataset[race])))
seq_local = []
next_speed_local = []
for i in range(len(dataset[race])-(NB_POINTS+1)):
# Creation of vector containing the avg slope of the NB_POINTS+1 to NB_POINTS+NB_POINTS_AVG points (the ones we want to predict)
# This vector is added in the input of the model, because we know the slope on all the run
slopes_avg = np.mean(dataset[race][i+NB_POINTS:i+NB_POINTS+NB_POINTS_AVG]['slope'].as_matrix())
slopes = np.reshape(NB_POINTS * [slopes_avg], (NB_POINTS, 1))
matrix = np.hstack((dataset[race][i:i+NB_POINTS].as_matrix(), slopes))
seq_local.append(matrix)
# The goal (speed) is not the normalized value
# The goal is the average of the next 5 speeds
speeds_avg = np.mean(dataset_not_normalized[race][i+NB_POINTS:i+NB_POINTS+NB_POINTS_AVG]['speed'].as_matrix())
next_speed_local.append(speeds_avg)
seq += seq_local
next_speed += next_speed_local
# Transform the data into numpy arrays
trainX = np.array(seq)
trainY = np.array(next_speed)
# Shuffle data
if SHUFFLE:
trainX, trainY = shuffle(trainX, trainY, random_state=42)
seq_test = []
next_speed_test = []
# Creation of test set
for race in range(NB_TRAINING, NB_TRAINING + NB_TEST):
print('Race={} ({} samples) (test)'.format(race, len(dataset[race])))
seq_test_local = []
next_speed_test_local = []
for i in range(len(dataset[race])-(NB_POINTS+1)):
slopes_avg = np.mean(dataset[race][i+NB_POINTS:i+NB_POINTS+NB_POINTS_AVG]['slope'].as_matrix())
slopes = np.reshape(NB_POINTS * [slopes_avg], (NB_POINTS, 1))
matrix = np.hstack((dataset[race][i:i+NB_POINTS].as_matrix(), slopes))
seq_test_local.append(matrix)
# The goal (speed) is not the normalized value
# The goal is the average of the next 5 speeds
speeds_avg = np.mean(dataset_not_normalized[race][i+NB_POINTS:i+NB_POINTS+NB_POINTS_AVG]['speed'].as_matrix())
next_speed_test_local.append(speeds_avg)
seq_test += seq_test_local
next_speed_test += next_speed_test_local
# Transform the data into numpy arrays
testX = np.array(seq_test)
testY = np.array(next_speed_test)
# Shuffle data
if SHUFFLE:
testX, testY = shuffle(testX, testY, random_state=42)
In [8]:
BATCH_SIZE = 10
NB_EPOCHS = 100
NB_UNITS = 4
# Create and fit the LSTM network
model = Sequential()
model.add(LSTM(NB_UNITS, input_shape=(NB_POINTS, len(FEATURES) + 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()
In [9]:
history = model.fit(trainX, trainY, epochs=NB_EPOCHS, batch_size=BATCH_SIZE, verbose=1,
validation_data=(testX, testY))
In [10]:
# Plot the training and testing
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Testing')
plt.xlabel('epochs')
plt.ylabel('mse')
plt.legend()
plt.grid()
In [11]:
# Plot one race with its prediction
RACE_NUMBER = NB_TRAINING # First race of testing set
# Actual values
actual = dataset_not_normalized[RACE_NUMBER]
print("Actual values")
print(actual[:10])
# Predict the future values
# testX contains all the testing races, we need to extract data from the wanted race
# Calculation of starting line of test set depending on race number
start = sum(len(dataset_not_normalized[race]) for race in range(NB_TRAINING, RACE_NUMBER))
predictY = model.predict(testX[start:start + len(actual)])
print("Predictions")
print(predictY[:10])
print("Standard deviation=" + str(np.std(predictY)))
# Plot the results
plt.figure(figsize=(20,4))
plt.suptitle('Prediction race ' + str(RACE_NUMBER))
plt.title('History=' + str(NB_POINTS) + ', Future=' + str(1))
plt.xlabel('time [s]')
plt.ylabel('speed [m/s]')
# Plot the predictions
plt.plot(np.arange(len(predictY)), predictY, 'g-', label='Predicted')
# Plot the actual values
plt.plot(np.arange(len(predictY)), actual['speed'], 'r-', label='Actual')
plt.legend()
Out[11]:
In [8]:
# Information about software version
%load_ext version_information
%reload_ext version_information
%version_information numpy, matplotlib, keras, pandas, sklearn, tensorflow
Out[8]: