In [0]:
!wget -q https://raw.githubusercontent.com/umbertogriffo/Predictive-Maintenance-using-LSTM/master/Dataset/PM_test.txt -O PM_test.txt 
!wget -q https://raw.githubusercontent.com/umbertogriffo/Predictive-Maintenance-using-LSTM/master/Dataset/PM_train.txt -O PM_train.txt  
!wget -q https://raw.githubusercontent.com/umbertogriffo/Predictive-Maintenance-using-LSTM/master/Dataset/PM_truth.txt -O PM_truth.txt

In [38]:
!ls


binary_model.h5		 Graph		     PM_train.txt
binary_submit_test.csv	 model_accuracy.png  PM_train.txt.1
binary_submit_train.csv  model_loss.png      PM_truth.txt
data			 model_verify.png    PM_truth.txt.1
datalab			 PM_test.txt	     regression_model.h5
filename.txt		 PM_test.txt.1	     submission.csv

Binary classification

Predict if an asset will fail within certain time frame (e.g. cycles)


In [0]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

# define path to save model
model_path = 'binary_model.h5'

Data Ingestion


In [0]:
# read training data - It is the aircraft engine run-to-failure data.
train_df = pd.read_csv('PM_train.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

train_df = train_df.sort_values(['id','cycle'])

# read test data - It is the aircraft engine operating data without failure events recorded.
test_df = pd.read_csv('PM_test.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

# read ground truth data - It contains the information of true remaining cycles for each engine in the testing data.
truth_df = pd.read_csv('PM_truth.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

Data Preprocessing


In [0]:
#######
# TRAIN
#######
# Data Labeling - generate column RUL(Remaining Usefull Life or Time to Failure)
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
# generate label columns for training data
# we will only make use of "label1" for binary classification, 
# while trying to answer the question: is a specific engine going to fail within w1 cycles?
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1','label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

######
# TEST
######
# MinMax normalization (from 0 to 1)
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)


# We use the ground truth dataset to generate labels for the test data.
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2

LSTM


In [42]:
# pick a large window size of 50 cycles
sequence_length = 50

# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,112),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 111 191 -> from row 111 to 191
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# generator for the sequences
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape

# function to generate labels
def gen_labels(id_df, seq_length, label):
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]] 
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previus ones are discarded).
    # All the next id's sequences will have associated step by step one label as target. 
    return data_matrix[seq_length:num_elements, :]

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['label1']) 
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

# Next, we build a deep network. 
# The first layer is an LSTM layer with 100 units followed by another LSTM layer with 50 units. 
# Dropout is also applied after each LSTM layer to control overfitting. 
# Final layer is a Dense output layer with single unit and sigmoid activation since this is a binary classification problem.
# build the network
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()

model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=nb_out, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# fit the network
history = model.fit(seq_array, label_array, epochs=100, batch_size=200, validation_split=0.05, verbose=2,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
                       keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)]
          )

# list all data in history
print(history.history.keys())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_11 (LSTM)               (None, 50, 100)           50400     
_________________________________________________________________
dropout_11 (Dropout)         (None, 50, 100)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 50)                30200     
_________________________________________________________________
dropout_12 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 51        
=================================================================
Total params: 80,651
Trainable params: 80,651
Non-trainable params: 0
_________________________________________________________________
None
Train on 14849 samples, validate on 782 samples
Epoch 1/100
 - 21s - loss: 0.2637 - acc: 0.8910 - val_loss: 0.1170 - val_acc: 0.9476
Epoch 2/100
 - 18s - loss: 0.1011 - acc: 0.9603 - val_loss: 0.1361 - val_acc: 0.9373
Epoch 3/100
 - 18s - loss: 0.0813 - acc: 0.9663 - val_loss: 0.0560 - val_acc: 0.9757
Epoch 4/100
 - 18s - loss: 0.0734 - acc: 0.9700 - val_loss: 0.0324 - val_acc: 0.9872
Epoch 5/100
 - 18s - loss: 0.0691 - acc: 0.9701 - val_loss: 0.0373 - val_acc: 0.9923
Epoch 6/100
 - 18s - loss: 0.0644 - acc: 0.9725 - val_loss: 0.0698 - val_acc: 0.9744
Epoch 7/100
 - 18s - loss: 0.0652 - acc: 0.9729 - val_loss: 0.0416 - val_acc: 0.9847
Epoch 8/100
 - 18s - loss: 0.0516 - acc: 0.9782 - val_loss: 0.0365 - val_acc: 0.9859
Epoch 9/100
 - 18s - loss: 0.0547 - acc: 0.9763 - val_loss: 0.0366 - val_acc: 0.9859
Epoch 10/100
 - 18s - loss: 0.0803 - acc: 0.9655 - val_loss: 0.0473 - val_acc: 0.9783
Epoch 11/100
 - 18s - loss: 0.0562 - acc: 0.9765 - val_loss: 0.0897 - val_acc: 0.9668
Epoch 12/100
 - 18s - loss: 0.0553 - acc: 0.9762 - val_loss: 0.0450 - val_acc: 0.9821
Epoch 13/100
 - 18s - loss: 0.0552 - acc: 0.9768 - val_loss: 0.0436 - val_acc: 0.9821
Epoch 14/100
 - 18s - loss: 0.0567 - acc: 0.9755 - val_loss: 0.0398 - val_acc: 0.9821
dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])

Model Evaluation on Test set


In [43]:
# summarize history for Accuracy
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("model_accuracy.png")

# summarize history for Loss
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("model_loss.png")

# training metrics
scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores[1]))

# make predictions and compute confusion matrix
y_pred = model.predict_classes(seq_array,verbose=1, batch_size=200)
y_true = label_array

test_set = pd.DataFrame(y_pred)
test_set.to_csv('binary_submit_train.csv', index = None)

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(y_true, y_pred)
print(cm)

# compute precision and recall
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
print( 'precision = ', precision, '\n', 'recall = ', recall)


15631/15631 [==============================] - 6s 354us/step
Accurracy: 0.9825986903121486
15631/15631 [==============================] - 6s 392us/step
Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels
[[12345   186]
 [   86  3014]]
precision =  0.941875 
 recall =  0.9722580645161291

Model Evaluation on Validation set


In [44]:
# We pick the last sequence for each id in the test data

seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
#print("seq_array_test_last")
#print(seq_array_test_last)
#print(seq_array_test_last.shape)

# Similarly, we pick the labels

#print("y_mask")
# serve per prendere solo le label delle sequenze che sono almeno lunghe 50
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
#print("y_mask")
#print(y_mask)
label_array_test_last = test_df.groupby('id')['label1'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
#print(label_array_test_last.shape)
#print("label_array_test_last")
#print(label_array_test_last)

# if best iteration's model was saved then load and use it
if os.path.isfile(model_path):
    estimator = load_model(model_path)

# test metrics
scores_test = estimator.evaluate(seq_array_test_last, label_array_test_last, verbose=2)
print('Accurracy: {}'.format(scores_test[1]))

# make predictions and compute confusion matrix
y_pred_test = estimator.predict_classes(seq_array_test_last)
y_true_test = label_array_test_last

test_set = pd.DataFrame(y_pred_test)
test_set.to_csv('binary_submit_test.csv', index = None)

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(y_true_test, y_pred_test)
print(cm)

# compute precision and recall
precision_test = precision_score(y_true_test, y_pred_test)
recall_test = recall_score(y_true_test, y_pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

# Plot in blue color the predicted data and in green color the
# actual data to verify visually the accuracy of the model.
fig_verify = plt.figure(figsize=(10, 5))
plt.plot(y_pred_test, color="blue")
plt.plot(y_true_test, color="green")
plt.title('prediction')
plt.ylabel('value')
plt.xlabel('row')
plt.legend(['predicted', 'actual data'], loc='upper left')
plt.show()
fig_verify.savefig("model_verify.png")


Accurracy: 0.9569892415436365
Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels
[[67  1]
 [ 3 22]]
Precision:  0.9565217391304348 
 Recall:  0.88 
 F1-score: 0.9166666666666666

Regression

How many more cycles an in-service engine will last before it fails?


In [0]:
import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

# define path to save model
model_path = 'regression_model.h5'

Data Ingestion


In [0]:
# read training data - It is the aircraft engine run-to-failure data.
train_df = pd.read_csv('PM_train.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

train_df = train_df.sort_values(['id','cycle'])

# read test data - It is the aircraft engine operating data without failure events recorded.
test_df = pd.read_csv('PM_test.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

# read ground truth data - It contains the information of true remaining cycles for each engine in the testing data.
truth_df = pd.read_csv('PM_truth.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

Data Preprocessing


In [47]:
##################################
# Data Preprocessing
##################################

#######
# TRAIN
#######
# Data Labeling - generate column RUL(Remaining Usefull Life or Time to Failure)
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

# generate label columns for training data
# we will only make use of "label1" for binary classification, 
# while trying to answer the question: is a specific engine going to fail within w1 cycles?
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1','label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

#train_df.to_csv('PredictiveManteinanceEngineTraining.csv', encoding='utf-8',index = None)

######
# TEST
######
# MinMax normalization (from 0 to 1)
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
print(test_df.head())

# We use the ground truth dataset to generate labels for the test data.
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2

#test_df.to_csv('PredictiveManteinanceEngineValidation.csv', encoding='utf-8',index = None)

# pick a large window size of 50 cycles
sequence_length = 50

# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,112),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 111 191 -> from row 111 to 191
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# TODO for debug 
# val is a list of 192 - 50 = 142 bi-dimensional array (50 rows x 25 columns)
val=list(gen_sequence(train_df[train_df['id']==1], sequence_length, sequence_cols))
print(len(val))

# generator for the sequences
# transform each id of the train dataset in a sequence
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
print(seq_array.shape)

# function to generate labels
def gen_labels(id_df, seq_length, label):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]] 
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previus ones are discarded).
    # All the next id's sequences will have associated step by step one label as target.
    return data_matrix[seq_length:num_elements, :]

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL']) 
             for id in train_df['id'].unique()]

label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape


   id  cycle  setting1  setting2  setting3   s1        s2        s3        s4  \
0   1      1  0.632184  0.750000       0.0  0.0  0.545181  0.310661  0.269413   
1   1      2  0.344828  0.250000       0.0  0.0  0.150602  0.379551  0.222316   
2   1      3  0.517241  0.583333       0.0  0.0  0.376506  0.346632  0.322248   
3   1      4  0.741379  0.500000       0.0  0.0  0.370482  0.285154  0.408001   
4   1      5  0.580460  0.500000       0.0  0.0  0.391566  0.352082  0.332039   

    s5     ...           s13       s14       s15  s16       s17  s18  s19  \
0  0.0     ...      0.220588  0.132160  0.308965  0.0  0.333333  0.0  0.0   
1  0.0     ...      0.264706  0.204768  0.213159  0.0  0.416667  0.0  0.0   
2  0.0     ...      0.220588  0.155640  0.458638  0.0  0.416667  0.0  0.0   
3  0.0     ...      0.250000  0.170090  0.257022  0.0  0.250000  0.0  0.0   
4  0.0     ...      0.220588  0.152751  0.300885  0.0  0.166667  0.0  0.0   

        s20       s21  cycle_norm  
0  0.558140  0.661834     0.00000  
1  0.682171  0.686827     0.00277  
2  0.728682  0.721348     0.00554  
3  0.666667  0.662110     0.00831  
4  0.658915  0.716377     0.01108  

[5 rows x 27 columns]
142
(15631, 50, 25)
Out[47]:
(15631, 1)

LSTM


In [48]:
def r2_keras(y_true, y_pred):
    """Coefficient of Determination 
    """
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Next, we build a deep network. 
# The first layer is an LSTM layer with 100 units followed by another LSTM layer with 50 units. 
# Dropout is also applied after each LSTM layer to control overfitting. 
# Final layer is a Dense output layer with single unit and linear activation since this is a regression problem.
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation("linear"))
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae',r2_keras])

print(model.summary())

# fit the network
history = model.fit(seq_array, label_array, epochs=100, batch_size=200, validation_split=0.05, verbose=2,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
                       keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)]
          )

# list all data in history
print(history.history.keys())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_13 (LSTM)               (None, 50, 100)           50400     
_________________________________________________________________
dropout_13 (Dropout)         (None, 50, 100)           0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 50)                30200     
_________________________________________________________________
dropout_14 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 51        
_________________________________________________________________
activation_3 (Activation)    (None, 1)                 0         
=================================================================
Total params: 80,651
Trainable params: 80,651
Non-trainable params: 0
_________________________________________________________________
None
Train on 14849 samples, validate on 782 samples
Epoch 1/100
 - 21s - loss: 8710.0010 - mean_absolute_error: 74.5754 - r2_keras: -1.6909e+00 - val_loss: 8121.8596 - val_mean_absolute_error: 71.6617 - val_r2_keras: -2.4373e+00
Epoch 2/100
 - 18s - loss: 8034.3115 - mean_absolute_error: 70.6539 - r2_keras: -1.4766e+00 - val_loss: 7602.9425 - val_mean_absolute_error: 68.6500 - val_r2_keras: -2.1910e+00
Epoch 3/100
 - 18s - loss: 7524.7710 - mean_absolute_error: 67.7182 - r2_keras: -1.3202e+00 - val_loss: 7112.1014 - val_mean_absolute_error: 65.8176 - val_r2_keras: -1.9599e+00
Epoch 4/100
 - 18s - loss: 7048.6777 - mean_absolute_error: 64.9854 - r2_keras: -1.1715e+00 - val_loss: 6652.7900 - val_mean_absolute_error: 63.1844 - val_r2_keras: -1.7456e+00
Epoch 5/100
 - 18s - loss: 6601.8182 - mean_absolute_error: 62.4940 - r2_keras: -1.0333e+00 - val_loss: 6219.2346 - val_mean_absolute_error: 60.7198 - val_r2_keras: -1.5454e+00
Epoch 6/100
 - 18s - loss: 6175.6531 - mean_absolute_error: 60.0687 - r2_keras: -9.0191e-01 - val_loss: 5815.9571 - val_mean_absolute_error: 58.4472 - val_r2_keras: -1.3614e+00
Epoch 7/100
 - 18s - loss: 5789.4928 - mean_absolute_error: 57.9402 - r2_keras: -7.8312e-01 - val_loss: 5445.6075 - val_mean_absolute_error: 56.3806 - val_r2_keras: -1.1947e+00
Epoch 8/100
 - 18s - loss: 5433.2812 - mean_absolute_error: 55.9649 - r2_keras: -6.7090e-01 - val_loss: 5097.9849 - val_mean_absolute_error: 54.4635 - val_r2_keras: -1.0408e+00
Epoch 9/100
 - 18s - loss: 5087.7394 - mean_absolute_error: 54.1268 - r2_keras: -5.6419e-01 - val_loss: 4781.7566 - val_mean_absolute_error: 52.7457 - val_r2_keras: -9.0348e-01
Epoch 10/100
 - 18s - loss: 4782.8234 - mean_absolute_error: 52.4782 - r2_keras: -4.6896e-01 - val_loss: 4493.8863 - val_mean_absolute_error: 51.2095 - val_r2_keras: -7.8145e-01
Epoch 11/100
 - 18s - loss: 4520.8048 - mean_absolute_error: 51.1438 - r2_keras: -3.8861e-01 - val_loss: 4237.9736 - val_mean_absolute_error: 49.8721 - val_r2_keras: -6.7613e-01
Epoch 12/100
 - 18s - loss: 4265.3113 - mean_absolute_error: 49.8218 - r2_keras: -3.1054e-01 - val_loss: 4007.2776 - val_mean_absolute_error: 48.6982 - val_r2_keras: -5.8473e-01
Epoch 13/100
 - 18s - loss: 4049.1438 - mean_absolute_error: 48.7800 - r2_keras: -2.4342e-01 - val_loss: 3804.8454 - val_mean_absolute_error: 47.7064 - val_r2_keras: -5.0850e-01
Epoch 14/100
 - 18s - loss: 3861.7658 - mean_absolute_error: 47.8933 - r2_keras: -1.8561e-01 - val_loss: 3634.4994 - val_mean_absolute_error: 46.9115 - val_r2_keras: -4.4870e-01
Epoch 15/100
 - 18s - loss: 3700.7644 - mean_absolute_error: 47.1360 - r2_keras: -1.3603e-01 - val_loss: 3488.1949 - val_mean_absolute_error: 46.2744 - val_r2_keras: -4.0245e-01
Epoch 16/100
 - 18s - loss: 3574.7564 - mean_absolute_error: 46.7065 - r2_keras: -9.8649e-02 - val_loss: 3372.9783 - val_mean_absolute_error: 45.8273 - val_r2_keras: -3.7182e-01
Epoch 17/100
 - 18s - loss: 3468.2338 - mean_absolute_error: 46.3544 - r2_keras: -6.5342e-02 - val_loss: 3289.1135 - val_mean_absolute_error: 45.5572 - val_r2_keras: -3.5574e-01
Epoch 18/100
 - 18s - loss: 3398.6249 - mean_absolute_error: 46.1360 - r2_keras: -4.3301e-02 - val_loss: 3226.5197 - val_mean_absolute_error: 45.4293 - val_r2_keras: -3.5149e-01
Epoch 19/100
 - 18s - loss: 3344.9435 - mean_absolute_error: 46.1256 - r2_keras: -2.7634e-02 - val_loss: 3190.2404 - val_mean_absolute_error: 45.4337 - val_r2_keras: -3.5769e-01
Epoch 20/100
 - 18s - loss: 3314.9026 - mean_absolute_error: 46.1281 - r2_keras: -2.0270e-02 - val_loss: 3175.1814 - val_mean_absolute_error: 45.5011 - val_r2_keras: -3.6738e-01
Epoch 21/100
 - 18s - loss: 3309.8460 - mean_absolute_error: 46.2010 - r2_keras: -1.4463e-02 - val_loss: 3145.2847 - val_mean_absolute_error: 45.3516 - val_r2_keras: -3.6155e-01
Epoch 22/100
 - 18s - loss: 3091.8200 - mean_absolute_error: 43.7458 - r2_keras: 0.0512 - val_loss: 2291.1275 - val_mean_absolute_error: 34.9508 - val_r2_keras: 0.1706
Epoch 23/100
 - 18s - loss: 2369.9234 - mean_absolute_error: 34.8269 - r2_keras: 0.2750 - val_loss: 1979.3735 - val_mean_absolute_error: 31.6306 - val_r2_keras: 0.2622
Epoch 24/100
 - 18s - loss: 2099.4937 - mean_absolute_error: 31.8679 - r2_keras: 0.3560 - val_loss: 1779.4552 - val_mean_absolute_error: 29.2319 - val_r2_keras: 0.3263
Epoch 25/100
 - 18s - loss: 1999.8597 - mean_absolute_error: 30.3754 - r2_keras: 0.3895 - val_loss: 2106.0034 - val_mean_absolute_error: 34.2074 - val_r2_keras: 0.0656
Epoch 26/100
 - 18s - loss: 1787.9025 - mean_absolute_error: 28.5231 - r2_keras: 0.4533 - val_loss: 1726.4552 - val_mean_absolute_error: 27.9679 - val_r2_keras: 0.4115
Epoch 27/100
 - 18s - loss: 1652.6229 - mean_absolute_error: 27.1625 - r2_keras: 0.4948 - val_loss: 1597.3959 - val_mean_absolute_error: 28.9261 - val_r2_keras: 0.3235
Epoch 28/100
 - 18s - loss: 1538.5149 - mean_absolute_error: 25.9007 - r2_keras: 0.5293 - val_loss: 1404.8620 - val_mean_absolute_error: 25.4527 - val_r2_keras: 0.4353
Epoch 29/100
 - 18s - loss: 1419.7833 - mean_absolute_error: 24.8025 - r2_keras: 0.5679 - val_loss: 1541.5653 - val_mean_absolute_error: 28.2006 - val_r2_keras: 0.2923
Epoch 30/100
 - 18s - loss: 1389.6231 - mean_absolute_error: 24.5538 - r2_keras: 0.5754 - val_loss: 1183.8783 - val_mean_absolute_error: 23.5899 - val_r2_keras: 0.5376
Epoch 31/100
 - 18s - loss: 1273.9976 - mean_absolute_error: 23.3684 - r2_keras: 0.6107 - val_loss: 1238.4882 - val_mean_absolute_error: 23.8504 - val_r2_keras: 0.4607
Epoch 32/100
 - 18s - loss: 1187.3405 - mean_absolute_error: 22.2846 - r2_keras: 0.6374 - val_loss: 1028.1954 - val_mean_absolute_error: 21.7312 - val_r2_keras: 0.5986
Epoch 33/100
 - 18s - loss: 1115.3596 - mean_absolute_error: 21.5425 - r2_keras: 0.6600 - val_loss: 895.7364 - val_mean_absolute_error: 19.0417 - val_r2_keras: 0.6763
Epoch 34/100
 - 18s - loss: 1060.9882 - mean_absolute_error: 20.9098 - r2_keras: 0.6760 - val_loss: 998.6016 - val_mean_absolute_error: 18.5054 - val_r2_keras: 0.6522
Epoch 35/100
 - 18s - loss: 1021.3275 - mean_absolute_error: 20.4409 - r2_keras: 0.6890 - val_loss: 1025.7192 - val_mean_absolute_error: 22.1825 - val_r2_keras: 0.5929
Epoch 36/100
 - 18s - loss: 960.9678 - mean_absolute_error: 19.7225 - r2_keras: 0.7068 - val_loss: 925.1884 - val_mean_absolute_error: 20.7019 - val_r2_keras: 0.6325
Epoch 37/100
 - 18s - loss: 913.2022 - mean_absolute_error: 19.2978 - r2_keras: 0.7213 - val_loss: 1018.5966 - val_mean_absolute_error: 23.5789 - val_r2_keras: 0.5181
Epoch 38/100
 - 18s - loss: 891.2813 - mean_absolute_error: 18.9652 - r2_keras: 0.7284 - val_loss: 917.1815 - val_mean_absolute_error: 21.7860 - val_r2_keras: 0.6009
Epoch 39/100
 - 18s - loss: 856.2622 - mean_absolute_error: 18.6940 - r2_keras: 0.7384 - val_loss: 860.5664 - val_mean_absolute_error: 15.7096 - val_r2_keras: 0.7097
Epoch 40/100
 - 18s - loss: 828.3678 - mean_absolute_error: 18.3621 - r2_keras: 0.7466 - val_loss: 722.7817 - val_mean_absolute_error: 18.0577 - val_r2_keras: 0.7209
Epoch 41/100
 - 18s - loss: 810.4449 - mean_absolute_error: 18.1153 - r2_keras: 0.7519 - val_loss: 948.9633 - val_mean_absolute_error: 17.2773 - val_r2_keras: 0.6864
Epoch 42/100
 - 18s - loss: 789.2256 - mean_absolute_error: 17.7570 - r2_keras: 0.7588 - val_loss: 718.4913 - val_mean_absolute_error: 15.8869 - val_r2_keras: 0.7334
Epoch 43/100
 - 18s - loss: 754.0595 - mean_absolute_error: 17.4599 - r2_keras: 0.7695 - val_loss: 762.6585 - val_mean_absolute_error: 17.6853 - val_r2_keras: 0.7087
Epoch 44/100
 - 18s - loss: 748.4787 - mean_absolute_error: 17.4644 - r2_keras: 0.7716 - val_loss: 993.4524 - val_mean_absolute_error: 16.4050 - val_r2_keras: 0.6697
Epoch 45/100
 - 18s - loss: 737.3531 - mean_absolute_error: 17.1963 - r2_keras: 0.7745 - val_loss: 740.7459 - val_mean_absolute_error: 15.1654 - val_r2_keras: 0.7320
Epoch 46/100
 - 18s - loss: 724.6269 - mean_absolute_error: 17.0860 - r2_keras: 0.7779 - val_loss: 802.9401 - val_mean_absolute_error: 19.2805 - val_r2_keras: 0.6640
Epoch 47/100
 - 18s - loss: 706.3259 - mean_absolute_error: 16.9278 - r2_keras: 0.7843 - val_loss: 626.0231 - val_mean_absolute_error: 14.7378 - val_r2_keras: 0.7539
Epoch 48/100
 - 18s - loss: 700.0124 - mean_absolute_error: 16.8493 - r2_keras: 0.7867 - val_loss: 690.4829 - val_mean_absolute_error: 15.7618 - val_r2_keras: 0.7503
Epoch 49/100
 - 18s - loss: 683.8302 - mean_absolute_error: 16.6547 - r2_keras: 0.7913 - val_loss: 592.1512 - val_mean_absolute_error: 15.0523 - val_r2_keras: 0.7252
Epoch 50/100
 - 18s - loss: 664.4725 - mean_absolute_error: 16.3332 - r2_keras: 0.7969 - val_loss: 963.3605 - val_mean_absolute_error: 21.4327 - val_r2_keras: 0.4410
Epoch 51/100
 - 18s - loss: 655.3854 - mean_absolute_error: 16.2690 - r2_keras: 0.7997 - val_loss: 663.0953 - val_mean_absolute_error: 17.2818 - val_r2_keras: 0.6639
Epoch 52/100
 - 18s - loss: 647.0649 - mean_absolute_error: 16.1153 - r2_keras: 0.8027 - val_loss: 670.5359 - val_mean_absolute_error: 15.1590 - val_r2_keras: 0.7325
Epoch 53/100
 - 18s - loss: 645.7000 - mean_absolute_error: 16.2083 - r2_keras: 0.8024 - val_loss: 688.9800 - val_mean_absolute_error: 16.6193 - val_r2_keras: 0.6872
Epoch 54/100
 - 18s - loss: 630.2782 - mean_absolute_error: 15.9147 - r2_keras: 0.8073 - val_loss: 718.5986 - val_mean_absolute_error: 18.1086 - val_r2_keras: 0.6589
Epoch 55/100
 - 18s - loss: 615.2432 - mean_absolute_error: 15.8569 - r2_keras: 0.8117 - val_loss: 694.5187 - val_mean_absolute_error: 16.1241 - val_r2_keras: 0.7057
Epoch 56/100
 - 18s - loss: 598.8294 - mean_absolute_error: 15.6043 - r2_keras: 0.8169 - val_loss: 926.1040 - val_mean_absolute_error: 19.9053 - val_r2_keras: 0.4527
Epoch 57/100
 - 18s - loss: 604.8984 - mean_absolute_error: 15.6311 - r2_keras: 0.8148 - val_loss: 795.2358 - val_mean_absolute_error: 19.9324 - val_r2_keras: 0.6300
Epoch 58/100
 - 18s - loss: 583.2793 - mean_absolute_error: 15.3520 - r2_keras: 0.8214 - val_loss: 899.0561 - val_mean_absolute_error: 21.3399 - val_r2_keras: 0.4924
Epoch 59/100
 - 18s - loss: 578.3221 - mean_absolute_error: 15.2435 - r2_keras: 0.8225 - val_loss: 683.7928 - val_mean_absolute_error: 16.7156 - val_r2_keras: 0.7113
dict_keys(['val_loss', 'val_mean_absolute_error', 'val_r2_keras', 'loss', 'mean_absolute_error', 'r2_keras'])

Model Evaluation on Test set


In [49]:
# summarize history for R^2
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['r2_keras'])
plt.plot(history.history['val_r2_keras'])
plt.title('model r^2')
plt.ylabel('R^2')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("model_r2.png")

# summarize history for MAE
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('model MAE')
plt.ylabel('MAE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("model_mae.png")

# summarize history for Loss
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("model_regression_loss.png")

# training metrics
scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('\nMAE: {}'.format(scores[1]))
print('\nR^2: {}'.format(scores[2]))

y_pred = model.predict(seq_array,verbose=1, batch_size=200)
y_true = label_array

test_set = pd.DataFrame(y_pred)
test_set.to_csv('submit_train.csv', index = None)


15631/15631 [==============================] - 6s 360us/step

MAE: 13.9879766045937

R^2: 0.7912724254898692
15631/15631 [==============================] - 6s 407us/step

Evaluate on Validation set


In [51]:
# We pick the last sequence for each id in the test data
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
#print("seq_array_test_last")
#print(seq_array_test_last)
#print(seq_array_test_last.shape)

# Similarly, we pick the labels
#print("y_mask")
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test_last = test_df.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
#print(label_array_test_last.shape)
#print("label_array_test_last")
#print(label_array_test_last)

# if best iteration's model was saved then load and use it
if os.path.isfile(model_path):
    estimator = load_model(model_path,custom_objects={'r2_keras': r2_keras})

    # test metrics
    scores_test = estimator.evaluate(seq_array_test_last, label_array_test_last, verbose=2)
    print('\nMAE: {}'.format(scores_test[1]))
    print('\nR^2: {}'.format(scores_test[2]))

    y_pred_test = estimator.predict(seq_array_test_last)
    y_true_test = label_array_test_last

    test_set = pd.DataFrame(y_pred_test)
    test_set.to_csv('submit_test.csv', index = None)

    # Plot in blue color the predicted data and in green color the
    # actual data to verify visually the accuracy of the model.
    fig_verify = plt.figure(figsize=(10, 5))
    plt.plot(y_pred_test, color="blue")
    plt.plot(y_true_test, color="green")
    plt.title('prediction')
    plt.ylabel('value')
    plt.xlabel('row')
    plt.legend(['predicted', 'actual data'], loc='upper left')
    plt.show()
    fig_verify.savefig("model_regression_verify.png")


MAE: 12.394294656733031

R^2: 0.7993160370857485

References