Information about the problem is at http://ti.arc.nasa.gov/tech/dash/pcoe/prognostic-data-repository/publications/#turbofan and original data is at http://ti.arc.nasa.gov/tech/dash/pcoe/prognostic-data-repository/#turbofan
The data was originally generated using the Commercial Modular Aero-Propulsion System Simulations (C-MAPPS) system.
The approach used in the turbofan engine degradation dataset was then used in the PHM08 challenge. Information about other research on the C-MAPSS data is available at https://www.phmsociety.org/sites/phmsociety.org/files/phm_submission/2014/phmc_14_063.pdf
In [6]:
import h2o
import numpy as np
import pandas as pd
import seaborn as sns
import pykalman as pyk
%matplotlib inline
sns.set()
doGridSearch = False
In [45]:
# Input files don't have column names
dependent_vars = ['RemainingUsefulLife']
index_columns_names = ["UnitNumber","Cycle"]
operational_settings_columns_names = ["OpSet"+str(i) for i in range(1,4)]
sensor_measure_columns_names =["SensorMeasure"+str(i) for i in range(1,22)]
input_file_column_names = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names
# And we are going to add these columns
kalman_smoothed_mean_columns_names =["SensorMeasureKalmanMean"+str(i) for i in range(1,22)]
In [46]:
train = pd.read_csv("train_FD001.txt", sep=r"\s*", header=None,
names=input_file_column_names, engine='python')
test = pd.read_csv("test_FD001.txt", sep=r"\s*", header=None,
names=input_file_column_names, engine='python')
test_rul = pd.read_csv("RUL_FD001.txt", header=None, names=['RemainingUsefulLife'])
test_rul.index += 1 # set the index to be the unit number in the test data set
test_rul.index.name = "UnitNumber"
In [47]:
# Calculate the remaining useful life for each training sample based on last measurement being zero remaining
grouped_train = train.groupby('UnitNumber', as_index=False)
useful_life_train = grouped_train.agg({'Cycle' : np.max })
useful_life_train.rename(columns={'Cycle': 'UsefulLife'}, inplace=True)
train_wfeatures = pd.merge(train, useful_life_train, on="UnitNumber")
train_wfeatures["RemainingUsefulLife"] = -(train_wfeatures.UsefulLife - train_wfeatures.Cycle)
train_wfeatures.drop('UsefulLife', axis=1, inplace=True)
In [48]:
sns.set_context("notebook", font_scale=5)
p = sns.pairplot(train_wfeatures.query('UnitNumber < 10'),
vars=["RemainingUsefulLife", "SensorMeasure4", "SensorMeasure3",
"SensorMeasure9", "SensorMeasure8", "SensorMeasure13"], size=10,
hue="UnitNumber", palette=sns.color_palette("husl", 9));
In [11]:
kalman_smoothed_mean_columns_names =["SensorMeasureKalmanMean"+str(i) for i in range(1,22)]
def calcSmooth(measures):
kf = pyk.KalmanFilter(initial_state_mean=measures[0], n_dim_obs=measures.shape[1])
(smoothed_state_means, smoothed_state_covariances) = kf.em(measures).smooth(measures)
return smoothed_state_means
def filterEachUnit(df):
dfout = df.copy()
for newcol in kalman_smoothed_mean_columns_names:
dfout[newcol] = np.nan
for unit in dfout.UnitNumber.unique():
print 'Processing Unit: ' + str(unit)
unitmeasures = dfout[dfout.UnitNumber == unit][sensor_measure_columns_names]
smoothed_state_means = calcSmooth( np.asarray( unitmeasures ) )
dfout.loc[dfout.UnitNumber == unit, kalman_smoothed_mean_columns_names] = smoothed_state_means
print 'Finished Unit: ' + str(unit)
return dfout
In [12]:
# Get picky about the order of output columns
test_output_cols = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names + \
kalman_smoothed_mean_columns_names
train_output_cols = test_output_cols + dependent_vars
train_output = train_wkalman[train_output_cols]
test_output = test_wkalman[test_output_cols]
In [15]:
# Output the files, so we don't have to do the preprocessing again.
train_output.to_csv("train_FD001_preprocessed.csv", index=False)
test_output.to_csv("test_FD001_preprocessed.csv", index=False)
test_rul.to_csv("rul_FD001_preprocessed.csv", index=True)
In [3]:
h2o.init()
In [4]:
train_hex = h2o.import_file("/home/ec2-user/python-examples/cmapdata/train_FD001_preprocessed.csv")
test_hex = h2o.import_file("/home/ec2-user/python-examples/cmapdata/test_FD001_preprocessed.csv")
Use the operational settings and Kalman smoothed mean states as the independent features
Setup a fold column to great cross validation models from 90 units and cross validating on 10 units. This creates a 10-fold cross validation. The cross validation models are then used to create an ensemble model for predictions
In [5]:
xCols= operational_settings_columns_names + kalman_smoothed_mean_columns_names
yCol = dependent_vars
foldCol = "UnitNumberMod10"
train_hex[foldCol] = train_hex["UnitNumber"] % 10
In [8]:
def trainGLM(x, y, fold_column, alpha=0.5, penalty=1e-5):
return h2o.glm(x=x, y=y,fold_column=fold_column,family = "gaussian",alpha = [alpha], Lambda = [penalty])
def gridSearchGLM(x, y, fold_column, alphas = [0,0.5,1], penalties=np.logspace(-3,0,num=4)):
results = []
for alpha in alphas:
for penalty in penalties:
results.append( trainGLM(x, y, fold_column, alpha, penalty) )
return results
if doGridSearch:
glmModels = gridSearchGLM(train_hex[xCols] , train_hex[yCol] , train_hex[foldCol])
else:
# this is used to speed up the demonstration by just building the single model previously found
glmModels = [ trainGLM(train_hex[xCols] , train_hex[yCol] , train_hex[foldCol], alpha=1, penalty=0.01 )]
In [9]:
def extractBestModel(models):
bestMse = models[0].mse(xval=True)
result = models[0]
for model in models:
if model.mse(xval=True) < bestMse:
bestMse = model.mse(xval=True)
result = model
return result
bestModel = extractBestModel(glmModels)
bestModel
Out[9]:
In [10]:
def trainGBM(x, y, fold_column, learning_rate=0.1, ntrees=50, max_depth=5):
return h2o.gbm(x=x, y=y,fold_column=fold_column,distribution = "gaussian",
learn_rate=learning_rate, ntrees=ntrees, max_depth=max_depth)
def gridSearchGBM(x, y, fold_column, learning_rates = [0.1,0.03,0.01], ntrees=[10,30,100,300], max_depth=[1,3,5]):
results = []
for learning_rate in learning_rates:
for ntree in ntrees:
for depth in max_depth:
print "GBM: {learning rate: "+str(learning_rate)+"},{ntrees: "+str(ntree)+"},{max_depth: "+str(depth)+"}"
results.append( trainGBM(x, y, fold_column, learning_rate=learning_rate, ntrees=ntree, max_depth=depth) )
return results
In [25]:
if doGridSearch:
gbmModels = gridSearchGBM(train_hex[xCols] , train_hex[yCol] , train_hex[foldCol], \
learning_rates=[0.03,0.01,0.003], ntrees=[100,300,1000,3000], max_depth=[1,2,3,5])
else:
gbmModels = [trainGBM(train_hex[xCols] , train_hex[yCol] , train_hex[foldCol], \
learning_rate=0.01, ntrees=300, max_depth=5)]
In [26]:
bestGbmModel = extractBestModel(gbmModels)
Best model had depth 5, learning rate 0.01, and 300 trees
In [27]:
bestGbmModel.params
Out[27]:
Best GBM Model reported MSE on cross validation data as 1687, an improvement from GLM of 1954.
In [28]:
bestGbmModel
Out[28]:
In [29]:
train_hex["weights"] = 1
allModels = bestGbmModel.xvals
pred = sum([model.predict(train_hex) for model in allModels]) / len(allModels)
In [30]:
pred["actual"] = train_hex["RemainingUsefulLife"]
pred["unit"] = train_hex["UnitNumber"]
Ideally all points would be on the diagonal, indication prediction from data matched exactly the actual.
Also, it is important that the prediction gets more accurate the closer it gets to no useful life remaining.
Looking at a sample of the first 12 units.
Moved predictions from H2O to Python Pandas for plotting using Seaborn.
In [31]:
scored_df = pred.as_data_frame(use_pandas=True)
In [32]:
g=sns.lmplot(x="actual",y="predict",hue="unit",col="unit",data=scored_df[scored_df.unit < 13],col_wrap=3,fit_reg=False)
g = (g.set_axis_labels("Remaining Useful Life", "Predicted Useful Life")
.set(xlim=(-400, 200), ylim=(-400, 200),
xticks=[-200, 0, 200], yticks=[-200, 0, 200]))
In [34]:
testPreds = sum([model.predict(test_hex) for model in allModels]) / len(allModels)
Append the original index information (Cycle and UnitNumber) to the predicted values so we have them later.
In [35]:
testPreds["Cycle"] = test_hex["Cycle"]
testPreds["UnitNumber"] = test_hex["UnitNumber"]
Move the predictions over to Python Pandas for final analysis and scoring
In [36]:
testPreds_df = testPreds.as_data_frame(use_pandas=True)
Load up the actual Remaining Useful Life information.
In [39]:
actual_RUL = pd.read_csv("/home/ec2-user/python-examples/cmapdata/rul_FD001_preprocessed.csv")
The final scoring used in the competition is based on a single value per unit. We extract the last three predictions and use the mean of those (simple aggregation) and put the prediction back from remaining useful life in T-minus format to cycles remaining (positive).
In [37]:
def aggfunc(x):
return np.mean( x.order().tail(3) )
grouped_by_unit_preds = testPreds_df.groupby("UnitNumber", as_index=False)
predictedRUL = grouped_by_unit_preds.agg({'predict' : aggfunc })
predictedRUL.predict = -predictedRUL.predict
Add the prediction to the actual data frame, and use the scoring used in the PHMO8 competition (more penality for predicting more useful life than there is actual).
In [40]:
final = pd.concat([actual_RUL, predictedRUL.predict], axis=1)
In [41]:
def rowScore(row):
d = row.predict-row.RemainingUsefulLife
return np.exp( -d/10 )-1 if d < 0 else np.exp(d/13)-1
rowScores = final.apply(rowScore, axis=1)
This is the final score using PHM08 method of scoring.
In [42]:
sum(rowScores)
Out[42]:
In [43]:
sns.regplot("RemainingUsefulLife", "predict", data=final, fit_reg=False);
In [ ]: