In [44]:
source("loadData.R")
# load and pre-process the data
data <- loadData()
data <- preProcessData(data)
data <- data[complete.cases(data),]
# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)
"Training Data"
head(train,3)
"Testing Data"
head(test,3)
In [45]:
source("preProcData.R")
# we only want data with complete observations (no missing features)
trainPrime <- train[complete.cases(train),]
# center and scale the petrophysical log features
trainPrime$GR <- (trainPrime$GR - mean(trainPrime$GR)) / sd(trainPrime$GR)
trainPrime$ILD_log10 <- (trainPrime$ILD_log10 - mean(trainPrime$ILD_log10)) / sd(trainPrime$ILD_log10)
trainPrime$DeltaPHI <- (trainPrime$DeltaPHI - mean(trainPrime$DeltaPHI)) / sd(trainPrime$DeltaPHI)
trainPrime$PHIND <- (trainPrime$PHIND - mean(trainPrime$PHIND)) / sd(trainPrime$PHIND)
trainPrime$PE <- (trainPrime$PE - mean(trainPrime$PE)) / sd(trainPrime$PE)
# center and scale the petrophysical log features
testPrime <- test
testPrime$GR <- (testPrime$GR - mean(testPrime$GR)) / sd(testPrime$GR)
testPrime$ILD_log10 <- (testPrime$ILD_log10 - mean(testPrime$ILD_log10)) / sd(testPrime$ILD_log10)
testPrime$DeltaPHI <- (testPrime$DeltaPHI - mean(testPrime$DeltaPHI)) / sd(testPrime$DeltaPHI)
testPrime$PHIND <- (testPrime$PHIND - mean(testPrime$PHIND)) / sd(testPrime$PHIND)
testPrime$PE <- (testPrime$PE - mean(testPrime$PE)) / sd(testPrime$PE)
head(trainPrime,3)
In [46]:
source("accuracyMetrics.R")
source("preProcData.R")
myF1Metric <- function(data) {
ConfM <- confusion_multi(data$pred, data$obs)
M <- ConfM$confusionMatrix
microF1_score <- microF1(M)
names(microF1_score) <- "F1"
microF1_score
}
preProcData <- function(data, l) {
badFeatures <- c("Formation", "Well.Name", "Depth", "GR", "ILD_log10", "DeltaPHI", "PHIND", "PE", "isMarine")
data <- lagData(data, l)
data <- data[, !(names(data) %in% badFeatures)]
data
}
splitData <- function(data, l) {
data <- preProcData(data, l)
set.seed(1234)
trainIndex <- createDataPartition(data$Facies, p=.8, list=F)
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]
list(train=trainData, test=testData)
}
tuneModel <- function(train, l) {
bestFit <- NA
bestF1 <- 0
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
for (l_i in l) {
print(paste("l=", l_i, "...Splitting data"))
split <- splitData(data, l_i)
trainData <- split[["train"]]
cvTestData <- split[["test"]]
print("Training model")
set.seed(1234)
fit <- train(Facies ~ ., data=trainData,
method="rf",
trControl=fitControl,
metric="Kappa")
print("Predicting cross-validation values")
cvTestData$Predicted <- predict(fit, newdata=cvTestData)
print("Evaluating model")
microF1_score <- myF1Metric(data.frame(pred=cvTestData$Predicted, obs=cvTestData$Facies))
if (microF1_score > bestF1) {
bestFit <- fit
}
}
print(paste("Best F1-Score", bestF1))
bestFit
}
tunedFit <- tuneModel(trainPrime, c(10,16,20,24,30))
tunedFit
In [50]:
source("accuracyMetrics.R")
testPrime <- lagData(testPrime, 30)
testPrime$Predicted <- predict(tunedFit, newdata=testPrime)
ConfM <- confusion_multi(testPrime$Predicted, testPrime$Facies)
M <- ConfM$confusionMatrix
microF1_score <- microF1(M)
print(paste("Multi F1:", microF1_score))
In [51]:
summary(tunedFit)