In [1]:
source("loadData.R")
# load and clean the data
raw <- loadData()
"Raw data:"
format(head(raw,3), digits=3)
clean <- cleanData(raw)
"Cleaned data:"
format(head(clean,3), digits=3)
dataPrime <- data.frame()
wells <- unique(clean$Well.Name)
for (well_i in wells) {
data_i <- clean[clean$Well.Name == well_i,]
data_i$GR <- (data_i$GR - mean(data_i$GR, na.rm=T)) / sd(data_i$GR, na.rm=T)
data_i$ILD_log10 <- (data_i$ILD_log10 - mean(data_i$ILD_log10, na.rm=T)) / sd(data_i$ILD_log10, na.rm=T)
data_i$DeltaPHI <- (data_i$DeltaPHI - mean(data_i$DeltaPHI, na.rm=T)) / sd(data_i$DeltaPHI, na.rm=T)
data_i$PHIND <- (data_i$PHIND - mean(data_i$PHIND, na.rm=T)) / sd(data_i$PHIND, na.rm=T)
data_i$PE <- (data_i$PE - mean(data_i$PE, na.rm=T)) / sd(data_i$PE, na.rm=T)
dataPrime <- rbind(dataPrime, data_i)
}
cs <- dataPrime
rm(dataPrime)
"Centered and scaled data:"
format(head(cs,3), digits=3)
In [2]:
# we only want data with complete observations (no missing features)
# we also exclude the Recruit F9 well, as our derivative features won't make sense for them
cs_withPE <- cs[complete.cases(cs) & cs$Well.Name != "Recruit F9",]
cs_withoutPE <- cs[cs$Well.Name != "Recruit F9",c("Facies", "Formation", "Well.Name", "Depth", "GR",
"ILD_log10", "DeltaPHI", "PHIND", "isMarine", "RELPOS")]
"With PE log:"
paste("Rows:", nrow(cs_withPE))
format(head(cs_withPE, 3), digits=3)
"Without PE log:"
paste("Rows:", nrow(cs_withoutPE))
format(head(cs_withoutPE, 3), digits=3)
In [3]:
source("preProcData.R")
lag_withPE <- lagData(cs_withPE, 30)
lag_withoutPE <- lagData(cs_withoutPE, 30)
paste("With PE # columns before forward, backward, and central difference feature-building:", ncol(cs_withPE))
paste("With PE # columns after lag feature-building:", ncol(lag_withPE))
format(head(lag_withPE,3), digits=3)
paste("Without PE # columns before forward, backward, and central difference feature-building:", ncol(cs_withoutPE))
paste("Without PE # columns after lag feature-building:", ncol(lag_withoutPE))
format(head(lag_withoutPE,3), digits=3)
In [ ]:
library(caret)
source("accuracyMetrics.R")
t0 <- Sys.time()
fitControl <- trainControl(method="repeatedcv", number=10, repeats=10)
##############################
# with PE case
##############################
f1_withPE <- NULL
wells <- unique(lag_withPE$Well.Name)
wells <- wells[!wells %in% "Recruit F9"]
for (i in 1:(length(wells)-1)) {
for (j in (i+1):length(wells)) {
trainIndex <- lag_withPE$Well.Name != wells[i] & lag_withPE$Well.Name != wells[j]
train <- lag_withPE[trainIndex,]
test <- lag_withPE[!trainIndex,]
fit <- train(Facies ~ ., data=subset(train, select=-c(Well.Name, Depth)),
method="rf", metric="Kappa", tuneLength=10,
trControl=fitControl)
test$Predicted <- predict(fit, newdata=test)
f1_i <- myF1Metric(test$Predicted, test$Facies)
f1_withPE <- c(f1_withPE, f1_i)
print(paste("Test well 1:", wells[i], ", Test well 2:", wells[j],
", withPE f1-score:", f1_i))
print("-------------")
}
}
##############################
# without PE case
##############################
f1_withoutPE <- NULL
wells <- unique(lag_withoutPE$Well.Name)
wells <- wells[!wells %in% "Recruit F9"]
for (i in 1:(length(wells)-1)) {
for (j in (i+1):length(wells)) {
trainIndex <- lag_withoutPE$Well.Name != wells[i] & lag_withoutPE$Well.Name != wells[j]
train <- lag_withoutPE[trainIndex,]
test <- lag_withoutPE[!trainIndex,]
fit <- train(Facies ~ ., data=subset(train, select=-c(Well.Name, Depth)),
method="rf", metric="Kappa", tuneLength=10,
trControl=fitControl)
test$Predicted <- predict(fit, newdata=test)
f1_i <- myF1Metric(test$Predicted, test$Facies)
f1_withoutPE <- c(f1_withoutPE, f1_i)
print(paste("Test well 1:", wells[i], ", Test well 2:", wells[j],
", withoutPE f1-score:", f1_i))
print("-------------")
}
}
print("WITH PE")
print(paste("Minimum F1:", min(f1_withPE)))
print(paste("Average F1:", mean(f1_withPE)))
print(paste("Maximum F1:", max(f1_withPE)))
print("-------------")
print("WITHOUT PE")
print(paste("Minimum F1:", min(f1_withoutPE)))
print(paste("Average F1:", mean(f1_withoutPE)))
print(paste("Maximum F1:", max(f1_withoutPE)))
tn <- Sys.time()
print(tn-t0)