In [132]:
# function to load data
loadData <- function() {
fname <- "../facies_vectors.csv"
data <- read.csv(fname, colClasses=c(rep("factor",3), rep("numeric",6), "factor", "numeric"))
data
}
# function to pre-process the data
preProcessData <- function(data) {
# convert NM_M channel into a binary channel "isMarine"
data$NM_M <- data$NM_M == "2"
names(data)[10] <- "isMarine"
# make the Facies channel more descriptive
levels(data$Facies) <- c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS")
data
}
# function to split the data
splitData <- function(data, testWell) {
testIndex <- data$Well.Name == testWell
train <- data[!testIndex,]
test <- data[testIndex,]
split <- list(train, test)
split
}
In [136]:
# load and pre-process the data
data <- loadData()
data <- preProcessData(data)
# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)
"Training Data"
head(train)
"Testing Data"
head(test)
In [165]:
# subset data - mirroring edges as needed
subsetData <- function(data, top, base) {
# requested subset within data range
if (top >= 1 & base <= nrow(data)) {
subset <- data[top:base,]
# requested data occurs before top - mirror data near top
} else if (top < 1) {
s1 <- data[2:(abs(top)+2),]
#s1$Depth <- data$Depth[1] - abs(data$Depth[1] - s1$Depth)
#s1$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - s1$RELPOS)
#s1 <- s1[order(s1$Depth),]
s2 <- data[1:base,]
subset <- rbind(s1, s2)
# requested data occurs after base - mirror data near base
} else if (base > nrow(data)) {
s1 <- data[top:nrow(data),]
s2 <- data[(nrow(data)-(base-nrow(data))):(nrow(data)-1),]
#s2$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - s2$Depth)
#s2$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - s2$RELPOS)
#s2 <- s2[order(s2$Depth),]
subset <- rbind(s1, s2)
}
subset
}
In [171]:
padData <- function(data, l) {
dl <- data[2:(l+1),]
#dl$Depth <- data$Depth[1] - abs(data$Depth[1] - dl$Depth)
#dl$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - dl$RELPOS)
#dl <- dl[order(dl$Depth),]
dr <- data[(nrow(data)-(l)):(nrow(data)-1),]
#dr$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - dr$Depth)
#dr$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - dr$RELPOS)
#dr <- dr[order(dr$Depth),]
rbind(dl, data, dr)
}
In [179]:
# function to perform crosscorrelation between two vectors
crossCorrelate <- function(a, b) {
if (sum(is.na(a)) > 0 | sum(is.na(b)) > 0) {
print("missing values found")
print(paste(a,b))
}
# calculate cross-correlation between vectors a and b
ccor <- ccf(a, b, lag.max=400, plot=F)
# retrieve the maximum correlation and associated lag
corr <- max(ccor[["acf"]][,,1])
lag <- ccor[["lag"]][,,1][which.max(ccor[["acf"]][,,1])]
# return maximum correlation and associated lag
list(correlation=corr, lag=lag)
}
# apply a function "FUN" over columns of data frames "a" and "b"
# NOTE: FUN must take two arguments, two vectors of data
loopAcrossFeatures <- function(a, b, features, FUN) {
# get list of columns for a and b dataframes
features_a <- names(a)[names(a) %in% features]
features_b <- names(b)[names(b) %in% features]
# ensure a and b data frames have the same features
try ((if (!all.equal(features_a, features_b)) stop("Error! Data frames do not have the same features.")))
# initialize resulting data frame
r <- data.frame()
# loop through features
for (feature in features) {
# retrieve the feature vector of interest from each data frame
av <- as.data.frame(a[,which(names(a) %in% feature)])
bv <- as.data.frame(b[,which(names(b) %in% feature)])
# apply the function to the two features, storing in a data frame
if (sum(is.na(av[,1])) == 0 & sum(is.na(bv[,1])) == 0) {
temp <- as.data.frame(FUN(av, bv))
temp$feature <- feature
# merge result with those of other features
r <- rbind(temp,r)
}
}
r
}
# function looks at the training data and determines which of the relevant testFeatures it includes
# returns list of features which can be used for training or weighting models
# NOTE: VERY IMPORTANT that given test well has data for all features listed in testFeatures RE: PE feature
whichFeatures <- function(train, testFeatures) {
# ensure features have actual data
goodFeatures <- names(train)[apply(train, 2, function(x) {sum(is.na(x))==0})]
# certain features we're just not interested in modeling
badFeatures <- c("Formation", "Well.Name", "Depth")
if (sum(train$Well.Name == "Recruit F9") == nrow(train)) badFeatures <- c(badFeatures, "RELPOS")
goodFeatures <- goodFeatures[!goodFeatures %in% badFeatures]
# finally, we only want to include features which also exist in the test data set
goodFeatures <- goodFeatures[goodFeatures %in% testFeatures]
goodFeatures
}
In [245]:
l <- 16
train <- data.frame(Well.Name=c(rep("A",100), rep("B",100), rep("C",100)), x=rep(0,300))
train[50:60, "x"] <- 1
train[160:170, "x"] <- 1
train[270:280, "x"] <- 1
train[40:45, "x"] <- 2
train[150:155, "x"] <- 2
train[237:250, "x"] <- 2
train$Facies <- as.factor(train$x)
levels(train$Facies) <- c("Easy", "Medium", "Hard")
train$x <- train$x + rnorm(300, 0, .25)
test <- data.frame(Well.Name=rep("D",30), x=rep(0,30))
test[10:20, "x"] <- 1
test[1:5, "x"] <- 2
test$Facies <- as.factor(test$x)
levels(test$Facies) <- c("Easy", "Medium", "Hard")
test$x <- test$x + rnorm(30, 0, .25)
In [244]:
corrPredict <- function(train, test, l) {
wells <- unique(train$Well.Name)
crossCorrs <- data.frame()
print(wells)
for (i in 1:nrow(test)) {
top <- i - l / 2 + 1
base <- i + l / 2
test_i <- subsetData(test, top, base)
crossCorrs_i <- data.frame()
for (well_j in wells) {
train_j <- train[train$Well.Name == well_j,]
cors <- data.frame(corsq=numeric(), trainWell=factor(), k=numeric())
for (k in 1:nrow(train_j)) {
top_k <- k - l / 2 + 1
base_k <- k + l / 2
train_jk <- subsetData(train_j, top_k, base_k)
temp <- data.frame(cor=cor(test_i$x, train_jk$x), trainWell=well_j, k=k)
cors <- rbind(cors, temp)
}
best_j <- cors[which.max(cors$cor),]
test[i, paste0("Facies_", well_j)] <- train_j[best_j$k[1], "Facies"]
test[i, paste0("Corr_", well_j)] <- best_j$cor[1]
}
}
test
}
In [242]:
library(caret)
accuracy_metrics <- function(cm, ytrue) {
# initialize vectors for precision, recall, and f1 metrics with zeros
prec <- rep(0,3)
recall <- rep(0,3)
f1 <- rep(0,3)
# loop through facies to compute precision, recall, and f1 for each facies
beta <- 1
for (i in 1:3) {
prec[i] <- cm[i,i] / sum(cm[i,])
recall[i] <- cm[i,i] / sum(cm[,i])
f1[i] <- (1 + beta^2) * prec[i] * recall[i] / ((beta^2 * prec[i]) + recall[i])
}
prec[is.na(prec)] <- 0
recall[is.na(recall)] <- 0
f1[is.na(f1)] <- 0
support <- as.matrix(table(ytrue))
tot_precision <- sum(prec * support) / sum(support)
tot_recall <- sum(recall * support) / sum(support)
tot_f1 <- sum(f1 * support) / sum(support)
c(tot_precision, tot_recall, tot_f1)
}
eval_model <- function(ypred, ytrue) {
cm <- confusionMatrix(ypred, ytrue)
accuracy_metrics(as.matrix(cm[["table"]]), ytrue)
}
In [247]:
ls <- c(16,18,20,22,24)
for (l in ls) {
t <- corrPredict(train, test, l)
metrics_a <- eval_model(t$Facies_A, t$Facies)
metrics_b <- eval_model(t$Facies_B, t$Facies)
metrics_c <- eval_model(t$Facies_C, t$Facies)
print(paste("l:", l))
print(paste("A F1-Score:", metrics_a[3]))
print(paste("B F1-Score:", metrics_b[3]))
print(paste("C F1-Score:", metrics_c[3]))
}