In [132]:
# function to load data
loadData <- function() {
    fname <- "../facies_vectors.csv"
    data <- read.csv(fname, colClasses=c(rep("factor",3), rep("numeric",6), "factor", "numeric"))
    
    data
}

# function to pre-process the data
preProcessData <- function(data) {
    # convert NM_M channel into a binary channel "isMarine"
    data$NM_M <- data$NM_M == "2"
    names(data)[10] <- "isMarine"

    # make the Facies channel more descriptive
    levels(data$Facies) <- c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS")
    
    data
}

# function to split the data
splitData <- function(data, testWell) {
    testIndex <- data$Well.Name == testWell
    
    train <- data[!testIndex,]
    test <- data[testIndex,]
    split <- list(train, test)
    
    split
}

In [136]:
# load and pre-process the data
data <- loadData()
data <- preProcessData(data)

# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)

"Training Data"
head(train)

"Testing Data"
head(test)


'Training Data'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793.0 77.45 0.664 9.9 11.915 4.6 FALSE 1.000
FSiS A1 SH SHRIMPLIN2793.5 78.26 0.661 14.2 12.565 4.1 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794.0 79.05 0.658 14.8 13.050 3.6 FALSE 0.957
FSiS A1 SH SHRIMPLIN2794.5 86.10 0.655 13.9 13.115 3.5 FALSE 0.936
FSiS A1 SH SHRIMPLIN2795.0 74.58 0.647 13.5 13.300 3.4 FALSE 0.915
FSiS A1 SH SHRIMPLIN2795.5 73.97 0.636 14.0 13.385 3.6 FALSE 0.894
'Testing Data'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
938CSiS A1 SH SHANKLE2774.5 98.36 0.642 -0.1 18.685 2.9 FALSE 1.000
939CSiS A1 SH SHANKLE2775.0 97.57 0.631 7.9 16.745 3.2 FALSE 0.984
940CSiS A1 SH SHANKLE2775.5 98.41 0.615 12.8 14.105 3.2 FALSE 0.968
941CSiS A1 SH SHANKLE2776.0 85.92 0.597 13.0 13.385 3.4 FALSE 0.952
942CSiS A1 SH SHANKLE2776.5 83.16 0.592 12.3 13.345 3.4 FALSE 0.935
943CSiS A1 SH SHANKLE2777.0 82.56 0.599 12.9 13.105 3.5 FALSE 0.919

In [165]:
# subset data - mirroring edges as needed
subsetData <- function(data, top, base) {

    # requested subset within data range
    if (top >= 1 & base <= nrow(data)) {
        subset <- data[top:base,]
        
    # requested data occurs before top - mirror data near top
    } else if (top < 1) {
        s1 <- data[2:(abs(top)+2),]
        #s1$Depth <- data$Depth[1] - abs(data$Depth[1] - s1$Depth)
        #s1$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - s1$RELPOS)
        #s1 <- s1[order(s1$Depth),]
        
        s2 <- data[1:base,]
        
        subset <- rbind(s1, s2)
        
    # requested data occurs after base - mirror data near base
    } else if (base > nrow(data)) {
        s1 <- data[top:nrow(data),]
        
        s2 <- data[(nrow(data)-(base-nrow(data))):(nrow(data)-1),]
        #s2$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - s2$Depth)
        #s2$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - s2$RELPOS)
        #s2 <- s2[order(s2$Depth),]
        
        subset <- rbind(s1, s2)
    }
    
    subset
}

In [171]:
padData <- function(data, l) {
    dl <- data[2:(l+1),]
    #dl$Depth <- data$Depth[1] - abs(data$Depth[1] - dl$Depth)
    #dl$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - dl$RELPOS)
    #dl <- dl[order(dl$Depth),]
    
    dr <- data[(nrow(data)-(l)):(nrow(data)-1),]
    #dr$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - dr$Depth)
    #dr$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - dr$RELPOS)
    #dr <- dr[order(dr$Depth),]
    
    rbind(dl, data, dr)
}

In [179]:
# function to perform crosscorrelation between two vectors
crossCorrelate <- function(a, b) {
    
    if (sum(is.na(a)) > 0 | sum(is.na(b)) > 0) {
        print("missing values found")
        print(paste(a,b))
    }
    
    # calculate cross-correlation between vectors a and b
    ccor <- ccf(a, b, lag.max=400, plot=F)

    # retrieve the maximum correlation and associated lag
    corr <- max(ccor[["acf"]][,,1])
    lag <- ccor[["lag"]][,,1][which.max(ccor[["acf"]][,,1])]
    
    # return maximum correlation and associated lag
    list(correlation=corr, lag=lag)
}

# apply a function "FUN" over columns of data frames "a" and "b"
# NOTE: FUN must take two arguments, two vectors of data
loopAcrossFeatures <- function(a, b, features, FUN) {
    
    # get list of columns for a and b dataframes
    features_a <- names(a)[names(a) %in% features]
    features_b <- names(b)[names(b) %in% features]

    # ensure a and b data frames have the same features
    try ((if (!all.equal(features_a, features_b)) stop("Error! Data frames do not have the same features.")))
    
    # initialize resulting data frame
    r <- data.frame()
    
    # loop through features
    for (feature in features) {
        # retrieve the feature vector of interest from each data frame
        av <- as.data.frame(a[,which(names(a) %in% feature)])
        bv <- as.data.frame(b[,which(names(b) %in% feature)])
        
        # apply the function to the two features, storing in a data frame
        if (sum(is.na(av[,1])) == 0 & sum(is.na(bv[,1])) == 0) {
            temp <- as.data.frame(FUN(av, bv))
            temp$feature <- feature

            # merge result with those of other features
            r <- rbind(temp,r)    
        }
        
    }
    
    r
}

# function looks at the training data and determines which of the relevant testFeatures it includes
# returns list of features which can be used for training or weighting models
# NOTE: VERY IMPORTANT that given test well has data for all features listed in testFeatures RE: PE feature
whichFeatures <- function(train, testFeatures) {
    
    # ensure features have actual data
    goodFeatures <- names(train)[apply(train, 2, function(x) {sum(is.na(x))==0})]
    
    # certain features we're just not interested in modeling
    badFeatures <- c("Formation", "Well.Name", "Depth")
    if (sum(train$Well.Name == "Recruit F9") == nrow(train)) badFeatures <- c(badFeatures, "RELPOS")
    goodFeatures <- goodFeatures[!goodFeatures %in% badFeatures]

    # finally, we only want to include features which also exist in the test data set
    goodFeatures <- goodFeatures[goodFeatures %in% testFeatures]

    goodFeatures
}

In [245]:
l <- 16

train <- data.frame(Well.Name=c(rep("A",100), rep("B",100), rep("C",100)), x=rep(0,300))
train[50:60, "x"] <- 1
train[160:170, "x"] <- 1
train[270:280, "x"] <- 1
train[40:45, "x"] <- 2
train[150:155, "x"] <- 2
train[237:250, "x"] <- 2
train$Facies <- as.factor(train$x)
levels(train$Facies) <- c("Easy", "Medium", "Hard")
train$x <- train$x + rnorm(300, 0, .25)

test <- data.frame(Well.Name=rep("D",30), x=rep(0,30))
test[10:20, "x"] <- 1
test[1:5, "x"] <- 2
test$Facies <- as.factor(test$x)
levels(test$Facies) <- c("Easy", "Medium", "Hard")
test$x <- test$x + rnorm(30, 0, .25)

In [244]:
corrPredict <- function(train, test, l) {
    wells <- unique(train$Well.Name)
    crossCorrs <- data.frame()
    print(wells)
    for (i in 1:nrow(test)) {
        top <- i - l / 2 + 1
        base <- i + l / 2
        test_i <- subsetData(test, top, base)
        crossCorrs_i <- data.frame()

        for (well_j in wells) {
            train_j <- train[train$Well.Name == well_j,]
            cors <- data.frame(corsq=numeric(), trainWell=factor(), k=numeric())

            for (k in 1:nrow(train_j)) {
                top_k <- k - l / 2 + 1
                base_k <- k + l / 2
                train_jk <- subsetData(train_j, top_k, base_k)

                temp <- data.frame(cor=cor(test_i$x, train_jk$x), trainWell=well_j, k=k)
                cors <- rbind(cors, temp)
            }

            best_j <- cors[which.max(cors$cor),]
            test[i, paste0("Facies_", well_j)] <- train_j[best_j$k[1], "Facies"]
            test[i, paste0("Corr_", well_j)] <- best_j$cor[1]
        }    
    }    
    
    test
}

In [242]:
library(caret)

accuracy_metrics <- function(cm, ytrue) {
    
    # initialize vectors for precision, recall, and f1 metrics with zeros
    prec <- rep(0,3)
    recall <- rep(0,3)
    f1 <- rep(0,3)

    # loop through facies to compute precision, recall, and f1 for each facies
    beta <- 1
    for (i in 1:3) {
        prec[i] <- cm[i,i] / sum(cm[i,])
        recall[i] <- cm[i,i] / sum(cm[,i])
        f1[i] <- (1 + beta^2) * prec[i] * recall[i] / ((beta^2 * prec[i]) + recall[i])
    }
    
    prec[is.na(prec)] <- 0
    recall[is.na(recall)] <- 0
    f1[is.na(f1)] <- 0
    
    support <- as.matrix(table(ytrue))
    tot_precision <- sum(prec * support) / sum(support)
    tot_recall <- sum(recall * support) / sum(support)
    tot_f1 <- sum(f1 * support) / sum(support)
    
    c(tot_precision, tot_recall, tot_f1)
}

eval_model <- function(ypred, ytrue) {
    cm <- confusionMatrix(ypred, ytrue)
    accuracy_metrics(as.matrix(cm[["table"]]), ytrue)
}

In [247]:
ls <- c(16,18,20,22,24)

for (l in ls) {
    t <- corrPredict(train, test, l)
    metrics_a <- eval_model(t$Facies_A, t$Facies)
    metrics_b <- eval_model(t$Facies_B, t$Facies)
    metrics_c <- eval_model(t$Facies_C, t$Facies)

    print(paste("l:", l))
    print(paste("A F1-Score:", metrics_a[3]))
    print(paste("B F1-Score:", metrics_b[3]))
    print(paste("C F1-Score:", metrics_c[3]))
}


[1] A B C
Levels: A B C
[1] "l: 16"
[1] "A F1-Score: 0.758222222222222"
[1] "B F1-Score: 0.728888888888889"
[1] "C F1-Score: 0.736108550947261"
[1] A B C
Levels: A B C
[1] "l: 18"
[1] "A F1-Score: 0.732098765432099"
[1] "B F1-Score: 0.73262599469496"
[1] "C F1-Score: 0.778877171215881"
[1] A B C
Levels: A B C
[1] "l: 20"
[1] "A F1-Score: 0.728647214854111"
[1] "B F1-Score: 0.760831122900088"
[1] "C F1-Score: 0.676574074074074"
[1] A B C
Levels: A B C
[1] "l: 22"
[1] "A F1-Score: 0.80212096332786"
[1] "B F1-Score: 0.832478632478633"
[1] "C F1-Score: 0.675462962962963"
[1] A B C
Levels: A B C
[1] "l: 24"
[1] "A F1-Score: 0.884571428571429"
[1] "B F1-Score: 0.88151828847481"
[1] "C F1-Score: 0.628581871345029"