notebook.community

Edit and run



In [132]:

    
# function to load data
loadData <- function() {
    fname <- "../facies_vectors.csv"
    data <- read.csv(fname, colClasses=c(rep("factor",3), rep("numeric",6), "factor", "numeric"))
    
    data
}

# function to pre-process the data
preProcessData <- function(data) {
    # convert NM_M channel into a binary channel "isMarine"
    data$NM_M <- data$NM_M == "2"
    names(data)[10] <- "isMarine"

    # make the Facies channel more descriptive
    levels(data$Facies) <- c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS")
    
    data
}

# function to split the data
splitData <- function(data, testWell) {
    testIndex <- data$Well.Name == testWell
    
    train <- data[!testIndex,]
    test <- data[testIndex,]
    split <- list(train, test)
    
    split
}



In [136]:

    
# load and pre-process the data
data <- loadData()
data <- preProcessData(data)

# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)

"Training Data"
head(train)

"Testing Data"
head(test)









    




'Training Data'






    





Facies Formation Well.Name Depth GR ILD_log10 DeltaPHI PHIND PE isMarine RELPOS

	FSiS     A1 SH    SHRIMPLIN 2793.0   77.45    0.664     9.9     11.915   4.6      FALSE    1.000    
	FSiS     A1 SH    SHRIMPLIN 2793.5   78.26    0.661    14.2     12.565   4.1      FALSE    0.979    
	FSiS     A1 SH    SHRIMPLIN 2794.0   79.05    0.658    14.8     13.050   3.6      FALSE    0.957    
	FSiS     A1 SH    SHRIMPLIN 2794.5   86.10    0.655    13.9     13.115   3.5      FALSE    0.936    
	FSiS     A1 SH    SHRIMPLIN 2795.0   74.58    0.647    13.5     13.300   3.4      FALSE    0.915    
	FSiS     A1 SH    SHRIMPLIN 2795.5   73.97    0.636    14.0     13.385   3.6      FALSE    0.894    









    




'Testing Data'






    





Facies Formation Well.Name Depth GR ILD_log10 DeltaPHI PHIND PE isMarine RELPOS

	938 CSiS   A1 SH  SHANKLE 2774.5 98.36  0.642  -0.1   18.685 2.9    FALSE  1.000  
	939 CSiS   A1 SH  SHANKLE 2775.0 97.57  0.631   7.9   16.745 3.2    FALSE  0.984  
	940 CSiS   A1 SH  SHANKLE 2775.5 98.41  0.615  12.8   14.105 3.2    FALSE  0.968  
	941 CSiS   A1 SH  SHANKLE 2776.0 85.92  0.597  13.0   13.385 3.4    FALSE  0.952  
	942 CSiS   A1 SH  SHANKLE 2776.5 83.16  0.592  12.3   13.345 3.4    FALSE  0.935  
	943 CSiS   A1 SH  SHANKLE 2777.0 82.56  0.599  12.9   13.105 3.5    FALSE  0.919



In [165]:

    
# subset data - mirroring edges as needed
subsetData <- function(data, top, base) {

    # requested subset within data range
    if (top >= 1 & base <= nrow(data)) {
        subset <- data[top:base,]
        
    # requested data occurs before top - mirror data near top
    } else if (top < 1) {
        s1 <- data[2:(abs(top)+2),]
        #s1$Depth <- data$Depth[1] - abs(data$Depth[1] - s1$Depth)
        #s1$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - s1$RELPOS)
        #s1 <- s1[order(s1$Depth),]
        
        s2 <- data[1:base,]
        
        subset <- rbind(s1, s2)
        
    # requested data occurs after base - mirror data near base
    } else if (base > nrow(data)) {
        s1 <- data[top:nrow(data),]
        
        s2 <- data[(nrow(data)-(base-nrow(data))):(nrow(data)-1),]
        #s2$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - s2$Depth)
        #s2$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - s2$RELPOS)
        #s2 <- s2[order(s2$Depth),]
        
        subset <- rbind(s1, s2)
    }
    
    subset
}



In [171]:

    
padData <- function(data, l) {
    dl <- data[2:(l+1),]
    #dl$Depth <- data$Depth[1] - abs(data$Depth[1] - dl$Depth)
    #dl$RELPOS <- data$RELPOS[1] + abs(data$RELPOS[1] - dl$RELPOS)
    #dl <- dl[order(dl$Depth),]
    
    dr <- data[(nrow(data)-(l)):(nrow(data)-1),]
    #dr$Depth <- data$Depth[nrow(data)] + abs(data$Depth[nrow(data)] - dr$Depth)
    #dr$RELPOS <- data$RELPOS[nrow(data)] - abs(data$RELPOS[nrow(data)] - dr$RELPOS)
    #dr <- dr[order(dr$Depth),]
    
    rbind(dl, data, dr)
}



In [179]:

    
# function to perform crosscorrelation between two vectors
crossCorrelate <- function(a, b) {
    
    if (sum(is.na(a)) > 0 | sum(is.na(b)) > 0) {
        print("missing values found")
        print(paste(a,b))
    }
    
    # calculate cross-correlation between vectors a and b
    ccor <- ccf(a, b, lag.max=400, plot=F)

    # retrieve the maximum correlation and associated lag
    corr <- max(ccor[["acf"]][,,1])
    lag <- ccor[["lag"]][,,1][which.max(ccor[["acf"]][,,1])]
    
    # return maximum correlation and associated lag
    list(correlation=corr, lag=lag)
}

# apply a function "FUN" over columns of data frames "a" and "b"
# NOTE: FUN must take two arguments, two vectors of data
loopAcrossFeatures <- function(a, b, features, FUN) {
    
    # get list of columns for a and b dataframes
    features_a <- names(a)[names(a) %in% features]
    features_b <- names(b)[names(b) %in% features]

    # ensure a and b data frames have the same features
    try ((if (!all.equal(features_a, features_b)) stop("Error! Data frames do not have the same features.")))
    
    # initialize resulting data frame
    r <- data.frame()
    
    # loop through features
    for (feature in features) {
        # retrieve the feature vector of interest from each data frame
        av <- as.data.frame(a[,which(names(a) %in% feature)])
        bv <- as.data.frame(b[,which(names(b) %in% feature)])
        
        # apply the function to the two features, storing in a data frame
        if (sum(is.na(av[,1])) == 0 & sum(is.na(bv[,1])) == 0) {
            temp <- as.data.frame(FUN(av, bv))
            temp$feature <- feature

            # merge result with those of other features
            r <- rbind(temp,r)    
        }
        
    }
    
    r
}

# function looks at the training data and determines which of the relevant testFeatures it includes
# returns list of features which can be used for training or weighting models
# NOTE: VERY IMPORTANT that given test well has data for all features listed in testFeatures RE: PE feature
whichFeatures <- function(train, testFeatures) {
    
    # ensure features have actual data
    goodFeatures <- names(train)[apply(train, 2, function(x) {sum(is.na(x))==0})]
    
    # certain features we're just not interested in modeling
    badFeatures <- c("Formation", "Well.Name", "Depth")
    if (sum(train$Well.Name == "Recruit F9") == nrow(train)) badFeatures <- c(badFeatures, "RELPOS")
    goodFeatures <- goodFeatures[!goodFeatures %in% badFeatures]

    # finally, we only want to include features which also exist in the test data set
    goodFeatures <- goodFeatures[goodFeatures %in% testFeatures]

    goodFeatures
}



In [245]:

    
l <- 16

train <- data.frame(Well.Name=c(rep("A",100), rep("B",100), rep("C",100)), x=rep(0,300))
train[50:60, "x"] <- 1
train[160:170, "x"] <- 1
train[270:280, "x"] <- 1
train[40:45, "x"] <- 2
train[150:155, "x"] <- 2
train[237:250, "x"] <- 2
train$Facies <- as.factor(train$x)
levels(train$Facies) <- c("Easy", "Medium", "Hard")
train$x <- train$x + rnorm(300, 0, .25)

test <- data.frame(Well.Name=rep("D",30), x=rep(0,30))
test[10:20, "x"] <- 1
test[1:5, "x"] <- 2
test$Facies <- as.factor(test$x)
levels(test$Facies) <- c("Easy", "Medium", "Hard")
test$x <- test$x + rnorm(30, 0, .25)



In [244]:

    
corrPredict <- function(train, test, l) {
    wells <- unique(train$Well.Name)
    crossCorrs <- data.frame()
    print(wells)
    for (i in 1:nrow(test)) {
        top <- i - l / 2 + 1
        base <- i + l / 2
        test_i <- subsetData(test, top, base)
        crossCorrs_i <- data.frame()

        for (well_j in wells) {
            train_j <- train[train$Well.Name == well_j,]
            cors <- data.frame(corsq=numeric(), trainWell=factor(), k=numeric())

            for (k in 1:nrow(train_j)) {
                top_k <- k - l / 2 + 1
                base_k <- k + l / 2
                train_jk <- subsetData(train_j, top_k, base_k)

                temp <- data.frame(cor=cor(test_i$x, train_jk$x), trainWell=well_j, k=k)
                cors <- rbind(cors, temp)
            }

            best_j <- cors[which.max(cors$cor),]
            test[i, paste0("Facies_", well_j)] <- train_j[best_j$k[1], "Facies"]
            test[i, paste0("Corr_", well_j)] <- best_j$cor[1]
        }    
    }    
    
    test
}



In [242]:

    
library(caret)

accuracy_metrics <- function(cm, ytrue) {
    
    # initialize vectors for precision, recall, and f1 metrics with zeros
    prec <- rep(0,3)
    recall <- rep(0,3)
    f1 <- rep(0,3)

    # loop through facies to compute precision, recall, and f1 for each facies
    beta <- 1
    for (i in 1:3) {
        prec[i] <- cm[i,i] / sum(cm[i,])
        recall[i] <- cm[i,i] / sum(cm[,i])
        f1[i] <- (1 + beta^2) * prec[i] * recall[i] / ((beta^2 * prec[i]) + recall[i])
    }
    
    prec[is.na(prec)] <- 0
    recall[is.na(recall)] <- 0
    f1[is.na(f1)] <- 0
    
    support <- as.matrix(table(ytrue))
    tot_precision <- sum(prec * support) / sum(support)
    tot_recall <- sum(recall * support) / sum(support)
    tot_f1 <- sum(f1 * support) / sum(support)
    
    c(tot_precision, tot_recall, tot_f1)
}

eval_model <- function(ypred, ytrue) {
    cm <- confusionMatrix(ypred, ytrue)
    accuracy_metrics(as.matrix(cm[["table"]]), ytrue)
}



In [247]:

    
ls <- c(16,18,20,22,24)

for (l in ls) {
    t <- corrPredict(train, test, l)
    metrics_a <- eval_model(t$Facies_A, t$Facies)
    metrics_b <- eval_model(t$Facies_B, t$Facies)
    metrics_c <- eval_model(t$Facies_C, t$Facies)

    print(paste("l:", l))
    print(paste("A F1-Score:", metrics_a[3]))
    print(paste("B F1-Score:", metrics_b[3]))
    print(paste("C F1-Score:", metrics_c[3]))
}









    



[1] A B C
Levels: A B C
[1] "l: 16"
[1] "A F1-Score: 0.758222222222222"
[1] "B F1-Score: 0.728888888888889"
[1] "C F1-Score: 0.736108550947261"
[1] A B C
Levels: A B C
[1] "l: 18"
[1] "A F1-Score: 0.732098765432099"
[1] "B F1-Score: 0.73262599469496"
[1] "C F1-Score: 0.778877171215881"
[1] A B C
Levels: A B C
[1] "l: 20"
[1] "A F1-Score: 0.728647214854111"
[1] "B F1-Score: 0.760831122900088"
[1] "C F1-Score: 0.676574074074074"
[1] A B C
Levels: A B C
[1] "l: 22"
[1] "A F1-Score: 0.80212096332786"
[1] "B F1-Score: 0.832478632478633"
[1] "C F1-Score: 0.675462962962963"
[1] A B C
Levels: A B C
[1] "l: 24"
[1] "A F1-Score: 0.884571428571429"
[1] "B F1-Score: 0.88151828847481"
[1] "C F1-Score: 0.628581871345029"

Facies	Formation	Well.Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	isMarine	RELPOS
FSiS	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	FALSE	1.000
FSiS	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	FALSE	0.979
FSiS	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	FALSE	0.957
FSiS	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	FALSE	0.936
FSiS	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	FALSE	0.915
FSiS	A1 SH	SHRIMPLIN	2795.5	73.97	0.636	14.0	13.385	3.6	FALSE	0.894

	Facies	Formation	Well.Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	isMarine	RELPOS
938	CSiS	A1 SH	SHANKLE	2774.5	98.36	0.642	-0.1	18.685	2.9	FALSE	1.000
939	CSiS	A1 SH	SHANKLE	2775.0	97.57	0.631	7.9	16.745	3.2	FALSE	0.984
940	CSiS	A1 SH	SHANKLE	2775.5	98.41	0.615	12.8	14.105	3.2	FALSE	0.968
941	CSiS	A1 SH	SHANKLE	2776.0	85.92	0.597	13.0	13.385	3.4	FALSE	0.952
942	CSiS	A1 SH	SHANKLE	2776.5	83.16	0.592	12.3	13.345	3.4	FALSE	0.935
943	CSiS	A1 SH	SHANKLE	2777.0	82.56	0.599	12.9	13.105	3.5	FALSE	0.919