In [1]:
source("loadData.R")

# load and clean the data
raw <- loadData()

"Raw data:"
format(head(raw,3), digits=3)

clean <- cleanData(raw)

"Cleaned data:"
format(head(clean,3), digits=3)

dataPrime <- data.frame()
wells <- unique(clean$Well.Name)

for (well_i in wells) {
    data_i <- clean[clean$Well.Name == well_i,]
    
    data_i$GR <- (data_i$GR - mean(data_i$GR, na.rm=T)) / sd(data_i$GR, na.rm=T)
    data_i$ILD_log10 <- (data_i$ILD_log10 - mean(data_i$ILD_log10, na.rm=T)) / sd(data_i$ILD_log10, na.rm=T)
    data_i$DeltaPHI <- (data_i$DeltaPHI - mean(data_i$DeltaPHI, na.rm=T)) / sd(data_i$DeltaPHI, na.rm=T)
    data_i$PHIND <- (data_i$PHIND - mean(data_i$PHIND, na.rm=T)) / sd(data_i$PHIND, na.rm=T)
    data_i$PE <- (data_i$PE - mean(data_i$PE, na.rm=T)) / sd(data_i$PE, na.rm=T)
    
    dataPrime <- rbind(dataPrime, data_i)
}

cs <- dataPrime
rm(dataPrime)

"Centered and scaled data:"
format(head(cs,3), digits=3)


'Raw data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPENM_MRELPOS
3 A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 1 1.000
3 A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 1 0.979
3 A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 1 0.957
'Cleaned data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 FALSE 0.957
'Centered and scaled data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 0.216 0.01855 0.512 -0.0487 0.421 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 0.237 0.00567 1.517 0.0736 -0.133 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 0.258 -0.00721 1.657 0.1648 -0.687 FALSE 0.957

In [2]:
source("preProcData.R")

paste("# columns before forward, backward, and central difference feature-building:", ncol(cs))

lag <- lagData(cs, 30)

paste("# columns after lag feature-building:", ncol(lag))
format(head(lag,3), digits=3)


'# columns before forward, backward, and central difference feature-building: 11'
'# columns after lag feature-building: 191'
FaciesFormationWell.NameDepthRELPOSGR_n15ILD_log10_n15DeltaPHI_n15PHIND_n15isMarine_n15...DeltaPHI_14PHIND_14isMarine_14PE_14GR_15ILD_log10_15DeltaPHI_15PHIND_15isMarine_15PE_15
FSiS A1 SH SHRIMPLIN2793 1.000 0.23435 -0.827 0.956 -0.0487 FALSE ... 1.540 -0.0487 FALSE 0.421 0.234 -0.827 0.956 -0.0487 FALSE 0.421
FSiS A1 SH SHRIMPLIN2794 0.979 -0.00292 -0.952 0.419 0.0736 FALSE ... 1.307 0.0736 FALSE -0.133 0.555 -0.604 1.540 0.0736 FALSE -0.133
FSiS A1 SH SHRIMPLIN2794 0.957 -0.15735 -1.038 0.255 0.1648 FALSE ... 0.839 0.1648 FALSE -0.687 0.376 -0.527 1.307 0.1648 FALSE -0.687

Training functions


In [89]:
source("trainingFunctions.R")

buildBlendedModel <- function(train, test, p=.25, recruit_wgt=.5) {
    blendedModel <- list()

    test_wells <- unique(test$Well.Name)
    
    # build a blended model for each well in the test data
    for (well_i in test_wells) {
        test_i <- test[test$Well.Name == well_i,]
        test_iso <- max(test$Depth) - min(test$Depth)
        
        # if test well has no PE log - remove PE features from training data
        if (sum(is.na(test_i$PE_0)) > 0) {
            train_i <- subset(train, select=-c(PE_n15, PE_n14, PE_n13, PE_n12, PE_n11, PE_n10, PE_n9, PE_n8, PE_n7, PE_n6, PE_n5,
                                      PE_n4, PE_n3, PE_n2, PE_n1, PE_0, PE_1, PE_2, PE_3, PE_4, PE_5, PE_6, PE_7, PE_8,
                                      PE_9, PE_10, PE_11, PE_12, PE_13, PE_14, PE_15))
        } else {
            train_i <- train
        }
        
        # train and weight models
        blendedModel[[well_i]][["fits"]] <- trainBlendedModel(train_i)
        blendedModel[[well_i]][["weights"]] <- weightBlendedModel(train_i, test_iso, p, recruit_wgt)
    }
    
    blendedModel
}

Evaluation functions


In [90]:
source("evaluationFunctions.R")

predictBlendedModel <- function(test, 
                                blendedModel, 
                                classes=c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS")
                               ) {
    
    testPrime <- data.frame()
    
    test_wells <- unique(test$Well.Name)
    
    for (well_i in test_wells) {
        test_i <- test[test$Well.Name == well_i,]
        votes <- tallyVotes(test_i, blendedModel[[well_i]], classes)
        test_i$Predicted <- electClass(test_i, votes)
        
        testPrime <- rbind(testPrime, test_i)
    }
    
    testPrime
}

Tuning blending parameters


In [ ]:
source("accuracyMetrics.R")
options(warn=-1)

t0 <- Sys.time()

train <- lag[lag$Well.Name != "SHRIMPLIN" & lag$Well.Name != "CHURCHMAN BIBLE",]
test <- lag[lag$Well.Name == "SHRIMPLIN" | lag$Well.Name == "CHURCHMAN BIBLE",]

ps <- c(.5)
rws <- c(.5)

for (p in ps) {
    for (rw in rws) {
        blendedModel <- buildBlendedModel(train, test, p, rw)
        testPrime <- predictBlendedModel(test, blendedModel)
        f1 <- myF1Metric(testPrime$Predicted, testPrime$Facies)
        
        print(paste("inv dist parameter p:", p, ", recruit weight:", rw, ", f1-score", round(f1,4)))
        print("-------------")
    }
}

tn <- Sys.time()
print(tn-t0)


Loading required package: randomForest
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin

Cross-validation


In [20]:
source("accuracyMetrics.R")

t0 <- Sys.time()

f1 <- NULL

wells <- unique(lag$Well.Name)
wells <- wells[!wells %in% "Recruit F9"]

for (i in 1:(length(wells)-1)) {
    for (j in (i+1):length(wells)) {
        trainIndex <- lag$Well.Name != wells[i] & lag$Well.Name != wells[j]
        train <- lag[trainIndex,]
        test <- lag[!trainIndex,]

        blendedModel <- buildBlendedModel(train, test)
        testPrime <- predictBlendedModel(test, blendedModel)
        
        f1_i <- myF1Metric(testPrime$Predicted, testPrime$Facies)
        f1 <- c(f1, f1_i)
        
        print(paste("Test well 1:", wells[i], ", Test well 2:", wells[j], ", f1-score:", f1_i))
        print("-------------")
    }
}

print(paste("Minimum F1:", min(f1)))
print(paste("Average F1:", mean(f1)))
print(paste("Maximum F1:", max(f1)))

tn <- Sys.time()
print(tn-t0)


[1] "Test well 1: SHRIMPLIN , Test well 2: ALEXANDER D , f1-score: 0.644444444444444"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: SHANKLE , f1-score: 0.616783216783217"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: LUKE G U , f1-score: 0.603351955307263"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: KIMZEY A , f1-score: 0.678474114441417"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: CROSS H CATTLE , f1-score: 0.674715909090909"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: NOLAN , f1-score: 0.573226544622426"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: NEWBY , f1-score: 0.520763187429854"
[1] "-------------"
[1] "Test well 1: SHRIMPLIN , Test well 2: CHURCHMAN BIBLE , f1-score: 0.63001485884101"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: SHANKLE , f1-score: 0.625"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: LUKE G U , f1-score: 0.653498871331828"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: KIMZEY A , f1-score: 0.56447963800905"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: CROSS H CATTLE , f1-score: 0.646666666666667"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: NOLAN , f1-score: 0.579908675799087"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: NEWBY , f1-score: 0.57558790593505"
[1] "-------------"
[1] "Test well 1: ALEXANDER D , Test well 2: CHURCHMAN BIBLE , f1-score: 0.599201065246338"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: LUKE G U , f1-score: 0.605504587155963"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: KIMZEY A , f1-score: 0.638680659670165"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: CROSS H CATTLE , f1-score: 0.627300613496933"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: NOLAN , f1-score: 0.646808510638298"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: NEWBY , f1-score: 0.540939597315436"
[1] "-------------"
[1] "Test well 1: SHANKLE , Test well 2: CHURCHMAN BIBLE , f1-score: 0.579207920792079"
[1] "-------------"
[1] "Test well 1: LUKE G U , Test well 2: KIMZEY A , f1-score: 0.650190114068441"
[1] "-------------"
[1] "Test well 1: LUKE G U , Test well 2: CROSS H CATTLE , f1-score: 0.648337595907928"
[1] "-------------"
[1] "Test well 1: LUKE G U , Test well 2: NOLAN , f1-score: 0.626146788990826"
[1] "-------------"
[1] "Test well 1: LUKE G U , Test well 2: NEWBY , f1-score: 0.546472564389698"
[1] "-------------"
[1] "Test well 1: LUKE G U , Test well 2: CHURCHMAN BIBLE , f1-score: 0.617198335644938"
[1] "-------------"
[1] "Test well 1: KIMZEY A , Test well 2: CROSS H CATTLE , f1-score: 0.6289592760181"
[1] "-------------"
[1] "Test well 1: KIMZEY A , Test well 2: NOLAN , f1-score: 0.632152588555858"
[1] "-------------"
[1] "Test well 1: KIMZEY A , Test well 2: NEWBY , f1-score: 0.539181286549708"
[1] "-------------"
[1] "Test well 1: KIMZEY A , Test well 2: CHURCHMAN BIBLE , f1-score: 0.6256"
[1] "-------------"
[1] "Test well 1: CROSS H CATTLE , Test well 2: NOLAN , f1-score: 0.61119293078056"
[1] "-------------"
[1] "Test well 1: CROSS H CATTLE , Test well 2: NEWBY , f1-score: 0.610570236439499"
[1] "-------------"
[1] "Test well 1: CROSS H CATTLE , Test well 2: CHURCHMAN BIBLE , f1-score: 0.575277337559429"
[1] "-------------"
[1] "Test well 1: NOLAN , Test well 2: NEWBY , f1-score: 0.563463819691578"
[1] "-------------"
[1] "Test well 1: NOLAN , Test well 2: CHURCHMAN BIBLE , f1-score: 0.635514018691589"
[1] "-------------"
[1] "Test well 1: NEWBY , Test well 2: CHURCHMAN BIBLE , f1-score: 0.543055555555556"
[1] "-------------"
[1] "Minimum F1: 0.520763187429854"
[1] "Average F1: 0.607718649773921"
[1] "Maximum F1: 0.678474114441417"
Time difference of 8.727254 mins