In [44]:
source("loadData.R")

# load and pre-process the data
data <- loadData()
data <- preProcessData(data)
data <- data[complete.cases(data),]

# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)

"Training Data"
head(train,3)

"Testing Data"
head(test,3)


'Training Data'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793.0 77.45 0.664 9.9 11.915 4.6 FALSE 1.000
FSiS A1 SH SHRIMPLIN2793.5 78.26 0.661 14.2 12.565 4.1 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794.0 79.05 0.658 14.8 13.050 3.6 FALSE 0.957
'Testing Data'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
938CSiS A1 SH SHANKLE2774.5 98.36 0.642 -0.1 18.685 2.9 FALSE 1.000
939CSiS A1 SH SHANKLE2775.0 97.57 0.631 7.9 16.745 3.2 FALSE 0.984
940CSiS A1 SH SHANKLE2775.5 98.41 0.615 12.8 14.105 3.2 FALSE 0.968

In [45]:
source("preProcData.R")

# we only want data with complete observations (no missing features)
trainPrime <- train[complete.cases(train),]

# center and scale the petrophysical log features
trainPrime$GR <- (trainPrime$GR - mean(trainPrime$GR)) / sd(trainPrime$GR)
trainPrime$ILD_log10 <- (trainPrime$ILD_log10 - mean(trainPrime$ILD_log10)) / sd(trainPrime$ILD_log10)
trainPrime$DeltaPHI <- (trainPrime$DeltaPHI - mean(trainPrime$DeltaPHI)) / sd(trainPrime$DeltaPHI)
trainPrime$PHIND <- (trainPrime$PHIND - mean(trainPrime$PHIND)) / sd(trainPrime$PHIND)
trainPrime$PE <- (trainPrime$PE - mean(trainPrime$PE)) / sd(trainPrime$PE)

# center and scale the petrophysical log features
testPrime <- test
testPrime$GR <- (testPrime$GR - mean(testPrime$GR)) / sd(testPrime$GR)
testPrime$ILD_log10 <- (testPrime$ILD_log10 - mean(testPrime$ILD_log10)) / sd(testPrime$ILD_log10)
testPrime$DeltaPHI <- (testPrime$DeltaPHI - mean(testPrime$DeltaPHI)) / sd(testPrime$DeltaPHI)
testPrime$PHIND <- (testPrime$PHIND - mean(testPrime$PHIND)) / sd(testPrime$PHIND)
testPrime$PE <- (testPrime$PE - mean(testPrime$PE)) / sd(testPrime$PE)

head(trainPrime,3)


FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN 2793.0 0.3543263 0.08003752 1.217824 -0.162920681 0.8883685 FALSE 1.000
FSiS A1 SH SHRIMPLIN 2793.5 0.3799504 0.06763682 2.069998 -0.074959992 0.3291584 FALSE 0.979
FSiS A1 SH SHRIMPLIN 2794.0 0.4049418 0.05523612 2.188906 -0.009327786-0.2300518 FALSE 0.957

In [46]:
source("accuracyMetrics.R")
source("preProcData.R")

myF1Metric <- function(data) {
    ConfM <- confusion_multi(data$pred, data$obs)
    M <- ConfM$confusionMatrix
    microF1_score <- microF1(M)
    names(microF1_score) <- "F1"
    
    microF1_score
}

preProcData <- function(data, l) {
    badFeatures <- c("Formation", "Well.Name", "Depth", "GR", "ILD_log10", "DeltaPHI", "PHIND", "PE", "isMarine")
    
    data <- lagData(data, l)
    data <- data[, !(names(data) %in% badFeatures)]
    
    data
}

splitData <- function(data, l) {
    data <- preProcData(data, l)
    
    set.seed(1234)
    trainIndex <- createDataPartition(data$Facies, p=.8, list=F)
    trainData <- data[trainIndex,]
    testData <- data[-trainIndex,]
    
    list(train=trainData, test=testData)
}

tuneModel <- function(train, l) {
    
    bestFit <- NA
    bestF1 <- 0

    fitControl <- trainControl(## 10-fold CV
                       method = "repeatedcv",
                       number = 10,
                       ## repeated ten times
                       repeats = 10)

    for (l_i in l) {
        print(paste("l=", l_i, "...Splitting data"))
        split <- splitData(data, l_i)
        trainData <- split[["train"]]
        cvTestData <- split[["test"]]
        
        print("Training model")
        set.seed(1234)
        fit <- train(Facies ~ ., data=trainData,
                    method="rf",
                    trControl=fitControl,
                    metric="Kappa")
        
        print("Predicting cross-validation values")
        cvTestData$Predicted <- predict(fit, newdata=cvTestData)
        
        print("Evaluating model")
        microF1_score <- myF1Metric(data.frame(pred=cvTestData$Predicted, obs=cvTestData$Facies))
        if (microF1_score > bestF1) {
            bestFit <- fit
        }
    }
    
    print(paste("Best F1-Score", bestF1))
    bestFit
}

tunedFit <- tuneModel(trainPrime, c(10,16,20,24,30))
tunedFit


[1] "l= 10 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 16 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 20 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 24 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 30 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "Best F1-Score 0"
Random Forest 

2590 samples
 187 predictor
   9 classes: 'SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 2332, 2331, 2333, 2329, 2330, 2332, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
    2   0.7402541  0.6910370
   94   0.7956318  0.7580663
  187   0.7885591  0.7497247

Kappa was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 94. 

In [50]:
source("accuracyMetrics.R")

testPrime <- lagData(testPrime, 30)
testPrime$Predicted <- predict(tunedFit, newdata=testPrime)

ConfM <- confusion_multi(testPrime$Predicted, testPrime$Facies)
M <- ConfM$confusionMatrix
microF1_score <- microF1(M)
print(paste("Multi F1:", microF1_score))


[1] "Multi F1: 0.444444444444444"

In [51]:
summary(tunedFit)


                Length Class      Mode     
call                4  -none-     call     
type                1  -none-     character
predicted        2590  factor     numeric  
err.rate         5000  -none-     numeric  
confusion          90  -none-     numeric  
votes           23310  matrix     numeric  
oob.times        2590  -none-     numeric  
classes             9  -none-     character
importance        187  -none-     numeric  
importanceSD        0  -none-     NULL     
localImportance     0  -none-     NULL     
proximity           0  -none-     NULL     
ntree               1  -none-     numeric  
mtry                1  -none-     numeric  
forest             14  -none-     list     
y                2590  factor     numeric  
test                0  -none-     NULL     
inbag               0  -none-     NULL     
xNames            187  -none-     character
problemType         1  -none-     character
tuneValue           1  data.frame list     
obsLevels           9  -none-     character