notebook.community

Edit and run



In [44]:

    
source("loadData.R")

# load and pre-process the data
data <- loadData()
data <- preProcessData(data)
data <- data[complete.cases(data),]

# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(split)

"Training Data"
head(train,3)

"Testing Data"
head(test,3)









    




'Training Data'






    





Facies Formation Well.Name Depth GR ILD_log10 DeltaPHI PHIND PE isMarine RELPOS

	FSiS     A1 SH    SHRIMPLIN 2793.0   77.45    0.664     9.9     11.915   4.6      FALSE    1.000    
	FSiS     A1 SH    SHRIMPLIN 2793.5   78.26    0.661    14.2     12.565   4.1      FALSE    0.979    
	FSiS     A1 SH    SHRIMPLIN 2794.0   79.05    0.658    14.8     13.050   3.6      FALSE    0.957    









    




'Testing Data'






    





Facies Formation Well.Name Depth GR ILD_log10 DeltaPHI PHIND PE isMarine RELPOS

	938 CSiS   A1 SH  SHANKLE 2774.5 98.36  0.642  -0.1   18.685 2.9    FALSE  1.000  
	939 CSiS   A1 SH  SHANKLE 2775.0 97.57  0.631   7.9   16.745 3.2    FALSE  0.984  
	940 CSiS   A1 SH  SHANKLE 2775.5 98.41  0.615  12.8   14.105 3.2    FALSE  0.968



In [45]:

    
source("preProcData.R")

# we only want data with complete observations (no missing features)
trainPrime <- train[complete.cases(train),]

# center and scale the petrophysical log features
trainPrime$GR <- (trainPrime$GR - mean(trainPrime$GR)) / sd(trainPrime$GR)
trainPrime$ILD_log10 <- (trainPrime$ILD_log10 - mean(trainPrime$ILD_log10)) / sd(trainPrime$ILD_log10)
trainPrime$DeltaPHI <- (trainPrime$DeltaPHI - mean(trainPrime$DeltaPHI)) / sd(trainPrime$DeltaPHI)
trainPrime$PHIND <- (trainPrime$PHIND - mean(trainPrime$PHIND)) / sd(trainPrime$PHIND)
trainPrime$PE <- (trainPrime$PE - mean(trainPrime$PE)) / sd(trainPrime$PE)

# center and scale the petrophysical log features
testPrime <- test
testPrime$GR <- (testPrime$GR - mean(testPrime$GR)) / sd(testPrime$GR)
testPrime$ILD_log10 <- (testPrime$ILD_log10 - mean(testPrime$ILD_log10)) / sd(testPrime$ILD_log10)
testPrime$DeltaPHI <- (testPrime$DeltaPHI - mean(testPrime$DeltaPHI)) / sd(testPrime$DeltaPHI)
testPrime$PHIND <- (testPrime$PHIND - mean(testPrime$PHIND)) / sd(testPrime$PHIND)
testPrime$PE <- (testPrime$PE - mean(testPrime$PE)) / sd(testPrime$PE)

head(trainPrime,3)









    





Facies Formation Well.Name Depth GR ILD_log10 DeltaPHI PHIND PE isMarine RELPOS

	FSiS        A1 SH       SHRIMPLIN   2793.0      0.3543263   0.08003752  1.217824    -0.162920681  0.8883685  FALSE       1.000       
	FSiS        A1 SH       SHRIMPLIN   2793.5      0.3799504   0.06763682  2.069998    -0.074959992  0.3291584  FALSE       0.979       
	FSiS        A1 SH       SHRIMPLIN   2794.0      0.4049418   0.05523612  2.188906    -0.009327786 -0.2300518  FALSE       0.957



In [46]:

    
source("accuracyMetrics.R")
source("preProcData.R")

myF1Metric <- function(data) {
    ConfM <- confusion_multi(data$pred, data$obs)
    M <- ConfM$confusionMatrix
    microF1_score <- microF1(M)
    names(microF1_score) <- "F1"
    
    microF1_score
}

preProcData <- function(data, l) {
    badFeatures <- c("Formation", "Well.Name", "Depth", "GR", "ILD_log10", "DeltaPHI", "PHIND", "PE", "isMarine")
    
    data <- lagData(data, l)
    data <- data[, !(names(data) %in% badFeatures)]
    
    data
}

splitData <- function(data, l) {
    data <- preProcData(data, l)
    
    set.seed(1234)
    trainIndex <- createDataPartition(data$Facies, p=.8, list=F)
    trainData <- data[trainIndex,]
    testData <- data[-trainIndex,]
    
    list(train=trainData, test=testData)
}

tuneModel <- function(train, l) {
    
    bestFit <- NA
    bestF1 <- 0

    fitControl <- trainControl(## 10-fold CV
                       method = "repeatedcv",
                       number = 10,
                       ## repeated ten times
                       repeats = 10)

    for (l_i in l) {
        print(paste("l=", l_i, "...Splitting data"))
        split <- splitData(data, l_i)
        trainData <- split[["train"]]
        cvTestData <- split[["test"]]
        
        print("Training model")
        set.seed(1234)
        fit <- train(Facies ~ ., data=trainData,
                    method="rf",
                    trControl=fitControl,
                    metric="Kappa")
        
        print("Predicting cross-validation values")
        cvTestData$Predicted <- predict(fit, newdata=cvTestData)
        
        print("Evaluating model")
        microF1_score <- myF1Metric(data.frame(pred=cvTestData$Predicted, obs=cvTestData$Facies))
        if (microF1_score > bestF1) {
            bestFit <- fit
        }
    }
    
    print(paste("Best F1-Score", bestF1))
    bestFit
}

tunedFit <- tuneModel(trainPrime, c(10,16,20,24,30))
tunedFit









    



[1] "l= 10 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 16 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 20 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 24 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "l= 30 ...Splitting data"
[1] "Training model"
[1] "Predicting cross-validation values"
[1] "Evaluating model"
[1] "Best F1-Score 0"






    





Random Forest 

2590 samples
 187 predictor
   9 classes: 'SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 2332, 2331, 2333, 2329, 2330, 2332, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
    2   0.7402541  0.6910370
   94   0.7956318  0.7580663
  187   0.7885591  0.7497247

Kappa was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 94.



In [50]:

    
source("accuracyMetrics.R")

testPrime <- lagData(testPrime, 30)
testPrime$Predicted <- predict(tunedFit, newdata=testPrime)

ConfM <- confusion_multi(testPrime$Predicted, testPrime$Facies)
M <- ConfM$confusionMatrix
microF1_score <- microF1(M)
print(paste("Multi F1:", microF1_score))









    



[1] "Multi F1: 0.444444444444444"



In [51]:

    
summary(tunedFit)









    





                Length Class      Mode     
call                4  -none-     call     
type                1  -none-     character
predicted        2590  factor     numeric  
err.rate         5000  -none-     numeric  
confusion          90  -none-     numeric  
votes           23310  matrix     numeric  
oob.times        2590  -none-     numeric  
classes             9  -none-     character
importance        187  -none-     numeric  
importanceSD        0  -none-     NULL     
localImportance     0  -none-     NULL     
proximity           0  -none-     NULL     
ntree               1  -none-     numeric  
mtry                1  -none-     numeric  
forest             14  -none-     list     
y                2590  factor     numeric  
test                0  -none-     NULL     
inbag               0  -none-     NULL     
xNames            187  -none-     character
problemType         1  -none-     character
tuneValue           1  data.frame list     
obsLevels           9  -none-     character

Facies	Formation	Well.Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	isMarine	RELPOS
FSiS	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	FALSE	1.000
FSiS	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	FALSE	0.979
FSiS	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	FALSE	0.957

	Facies	Formation	Well.Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	isMarine	RELPOS
938	CSiS	A1 SH	SHANKLE	2774.5	98.36	0.642	-0.1	18.685	2.9	FALSE	1.000
939	CSiS	A1 SH	SHANKLE	2775.0	97.57	0.631	7.9	16.745	3.2	FALSE	0.984
940	CSiS	A1 SH	SHANKLE	2775.5	98.41	0.615	12.8	14.105	3.2	FALSE	0.968