In [1]:
source("loadData.R")

# load and clean the data
raw <- loadData()

"Raw data:"
format(head(raw,3), digits=3)

clean <- cleanData(raw)

"Cleaned data:"
format(head(clean,3), digits=3)

dataPrime <- data.frame()
wells <- unique(clean$Well.Name)

for (well_i in wells) {
    data_i <- clean[clean$Well.Name == well_i,]
    
    data_i$GR <- (data_i$GR - mean(data_i$GR, na.rm=T)) / sd(data_i$GR, na.rm=T)
    data_i$ILD_log10 <- (data_i$ILD_log10 - mean(data_i$ILD_log10, na.rm=T)) / sd(data_i$ILD_log10, na.rm=T)
    data_i$DeltaPHI <- (data_i$DeltaPHI - mean(data_i$DeltaPHI, na.rm=T)) / sd(data_i$DeltaPHI, na.rm=T)
    data_i$PHIND <- (data_i$PHIND - mean(data_i$PHIND, na.rm=T)) / sd(data_i$PHIND, na.rm=T)
    data_i$PE <- (data_i$PE - mean(data_i$PE, na.rm=T)) / sd(data_i$PE, na.rm=T)
    
    dataPrime <- rbind(dataPrime, data_i)
}

cs <- dataPrime
rm(dataPrime)

"Centered and scaled data:"
format(head(cs,3), digits=3)


'Raw data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPENM_MRELPOS
3 A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 1 1.000
3 A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 1 0.979
3 A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 1 0.957
'Cleaned data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 FALSE 0.957
'Centered and scaled data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 0.216 0.01855 0.512 -0.0487 0.421 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 0.237 0.00567 1.517 0.0736 -0.133 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 0.258 -0.00721 1.657 0.1648 -0.687 FALSE 0.957

In [2]:
# we only want data with complete observations (no missing features)
# we also exclude the Recruit F9 well, as our derivative features won't make sense for them
cs_withPE <- cs[complete.cases(cs) & cs$Well.Name != "Recruit F9",]
cs_withoutPE <- cs[cs$Well.Name != "Recruit F9",c("Facies", "Formation", "Well.Name", "Depth", "GR", 
                          "ILD_log10", "DeltaPHI", "PHIND", "isMarine", "RELPOS")]

"With PE log:"
paste("Rows:", nrow(cs_withPE))
format(head(cs_withPE, 3), digits=3)

"Without PE log:"
paste("Rows:", nrow(cs_withoutPE))
format(head(cs_withoutPE, 3), digits=3)


'With PE log:'
'Rows: 3164'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 0.216 0.01855 0.512 -0.0487 0.421 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 0.237 0.00567 1.517 0.0736 -0.133 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 0.258 -0.00721 1.657 0.1648 -0.687 FALSE 0.957
'Without PE log:'
'Rows: 4069'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDisMarineRELPOS
FSiS A1 SH SHRIMPLIN2793 0.216 0.01855 0.512 -0.0487 FALSE 1.000
FSiS A1 SH SHRIMPLIN2794 0.237 0.00567 1.517 0.0736 FALSE 0.979
FSiS A1 SH SHRIMPLIN2794 0.258 -0.00721 1.657 0.1648 FALSE 0.957

In [3]:
source("preProcData.R")

lag_withPE <- lagData(cs_withPE, 30)
lag_withoutPE <- lagData(cs_withoutPE, 30)

paste("With PE # columns before forward, backward, and central difference feature-building:", ncol(cs_withPE))
paste("With PE # columns after lag feature-building:", ncol(lag_withPE))
format(head(lag_withPE,3), digits=3)

paste("Without PE # columns before forward, backward, and central difference feature-building:", ncol(cs_withoutPE))
paste("Without PE # columns after lag feature-building:", ncol(lag_withoutPE))
format(head(lag_withoutPE,3), digits=3)


'With PE # columns before forward, backward, and central difference feature-building: 11'
'With PE # columns after lag feature-building: 191'
FaciesFormationWell.NameDepthRELPOSGR_n15ILD_log10_n15DeltaPHI_n15PHIND_n15isMarine_n15...DeltaPHI_14PHIND_14isMarine_14PE_14GR_15ILD_log10_15DeltaPHI_15PHIND_15isMarine_15PE_15
FSiS A1 SH SHRIMPLIN2793 1.000 0.23435 -0.827 0.956 -0.0487 FALSE ... 1.540 -0.0487 FALSE 0.421 0.234 -0.827 0.956 -0.0487 FALSE 0.421
FSiS A1 SH SHRIMPLIN2794 0.979 -0.00292 -0.952 0.419 0.0736 FALSE ... 1.307 0.0736 FALSE -0.133 0.555 -0.604 1.540 0.0736 FALSE -0.133
FSiS A1 SH SHRIMPLIN2794 0.957 -0.15735 -1.038 0.255 0.1648 FALSE ... 0.839 0.1648 FALSE -0.687 0.376 -0.527 1.307 0.1648 FALSE -0.687
'Without PE # columns before forward, backward, and central difference feature-building: 10'
'Without PE # columns after lag feature-building: 160'
FaciesFormationWell.NameDepthRELPOSGR_n15ILD_log10_n15DeltaPHI_n15PHIND_n15isMarine_n15...GR_14ILD_log10_14DeltaPHI_14PHIND_14isMarine_14GR_15ILD_log10_15DeltaPHI_15PHIND_15isMarine_15
FSiS A1 SH SHRIMPLIN2793 1.000 0.23435 -0.827 0.956 -0.0487 FALSE ... 0.555 -0.604 1.540 -0.0487 FALSE 0.234 -0.827 0.956 -0.0487 FALSE
FSiS A1 SH SHRIMPLIN2794 0.979 -0.00292 -0.952 0.419 0.0736 FALSE ... 0.376 -0.527 1.307 0.0736 FALSE 0.555 -0.604 1.540 0.0736 FALSE
FSiS A1 SH SHRIMPLIN2794 0.957 -0.15735 -1.038 0.255 0.1648 FALSE ... 0.066 -0.424 0.839 0.1648 FALSE 0.376 -0.527 1.307 0.1648 FALSE

Cross-Validation


In [ ]:
library(caret)
source("accuracyMetrics.R")

t0 <- Sys.time()

fitControl <- trainControl(method="repeatedcv", number=10, repeats=10)

##############################
# with PE case
##############################
f1_withPE <- NULL

wells <- unique(lag_withPE$Well.Name)
wells <- wells[!wells %in% "Recruit F9"]

for (i in 1:(length(wells)-1)) {
    for (j in (i+1):length(wells)) {
        trainIndex <- lag_withPE$Well.Name != wells[i] & lag_withPE$Well.Name != wells[j]
        train <- lag_withPE[trainIndex,]
        test <- lag_withPE[!trainIndex,]

        fit <- train(Facies ~ ., data=subset(train, select=-c(Well.Name, Depth)), 
                     method="rf", metric="Kappa", tuneLength=10, 
                     trControl=fitControl)
        
        test$Predicted <- predict(fit, newdata=test)
        f1_i <- myF1Metric(test$Predicted, test$Facies)
        f1_withPE <- c(f1_withPE, f1_i)
        
        print(paste("Test well 1:", wells[i], ", Test well 2:", wells[j], 
                    ", withPE f1-score:", f1_i))
        print("-------------")
    }
}

##############################
# without PE case
##############################
f1_withoutPE <- NULL

wells <- unique(lag_withoutPE$Well.Name)
wells <- wells[!wells %in% "Recruit F9"]

for (i in 1:(length(wells)-1)) {
    for (j in (i+1):length(wells)) {
        trainIndex <- lag_withoutPE$Well.Name != wells[i] & lag_withoutPE$Well.Name != wells[j]
        train <- lag_withoutPE[trainIndex,]
        test <- lag_withoutPE[!trainIndex,]
        
        fit <- train(Facies ~ ., data=subset(train, select=-c(Well.Name, Depth)), 
                     method="rf", metric="Kappa", tuneLength=10, 
                     trControl=fitControl)
        
        test$Predicted <- predict(fit, newdata=test)
        f1_i <- myF1Metric(test$Predicted, test$Facies)
        f1_withoutPE <- c(f1_withoutPE, f1_i)    
        
        print(paste("Test well 1:", wells[i], ", Test well 2:", wells[j], 
                    ", withoutPE f1-score:", f1_i))
        print("-------------")        
    }
}

print("WITH PE")
print(paste("Minimum F1:", min(f1_withPE)))
print(paste("Average F1:", mean(f1_withPE)))
print(paste("Maximum F1:", max(f1_withPE)))
print("-------------")
print("WITHOUT PE")
print(paste("Minimum F1:", min(f1_withoutPE)))
print(paste("Average F1:", mean(f1_withoutPE)))
print(paste("Maximum F1:", max(f1_withoutPE)))

tn <- Sys.time()
print(tn-t0)


Warning message:
"package 'caret' was built under R version 3.2.5"Loading required package: lattice
Loading required package: ggplot2
Loading required package: randomForest
Warning message:
"package 'randomForest' was built under R version 3.2.5"randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin