Getting and cleaning the data


In [1]:
options(warn=-1)

suppressMessages(library(dplyr))
source("loadData.R")

# load data
"Raw data:"
raw <- loadData()
format(head(raw,3), digits=3)

# clean data
"Cleaned data:"
clean <- cleanData(raw)
format(head(clean,3), digits=3)


'Raw data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPENM_MRELPOS
3 A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 1 1.000
3 A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 1 0.979
3 A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 1 0.957
'Cleaned data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOSNPHIDPHI
FSiS A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 FALSE 1.000 16.9 6.96
FSiS A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 FALSE 0.979 19.7 5.46
FSiS A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 FALSE 0.957 20.5 5.65

Feature generation & Imputation


In [3]:
library(dplyr)
library(randomForest)
library(lattice)
library(ggplot2)
library(caret)

source("preProcData.R")

t0 <- Sys.time()

df <- clean[clean$Well.Name != "Recruit F9",]

# formation averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation), 
             GR_FmAvg = mean(GR),
             ILD_FmAvg = mean(ILD_log10), 
             DeltaPHI_FmAvg = mean(DeltaPHI), 
             PHIND_FmAvg = mean(PHIND), 
             PE_FmAvg = mean(PE, na.rm=T),
             NPHI_FmAvg = mean(NPHI),
             DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------

# facies averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Facies),
             GR_FaciesAvg = mean(GR),
             ILD_FaciesAvg = mean(ILD_log10),
             DeltaPHI_FaciesAvg = mean(DeltaPHI),
             PHIND_FaciesAvg = mean(PHIND),
             PE_FaciesAvg = mean(PE, na.rm=T),
             NPHI_FaciesAvg = mean(NPHI),
             DPHI_FaciesAvg = mean(DPHI))
# ------------------------------------------------------------------------------------

# well averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Well.Name),
             GR_WellAvg = mean(GR),
             ILD_WellAvg = mean(ILD_log10),
             DeltaPHI_WellAvg = mean(DeltaPHI),
             PHIND_WellAvg = mean(PHIND),
             NPHI_WellAvg = mean(NPHI),
             DPHI_WellAvg = mean(DPHI))
# ------------------------------------------------------------------------------------

# lag log data
# ------------------------------------------------------------------------------------
df <- lagData(df, 30, c("GR", "ILD_log10", "DeltaPHI", "PHIND", "isMarine", 
                        "PE_FaciesAvg"))
# ------------------------------------------------------------------------------------

# calculate formation thickness
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation, Well.Name), FmThickness=max(Depth)-min(Depth)+.5)
# ------------------------------------------------------------------------------------

# calculate difference/derivatives
# ------------------------------------------------------------------------------------
df <- firstDiff(df, F)
# ------------------------------------------------------------------------------------

format(head(df, 5), digits=3)
df_fg <- df


FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarine...DeltaPHI_forDiffPHIND_forDiffGR_bacDiffILD_bacDiffDeltaPHI_bacDiffPHIND_bacDiffGR_cenDiffILD_cenDiffDeltaPHI_cenDiffPHIND_cenDiff
FSiS A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 FALSE ... 4.3 0.650 0.00 0.000 0.0 0.000 0.00 0.0000 0.00 0.000
FSiS A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 FALSE ... 0.6 0.485 0.81 -0.003 4.3 0.650 0.80 -0.0030 2.45 0.568
FSiS A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 FALSE ... -0.9 0.065 0.79 -0.003 0.6 0.485 3.92 -0.0030 -0.15 0.275
FSiS A1 SH SHRIMPLIN2794 86.1 0.655 13.9 13.1 3.5 FALSE ... -0.4 0.185 7.05 -0.003 -0.9 0.065 -2.23 -0.0055 -0.65 0.125
FSiS A1 SH SHRIMPLIN2795 74.6 0.647 13.5 13.3 3.4 FALSE ... 0.5 0.085 -11.52 -0.008 -0.4 0.185 -6.06 -0.0095 0.05 0.135

In [4]:
t0 <- Sys.time()
df <- df_fg

# split data on whether or not PE has a valid value
# ------------------------------------------------------------------------------------
PE_impute_train <- df[!is.na(df$PE),]
PE_impute_test <- df[is.na(df$PE),]
# ------------------------------------------------------------------------------------

tn <- Sys.time()
"Time elapsed:"
print(tn-t0)


'Time elapsed:'
Time difference of 0.08700514 secs

In [5]:
t0 <- Sys.time()
set.seed(1234)

# fit a random forest regressor (like ar4 and HouMath) to impute missing PE
# ------------------------------------------------------------------------------------
PE_wells <- unique(PE_impute_train$Well.Name)
resamp_index <- list()

for (w in PE_wells) {
    resamp_index[[w]] <- which(PE_impute_train$Well.Name != w)
}

fitControl <- trainControl(method="cv", index=resamp_index)
fit <- train(PE ~ ., data=PE_impute_train, 
             method="rf", trControl=fitControl,
             tuneLength=10, ntree=50, importance=T)

"Before imputing:"
format(head(PE_impute_test,3), digits=3)

print(fit)
print(fit[["resample"]])
PE_impute_test$PE <- predict(fit, PE_impute_test)

"After imputing:"
format(head(PE_impute_test,3), digits=3)
# ------------------------------------------------------------------------------------

tn <- Sys.time()
"Time elapsed:"
print(tn-t0)


'Before imputing:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarine...DeltaPHI_forDiffPHIND_forDiffGR_bacDiffILD_bacDiffDeltaPHI_bacDiffPHIND_bacDiffGR_cenDiffILD_cenDiffDeltaPHI_cenDiffPHIND_cenDiff
FSiS A1 SH ALEXANDER D2888 88.7 0.612 6.7 10.6 NA FALSE ... 4.3 1.910 0.00 0.000 0.0 0.000 0.00 0.0000 0.00 0.000
FSiS A1 SH ALEXANDER D2888 92.7 0.583 11.0 12.5 NA FALSE ... 1.0 0.895 4.00 -0.029 4.3 1.910 2.92 -0.0165 2.65 1.402
FSiS A1 SH ALEXANDER D2888 94.5 0.579 12.0 13.4 NA FALSE ... -0.5 0.340 1.83 -0.004 1.0 0.895 1.30 -0.0020 0.25 0.617
Random Forest 

3164 samples
 225 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2693, 2715, 2703, 2663, 2749, 2701, ... 
Resampling results across tuning parameters:

  mtry  RMSE       Rsquared 
    2   0.5515111  0.6555024
   29   0.5075496  0.7119138
   57   0.5103292  0.7068156
   85   0.5156431  0.6985850
  113   0.5121128  0.6995662
  140   0.5152494  0.7063695
  168   0.5255717  0.6923438
  196   0.5211819  0.6947750
  224   0.5286081  0.6878713
  252   0.5180770  0.6999451

RMSE was used to select the optimal model using  the smallest value.
The final value used for the model was mtry = 29. 
       RMSE  Rsquared        Resample
1 0.3956320 0.7586792         SHANKLE
2 0.4942847 0.5384159        LUKE G U
3 0.3335226 0.7772389  CROSS H CATTLE
4 0.7481171 0.8162802       SHRIMPLIN
5 0.5436056 0.6820090           NOLAN
6 0.3740222 0.7106842           NEWBY
7 0.6636627 0.7000891 CHURCHMAN BIBLE
'After imputing:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarine...DeltaPHI_forDiffPHIND_forDiffGR_bacDiffILD_bacDiffDeltaPHI_bacDiffPHIND_bacDiffGR_cenDiffILD_cenDiffDeltaPHI_cenDiffPHIND_cenDiff
FSiS A1 SH ALEXANDER D2888 88.7 0.612 6.7 10.6 3.41 FALSE ... 4.3 1.910 0.00 0.000 0.0 0.000 0.00 0.0000 0.00 0.000
FSiS A1 SH ALEXANDER D2888 92.7 0.583 11.0 12.5 3.42 FALSE ... 1.0 0.895 4.00 -0.029 4.3 1.910 2.92 -0.0165 2.65 1.402
FSiS A1 SH ALEXANDER D2888 94.5 0.579 12.0 13.4 3.33 FALSE ... -0.5 0.340 1.83 -0.004 1.0 0.895 1.30 -0.0020 0.25 0.617
'Time elapsed:'
Time difference of 1.641159 hours

In [6]:
# bring it all back in one data frame
# ------------------------------------------------------------------------------------
t0 <- Sys.time()

"Imputed data:"
df_impute <- subset(rbind(PE_impute_train, PE_impute_test), select=c("Facies", "Formation", "Well.Name", "Depth",
                                                                      "GR", "ILD_log10", "DeltaPHI", "PHIND", "PE",
                                                                      "isMarine", "RELPOS", "NPHI", "DPHI"))
format(head(df_impute,3), digits=3)
# ------------------------------------------------------------------------------------

tn <- Sys.time()
"Time elapsed:"
print(tn-t0)


'Imputed data:'
FaciesFormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOSNPHIDPHI
FSiS A1 SH SHRIMPLIN2793 77.5 0.664 9.9 11.9 4.6 FALSE 1.000 16.9 6.96
FSiS A1 SH SHRIMPLIN2794 78.3 0.661 14.2 12.6 4.1 FALSE 0.979 19.7 5.46
FSiS A1 SH SHRIMPLIN2794 79.0 0.658 14.8 13.1 3.6 FALSE 0.957 20.5 5.65
'Time elapsed:'
Time difference of 0.2320139 secs

Cross-validation


In [111]:
library(dplyr)
source("preProcData.R")

df <- df_impute

# center data by subtracting mean by well
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Well.Name),
             GR = GR - mean(GR),
             ILD_log10 = ILD_log10 - mean(ILD_log10),
             DeltaPHI = DeltaPHI - mean(DeltaPHI),
             PHIND = PHIND - mean(PHIND),
             PE = PE - mean(PE),
             NPHI = NPHI - mean(NPHI),
             DPHI = DPHI - mean(DPHI)
            )
# ------------------------------------------------------------------------------------

# formation averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation), 
             PE_FmAvg = mean(PE, na.rm=T),
             NPHI_FmAvg = mean(NPHI),
             DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------

# project well facies onto each other
# ------------------------------------------------------------------------------------
wells <- unique(df$Well.Name)
for (w in wells) {
    df[,paste0("Facies", "_", w)] <- NA
}
for (wi in wells) {
    df_i <- df[df$Well.Name == wi,]
    formations <- unique(df_i$Formation)
    
    for (wj in wells[!wells %in% c(wi)]) {
        df_j <- df[df$Well.Name == wj,]
        formations <- formations[formations %in% unique(df_j$Formation)]
        
        for (f in formations) {
            df_if <- df_i[df_i$Formation == f,]
            df_jf <- df_j[df_j$Formation == f,]
            
            for (i in 1:nrow(df_if)) {
                depth <- df_if$Depth[i]
                relpos_i <- df_if$RELPOS[i]
                row_j <- which.min(abs(relpos_i - df_jf$RELPOS))
                df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wj)] <- df_jf$Facies[row_j]
            }
        }
    }
    
    jwells <- wells[!wells %in% c(wi)]
    for (i in 1:nrow(df_i)) {
        depth <- df_i$Depth[i]
        votes <- NULL
        
        for (wj in jwells) {
            votes <- c(votes, df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wj)])
        }
        
        vs <- unique(votes)
        elected <- unique(votes)[which.max(table(votes))]
        df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wi)] <- ifelse(length(elected)==0, 2, elected[[1]][1])
    }
}

df[is.na(df[,"Facies_SHRIMPLIN"]),"Facies_SHRIMPLIN"] <- 2
df[,"Facies_SHRIMPLIN"] <- as.factor(unlist(df[,"Facies_SHRIMPLIN"]))
# ------------------------------------------------------------------------------------

# calculate formation thickness
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation, Well.Name), FmThickness=(max(Depth)-min(Depth)+.5))
# ------------------------------------------------------------------------------------

df_fe <- subset(df, select=c(Facies, Formation, Well.Name, GR, ILD_log10, DeltaPHI, PHIND, PE, isMarine, 
                             RELPOS, NPHI, DPHI, PE_FmAvg, NPHI_FmAvg, DPHI_FmAvg, Facies_SHRIMPLIN, 
                             FmThickness))
format(head(df_fe, 5), digits=3)


FaciesFormationWell.NameGRILD_log10DeltaPHIPHINDPEisMarineRELPOSNPHIDPHIPE_FmAvgNPHI_FmAvgDPHI_FmAvgFacies_SHRIMPLINFmThickness
FSiS A1 SH SHRIMPLIN 8.04 0.00432 2.19 -0.259 0.38 FALSE 1.000 0.837 -1.35 -0.581 3 0.141 2 21.5
FSiS A1 SH SHRIMPLIN 8.85 0.00132 6.49 0.391 -0.12 FALSE 0.979 3.637 -2.85 -0.581 3 0.141 2 21.5
FSiS A1 SH SHRIMPLIN 9.64 -0.00168 7.09 0.876 -0.62 FALSE 0.957 4.422 -2.67 -0.581 3 0.141 2 21.5
FSiS A1 SH SHRIMPLIN16.69 -0.00468 6.19 0.941 -0.72 FALSE 0.936 4.037 -2.15 -0.581 3 0.141 2 21.5
FSiS A1 SH SHRIMPLIN 5.17 -0.01268 5.79 1.126 -0.82 FALSE 0.915 4.022 -1.77 -0.581 3 0.141 2 21.5

In [113]:
names(df_fe)


  1. 'Facies'
  2. 'Formation'
  3. 'Well.Name'
  4. 'GR'
  5. 'ILD_log10'
  6. 'DeltaPHI'
  7. 'PHIND'
  8. 'PE'
  9. 'isMarine'
  10. 'RELPOS'
  11. 'NPHI'
  12. 'DPHI'
  13. 'PE_FmAvg'
  14. 'NPHI_FmAvg'
  15. 'DPHI_FmAvg'
  16. 'Facies_SHRIMPLIN'
  17. 'FmThickness'

In [132]:
library(randomForest)
library(caret)
source("accuracyMetrics.R")

t0 <- Sys.time()

df <- df_fe

# generate resampling list (cross-validation by well)
# ------------------------------------------------------------------------------------
resamp_list <- list()

for (w in unique(df$Well.Name)) {
    resamp_list[[w]] <- which(df$Well.Name != w)
}
# ------------------------------------------------------------------------------------

weights <- table(df$Facies[df$Well.Name %in% c("SHRIMPLIN", "CHURCHMAN BIBLE")])

# perform cross-validation by building model and outputting results
# ------------------------------------------------------------------------------------
set.seed(1234)
fitControl <- trainControl(index=resamp_list, method="cv", summaryFunction=myF1MetricCaret)
fit <- train(Facies ~ ., data=subset(df, select=-c(Well.Name)), method="rf",
             trControl=fitControl, metric="F1", ntree=100, classwt=weights,
             tuneLength=10)
# ------------------------------------------------------------------------------------

print(fit)
print(fit[["resample"]])

tn <- Sys.time()
"Time elapsed:"
print(tn-t0)


Random Forest 

4069 samples
  15 predictor
   9 classes: 'SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 3598, 3620, 3608, 3568, 3654, 3606, ... 
Resampling results across tuning parameters:

  mtry  F1       
   2    0.5241269
   5    0.5596335
   9    0.5754837
  12    0.5731957
  16    0.5770228
  19    0.5822769
  23    0.5714296
  26    0.5736143
  30    0.5654781
  34    0.5583793

F1 was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 19. 
         F1        Resample
1 0.5545657         SHANKLE
2 0.6629464        LUKE G U
3 0.6295503       SHRIMPLIN
4 0.6180049           NOLAN
5 0.5313175           NEWBY
6 0.5000000 CHURCHMAN BIBLE
7 0.5400000  CROSS H CATTLE
8 0.6072961     ALEXANDER D
9 0.5968109        KIMZEY A
'Time elapsed:'
Time difference of 6.408033 mins

In [161]:
library(dplyr)
source("preProcData.R")

# load the hold-out test set
testName <- "../validation_data_nofacies.csv"
df_test <- read.csv(testName, colClasses=c(rep("factor",2), rep("numeric",6), "factor", "numeric"))
df_test <- cleanData(df_test)
df_test$Facies <- NA

# center data by subtracting mean by well
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Well.Name),
                 GR = GR - mean(GR),
                 ILD_log10 = ILD_log10 - mean(ILD_log10),
                 DeltaPHI = DeltaPHI - mean(DeltaPHI),
                 PHIND = PHIND - mean(PHIND),
                 PE = PE - mean(PE),
                 NPHI = NPHI - mean(NPHI),
                 DPHI = DPHI - mean(DPHI)
            )
# ------------------------------------------------------------------------------------

# formation averages of log data
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Formation), 
                 PE_FmAvg = mean(PE, na.rm=T),
                 NPHI_FmAvg = mean(NPHI),
                 DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------

# project SHRIMPLIN facies onto wells
# ------------------------------------------------------------------------------------
testWells <- unique(df_test$Well.Name)
df_SHRIMPLIN <- df[df$Well.Name == "SHRIMPLIN",]
for (w in testWells) {
    dfw <- df_test[df_test$Well.Name == w,]
    formations <- unique(dfw$Formation)
    formations <- formations[formations %in% unique(df_SHRIMPLIN$Formation)]
    
    for (f in formations) {
        dfwf <- dfw[dfw$Formation == f,]
        df_SHRIMPLINf <- df_SHRIMPLIN[df_SHRIMPLIN$Formation == f,]
        
        for (i in 1:nrow(dfwf)) {
            depth <- dfwf$Depth[i]
            relpos <- dfwf$RELPOS[i]
            row_shrimplin <- which.min(abs(relpos-df_SHRIMPLINf$RELPOS))
            df_test[df_test$Well.Name == w & df_test$Depth == depth, "Facies_SHRIMPLIN"] <- df_SHRIMPLINf$Facies[row_shrimplin]
        }
    }
}

levels(df_test$Facies_SHRIMPLIN) <- c(1:9)
# ------------------------------------------------------------------------------------

# calculate formation thickness
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Formation, Well.Name), FmThickness=(max(Depth)-min(Depth)+.5))
# ------------------------------------------------------------------------------------

head(df_test)


FormationWell.NameDepthGRILD_log10DeltaPHIPHINDPEisMarineRELPOSNPHIDPHIFaciesPE_FmAvgNPHI_FmAvgDPHI_FmAvgFacies_SHRIMPLINFmThickness
A1 SH STUART 2808.0 9.456099 -0.082466240.009493671-0.7239451 -0.1531477 FALSE 1.000 -0.7191983 -0.728692 NA -0.6339616 1.906383 -0.3449711 3 21.5
A1 SH STUART 2808.5 20.432099 -0.127466243.209493671 0.5760549 -0.4031477 FALSE 0.978 2.1808017 -1.028692 NA -0.6339616 1.906383 -0.3449711 3 21.5
A1 SH STUART 2809.0 26.079099 -0.146466246.109493671 2.2260549 -0.6801477 FALSE 0.956 5.2808017 -0.828692 NA -0.6339616 1.906383 -0.3449711 3 21.5
A1 SH STUART 2809.5 23.851099 -0.119466246.209493671 1.8760549 -0.7671477 FALSE 0.933 4.9808017 -1.228692 NA -0.6339616 1.906383 -0.3449711 3 21.5
A1 SH STUART 2810.0 19.151099 -0.074466245.409493671 0.9760549 -0.7241477 FALSE 0.911 3.6808017 -1.728692 NA -0.6339616 1.906383 -0.3449711 3 21.5
A1 SH STUART 2810.5 17.135099 -0.045466243.609493671 0.8760549 -0.6581477 FALSE 0.889 2.6808017 -0.928692 NA -0.6339616 1.906383 -0.3449711 3 21.5

In [169]:
df_test$Predicted <- predict(fit, df_test)
df_test$PredictedAsNum <- match(df_test$Predicted, c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS"))

output <- subset(df_test, select=c(Formation, Well.Name, Depth, Predicted, PredictedAsNum))
format(head(output), digits=3)

# save predictions to *.csv file
write.csv(output, "jpoirier008_submission001.csv", row.names=F)


FormationWell.NameDepthPredictedPredictedAsNum
A1 SH STUART2808 FSiS 3
A1 SH STUART2808 FSiS 3
A1 SH STUART2809 FSiS 3
A1 SH STUART2810 FSiS 3
A1 SH STUART2810 FSiS 3
A1 SH STUART2810 FSiS 3