In [1]:
options(warn=-1)
suppressMessages(library(dplyr))
source("loadData.R")
# load data
"Raw data:"
raw <- loadData()
format(head(raw,3), digits=3)
# clean data
"Cleaned data:"
clean <- cleanData(raw)
format(head(clean,3), digits=3)
In [3]:
library(dplyr)
library(randomForest)
library(lattice)
library(ggplot2)
library(caret)
source("preProcData.R")
t0 <- Sys.time()
df <- clean[clean$Well.Name != "Recruit F9",]
# formation averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation),
GR_FmAvg = mean(GR),
ILD_FmAvg = mean(ILD_log10),
DeltaPHI_FmAvg = mean(DeltaPHI),
PHIND_FmAvg = mean(PHIND),
PE_FmAvg = mean(PE, na.rm=T),
NPHI_FmAvg = mean(NPHI),
DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------
# facies averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Facies),
GR_FaciesAvg = mean(GR),
ILD_FaciesAvg = mean(ILD_log10),
DeltaPHI_FaciesAvg = mean(DeltaPHI),
PHIND_FaciesAvg = mean(PHIND),
PE_FaciesAvg = mean(PE, na.rm=T),
NPHI_FaciesAvg = mean(NPHI),
DPHI_FaciesAvg = mean(DPHI))
# ------------------------------------------------------------------------------------
# well averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Well.Name),
GR_WellAvg = mean(GR),
ILD_WellAvg = mean(ILD_log10),
DeltaPHI_WellAvg = mean(DeltaPHI),
PHIND_WellAvg = mean(PHIND),
NPHI_WellAvg = mean(NPHI),
DPHI_WellAvg = mean(DPHI))
# ------------------------------------------------------------------------------------
# lag log data
# ------------------------------------------------------------------------------------
df <- lagData(df, 30, c("GR", "ILD_log10", "DeltaPHI", "PHIND", "isMarine",
"PE_FaciesAvg"))
# ------------------------------------------------------------------------------------
# calculate formation thickness
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation, Well.Name), FmThickness=max(Depth)-min(Depth)+.5)
# ------------------------------------------------------------------------------------
# calculate difference/derivatives
# ------------------------------------------------------------------------------------
df <- firstDiff(df, F)
# ------------------------------------------------------------------------------------
format(head(df, 5), digits=3)
df_fg <- df
In [4]:
t0 <- Sys.time()
df <- df_fg
# split data on whether or not PE has a valid value
# ------------------------------------------------------------------------------------
PE_impute_train <- df[!is.na(df$PE),]
PE_impute_test <- df[is.na(df$PE),]
# ------------------------------------------------------------------------------------
tn <- Sys.time()
"Time elapsed:"
print(tn-t0)
In [5]:
t0 <- Sys.time()
set.seed(1234)
# fit a random forest regressor (like ar4 and HouMath) to impute missing PE
# ------------------------------------------------------------------------------------
PE_wells <- unique(PE_impute_train$Well.Name)
resamp_index <- list()
for (w in PE_wells) {
resamp_index[[w]] <- which(PE_impute_train$Well.Name != w)
}
fitControl <- trainControl(method="cv", index=resamp_index)
fit <- train(PE ~ ., data=PE_impute_train,
method="rf", trControl=fitControl,
tuneLength=10, ntree=50, importance=T)
"Before imputing:"
format(head(PE_impute_test,3), digits=3)
print(fit)
print(fit[["resample"]])
PE_impute_test$PE <- predict(fit, PE_impute_test)
"After imputing:"
format(head(PE_impute_test,3), digits=3)
# ------------------------------------------------------------------------------------
tn <- Sys.time()
"Time elapsed:"
print(tn-t0)
In [6]:
# bring it all back in one data frame
# ------------------------------------------------------------------------------------
t0 <- Sys.time()
"Imputed data:"
df_impute <- subset(rbind(PE_impute_train, PE_impute_test), select=c("Facies", "Formation", "Well.Name", "Depth",
"GR", "ILD_log10", "DeltaPHI", "PHIND", "PE",
"isMarine", "RELPOS", "NPHI", "DPHI"))
format(head(df_impute,3), digits=3)
# ------------------------------------------------------------------------------------
tn <- Sys.time()
"Time elapsed:"
print(tn-t0)
In [111]:
library(dplyr)
source("preProcData.R")
df <- df_impute
# center data by subtracting mean by well
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Well.Name),
GR = GR - mean(GR),
ILD_log10 = ILD_log10 - mean(ILD_log10),
DeltaPHI = DeltaPHI - mean(DeltaPHI),
PHIND = PHIND - mean(PHIND),
PE = PE - mean(PE),
NPHI = NPHI - mean(NPHI),
DPHI = DPHI - mean(DPHI)
)
# ------------------------------------------------------------------------------------
# formation averages of log data
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation),
PE_FmAvg = mean(PE, na.rm=T),
NPHI_FmAvg = mean(NPHI),
DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------
# project well facies onto each other
# ------------------------------------------------------------------------------------
wells <- unique(df$Well.Name)
for (w in wells) {
df[,paste0("Facies", "_", w)] <- NA
}
for (wi in wells) {
df_i <- df[df$Well.Name == wi,]
formations <- unique(df_i$Formation)
for (wj in wells[!wells %in% c(wi)]) {
df_j <- df[df$Well.Name == wj,]
formations <- formations[formations %in% unique(df_j$Formation)]
for (f in formations) {
df_if <- df_i[df_i$Formation == f,]
df_jf <- df_j[df_j$Formation == f,]
for (i in 1:nrow(df_if)) {
depth <- df_if$Depth[i]
relpos_i <- df_if$RELPOS[i]
row_j <- which.min(abs(relpos_i - df_jf$RELPOS))
df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wj)] <- df_jf$Facies[row_j]
}
}
}
jwells <- wells[!wells %in% c(wi)]
for (i in 1:nrow(df_i)) {
depth <- df_i$Depth[i]
votes <- NULL
for (wj in jwells) {
votes <- c(votes, df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wj)])
}
vs <- unique(votes)
elected <- unique(votes)[which.max(table(votes))]
df[df$Well.Name == wi & df$Depth == depth, paste0("Facies", "_", wi)] <- ifelse(length(elected)==0, 2, elected[[1]][1])
}
}
df[is.na(df[,"Facies_SHRIMPLIN"]),"Facies_SHRIMPLIN"] <- 2
df[,"Facies_SHRIMPLIN"] <- as.factor(unlist(df[,"Facies_SHRIMPLIN"]))
# ------------------------------------------------------------------------------------
# calculate formation thickness
# ------------------------------------------------------------------------------------
df <- mutate(group_by(df, Formation, Well.Name), FmThickness=(max(Depth)-min(Depth)+.5))
# ------------------------------------------------------------------------------------
df_fe <- subset(df, select=c(Facies, Formation, Well.Name, GR, ILD_log10, DeltaPHI, PHIND, PE, isMarine,
RELPOS, NPHI, DPHI, PE_FmAvg, NPHI_FmAvg, DPHI_FmAvg, Facies_SHRIMPLIN,
FmThickness))
format(head(df_fe, 5), digits=3)
In [113]:
names(df_fe)
In [132]:
library(randomForest)
library(caret)
source("accuracyMetrics.R")
t0 <- Sys.time()
df <- df_fe
# generate resampling list (cross-validation by well)
# ------------------------------------------------------------------------------------
resamp_list <- list()
for (w in unique(df$Well.Name)) {
resamp_list[[w]] <- which(df$Well.Name != w)
}
# ------------------------------------------------------------------------------------
weights <- table(df$Facies[df$Well.Name %in% c("SHRIMPLIN", "CHURCHMAN BIBLE")])
# perform cross-validation by building model and outputting results
# ------------------------------------------------------------------------------------
set.seed(1234)
fitControl <- trainControl(index=resamp_list, method="cv", summaryFunction=myF1MetricCaret)
fit <- train(Facies ~ ., data=subset(df, select=-c(Well.Name)), method="rf",
trControl=fitControl, metric="F1", ntree=100, classwt=weights,
tuneLength=10)
# ------------------------------------------------------------------------------------
print(fit)
print(fit[["resample"]])
tn <- Sys.time()
"Time elapsed:"
print(tn-t0)
In [161]:
library(dplyr)
source("preProcData.R")
# load the hold-out test set
testName <- "../validation_data_nofacies.csv"
df_test <- read.csv(testName, colClasses=c(rep("factor",2), rep("numeric",6), "factor", "numeric"))
df_test <- cleanData(df_test)
df_test$Facies <- NA
# center data by subtracting mean by well
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Well.Name),
GR = GR - mean(GR),
ILD_log10 = ILD_log10 - mean(ILD_log10),
DeltaPHI = DeltaPHI - mean(DeltaPHI),
PHIND = PHIND - mean(PHIND),
PE = PE - mean(PE),
NPHI = NPHI - mean(NPHI),
DPHI = DPHI - mean(DPHI)
)
# ------------------------------------------------------------------------------------
# formation averages of log data
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Formation),
PE_FmAvg = mean(PE, na.rm=T),
NPHI_FmAvg = mean(NPHI),
DPHI_FmAvg = mean(DPHI))
# ------------------------------------------------------------------------------------
# project SHRIMPLIN facies onto wells
# ------------------------------------------------------------------------------------
testWells <- unique(df_test$Well.Name)
df_SHRIMPLIN <- df[df$Well.Name == "SHRIMPLIN",]
for (w in testWells) {
dfw <- df_test[df_test$Well.Name == w,]
formations <- unique(dfw$Formation)
formations <- formations[formations %in% unique(df_SHRIMPLIN$Formation)]
for (f in formations) {
dfwf <- dfw[dfw$Formation == f,]
df_SHRIMPLINf <- df_SHRIMPLIN[df_SHRIMPLIN$Formation == f,]
for (i in 1:nrow(dfwf)) {
depth <- dfwf$Depth[i]
relpos <- dfwf$RELPOS[i]
row_shrimplin <- which.min(abs(relpos-df_SHRIMPLINf$RELPOS))
df_test[df_test$Well.Name == w & df_test$Depth == depth, "Facies_SHRIMPLIN"] <- df_SHRIMPLINf$Facies[row_shrimplin]
}
}
}
levels(df_test$Facies_SHRIMPLIN) <- c(1:9)
# ------------------------------------------------------------------------------------
# calculate formation thickness
# ------------------------------------------------------------------------------------
df_test <- mutate(group_by(df_test, Formation, Well.Name), FmThickness=(max(Depth)-min(Depth)+.5))
# ------------------------------------------------------------------------------------
head(df_test)
In [169]:
df_test$Predicted <- predict(fit, df_test)
df_test$PredictedAsNum <- match(df_test$Predicted, c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS"))
output <- subset(df_test, select=c(Formation, Well.Name, Depth, Predicted, PredictedAsNum))
format(head(output), digits=3)
# save predictions to *.csv file
write.csv(output, "jpoirier008_submission001.csv", row.names=F)