Please refer to the github repository for course materials github.com/akzaidi/R-cadence
RevoScaleR package.RevoScaleR to train a model.rxPredict function to test/score a model.rxDataStep and xdfsRevoScaleR modeling and datastep functions: rxLinMod, rxGlm, rxLogit, rxDTree, rxDForest, rxSplit, and rxPredictTypical Modeling Lifecycle:
ScaleR modeling functions on the train set to estimate your modelrxPredict to validate/score your results
In [ ]:
mort_path <- paste(rxGetOption("sampleDataDir"), "mortDefaultSmall.xdf", sep = "/")
file.copy(mort_path, "mortgage.xdf", overwrite = TRUE)
mort_xdf <- RxXdfData("mortgage.xdf")
rxGetInfo(mort_xdf, getVarInfo = TRUE, numRows = 5)
In [ ]:
rxDataStep(inData = mort_xdf,
outFile = mort_xdf,
overwrite = TRUE,
transforms = list(default_flag = factor(ifelse(default == 1,
"default",
"current"))
)
)
rxGetInfo(mort_xdf, numRows = 3, getVarInfo = TRUE)
rxSplit to create partitions.rxSplit splits an input .xdf into multiple .xdfs, similar in spirit to the split function in base R
In [ ]:
create_partition <- function(xdf = mort_xdf,
partition_size = 0.75, ...) {
rxDataStep(inData = xdf,
outFile = xdf,
transforms = list(
trainvalidate = factor(
ifelse(rbinom(.rxNumRows,
size = 1, prob = splitperc),
"train", "validate")
)
),
transformObjects = list(splitperc = partition_size),
overwrite = TRUE, ...)
splitDS <- rxSplit(inData = xdf,
#outFilesBase = ,
outFileSuffixes = c("train", "validate"),
splitByFactor = "trainvalidate",
overwrite = TRUE)
return(splitDS)
}
While the above example does what we want it to do, it's not very efficient. It requires two passes over the data, first to add the trainvalidate column, and then another to split it into train and validate sets. We could do all of that in a single step if we pass the transforms directly to rxSplit.
In [ ]:
create_partition <- function(xdf = mort_xdf,
partition_size = 0.75, ...) {
splitDS <- rxSplit(inData = xdf,
transforms = list(
trainvalidate = factor(
ifelse(rbinom(.rxNumRows,
size = 1, prob = splitperc),
"train", "validate")
)
),
transformObjects = list(splitperc = partition_size),
outFileSuffixes = c("train", "validate"),
splitByFactor = "trainvalidate",
overwrite = TRUE)
return(splitDS)
}
In [ ]:
mort_split <- create_partition(reportProgress = 0)
names(mort_split) <- c("train", "validate")
lapply(mort_split, rxGetInfo)
RevoScaleR provides a plethora of modeling functions to choose from: decision trees, ensemble trees, linear models, and generalized linear models
In [ ]:
make_form <- function(xdf = mort_xdf,
resp_var = "default_flag",
vars_to_skip = c("default", "trainvalidate")) {
library(stringr)
non_incl <- paste(vars_to_skip, collapse = "|")
x_names <- names(xdf)
features <- x_names[!str_detect(x_names, resp_var)]
features <- features[!str_detect(features, non_incl)]
form <- as.formula(paste(resp_var, paste0(features, collapse = " + "),
sep = " ~ "))
return(form)
}
## Turns out, RevoScaleR already has a function for this
formula(mort_xdf, depVar = "default_flag", varsToDrop = c("defaultflag", "trainvalidate"))
In [ ]:
make_form()
estimate_model <- function(xdf_data = mort_split[["train"]],
form = make_form(xdf_data),
model = rxLogit, ...) {
rx_model <- model(form, data = xdf_data, ...)
return(rx_model)
}
In [ ]:
default_model_logit <- estimate_model(mort_split$train,
reportProgress = 0)
summary(default_model_logit)
In [ ]:
default_model_tree <- estimate_model(mort_split$train,
model = rxDTree,
minBucket = 10,
reportProgress = 0)
summary(default_model_tree)
library(RevoTreeView)
plot(createTreeView(default_model_tree))
In [ ]:
options(stringsAsFactors = TRUE)
if(file.exists("scored.xdf")) file.remove('scored.xdf')
rxPredict function to score/validate our results
In [ ]:
default_logit_scored <- rxPredict(default_model_logit,
mort_split$validate,
"scored.xdf",
writeModelVars = TRUE,
extraVarsToWrite = "default",
predVarNames = c("pred_logit_default"))
rxGetInfo(default_logit_scored, numRows = 2)
In [ ]:
plot(rxRoc(actualVarName = "default",
predVarNames ="pred_logit_default",
data = default_logit_scored))
In [ ]:
default_tree_scored <- rxPredict(default_model_tree,
mort_split$validate,
"scored.xdf",
writeModelVars = TRUE,
predVarNames = c("pred_tree_current",
"pred_tree_default"))
In [ ]:
rxRocCurve("default",
c("pred_logit_default", "pred_tree_default"),
data = default_tree_scored)
Two of the most predictive algorithms in the RevoScaleR package are the rxBTrees and rxDForest algorithms, for gradient boosted decision trees and random forests, respectively.
Use the above functions and estimate a model for each of those algorithms, and add them to the default_tree_scored dataset to visualize ROC and AUC metrics.
In [ ]:
## Starter code
default_model_forest <- estimate_model(mort_split$train,
model = ?,
nTree = 100,
importance = ,
### any other args?,
reportProgress = 0)
default_forest_scored <- rxPredict(default_model_forest,
mort_split$validate,
"scored.xdf",
type = 'prob',
predVarNames = c("pred_forest_current", "pred_forest_default", "pred_default"))
## same for rxBTrees
default_model_gbm <- estimate_model(mort_split$train,
model = ,
importance = TRUE,
nTree = ,
### any other args?,
reportProgress = 0)
default_gbm_scored <- rxPredict(default_model_gbm,
mort_split$validate,
"scored.xdf",
predVarNames = c("pred_gbm_default"))
In [ ]:
#
# rxRocCurve(actualVarName = "default",
# predVarNames = c("pred_tree_default",
# "pred_logit_default",
# "pred_forest_default",
# "pred_gbm_default"),
# data = 'scored.xdf')
rx functions will work on non-xdf data sources
In [ ]:
csv_path <- paste(rxGetOption("sampleDataDir"),
"mortDefaultSmall2009.csv",
sep = "/")
file.copy(csv_path, "mortDefaultSmall2009.csv", overwrite = TRUE)
mort_csv <- RxTextData("mortDefaultSmall2009.csv")
In [ ]:
tree_model_ccdebt <- estimate_model(xdf_data = mort_split$train,
form = make_form(mort_split$train,
"ccDebt",
vars_to_skip = c("default_flag",
"trainvalidate")),
model = rxDTree)
# plot(RevoTreeView::createTreeView(tree_model_ccdebt))
In [ ]:
if (file.exists("mort2009predictions.xdf")) file.remove("mort2009predictions.xdf")
In [ ]:
rxPredict(tree_model_ccdebt,
data = mort_csv,
outData = "mort2009predictions.xdf",
writeModelVars = TRUE)
mort_2009_pred <- RxXdfData("mort2009predictions.xdf")
rxGetInfo(mort_2009_pred, numRows = 1)
rxFactors to convert year to a factor variable
In [ ]:
mort_xdf_factor <- rxFactors(inData = mort_xdf,
factorInfo = c("year"),
outFile = "mort_year.xdf",
overwrite = TRUE)
In [ ]:
rxGetInfo(mort_xdf_factor, getVarInfo = TRUE, numRows = 4)
In [ ]:
tree_multiclass_year <- estimate_model(xdf_data = mort_xdf_factor,
form = make_form(mort_xdf_factor,
"year",
vars_to_skip = c("default",
"trainvalidate")),
model = rxDTree)
In [ ]:
multiclass_preds <- rxPredict(tree_multiclass_year,
data = mort_xdf_factor,
writeModelVars = TRUE,
outData = "multi.xdf",
overwrite = TRUE)
In [ ]:
rxGetInfo(multiclass_preds, numRows = 3)