In [1]:
# Import packages
library('swat')
library('ggplot2')
library('repr')
In [2]:
# Start a CAS session
# This uses REST (protocol=http)
sess <- swat::CAS('<your CAS server here>', <your CAS server port here>, protocol='http', caslib="public")
In [3]:
# Set helper variables
gcaslib <- "public"
prepped_data <- "bank_prepped"
target <- c("b_tgt")
class_inputs <- c("cat_input1", "cat_input2", "demog_ho", "demog_genf", "demog_genm")
interval_inputs <- c("im_demog_age", "im_demog_homeval", "im_demog_inc", "demog_pr", "log_rfm1", "rfm2", "log_im_rfm3", "rfm4", "rfm5", "rfm6", "rfm7", "rfm8", "rfm9", "rfm10", "rfm11", "rfm12")
class_vars <- c(target, class_inputs)
In [4]:
# Load action set
loadActionSet(sess, "regression")
# Train Logistic Regression
cas.regression.logistic(
sess,
table=list(name=prepped_data, caslib=gcaslib),
class=list(vars=class_vars),
model=list(depVars=list(list(name="b_tgt", options=list(event="1"))), effects=list(list(vars=c(class_inputs, interval_inputs)))),
partByVar=list(name="_partind_", train="1", valid="0"),
selection=list(method="STEPWISE"),
output=list(casOut=list(name="_scored_logistic", replace=TRUE), copyVars=list("account", "b_tgt", "_partind_"))
)
# Compute p_b_tgt0 and p_b_tgt1 for assessment
cas.dataStep.runCode(
sess,
code="data _scored_logistic; set _scored_logistic; p_b_tgt0=1-_pred_; rename _pred_=p_b_tgt1; run;"
)
In [5]:
# 1. Load GBM model (ASTORE) created in VA
cas.table.loadTable(
sess,
caslib="models",
path="Gradient_Boosting_VA.sashdat",
casout=list(name="gbm_astore_model", caslib="casuser", replace=TRUE)
)
# 2. Score code from VA (for data preparation)
cas.dataStep.runCode(
sess,
code="data bank_part_post;
set bank_part(caslib='public');
_va_calculated_54_1=round('b_tgt'n,1.0);
_va_calculated_54_2=round('demog_genf'n,1.0);
_va_calculated_54_3=round('demog_ho'n,1.0);
_va_calculated_54_4=round('_PartInd_'n,1.0);
run;"
)
# 3. Score using ASTORE
loadActionSet(sess, "astore")
cas.astore.score(
sess,
table=list(name="bank_part_post"),
rstore=list(name="gbm_astore_model"),
out=list(name="_scored_gbm", replace=TRUE),
copyVars=list("account", "_partind_", "b_tgt")
)
# 4. Rename p_b_tgt0 and p_b_tgt1 for assessment
cas.dataStep.runCode(
sess,
code="data _scored_gbm;
set _scored_gbm;
rename p__va_calculated_54_10=p_b_tgt0
p__va_calculated_54_11=p_b_tgt1;
run;"
)
In [6]:
# Load action set
loadActionSet(sess, "decisionTree")
# Score using forest_model table
cas.decisionTree.forestScore(
sess,
table=list(name=prepped_data, caslib=gcaslib),
modelTable=list(name="forest_model", caslib="public"),
casOut=list(name="_scored_rf", replace=TRUE),
copyVars=list("account", "b_tgt", "_partind_"),
vote="PROB"
)
# Create p_b_tgt0 and p_b_tgt1 as _rf_predp_ is the probability of event in _rf_predname_
cas.dataStep.runCode(
sess,
code="data _scored_rf;
set _scored_rf;
if _rf_predname_=1 then do;
p_b_tgt1=_rf_predp_;
p_b_tgt0=1-p_b_tgt1;
end;
if _rf_predname_=0 then do;
p_b_tgt0=_rf_predp_;
p_b_tgt1=1-p_b_tgt0;
end;
run;"
)
In [7]:
# Score using ASTORE
loadActionSet(sess, "astore")
cas.astore.score(
sess,
table=list(name=prepped_data, caslib=gcaslib),
rstore=list(name="svm_astore_model", caslib="public"),
out=list(name="_scored_svm", replace=TRUE),
copyVars=list("account", "_partind_", "b_tgt")
)
In [8]:
# Assess models
assess.model <- function(prefix) {
cas.percentile.assess(
sess,
table=list(name=paste0('_scored_', prefix), where='_partind_=0'),
inputs="p_b_tgt1",
response="b_tgt",
event="1",
pVar=list("p_b_tgt0"),
pEvent=list("0")
)
}
lrAssess <- assess.model("logistic")
lr_fitstat <- lrAssess$FitStat
lr_rocinfo <- lrAssess$ROCInfo
lr_liftinfo <- lrAssess$LIFTInfo
rfAssess <- assess.model("rf")
rf_fitstat <- rfAssess$FitStat
rf_rocinfo <- rfAssess$ROCInfo
rf_liftinfo <- rfAssess$LIFTInfo
gbmAssess <- assess.model("gbm")
gbm_fitstat <- gbmAssess$FitStat
gbm_rocinfo <- gbmAssess$ROCInfo
gbm_liftinfo<- gbmAssess$LIFTInfo
svmAssess <- assess.model("svm")
svm_fitstat <- svmAssess$FitStat
svm_rocinfo <- svmAssess$ROCInfo
svm_liftinfo<- svmAssess$LIFTInfo
In [9]:
# Add new variable to indicate type of model
lr_liftinfo$model <- "Logistic (R API)"
lr_rocinfo$model <- 'Logistic (R API)'
rf_liftinfo$model <- "Autotuned Forest (SAS Studio)"
rf_rocinfo$model <- "Autotuned Forest (SAS Studio)"
gbm_liftinfo$model <- "Gradient Boosting (SAS VA)"
gbm_rocinfo$model <- "Gradient Boosting (SAS VA)"
svm_liftinfo$model <- "SVM (SAS Studio)"
svm_rocinfo$model <- "SVM (SAS Studio)"
# Append data
all_liftinfo <- rbind(lr_liftinfo, rbind(rf_liftinfo, rbind(gbm_liftinfo, svm_liftinfo)))
all_rocinfo <- rbind(lr_rocinfo, rbind(rf_rocinfo, rbind(gbm_rocinfo, svm_rocinfo)))
In [10]:
print("----- AUC (using validation data) -----")
all_rocinfo2 <- unique(all_rocinfo[, c("model", "C")])
all_rocinfo2[with(all_rocinfo2, order(-C)), ]
In [11]:
# Set plot mimetype to PNG so it renders in all browsers
options(jupyter.plot_mimetypes = "image/png")
# Set plot size
options(repr.plot.width = 6, repr.plot.height = 4)
# Draw ROC charts
ggplot(all_rocinfo[c('FPR', 'Sensitivity', 'model')],
aes(x = as.numeric(FPR), y = as.numeric(Sensitivity), color = model)) +
geom_line() + geom_abline(linetype='dashed') + coord_fixed() +
labs(title = 'ROC Curve (using validation data)', x = 'False Positive Rate', y = 'True Positive Rate')
# Draw lift charts
ggplot(all_liftinfo[c('Depth', 'Lift', 'model')],
aes(x = as.numeric(Depth), y = as.numeric(Lift), color = model)) +
geom_line() +
labs(title = 'Lift Chart (using validation data)', x = 'Depth', y = 'Lift')