In [1]:
# Import packages
from swat import *
from pprint import pprint
from swat.render import render_html
from matplotlib import pyplot as plt
import pandas as pd
import sys
%matplotlib inline
In [2]:
# Start a CAS session
cashost='<your CAS server here>'
casport=<your CAS server port here>
casauth="~/.authinfo"
sess = CAS(cashost, casport, authinfo=casauth, caslib="public")
In [3]:
# Set helper variables
gcaslib="public"
prepped_data="bank_prepped"
target = {"b_tgt"}
class_inputs = {"cat_input1", "cat_input2", "demog_ho", "demog_genf"}
interval_inputs = {"im_demog_age", "im_demog_homeval", "im_demog_inc", "demog_pr", "log_rfm1", "rfm2", "log_im_rfm3", "rfm4", "rfm5", "rfm6", "rfm7", "rfm8", "rfm9", "rfm10", "rfm11", "rfm12"}
class_vars = target | class_inputs
In [4]:
# Load action set
sess.loadactionset(actionset="regression")
# Train Logistic Regression
lr=sess.regression.logistic(
table={"name":prepped_data, "caslib":gcaslib},
classVars=[{"vars":class_vars}],
model={
"depVars":[{"name":"b_tgt", "options":{"event":"1"}}],
"effects":[{"vars":class_inputs | interval_inputs}]
},
partByVar={"name":"_partind_", "train":"1", "valid":"0"},
selection={"method":"STEPWISE"},
output={"casOut":{"name":"_scored_logistic", "replace":True}, "copyVars":{"account", "b_tgt", "_partind_"}}
)
# Output model statistics
render_html(lr)
# Compute p_b_tgt0 and p_b_tgt1 for assessment
sess.dataStep.runCode(
code="data _scored_logistic; set _scored_logistic; p_b_tgt0=1-_pred_; rename _pred_=p_b_tgt1; run;"
)
Out[4]:
In [5]:
# 1. Load GBM model (ASTORE) created in VA
sess.loadTable(
caslib="models", path="Gradient_Boosting_VA.sashdat",
casout={"name":"gbm_astore_model","caslib":"casuser", "replace":True}
)
# 2. Score code from VA (for data preparation)
sess.dataStep.runCode(
code="""data bank_part_post;
set bank_part(caslib='public');
_va_calculated_54_1=round('b_tgt'n,1.0);
_va_calculated_54_2=round('demog_genf'n,1.0);
_va_calculated_54_3=round('demog_ho'n,1.0);
_va_calculated_54_4=round('_PartInd_'n,1.0);
run;"""
)
# 3. Score using ASTORE
sess.loadactionset(actionset="astore")
sess.astore.score(
table={"name":"bank_part_post"},
rstore={"name":"gbm_astore_model"},
out={"name":"_scored_gbm", "replace":True},
copyVars={"account", "_partind_", "b_tgt"}
)
# 4. Rename p_b_tgt0 and p_b_tgt1 for assessment
sess.dataStep.runCode(
code="""data _scored_gbm;
set _scored_gbm;
rename p__va_calculated_54_10=p_b_tgt0
p__va_calculated_54_11=p_b_tgt1;
run;"""
)
Out[5]:
In [6]:
# Load action set
sess.loadactionset(actionset="decisionTree")
# Score using forest_model table
sess.decisionTree.forestScore(
table={"name":prepped_data, "caslib":gcaslib},
modelTable={"name":"forest_model", "caslib":"public"},
casOut={"name":"_scored_rf", "replace":True},
copyVars={"account", "b_tgt", "_partind_"},
vote="PROB"
)
# Create p_b_tgt0 and p_b_tgt1 as _rf_predp_ is the probability of event in _rf_predname_
sess.dataStep.runCode(
code="""data _scored_rf;
set _scored_rf;
if _rf_predname_=1 then do;
p_b_tgt1=_rf_predp_;
p_b_tgt0=1-p_b_tgt1;
end;
if _rf_predname_=0 then do;
p_b_tgt0=_rf_predp_;
p_b_tgt1=1-p_b_tgt0;
end;
run;"""
)
Out[6]:
In [7]:
# Score using ASTORE
sess.loadactionset(actionset="astore")
sess.astore.score(
table={"name":prepped_data, "caslib":gcaslib},
rstore={"name":"svm_astore_model", "caslib":"public"},
out={"name":"_scored_svm", "replace":True},
copyVars={"account", "_partind_", "b_tgt"}
)
Out[7]:
In [8]:
# Assess models
def assess_model(prefix):
return sess.percentile.assess(
table={
"name":"_scored_" + prefix,
"where": "strip(put(_partind_, best.))='0'"
},
inputs=[{"name":"p_b_tgt1"}],
response="b_tgt",
event="1",
pVar={"p_b_tgt0"},
pEvent={"0"}
)
lrAssess=assess_model(prefix="logistic")
lr_fitstat =lrAssess.FitStat
lr_rocinfo =lrAssess.ROCInfo
lr_liftinfo=lrAssess.LIFTInfo
rfAssess=assess_model(prefix="rf")
rf_fitstat =rfAssess.FitStat
rf_rocinfo =rfAssess.ROCInfo
rf_liftinfo=rfAssess.LIFTInfo
gbmAssess=assess_model(prefix="gbm")
gbm_fitstat =gbmAssess.FitStat
gbm_rocinfo =gbmAssess.ROCInfo
gbm_liftinfo=gbmAssess.LIFTInfo
svmAssess=assess_model(prefix="svm")
svm_fitstat =svmAssess.FitStat
svm_rocinfo =svmAssess.ROCInfo
svm_liftinfo=svmAssess.LIFTInfo
In [9]:
# Add new variable to indicate type of model
lr_liftinfo["model"]="Logistic (Python API)"
lr_rocinfo["model"]='Logistic (Python API)'
rf_liftinfo["model"]="Autotuned Forest (SAS Studio)"
rf_rocinfo["model"]="Autotuned Forest (SAS Studio)"
gbm_liftinfo["model"]="Gradient Boosting (SAS VA)"
gbm_rocinfo["model"]="Gradient Boosting (SAS VA)"
svm_liftinfo["model"]="SVM (SAS Studio)"
svm_rocinfo["model"]="SVM (SAS Studio)"
# Append data
all_liftinfo=lr_liftinfo.append(rf_liftinfo, ignore_index=True) \
.append(gbm_liftinfo, ignore_index=True) \
.append(svm_liftinfo, ignore_index=True)
all_rocinfo=lr_rocinfo.append(rf_rocinfo, ignore_index=True) \
.append(gbm_rocinfo, ignore_index=True) \
.append(svm_rocinfo, ignore_index=True)
print("AUC (using validation data)".center(80, '-'))
all_rocinfo[["model", "C"]].drop_duplicates(keep="first").sort_values(by="C", ascending=False)
Out[9]:
In [10]:
# Draw ROC charts
plt.figure()
for key, grp in all_rocinfo.groupby(["model"]):
plt.plot(grp["FPR"], grp["Sensitivity"], label=key)
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.grid(True)
plt.legend(loc="best")
plt.title("ROC Curve (using validation data)")
plt.show()
# Draw lift charts
plt.figure()
for key, grp in all_liftinfo.groupby(["model"]):
plt.plot(grp["Depth"], grp["Lift"], label=key)
plt.xlabel("Depth")
plt.ylabel("Lift")
plt.grid(True)
plt.legend(loc="best")
plt.title("Lift Chart (using validation data)")
plt.show()
In [ ]:
# Close the CAS session
# sess.close()