This is a simple end to end example of how you can use SAS Viya for analysis The example follows these steps:
Find doc for all the CAS actions here
In this code we import the needed packages and we assign variables for the modeling details that will be used later in the analysis
In [ ]:
import os
import pandas as pd
import swat
import sys
from matplotlib import pyplot as plt
%matplotlib inline
target = "bad"
class_inputs = ["reason", "job"]
class_vars = [target] + class_inputs
interval_inputs = ["im_clage", "clno", "im_debtinc", "loan", "mortdue", "value", "im_yoj", "im_ninq", "derog", "im_delinq"]
all_inputs = interval_inputs + class_inputs
indata = 'hmeq'
In this code we assign values for the cashost, casport, and casauth values. These are then used to establish a CAS session named sess
.
In [ ]:
cashost='localhost'
casport=5570
sess = swat.CAS(cashost, casport)
# Load the needed action sets for this example:
sess.loadactionset('datastep')
sess.loadactionset('datapreprocess')
sess.loadactionset('cardinality')
sess.loadactionset('sampling')
sess.loadactionset('regression')
sess.loadactionset('decisiontree')
sess.loadactionset('neuralnet')
sess.loadactionset('svm')
sess.loadactionset('astore')
sess.loadactionset('percentile')
In [ ]:
indata = sess.CASTable('hmeq')
if not indata.tableexists().exists:
indata = sess.upload_file('http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv', casout=indata)
In [ ]:
indata.summary()
In [ ]:
tbl_data_card = sess.CASTable('data_card', replace=True)
indata.cardinality.summarize(cardinality=tbl_data_card)
tbl_data_card = tbl_data_card.query('_NMISS_ > 0')
tbl_data_card.head()
In [ ]:
tbl_data_card['PERCENT_MISSING'] = (tbl_data_card['_NMISS_'] / tbl_data_card['_NOBS_']) * 100
ax = tbl_data_card[['_VARNAME_', 'PERCENT_MISSING']].to_frame().set_index('_VARNAME_').plot.bar(
title='Percentage of Missing Values', figsize=(15,7)
)
ax.set_ylabel('Percent Missing')
ax.set_xlabel('Variable Names');
In [ ]:
hmeq_prepped = sess.CASTable('hmeq_prepped', replace=True)
indata.datapreprocess.transform(
casout=hmeq_prepped,
copyallvars=True,
outvarsnameglobalprefix='im',
requestpackages=[
{'impute': {'method': 'mean'}, 'inputs': ['clage']},
{'impute': {'method': 'median'}, 'inputs': ['delinq']},
{'impute': {'method': 'value', 'valuescontinuous': [2]}, 'inputs': ['ninq']},
{'impute': {'method': 'value', 'valuescontinuous': [35.0, 7, 2]}, 'inputs': ['debtinc', 'yoj']}
]
)
In [ ]:
hmeq_part = sess.CASTable('hmeq_part', replace=True)
hmeq_prepped.groupby(target).sampling.stratified(
output=dict(casout=hmeq_part, copyvars='all'),
samppct=70,
partind=True
)
In this code block we do the following:
tree_model
. It is used in the subsequent step but it could just have easily been used a day, week, or month from now.tree_model
that was created in the previous step
In [ ]:
hmeq_part_1 = hmeq_part.query('_partind_ = 1')
tree_model = sess.CASTable('tree_model', replace=True)
scored_tree = sess.CASTable('_scored_tree', replace=True)
hmeq_part_1.decisiontree.dtreetrain(
inputs=all_inputs,
target='bad',
nominals=class_vars,
crit='gain',
prune=True,
varImp=True,
missing='useinsearch',
casout=tree_model
)
# Score
hmeq_part.decisiontree.dtreescore(
modeltable=tree_model,
casout=scored_tree,
copyvars=['bad', '_partind_']
)
# Create p_bad0 and p_bad1 as _dt_predp_ is the probability of event in _dt_predname_
scored_tree['p_bad1'] = scored_tree.eval("ifn( strip(_dt_predname_) = '1', _dt_predp_, 1-_dt_predp_ )")
scored_tree['p_bad0'] = scored_tree.eval("ifn( strip(_dt_predname_) = '0', 1-_dt_predp_, _dt_predp_ )")
In this code block we do the following:
forest_model
. It is used in the subsequent step but it could just have easily been used a day, week, or month from now.forest_model
that was created in the previous step
In [ ]:
forest_model = sess.CASTable('forest_model', replace=True)
scored_rf = sess.CASTable('_scored_rf', replace=True)
hmeq_part_1.decisiontree.foresttrain(
inputs=all_inputs,
nominals=class_vars,
target='bad',
ntree=50,
nbins=20,
leafsize=5,
maxlevel=21,
crit='gainratio',
varimp=True,
missing='useinsearch',
vote='prob',
casout=forest_model
)
# Score
hmeq_part.decisiontree.forestscore(
modeltable=forest_model,
casout=scored_rf,
copyvars=['bad', '_partind_'],
vote='prob'
)
# Create p_bad0 and p_bad1 as _rf_predp_ is the probability of event in _rf_predname_
scored_rf['p_bad1'] = scored_rf.eval("ifn( strip(_rf_predname_) = '1', _rf_predp_, 1-_rf_predp_ )")
scored_rf['p_bad0'] = scored_rf.eval("ifn( strip(_rf_predname_) = '0', 1-_rf_predp_, _rf_predp_ )")
In this code block we do the following:
gb_model
. It is used in the subsequent step but it could just have easily been used a day, week, or month from now.gb_model
that was created in the previous step
In [ ]:
gb_model = sess.CASTable('gb_model', replace=True)
scored_gb = sess.CASTable('_scored_gb', replace=True)
hmeq_part_1.decisiontree.gbtreetrain(
inputs=all_inputs,
nominals=class_vars,
target=target,
ntree=10,
nbins=20,
maxlevel=6,
varimp=True,
missing='useinsearch',
casout=gb_model
)
# Score
hmeq_part.decisionTree.gbtreeScore(
modeltable=gb_model,
casout=scored_gb,
copyvars=[target, '_partind_']
)
# Create p_bad0 and p_bad1 as _gbt_predp_ is the probability of event in _gbt_predname_
scored_gb['p_bad1'] = scored_gb.eval("ifn( strip(_gbt_predname_) = '1', _gbt_predp_, 1-_gbt_predp_ )")
scored_gb['p_bad0'] = scored_gb.eval("ifn( strip(_gbt_predname_) = '0', 1-_gbt_predp_, _gbt_predp_ )")
In this code block we do the following:
nnet_model
. It is used in the subsequent step but it could just have easily been used a day, week, or month from now.nnet_model
that was created in the previous step
In [ ]:
hmeq_part_0 = hmeq_part.query('_partind_ = 0')
nnet_model = sess.CASTable('nnet_model', replace=True)
scored_nn = sess.CASTable('_scored_nn', replace=True)
hmeq_part_1.neuralnet.anntrain(
validtable=hmeq_part_0,
inputs=all_inputs,
nominals=class_vars,
target="bad",
hiddens={9},
acts=['tanh'],
combs=['linear'],
targetact='softmax',
errorfunc='entropy',
std='midrange',
randdist='uniform',
scaleinit=1,
nloopts={
'optmlopt': {'maxiters': 250, 'fconv': 1e-10},
'lbfgsopt': {'numcorrections': 6},
'printopt': {'printlevel': 'printdetail'},
'validate': {'frequency': 1}
},
casout=nnet_model
)
# Score
hmeq_part.neuralnet.annscore(
modeltable=nnet_model,
casout=scored_nn,
copyvars=['bad', '_partind_']
)
# Create p_bad0 and p_bad1 as _nn_predp_ is the probability of event in _nn_predname_
scored_nn['p_bad1'] = scored_nn.eval("ifn( strip(_nn_predname_) = '1', _nn_predp_, 1-_nn_predp_ )")
scored_nn['p_bad0'] = scored_nn.eval("ifn( strip(_nn_predname_) = '0', 1-_nn_predp_, _nn_predp_ )")
In [ ]:
def assess_model(t):
return sess.percentile.assess(
table=t.query('_partind_ = 0'),
inputs=['p_bad1'],
response='bad',
event='1',
pvar=['p_bad0'],
pevent=['0']
)
tree_assess = assess_model(scored_tree)
tree_fitstat = tree_assess.FitStat
tree_rocinfo = tree_assess.ROCInfo
tree_liftinfo = tree_assess.LIFTInfo
rf_assess = assess_model(scored_rf)
rf_fitstat = rf_assess.FitStat
rf_rocinfo = rf_assess.ROCInfo
rf_liftinfo = rf_assess.LIFTInfo
gb_assess = assess_model(scored_gb)
gb_fitstat = gb_assess.FitStat
gb_rocinfo = gb_assess.ROCInfo
gb_liftinfo = gb_assess.LIFTInfo
nn_assess = assess_model(scored_nn)
nn_fitstat = nn_assess.FitStat
nn_rocinfo = nn_assess.ROCInfo
nn_liftinfo = nn_assess.LIFTInfo
In [ ]:
# Add new variable to indicate type of model
tree_liftinfo['model'] = 'DecisionTree'
tree_rocinfo['model'] = 'DecisionTree'
rf_liftinfo['model'] = 'Forest'
rf_rocinfo['model'] = 'Forest'
gb_liftinfo['model'] = 'GradientBoosting'
gb_rocinfo['model'] = 'GradientBoosting'
nn_liftinfo['model'] = 'NeuralNetwork'
nn_rocinfo['model'] = 'NeuralNetwork'
# Concatenate data
all_liftinfo = pd.concat([rf_liftinfo, gb_liftinfo, nn_liftinfo, tree_liftinfo], ignore_index=True)
all_rocinfo = pd.concat([rf_rocinfo, gb_rocinfo, nn_rocinfo, tree_rocinfo], ignore_index=True)
In [ ]:
all_rocinfo[['model', 'C']].drop_duplicates(keep='first').sort_values(by='C', ascending=False)
In [ ]:
# Draw ROC charts
plt.figure(figsize=(15, 5))
for key, grp in all_rocinfo.groupby(['model']):
plt.plot(grp['FPR'], grp['Sensitivity'], label=key)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend(loc='best')
plt.title('ROC Curve')
plt.show()
# Draw lift charts
plt.figure(figsize=(15, 5))
for key, grp in all_liftinfo.groupby(['model']):
plt.plot(grp['Depth'], grp['CumLift'], label=key)
plt.xlabel('Depth')
plt.ylabel('Cumulative Lift')
plt.grid(True)
plt.legend(loc='best')
plt.title('Cumulative Lift Chart')
plt.show();
In [ ]:
# This is the same as sess.endsession(); sess.close();
sess.terminate()
In [ ]: