
This is a simple end to end example of how you can use SAS Viya for analysis The example follows these steps:

  1. Importing the needed R packages
  2. Starting a CAS session on an already running CAS server
  3. Load the needed CAS Action Sets
  4. Loading data from the local file system to the CAS server
  5. Explore the data
  6. Impute missing values
  7. Partition the data into training and validation partitions
  8. Build a decision tree
  9. Build a neural network
  10. Build a decision forest
  11. Build a gradient boost
  12. Assess the models
  13. Build ROC charts

Set Up the R Notebook for Analysis

Documentation Links:

Load the needed packages to complete analysis

In [ ]:
# Load necessary packages
options(cas.print.messages = FALSE)

Connect to CAS

This is the simpliest way to connect to CAS for more details see the [documentation]

In [ ]:
conn <- CAS('localhost', 5570)

Load Actionsets

In [ ]:
actionsets <- c('sampling', 'fedsql', 'decisionTree', 'neuralNet', 'percentile')
for(i in actionsets){
    loadActionSet(conn, i)

Load data from CSV

For this trial you cannot add files to the system

In [ ]:
castbl <- cas.read.csv(conn, 'http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv')

Explore the data

In [ ]:

In [ ]:

Visualize Numeric Variables

In [ ]:
# Bring data locally
df <- to.casDataFrame(castbl, obs = nrow(castbl))
# Use reshape2's melt to help with data formatting
d <- melt(df[sapply(df, is.numeric)], id.vars=NULL)
ggplot(d, aes(x = value)) +
    facet_wrap(~variable,scales = 'free_x') +
    geom_histogram(fill = 'blue', bins = 25)

Count Missing Values

In [ ]:
# Get the number of missing values for all variables
tbl <- cas.simple.distinct(castbl)$Distinct[,c('Column', 'NMiss')]

In [ ]:
# Easy way to get missing values for numeric variables

In [ ]:
# Visualize the missing data
tbl$PctMiss <- tbl$NMiss/nrow(castbl)
ggplot(tbl, aes(Column, PctMiss)) +
    geom_col(fill = 'blue') +
    ggtitle('Pct Missing Values') +
    theme(plot.title = element_text(hjust = 0.5))

Impute Missing Values

In [ ]:
# Impute missing values
    methodContinuous = 'MEDIAN',
    methodNominal = 'MODE',
    inputs = colnames(castbl)[-1],
    copyAllVars = TRUE,
    casOut = list(name = 'hmeq', 
                replace = TRUE)

Partition the data into Training and Validation

In [ ]:
# Partition the data
    table = 'hmeq',
    samppct = 30,
    partind = TRUE,
    output = list(casOut = list(name = 'hmeq', replace = T), copyVars = 'ALL')

Variable Mapping

Define varibles with generic labels and reuse between model calls

In [ ]:
#Note: I do not want to hard code any of my variable names.
indata <- 'hmeq'

# Get variable info and types
colinfo <- head(cas.table.columnInfo(conn, table = indata)$ColumnInfo, -1)

# My target variable is the first column
target <- colinfo$Column[1]

# For models that can inherently handle missing values (ex: Decision Tree)
inputs <- colinfo$Column[-1]
nominals <- c(target, subset(colinfo, Type == 'varchar')$Column)

# For models that cannot handle missing values (ex: Neural Network)
imp.inputs <- grep('IMP_', inputs, value = T)
imp.nominals <- c(target, grep('IMP_', nominals, value = T))

Build Models

Decision Tree

In [ ]:
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    varImp = TRUE,
    casOut = list(name = 'dt_model', replace = TRUE)


In [ ]:
# Train the forest model
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'rf_model', replace = TRUE)

Gradient Boosting

In [ ]:
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'gbt_model', replace = TRUE)

Neural Network

In [ ]:
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = imp.inputs,
    hidden = 7,
    nominals = imp.nominals,
    casOut = list(name = 'nn_model', replace = TRUE)

Score the Models

In [ ]:
models <- c('dt','rf','gbt','nn')
scores <- c(cas.decisionTree.dtreeScore, cas.decisionTree.forestScore, 
            cas.decisionTree.gbtreeScore, cas.neuralNet.annScore)
names(scores) <- models

# Function to help automate prediction process on new data
score.params <- function(model){return(list(
    object       = defCasTable(conn, indata),
    modelTable   = list(name = paste0(model, '_model')),
    copyVars     = list(target, '_PartInd_'),
    assessonerow = TRUE,
    casOut       = list(name = paste0(model, '_scored'), replace = T)
lapply(models, function(x) {do.call(scores[[x]], score.params(x))})

Assess Models

Confusion Matrix

In [ ]:
# Load the percentile actionset for scoring
loadActionSet(conn, 'percentile')

# Useful function for model assessment
assess.model <- function(model){
        table    = list(name = paste0(model,'_scored'), 
                        where = '_PartInd_ = 1'),
        inputs   = paste0('_', model, '_P_           1'),
        response = target,
        event    = '1')

model.names <- c('Decision Tree', 'Random Forest', 
                 'Gradient Boosting', 'Neural Network')
roc.df <- data.frame()
for (i in 1:length(models)){
    tmp <- (assess.model(models[i]))$ROCInfo
    tmp$Model <- model.names[i] 
    roc.df <- rbind(roc.df, tmp)

# Manipulate the dataframe
compare <- subset(roc.df, round(roc.df$CutOff, 2) == 0.5)
rownames(compare) <- NULL


In [ ]:
# Build a dataframe to compare the misclassification rates
compare$Misclassification <- 1 - compare$ACC
miss <- compare[order(compare$Misclassification), c('Model','Misclassification')]
rownames(miss) <- NULL


In [ ]:
# Add a new column to be used as the ROC curve label
roc.df$Models <- paste(roc.df$Model, round(roc.df$C, 3), sep = ' - ')

# Create the ROC curve
ggplot(data = roc.df[c('FPR', 'Sensitivity', 'Models')],
    aes(x = as.numeric(FPR), y = as.numeric(Sensitivity), colour = Models)) +
    geom_line() +
    labs(x = 'False Positive Rate', y = 'True Positive Rate')

End the session

In [ ]:
# End the session

In [ ]: