Overview

This is a simple end to end example of how you can use SAS Viya for analysis The example follows these steps:

  1. Importing the needed R packages
  2. Starting a CAS session on an already running CAS server
  3. Load the needed CAS Action Sets
  4. Loading data from the local file system to the CAS server
  5. Explore the data
  6. Impute missing values
  7. Partition the data into training and validation partitions
  8. Build a decision tree
  9. Build a neural network
  10. Build a decision forest
  11. Build a gradient boost
  12. Assess the models
  13. Build ROC charts

Set Up the R Notebook for Analysis

Documentation Links:

Load the needed packages to complete analysis


In [ ]:
# Load necessary packages
library('swat')
library('ggplot2')
library('reshape2')
options(cas.print.messages = FALSE)

Connect to CAS

This is the simpliest way to connect to CAS for more details see the [documentation]


In [ ]:
conn <- CAS('localhost', 5570)

Load Actionsets


In [ ]:
actionsets <- c('sampling', 'fedsql', 'decisionTree', 'neuralNet', 'percentile')
for(i in actionsets){
    loadActionSet(conn, i)
}

Load data from CSV

For this trial you cannot add files to the system


In [ ]:
castbl <- cas.read.csv(conn, 'http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv')

Explore the data


In [ ]:
head(castbl)

In [ ]:
summary(castbl)

Visualize Numeric Variables


In [ ]:
# Bring data locally
df <- to.casDataFrame(castbl, obs = nrow(castbl))
# Use reshape2's melt to help with data formatting
d <- melt(df[sapply(df, is.numeric)], id.vars=NULL)
ggplot(d, aes(x = value)) +
    facet_wrap(~variable,scales = 'free_x') +
    geom_histogram(fill = 'blue', bins = 25)

Count Missing Values


In [ ]:
# Get the number of missing values for all variables
tbl <- cas.simple.distinct(castbl)$Distinct[,c('Column', 'NMiss')]
tbl

In [ ]:
# Easy way to get missing values for numeric variables
cas.nmiss(castbl)

In [ ]:
# Visualize the missing data
tbl$PctMiss <- tbl$NMiss/nrow(castbl)
ggplot(tbl, aes(Column, PctMiss)) +
    geom_col(fill = 'blue') +
    ggtitle('Pct Missing Values') +
    theme(plot.title = element_text(hjust = 0.5))

Impute Missing Values


In [ ]:
# Impute missing values
cas.dataPreprocess.impute(castbl,
    methodContinuous = 'MEDIAN',
    methodNominal = 'MODE',
    inputs = colnames(castbl)[-1],
    copyAllVars = TRUE,
    casOut = list(name = 'hmeq', 
                replace = TRUE)
)

Partition the data into Training and Validation


In [ ]:
# Partition the data
cas.sampling.srs(conn,
    table = 'hmeq',
    samppct = 30,
    partind = TRUE,
    output = list(casOut = list(name = 'hmeq', replace = T), copyVars = 'ALL')
)

Variable Mapping

Define varibles with generic labels and reuse between model calls


In [ ]:
#Note: I do not want to hard code any of my variable names.
indata <- 'hmeq'

# Get variable info and types
colinfo <- head(cas.table.columnInfo(conn, table = indata)$ColumnInfo, -1)

# My target variable is the first column
target <- colinfo$Column[1]

# For models that can inherently handle missing values (ex: Decision Tree)
inputs <- colinfo$Column[-1]
nominals <- c(target, subset(colinfo, Type == 'varchar')$Column)

# For models that cannot handle missing values (ex: Neural Network)
imp.inputs <- grep('IMP_', inputs, value = T)
imp.nominals <- c(target, grep('IMP_', nominals, value = T))

Build Models

Decision Tree


In [ ]:
cas.decisionTree.dtreeTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    varImp = TRUE,
    casOut = list(name = 'dt_model', replace = TRUE)
)

Forest


In [ ]:
# Train the forest model
cas.decisionTree.forestTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'rf_model', replace = TRUE)
)

Gradient Boosting


In [ ]:
cas.decisionTree.gbtreeTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'gbt_model', replace = TRUE)
)

Neural Network


In [ ]:
cas.neuralNet.annTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = imp.inputs,
    hidden = 7,
    nominals = imp.nominals,
    casOut = list(name = 'nn_model', replace = TRUE)
)

Score the Models


In [ ]:
models <- c('dt','rf','gbt','nn')
scores <- c(cas.decisionTree.dtreeScore, cas.decisionTree.forestScore, 
            cas.decisionTree.gbtreeScore, cas.neuralNet.annScore)
names(scores) <- models

# Function to help automate prediction process on new data
score.params <- function(model){return(list(
    object       = defCasTable(conn, indata),
    modelTable   = list(name = paste0(model, '_model')),
    copyVars     = list(target, '_PartInd_'),
    assessonerow = TRUE,
    casOut       = list(name = paste0(model, '_scored'), replace = T)
))}
lapply(models, function(x) {do.call(scores[[x]], score.params(x))})

Assess Models

Confusion Matrix


In [ ]:
# Load the percentile actionset for scoring
loadActionSet(conn, 'percentile')

# Useful function for model assessment
assess.model <- function(model){
    cas.percentile.assess(conn,
        table    = list(name = paste0(model,'_scored'), 
                        where = '_PartInd_ = 1'),
        inputs   = paste0('_', model, '_P_           1'),
        response = target,
        event    = '1')
}

model.names <- c('Decision Tree', 'Random Forest', 
                 'Gradient Boosting', 'Neural Network')
roc.df <- data.frame()
for (i in 1:length(models)){
    tmp <- (assess.model(models[i]))$ROCInfo
    tmp$Model <- model.names[i] 
    roc.df <- rbind(roc.df, tmp)
}

# Manipulate the dataframe
compare <- subset(roc.df, round(roc.df$CutOff, 2) == 0.5)
rownames(compare) <- NULL
compare[,c('Model','TP','FP','FN','TN')]

Misclassification


In [ ]:
# Build a dataframe to compare the misclassification rates
compare$Misclassification <- 1 - compare$ACC
miss <- compare[order(compare$Misclassification), c('Model','Misclassification')]
rownames(miss) <- NULL
miss

ROC


In [ ]:
# Add a new column to be used as the ROC curve label
roc.df$Models <- paste(roc.df$Model, round(roc.df$C, 3), sep = ' - ')

# Create the ROC curve
ggplot(data = roc.df[c('FPR', 'Sensitivity', 'Models')],
    aes(x = as.numeric(FPR), y = as.numeric(Sensitivity), colour = Models)) +
    geom_line() +
    labs(x = 'False Positive Rate', y = 'True Positive Rate')

End the session


In [ ]:
# End the session
cas.session.endSession(conn)

In [ ]: