This is a simple end to end example of how you can use SAS Viya for analysis The example follows these steps:
In [ ]:
# Load necessary packages
library('swat')
library('ggplot2')
library('reshape2')
options(cas.print.messages = FALSE)
This is the simpliest way to connect to CAS for more details see the [documentation]
In [ ]:
conn <- CAS('localhost', 5570)
In [ ]:
actionsets <- c('sampling', 'fedsql', 'decisionTree', 'neuralNet', 'percentile')
for(i in actionsets){
loadActionSet(conn, i)
}
In [ ]:
castbl <- cas.read.csv(conn, 'http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv')
In [ ]:
head(castbl)
In [ ]:
summary(castbl)
In [ ]:
# Bring data locally
df <- to.casDataFrame(castbl, obs = nrow(castbl))
# Use reshape2's melt to help with data formatting
d <- melt(df[sapply(df, is.numeric)], id.vars=NULL)
ggplot(d, aes(x = value)) +
facet_wrap(~variable,scales = 'free_x') +
geom_histogram(fill = 'blue', bins = 25)
In [ ]:
# Get the number of missing values for all variables
tbl <- cas.simple.distinct(castbl)$Distinct[,c('Column', 'NMiss')]
tbl
In [ ]:
# Easy way to get missing values for numeric variables
cas.nmiss(castbl)
In [ ]:
# Visualize the missing data
tbl$PctMiss <- tbl$NMiss/nrow(castbl)
ggplot(tbl, aes(Column, PctMiss)) +
geom_col(fill = 'blue') +
ggtitle('Pct Missing Values') +
theme(plot.title = element_text(hjust = 0.5))
In [ ]:
# Impute missing values
cas.dataPreprocess.impute(castbl,
methodContinuous = 'MEDIAN',
methodNominal = 'MODE',
inputs = colnames(castbl)[-1],
copyAllVars = TRUE,
casOut = list(name = 'hmeq',
replace = TRUE)
)
In [ ]:
# Partition the data
cas.sampling.srs(conn,
table = 'hmeq',
samppct = 30,
partind = TRUE,
output = list(casOut = list(name = 'hmeq', replace = T), copyVars = 'ALL')
)
In [ ]:
#Note: I do not want to hard code any of my variable names.
indata <- 'hmeq'
# Get variable info and types
colinfo <- head(cas.table.columnInfo(conn, table = indata)$ColumnInfo, -1)
# My target variable is the first column
target <- colinfo$Column[1]
# For models that can inherently handle missing values (ex: Decision Tree)
inputs <- colinfo$Column[-1]
nominals <- c(target, subset(colinfo, Type == 'varchar')$Column)
# For models that cannot handle missing values (ex: Neural Network)
imp.inputs <- grep('IMP_', inputs, value = T)
imp.nominals <- c(target, grep('IMP_', nominals, value = T))
In [ ]:
cas.decisionTree.dtreeTrain(conn,
table = list(name = indata, where = '_PartInd_ = 0'),
target = target,
inputs = inputs,
nominals = nominals,
varImp = TRUE,
casOut = list(name = 'dt_model', replace = TRUE)
)
In [ ]:
# Train the forest model
cas.decisionTree.forestTrain(conn,
table = list(name = indata, where = '_PartInd_ = 0'),
target = target,
inputs = inputs,
nominals = nominals,
casOut = list(name = 'rf_model', replace = TRUE)
)
In [ ]:
cas.decisionTree.gbtreeTrain(conn,
table = list(name = indata, where = '_PartInd_ = 0'),
target = target,
inputs = inputs,
nominals = nominals,
casOut = list(name = 'gbt_model', replace = TRUE)
)
In [ ]:
cas.neuralNet.annTrain(conn,
table = list(name = indata, where = '_PartInd_ = 0'),
target = target,
inputs = imp.inputs,
hidden = 7,
nominals = imp.nominals,
casOut = list(name = 'nn_model', replace = TRUE)
)
In [ ]:
models <- c('dt','rf','gbt','nn')
scores <- c(cas.decisionTree.dtreeScore, cas.decisionTree.forestScore,
cas.decisionTree.gbtreeScore, cas.neuralNet.annScore)
names(scores) <- models
# Function to help automate prediction process on new data
score.params <- function(model){return(list(
object = defCasTable(conn, indata),
modelTable = list(name = paste0(model, '_model')),
copyVars = list(target, '_PartInd_'),
assessonerow = TRUE,
casOut = list(name = paste0(model, '_scored'), replace = T)
))}
lapply(models, function(x) {do.call(scores[[x]], score.params(x))})
In [ ]:
# Load the percentile actionset for scoring
loadActionSet(conn, 'percentile')
# Useful function for model assessment
assess.model <- function(model){
cas.percentile.assess(conn,
table = list(name = paste0(model,'_scored'),
where = '_PartInd_ = 1'),
inputs = paste0('_', model, '_P_ 1'),
response = target,
event = '1')
}
model.names <- c('Decision Tree', 'Random Forest',
'Gradient Boosting', 'Neural Network')
roc.df <- data.frame()
for (i in 1:length(models)){
tmp <- (assess.model(models[i]))$ROCInfo
tmp$Model <- model.names[i]
roc.df <- rbind(roc.df, tmp)
}
# Manipulate the dataframe
compare <- subset(roc.df, round(roc.df$CutOff, 2) == 0.5)
rownames(compare) <- NULL
compare[,c('Model','TP','FP','FN','TN')]
In [ ]:
# Build a dataframe to compare the misclassification rates
compare$Misclassification <- 1 - compare$ACC
miss <- compare[order(compare$Misclassification), c('Model','Misclassification')]
rownames(miss) <- NULL
miss
In [ ]:
# Add a new column to be used as the ROC curve label
roc.df$Models <- paste(roc.df$Model, round(roc.df$C, 3), sep = ' - ')
# Create the ROC curve
ggplot(data = roc.df[c('FPR', 'Sensitivity', 'Models')],
aes(x = as.numeric(FPR), y = as.numeric(Sensitivity), colour = Models)) +
geom_line() +
labs(x = 'False Positive Rate', y = 'True Positive Rate')
In [ ]:
# End the session
cas.session.endSession(conn)
In [ ]: