A financial services company offers a home equity line of credit to its clients. The company has extended several thousand lines of credit in the past, and many of these accepted applicants (approximately 20%) have defaulted on their loans. By using geographic, demographic, and financial variables, the company wants to build a model to predict whether an applicant will default.
After analyzing the data, the company selected a subset of 12 predictor (or input) variables to model whether each applicant defaulted. The response (or target) variable (BAD) indicates whether an applicant defaulted on the home equity line of credit. These variables, along with their model role, measurement level, and description, are shown in the following table.
Name | Model Role | Measurement Level | Description |
---|---|---|---|
BAD | Target | Binary | 1 = applicant defaulted on loan or delinquent, 0 = applicant paid loan |
CLAGE | Input | Interval | Age of oldest credit line in months |
CLNO | Input | Interval | Number of credit lines |
DEBTINC | Input | Interval | Debt-to-income ratio |
DELINQ | Input | Interval | Number of delinquent credit lines |
DEROG | Input | Interval | Number of derogatory reports |
JOB | Input | Nominal | Occupational categories |
LOAN | Input | Interval | Amount of loan request |
MORTDUE | Input | Interval | Amount due on existing mortgage |
NINQ | Input | Interval | Number of recent credit inquiries |
REASON | Input | Binary | DebtCon = debt consolidation, HomeImp = home improvement |
VALUE | Input | Interval | Value of current property |
YOJ | Input | Interval | Years at present job |
In [258]:
library(swat)
library(ggplot2)
library(reshape2)
library(xgboost)
library(caret)
In [259]:
#CAS(hostname,port,username,password)
conn <- CAS("server", 8777, "student", "Metadata0", protocol="http")
In [260]:
castbl <- cas.read.csv(conn, "D:/Workshop/Winsas/AX17HRV/hmeq.csv")
#Create variable for Dataset name
indata <- 'hmeq'
In [261]:
#Use the dim and names functions to view the table size and column names
dim(castbl)
names(castbl)
In [262]:
#Use head and tail function to view the first and last 6 observations
head(castbl)
tail(castbl)
In [263]:
#Use summary function to get variable summary
summary(castbl)
In [264]:
# Bring data locally
df <- to.casDataFrame(castbl, obs = nrow(castbl))
# Use reshape2's melt to help with data formatting
d <- melt(df[sapply(df, is.numeric)], id.vars=NULL)
#Plot data with ggplot
ggplot(d, aes(x = value)) + facet_wrap(~variable,scales = 'free_x') + geom_histogram(fill = 'blue', bins = 25)
In [265]:
tbl <- cas.simple.distinct(castbl)$Distinct[,c('Column', 'NMiss')]
tbl
In [266]:
tbl$PctMiss <- tbl$NMiss/nrow(castbl)
ggplot(tbl, aes(Column, PctMiss)) + geom_col(fill = 'blue') +
ggtitle('Pct Missing Values') + theme(plot.title = element_text(hjust = 0.5))
In [267]:
# Impute missing values with the median for continuous variables and most frequent for nominal variables
cas.dataPreprocess.impute(castbl,
methodContinuous = 'MEDIAN',
methodNominal = 'MODE',
inputs = colnames(castbl)[-1],
copyAllVars = TRUE,
casOut = list(name = indata, replace = TRUE)
)
In [268]:
# Load the sampling actionset
loadActionSet(conn, 'sampling')
In [269]:
# Partition the data
cas.sampling.srs(conn,
table = indata,
samppct = 70,
SEED=51188,
partind = TRUE,
output = list(casOut = list(name = indata, replace = T), copyVars = 'ALL')
)
In [270]:
# Load the fedsql actionset
loadActionSet(conn, 'fedsql')
In [271]:
# Make sure the partition worked correctly using SQL
cas.fedsql.execDirect(conn, query = paste0("
SELECT
CASE WHEN _PartInd_ = 1 THEN 'Training' ELSE 'Validation' END AS name,
_PartInd_,
COUNT(*) AS obs
FROM ", indata, "
GROUP BY
CASE WHEN _PartInd_ = 1 THEN 'Training' ELSE 'Validation' END,
_PartInd_;
"))
In [272]:
# Get variable info and types
colinfo <- head(cas.table.columnInfo(conn, table = indata)$ColumnInfo, -1)
colinfo
# Target variable is the first column
target <- colinfo$Column[1]
# Get all variables
inputs <- colinfo$Column[-1]
nominals <- c(target, subset(colinfo, Type == 'varchar')$Column)
# Get only imputed variables
inputs <- grep('IMP_', inputs, value = T)
nominals <- c(target, grep('IMP_', nominals, value = T))
inputs
nominals
In [273]:
# Load the decsion tree actionset
loadActionSet(conn, 'decisionTree')
In [274]:
# Train the decision tree model
cas.decisionTree.dtreeTrain(conn,
table = list(name = indata, where = '_PartInd_ = 1'),
target = target,
inputs = inputs,
nominals = nominals,
varImp = TRUE,
casOut = list(name = 'dt_model', replace = TRUE)
)
In [275]:
# Train the random forest model
cas.decisionTree.forestTrain(conn,
table = list(name = indata, where = '_PartInd_ = 1'),
target = target,
inputs = inputs,
nominals = nominals,
casOut = list(name = 'rf_model', replace = TRUE)
)
In [276]:
# Train the gradient boosting model
cas.decisionTree.gbtreeTrain(conn,
table = list(name = indata, where = '_PartInd_ = 1'),
target = target,
inputs = inputs,
nominals = nominals,
casOut = list(name = 'gbt_model', replace = TRUE)
)
In [277]:
# Load the neuralNet actionset
loadActionSet(conn, 'neuralNet')
In [278]:
# Build a neural network model
cas.neuralNet.annTrain(conn,
table = list(name = indata, where = '_PartInd_ = 1'),
target = target,
inputs = inputs,
nominals = nominals,
casOut = list(name = 'nn_model', replace = TRUE)
)
In [279]:
#Create table object for the imputed and partitioned data
pardata = defCasTable(conn, indata)
#Score the decision tree model
dt_score_obj = cas.decisionTree.dtreeScore(
pardata[pardata$`_PartInd_` == 0,],
model = "dt_model",
casout = list(name="dt_scored",replace=TRUE),
copyVars = list(target),
encodename = TRUE,
assessonerow = TRUE
)
#Score the random forest model
rf_score_obj = cas.decisionTree.forestScore(
pardata[pardata$`_PartInd_` == 0,],
model = "rf_model",
casout = list(name="rf_scored",replace=TRUE),
copyVars = list(target),
encodename = TRUE,
assessonerow = TRUE
)
#Score the gradient boosting model
gbt_score_obj = cas.decisionTree.gbtreeScore(
pardata[pardata$`_PartInd_` == 0,],
model = "gbt_model",
casout = list(name="gbt_scored",replace=TRUE),
copyVars = list(target),
encodename = TRUE,
assessonerow = TRUE
)
#Score the neural network model
nn_score_obj = cas.neuralNet.annScore(
pardata[pardata$`_PartInd_` == 0,],
model = "nn_model",
casout = list(name="nn_scored",replace=TRUE),
copyVars = list(target),
encodename = TRUE,
assessonerow = TRUE
)
#View selected fields from the output object
dt_score_obj$OutputCasTables
dt_score_obj$ScoreInfo
rf_score_obj$OutputCasTables
rf_score_obj$ScoreInfo
rf_score_obj$OutputCasTables
gbt_score_obj$ScoreInfo
rf_score_obj$OutputCasTables
nn_score_obj$ScoreInfo
In [280]:
# Load the percentile actionset for scoring
loadActionSet(conn, 'percentile')
In [281]:
# Create table objects from the scoring output and assess each model
dt_scored = defCasTable(conn, tablename = "dt_scored")
cas.percentile.assess(
dt_scored,
inputs = paste("P_", target, "1", sep = ""),
casout = list(name="dt_assess",replace=TRUE),
response = target,
event = "1"
)
rf_scored = defCasTable(conn, tablename = "rf_scored")
cas.percentile.assess(
rf_scored,
inputs = paste("P_", target, "1", sep = ""),
casout = list(name="rf_assess",replace=TRUE),
response = target,
event = "1"
)
gbt_scored = defCasTable(conn, tablename = "gbt_scored")
cas.percentile.assess(
gbt_scored,
inputs = paste("P_", target, "1", sep = ""),
casout = list(name="gbt_assess",replace=TRUE),
response = target,
event = "1"
)
nn_scored = defCasTable(conn, tablename = "nn_scored")
cas.percentile.assess(
nn_scored,
inputs = paste("P_", target, "1", sep = ""),
casout = list(name="nn_assess",replace=TRUE),
response = target,
event = "1"
)
In [282]:
#Create table objects from the assess output,
#bring data to the client,
#and add new variable to data frame indicating model name
dt_assess_ROC = defCasTable(conn, tablename = "dt_assess_ROC")
dt_assess_ROC <- to.casDataFrame(dt_assess_ROC, obs = nrow(dt_assess_ROC))
dt_assess_ROC$Model = 'Decision Tree'
rf_assess_ROC = defCasTable(conn, tablename = "rf_assess_ROC")
rf_assess_ROC <- to.casDataFrame(rf_assess_ROC, obs = nrow(rf_assess_ROC))
rf_assess_ROC$Model = 'Random Forest'
gbt_assess_ROC = defCasTable(conn, tablename = "gbt_assess_ROC")
gbt_assess_ROC <- to.casDataFrame(gbt_assess_ROC, obs = nrow(gbt_assess_ROC))
gbt_assess_ROC$Model = 'Gradient Boosting'
nn_assess_ROC = defCasTable(conn, tablename = "nn_assess_ROC")
nn_assess_ROC <- to.casDataFrame(nn_assess_ROC, obs = nrow(nn_assess_ROC))
nn_assess_ROC$Model = 'Neural Network'
#Combine data frames and view confusion matrix at a %50 cutoff
df_assess = rbind(dt_assess_ROC, rf_assess_ROC, gbt_assess_ROC, nn_assess_ROC)
compare <- subset(roc.df, CutOff == 0.5)
rownames(compare) <- NULL
compare[,c('Model','TP','FP','FN','TN')]
In [283]:
# Build a data frame to compare the misclassification rates
compare$Misclassification <- 1 - compare$ACC
miss <- compare[order(compare$Misclassification), c('Model','Misclassification')]
rownames(miss) <- NULL
miss
In [284]:
# Add a new column to be used as the ROC curve label
df_assess$Models <- paste(df_assess$Model, round(df_assess$'_C_', 3), sep = ' - ')
#Subset the data frame with only three variables
df_roc = df_assess[c('_FPR_', '_Sensitivity_', 'Models')]
colnames(df_roc) = c("FPR", "Sensitivity", "Models")
# Create the ROC curve
ggplot(data = newdf,
aes(x = FPR, y = Sensitivity, colour = Models)) + geom_line() +
labs(x = 'False Positive Rate', y = 'True Positive Rate')
In [285]:
# # Score the models
# models <- c('dt','rf','gbt','nn')
# scores <- c(cas.decisionTree.dtreeScore, cas.decisionTree.forestScore, cas.decisionTree.gbtreeScore, cas.neuralNet.annScore)
# names(scores) <- models
# # Function to help automate prediction process on new data
# score.params <- function(model){return(list(
# object = defCasTable(conn, indata),
# modelTable = list(name = paste0(model, '_model')),
# copyVars = list(target, '_PartInd_'),
# assessonerow = TRUE,
# casOut = list(name = paste0(model, '_scored'), replace = T)
# ))}
# lapply(models, function(x) {do.call(scores[[x]], score.params(x))})
# # Useful function for model assessment
# assess.model <- function(model){
# cas.percentile.assess(conn,
# table = list(name = paste0(model,'_scored'), where = '_PartInd_ = 0'),
# inputs = paste0('_', model, '_P_ 1'),
# response = target,
# event = '1')
# }
# model.names <- c('Decision Tree', 'Random Forest', 'Gradient Boosting', 'Neural Network')
# roc.df <- data.frame()
# for (i in 1:length(models)){
# tmp <- (assess.model(models[i]))$ROCInfo
# tmp$Model <- model.names[i]
# roc.df <- rbind(roc.df, tmp)
# }
# # Manipulate the dataframe
# compare <- subset(roc.df, CutOff == 0.5)
# rownames(compare) <- NULL
# compare[,c('Model','TP','FP','FN','TN')]
# # Add a new column to be used as the ROC curve label
# roc.df$Models <- paste(roc.df$Model, round(roc.df$C, 3), sep = ' - ')
# # Create the ROC curve
# ggplot(data = roc.df[c('FPR', 'Sensitivity', 'Models')],
# aes(x = as.numeric(FPR), y = as.numeric(Sensitivity), colour = Models)) + geom_line() +
# labs(x = 'False Positive Rate', y = 'True Positive Rate')
In [286]:
# Bring data locally and make sure it's in the right format
df <- to.casDataFrame(defCasTable(conn, indata), obs = nrow(castbl))
df <- df[,c(target, inputs, '_PartInd_')]
# Create dummy variables through one-hot encoding
df.dum <- df[,nominals[-1]]
dummies <- dummyVars('~ .', data = df.dum)
df.ohe <- as.data.frame(predict(dummies, newdata = df))
df.all.combined <- cbind(df[,-c(which(colnames(df) %in% nominals[-1]))], df.ohe)
# Split into training and validation
train <- df.all.combined[df.all.combined['_PartInd_'] == 1,]
valid <- df.all.combined[df.all.combined['_PartInd_'] == 0,]
# Train the XGBoost model
set.seed(101112)
bst <- xgboost(
data = data.matrix(train[,-1]),
label = data.matrix(train[,1]),
missing = 'NAN',
nround = 25,
objective = "binary:logistic",
eta = 0.1,
max_depth = 5,
subsample = 0.5,
colsample_bytree = 0.5
)
In [287]:
# Create a dataframe with the misclassification rate for XGBoost
pred <- as.numeric(predict(bst, data.matrix(valid[,-1]), missing = 'NAN') > 0.5)
Misclassification <- mean(as.numeric(pred > 0.5) != valid[,1])
xgb <- data.frame(cbind(Model = 'R - XGBoost', Misclassification))
xgb
In [288]:
# Combine the assessments and order by most accurate on validation data
err <- data.frame(rbind(miss, xgb))
err[,-1] <- round(as.numeric(as.character(err[,-1])),7)
err <- err[order(err[,-1]),]
rownames(err) <- NULL
err
In [289]:
# Save the champion model for later use
cas.table.save(conn, table = list(name = 'gbt_model'), name = 'Best_Model_gbt', replace = T)
In [290]:
cas.session.endSession(conn)