In [21]:
# first install packages devtools and pacman manually
#pull in source functions from github
devtools::source_url('https://raw.githubusercontent.com/jsphyg/Machine_Learning_Notebooks/master/myRfunctions.R')
#source("C:\\Work\\myRfunctions.R")
fnRunDate()
fnInstallPackages()
In [22]:
#**# attach the iris dataset to the environment
data(iris)
# rename the dataset
dataset <- as_tibble(iris)
In [23]:
glimpse(dataset)
In [9]:
#**# define the target
target <- dplyr::select(dataset, Species)
head(target)
In [10]:
#**# define the predictors
predictors <- dplyr::select(dataset, -dplyr::one_of(c("Species")))
head(predictors)
In [11]:
fnClassDistribution(Class = dataset$Species)
In [12]:
psych::describe(dataset, check = T)
In [18]:
# the data types
visdat::vis_dat(dataset)
# a best guess at the data types looking at the data
visdat::vis_guess(dataset)
# what's missing? view NAs
visdat::vis_miss(dataset, cluster = T, sort_miss = T)
In [19]:
tabplot::tableplot(
dataset, sortCol = Species)
In [16]:
# scatterplot matrix
trellis.par.set(theme = col.whitebg(), warn = FALSE)
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="ellipse")
In [69]:
# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="density", scales=scales)
In [70]:
# scatterplot matrix
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="box")
In [71]:
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="strip", jitter = TRUE)
In [14]:
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)
validation <- dataset[-validation_index,]
dataset <- dataset[validation_index,]
In [74]:
formula <- Species ~ .
In [75]:
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
# a) linear algorithms
# LDA
set.seed(7)
fit.lda <- train(formula, data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(formula, data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(formula, data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(formula, data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(formula, data=dataset, method="rf", metric=metric, trControl=control)
set.seed(13)
fit.xgb <- caret::train(formula, data = dataset, method = "xgbLinear", metric = metric, trControl = control)
In [76]:
# d) compare algorithms
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, xgb=fit.xgb))
summary(results)
In [77]:
dotplot(results)
In [78]:
# summarize Best Model
# Output estimated accuracy of a model.
print(fit.lda)
In [82]:
set.seed(13)
predictions <- predict(fit.lda, newdata=validation)
confusionMatrix(predictions, validation$Species)
In [ ]: