Iris Flowers Multi-class Classification

Goal: Predict species of iris plant
Dataset: a database of diseased soybean plants. 150 observations
Predictors: 4 features
Target: Species. Iris plants are classified into 3 categories

Results

~ 100% accuracy



In [21]:

    
# first install packages devtools and pacman manually

#pull in source functions from github
devtools::source_url('https://raw.githubusercontent.com/jsphyg/Machine_Learning_Notebooks/master/myRfunctions.R')
#source("C:\\Work\\myRfunctions.R")
fnRunDate()
fnInstallPackages()









    



SHA-1 hash of file is 6de800cc2b55861becf89ae2e1c302676677008e






    




'Project last run on Wed Sep 20 8:22:03 AM 2017'






    




'Package install completed'



In [22]:

    
#**# attach the iris dataset to the environment
data(iris)
# rename the dataset
dataset <- as_tibble(iris)



In [23]:

    
glimpse(dataset)









    



Observations: 150
Variables: 5
$ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4,...
$ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
$ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
$ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
$ Species      <fctr> setosa, setosa, setosa, setosa, setosa, setosa, setos...



In [9]:

    
#**# define the target
target <- dplyr::select(dataset, Species)
head(target)









    





Species

	setosa
	setosa
	setosa
	setosa
	setosa
	setosa



In [10]:

    
#**# define the predictors
predictors <- dplyr::select(dataset, -dplyr::one_of(c("Species")))
head(predictors)









    





Sepal.Length Sepal.Width Petal.Length Petal.Width

	5.1 3.5 1.4 0.2
	4.9 3.0 1.4 0.2
	4.7 3.2 1.3 0.2
	4.6 3.1 1.5 0.2
	5.0 3.6 1.4 0.2
	5.4 3.9 1.7 0.4



In [11]:

    
fnClassDistribution(Class = dataset$Species)









    





freq percentage

	setosa 50      33.33333
	versicolor 50      33.33333
	virginica 50      33.33333



In [12]:

    
psych::describe(dataset, check = T)









    





vars n mean sd median trimmed mad min max range skew kurtosis se

	Sepal.Length 1         150       5.843333  0.8280661 5.80      5.808333  1.03782   4.3       7.9       3.6        0.3086407 -0.6058125 0.06761132
	Sepal.Width 2         150       3.057333  0.4358663 3.00      3.043333  0.44478   2.0       4.4       2.4        0.3126147  0.1387047 0.03558833
	Petal.Length 3         150       3.758000  1.7652982 4.35      3.760000  1.85325   1.0       6.9       5.9       -0.2694109 -1.4168574 0.14413600
	Petal.Width 4         150       1.199333  0.7622377 1.30      1.184167  1.03782   0.1       2.5       2.4       -0.1009166 -1.3581792 0.06223645
	Species* 5         150       2.000000  0.8192319 2.00      2.000000  1.48260   1.0       3.0       2.0        0.0000000 -1.5199333 0.06689001



In [18]:

    
# the data types
  visdat::vis_dat(dataset)
  # a best guess at the data types looking at the data
  visdat::vis_guess(dataset)
  # what's missing? view NAs
  visdat::vis_miss(dataset, cluster = T, sort_miss = T)



In [19]:

    
tabplot::tableplot(
  dataset, sortCol = Species)



In [16]:

    
# scatterplot matrix
trellis.par.set(theme = col.whitebg(), warn = FALSE)
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="ellipse")



In [69]:

    
# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="density", scales=scales)



In [70]:

    
# scatterplot matrix
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="box")



In [71]:

    
caret::featurePlot(x=dataset[, 1:4], y=dataset$Species, plot="strip", jitter = TRUE)



In [14]:

    
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)
validation <- dataset[-validation_index,]
dataset <- dataset[validation_index,]



In [74]:

    
formula <- Species ~ .



In [75]:

    
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

# a) linear algorithms
# LDA
set.seed(7)
fit.lda <- train(formula, data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(formula, data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(formula, data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(formula, data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(formula, data=dataset, method="rf", metric=metric, trControl=control)
set.seed(13)
fit.xgb <- caret::train(formula, data = dataset, method = "xgbLinear",  metric = metric, trControl = control)









    



Loading required package: xgboost
Warning message:
"package 'xgboost' was built under R version 3.3.3"
Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice



In [76]:

    
# d) compare algorithms
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, xgb=fit.xgb))
summary(results)









    





Call:
summary.resamples(object = results)

Models: lda, cart, knn, svm, rf, xgb 
Number of resamples: 10 

Accuracy 
       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
lda  0.9167  0.9375 1.0000 0.9750  1.0000    1    0
cart 0.8333  0.9167 0.9167 0.9250  0.9792    1    0
knn  0.8333  1.0000 1.0000 0.9750  1.0000    1    0
svm  0.8333  0.9167 0.9583 0.9500  1.0000    1    0
rf   0.8333  0.9167 0.9583 0.9417  1.0000    1    0
xgb  0.8333  0.9167 0.9583 0.9500  1.0000    1    0

Kappa 
      Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
lda  0.875  0.9062 1.0000 0.9625  1.0000    1    0
cart 0.750  0.8750 0.8750 0.8875  0.9688    1    0
knn  0.750  1.0000 1.0000 0.9625  1.0000    1    0
svm  0.750  0.8750 0.9375 0.9250  1.0000    1    0
rf   0.750  0.8750 0.9375 0.9125  1.0000    1    0
xgb  0.750  0.8750 0.9375 0.9250  1.0000    1    0



In [77]:

    
dotplot(results)



In [78]:

    
# summarize Best Model
#  Output estimated accuracy of a model.
print(fit.lda)









    



Linear Discriminant Analysis 

120 samples
  4 predictor
  3 classes: 'setosa', 'versicolor', 'virginica' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
Resampling results:

  Accuracy  Kappa 
  0.975     0.9625



In [82]:

    
set.seed(13)
predictions <- predict(fit.lda, newdata=validation)
confusionMatrix(predictions, validation$Species)









    





Confusion Matrix and Statistics

            Reference
Prediction   setosa versicolor virginica
  setosa         10          0         0
  versicolor      0         10         0
  virginica       0          0        10

Overall Statistics
                                     
               Accuracy : 1          
                 95% CI : (0.8843, 1)
    No Information Rate : 0.3333     
    P-Value [Acc > NIR] : 4.857e-15  
                                     
                  Kappa : 1          
 Mcnemar's Test P-Value : NA         

Statistics by Class:

                     Class: setosa Class: versicolor Class: virginica
Sensitivity                 1.0000            1.0000           1.0000
Specificity                 1.0000            1.0000           1.0000
Pos Pred Value              1.0000            1.0000           1.0000
Neg Pred Value              1.0000            1.0000           1.0000
Prevalence                  0.3333            0.3333           0.3333
Detection Rate              0.3333            0.3333           0.3333
Detection Prevalence        0.3333            0.3333           0.3333
Balanced Accuracy           1.0000            1.0000           1.0000



In [ ]:

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
5.1	3.5	1.4	0.2
4.9	3.0	1.4	0.2
4.7	3.2	1.3	0.2
4.6	3.1	1.5	0.2
5.0	3.6	1.4	0.2
5.4	3.9	1.7	0.4

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Sepal.Length	1	150	5.843333	0.8280661	5.80	5.808333	1.03782	4.3	7.9	3.6	0.3086407	-0.6058125	0.06761132
Sepal.Width	2	150	3.057333	0.4358663	3.00	3.043333	0.44478	2.0	4.4	2.4	0.3126147	0.1387047	0.03558833
Petal.Length	3	150	3.758000	1.7652982	4.35	3.760000	1.85325	1.0	6.9	5.9	-0.2694109	-1.4168574	0.14413600
Petal.Width	4	150	1.199333	0.7622377	1.30	1.184167	1.03782	0.1	2.5	2.4	-0.1009166	-1.3581792	0.06223645
Species*	5	150	2.000000	0.8192319	2.00	2.000000	1.48260	1.0	3.0	2.0	0.0000000	-1.5199333	0.06689001

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
5.1	3.5	1.4	0.2
4.9	3.0	1.4	0.2
4.7	3.2	1.3	0.2
4.6	3.1	1.5	0.2
5.0	3.6	1.4	0.2
5.4	3.9	1.7	0.4

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
5.1	3.5	1.4	0.2
4.9	3.0	1.4	0.2
4.7	3.2	1.3	0.2
4.6	3.1	1.5	0.2
5.0	3.6	1.4	0.2
5.4	3.9	1.7	0.4