Outliers

Libraries and Datasets


In [14]:
source("https://raw.githubusercontent.com/eogasawara/mylibrary/master/myclustering.R")
load(url("https://github.com/eogasawara/mylibrary/raw/master/data/iris.RData"))

irisn <- iris
irisn$Species <- NULL

outliers by boxplot


In [15]:
out <- outliers.boxplot(iris)
myiris <- iris[!out,]
head(iris[out,])
boutliers <- which(out==TRUE)
print(boutliers)


Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
165.7 4.4 1.5 0.4 setosa
335.2 4.1 1.5 0.1 setosa
345.5 4.2 1.4 0.2 setosa
615.0 2.0 3.5 1.0 versicolor
[1] 16 33 34 61

Outliers by clustering


In [16]:
t <- sort(dbscan::kNNdist(irisn, k =  10))
cm <- curvature.max(c(1:length(t)),t, do_plot=FALSE)
options(repr.plot.width=5, repr.plot.height=4)
dbscan::kNNdistplot(irisn, k =  10)
abline(h = cm$y, lty = 2)
dbs3n <- fpc::dbscan(irisn, eps = cm$y, MinPts = 10)
irisn$cluster <- dbs3n$cluster
dbsoutliers = which(irisn$cluster == 0)
print(dbsoutliers)


 [1]  16  42  58  61  88  94  99 106 107 108 109 110 118 119 123 126 130 131 132
[20] 135 136 149

Outliers by local outlier factors


In [17]:
outlier.scores <- lofactor(irisn, k=5)
plot(density(outlier.scores))
t <- sort(outlier.scores, decreasing=T)
plot(t)
looutliers <- order(outlier.scores, decreasing=T)[1:5]
print(looutliers)


[1] 42 16 23 63 25

In [18]:
n <- nrow(irisn)
labels <- 1:n
labels[-looutliers] <- "."
biplot(prcomp(irisn), cex=.8, xlabs=labels)



In [19]:
pch <- rep(".", n)
pch[looutliers] <- "+"
col <- rep("black", n)
col[looutliers] <- "red"
options(repr.plot.width=7, repr.plot.height=7)
pairs(irisn, pch=pch, col=col)


Outliers by regression


In [20]:
mycm <- dt.categ_mapping(iris, "Species")
mycm <- mycm[with(mycm, order(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)),]
mod <- lm(Speciessetosa+Speciesversicolor+Speciesvirginica ~ ., data=mycm)
options(repr.plot.width=4, repr.plot.height=3)
plot(mod,4)



In [ ]: