In [14]:
source("https://raw.githubusercontent.com/eogasawara/mylibrary/master/myclustering.R")
load(url("https://github.com/eogasawara/mylibrary/raw/master/data/iris.RData"))
irisn <- iris
irisn$Species <- NULL
outliers by boxplot
In [15]:
out <- outliers.boxplot(iris)
myiris <- iris[!out,]
head(iris[out,])
boutliers <- which(out==TRUE)
print(boutliers)
Outliers by clustering
In [16]:
t <- sort(dbscan::kNNdist(irisn, k = 10))
cm <- curvature.max(c(1:length(t)),t, do_plot=FALSE)
options(repr.plot.width=5, repr.plot.height=4)
dbscan::kNNdistplot(irisn, k = 10)
abline(h = cm$y, lty = 2)
dbs3n <- fpc::dbscan(irisn, eps = cm$y, MinPts = 10)
irisn$cluster <- dbs3n$cluster
dbsoutliers = which(irisn$cluster == 0)
print(dbsoutliers)
Outliers by local outlier factors
In [17]:
outlier.scores <- lofactor(irisn, k=5)
plot(density(outlier.scores))
t <- sort(outlier.scores, decreasing=T)
plot(t)
looutliers <- order(outlier.scores, decreasing=T)[1:5]
print(looutliers)
In [18]:
n <- nrow(irisn)
labels <- 1:n
labels[-looutliers] <- "."
biplot(prcomp(irisn), cex=.8, xlabs=labels)
In [19]:
pch <- rep(".", n)
pch[looutliers] <- "+"
col <- rep("black", n)
col[looutliers] <- "red"
options(repr.plot.width=7, repr.plot.height=7)
pairs(irisn, pch=pch, col=col)
Outliers by regression
In [20]:
mycm <- dt.categ_mapping(iris, "Species")
mycm <- mycm[with(mycm, order(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)),]
mod <- lm(Speciessetosa+Speciesversicolor+Speciesvirginica ~ ., data=mycm)
options(repr.plot.width=4, repr.plot.height=3)
plot(mod,4)
In [ ]: