In [16]:
source("https://raw.githubusercontent.com/eogasawara/mylibrary/master/myPreprocessing.R")
loadlibrary("RColorBrewer")
loadlibrary("dplyr")
loadlibrary("gridExtra")
loadlibrary("reshape")
col.set <- brewer.pal(11, 'Spectral')
mycolors <- col.set[c(1,3,5,7,9)]
plot_size(4, 3)
In [17]:
sampler <- sample.random(iris)
head(sampler$sample)
samples <- sample.stratified(iris, "Species")
tbl <- rbind(table(iris$Species), table(sampler$sample$Species), table(samples$sample$Species))
rownames(tbl) <- c("dataset", "random sample", "stratified sample")
head(tbl)
tbl <- tbl[1,]-tbl
tbl <- tbl[2:3,]
head(tbl)
In [18]:
foldsr <- sample.random_kfold(iris, k=4)
foldss <- sample.stratified_kfold(iris, "Species", k=4)
tbls <- tblr <- NULL
for (i in (1:4)) {
tblr <- rbind(tblr, table(foldsr[[i]]$Species))
}
rownames(tblr) <- rep("random sampling", 4)
head(tblr)
for (i in (1:4)) {
tbls <- rbind(tbls, table(foldss[[i]]$Species))
}
rownames(tbls) <- rep("stratified sampling", 4)
head(tbls)
In [19]:
out <- outliers.boxplot(iris)
myiris <- iris[!out,]
head(iris[out,])
In [20]:
myirisM <- normalize.minmax(iris)
myirisZ <- normalize.zscore(iris)
myirisZS <- normalize.zscore(iris, nmean=0.5, nsd=0.5/2.698)
grfA <- plot.density(iris %>% select(variable="Sepal.Width", value=Sepal.Width), color=mycolors[1])
grfB <- plot.density(myirisM$data %>% select(variable="Sepal.Width", value=Sepal.Width), color=mycolors[1]) + xlim(0,1)
grfC <- plot.density(myirisZ$data %>% select(variable="Sepal.Width", value=Sepal.Width), color=mycolors[1])
grfD <- plot.density(myirisZS$data %>% select(variable="Sepal.Width", value=Sepal.Width), color=mycolors[1]) + xlim(0,1)
plot_size(8, 3)
grid.arrange(grfA, grfB, grfC, grfD, ncol=4)
plot_size(4, 3)
In [21]:
head(iris[,1:4])
mypca <- dt.pca(iris, "Species")
head(mypca$pca)
head(mypca$transf$pca.transf)
plot.scatter(mypca$pca %>% select(x=PC1, value=PC2, variable=Species), colors=mycolors[1:3])
Discretization is the process of transferring continuous functions, models, variables, and equations into discrete counterparts.
Smoothing is a technique that creates an approximating function that attempts to capture important patterns in the data while leaving out noise or other fine-scale structures/rapid phenomena.
An important part of the discretization/smoothing is to set up bins for proceeding the approximation.
In [22]:
bi <- smoothing.interval(iris$Sepal.Length, n=2)
bf <- smoothing.freq(iris$Sepal.Length, n=2)
bc <- smoothing.cluster(iris$Sepal.Length, n=2)
show_row(c('interval: ', sprintf("%.1f",bi$interval), 'entropy: ', sprintf("%.2f",entropy_group(bi$bins_factor, iris$Species))))
show_row(c('freq: ', sprintf("%.1f",bf$interval), 'entropy: ', sprintf("%.2f",entropy_group(bf$bins_factor, iris$Species))))
show_row(c('cluster: ', sprintf("%.1f",bc$interval), 'entropy: ', sprintf("%.2f",entropy_group(bc$bins_factor, iris$Species))))
In [23]:
bsl <- smoothing.opt(iris$Sepal.Length, smoothing=smoothing.freq)
bsw <- smoothing.opt(iris$Sepal.Width, smoothing=smoothing.freq)
bpl <- smoothing.opt(iris$Petal.Length, smoothing=smoothing.freq)
bpw <- smoothing.opt(iris$Petal.Width, smoothing=smoothing.freq)
show_row(c('Sepal.Length: ', sprintf("%.1f",bsl$interval), 'entropy: ', sprintf("%.2f",entropy_group(bsl$bins_factor, iris$Species))))
show_row(c('Sepal.Width: ', sprintf("%.1f",bsw$interval), 'entropy: ', sprintf("%.2f",entropy_group(bsw$bins_factor, iris$Species))))
show_row(c('Petal.Length: ', sprintf("%.1f",bpl$interval), 'entropy: ', sprintf("%.2f",entropy_group(bpl$bins_factor, iris$Species))))
show_row(c('Petal.Width: ', sprintf("%.1f",bpw$interval), 'entropy: ', sprintf("%.2f",entropy_group(bpw$bins_factor, iris$Species))))
In [24]:
bsl <- smoothing.opt(iris$Sepal.Length, smoothing=smoothing.cluster)
bsw <- smoothing.opt(iris$Sepal.Width, smoothing=smoothing.cluster)
bpl <- smoothing.opt(iris$Petal.Length, smoothing=smoothing.cluster)
bpw <- smoothing.opt(iris$Petal.Width, smoothing=smoothing.cluster)
show_row(c('Sepal.Length: ', sprintf("%.1f",bsl$interval), 'entropy: ', sprintf("%.2f",entropy_group(bsl$bins_factor, iris$Species))))
show_row(c('Sepal.Width: ', sprintf("%.1f",bsw$interval), 'entropy: ', sprintf("%.2f",entropy_group(bsw$bins_factor, iris$Species))))
show_row(c('Petal.Length: ', sprintf("%.1f",bpl$interval), 'entropy: ', sprintf("%.2f",entropy_group(bpl$bins_factor, iris$Species))))
show_row(c('Petal.Width: ', sprintf("%.1f",bpw$interval), 'entropy: ', sprintf("%.2f",entropy_group(bpw$bins_factor, iris$Species))))
In [25]:
#forcing an unballanced dataset
myiris <- iris[c(1:20,51:100, 110:120),]
myiris.bo <- balance.oversampling(myiris, "Species")
myiris.bs <- balance.subsampling(myiris, "Species")
tbl <- rbind(table(myiris$Species), table(myiris.bo$Species), table(myiris.bs$Species))
rownames(tbl) <- c('unbalanced', 'oversampling', 'subsampling')
head(tbl)
In [26]:
mycm <- dt.categ_mapping(sampler$sample, "Species")
head(mycm)
In [ ]: