In [1]:
## Info:
sessionInfo()
Out[1]:
In [2]:
## Load kernlab library and spam dataset:
library(kernlab)
data(spam)
table(spam$type)
Out[2]:
In [3]:
## Split dataset into train and test sets:
## 60% training + 40% testing
set.seed(3035)
trainInd <- rbinom(nrow(spam), size=1, prob=0.6)==1
table(trainInd)
trainSet <- spam[trainInd,]
testSet <- spam[!trainInd,]
Out[3]:
In [4]:
## Explorating the data
head(names(trainSet))
## column names are the words appear in emails
## values represent the frequecy of their appeeance in each email
Out[4]:
In [5]:
hc <- hclust(dist(t(trainSet[,-length(spam)])))
plot(hc)
In [21]:
## lengthh(spam) -> returns the number of columns
## we exclude the last column, which is email class
hc <- hclust(dist(t(log10(trainSet[,-length(spam)]+1))))
plot(hc)
In [ ]: