In [1]:
## Info:
sessionInfo()


Out[1]:
R version 3.1.1 (2014-07-10)
Platform: x86_64-pc-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
[1] base64_1.1     digest_0.6.4   evaluate_0.5.5 IRdisplay_0.1  IRkernel_0.1  
[6] jsonlite_0.9.8 rzmq_0.7.0     stringr_0.6.2  uuid_0.1-1    

In [2]:
## Load kernlab library and spam dataset:

library(kernlab)

data(spam)

table(spam$type)


Out[2]:
nonspam    spam 
   2788    1813 

In [3]:
## Split dataset into train and test sets:
## 60% training + 40% testing

set.seed(3035)

trainInd <- rbinom(nrow(spam), size=1, prob=0.6)==1
table(trainInd)

trainSet <- spam[trainInd,]
testSet <- spam[!trainInd,]


Out[3]:
trainInd
FALSE  TRUE 
 1807  2794 

In [4]:
## Explorating the data
head(names(trainSet))

## column names are the words appear in emails
## values represent the frequecy of their appeeance in each email


Out[4]:
[1] "make"    "address" "all"     "num3d"   "our"     "over"   

Cluster the words to see which words appear together in an email


In [5]:
hc <- hclust(dist(t(trainSet[,-length(spam)])))

plot(hc)



In [21]:
## lengthh(spam) -> returns the number of columns
## we exclude the last column, which is email class

hc <- hclust(dist(t(log10(trainSet[,-length(spam)]+1))))

plot(hc)



In [ ]: