Fuzzy Clustering

Vahid Mirjalili, Data Mining Researcher


In [7]:
library("e1071")

source("clusterPurity.R")

In [2]:
### create a sample data set: 3 clusters, 6 dimensions
df <- data.frame(
  w=c(rep(1,100), rep(2, 100), rep(3, 100)),
  x1=c(rnorm(100, -1, 1), rnorm(100, 1, 1), rnorm(100, 0, 1)),
  x2=c(rnorm(100, -2.5, 2), rnorm(100, 2.5, 2), rnorm(100, 0, 2)),
  x3=c(rnorm(100, 0, 1), rnorm(100, 2, 1), rnorm(100, 1, 1)),
  x4=c(rnorm(100, 0, 2), rnorm(100, 5, 2), rnorm(100, 2.5, 1)),
  x5=c(rnorm(100, -2, 1), rnorm(100, 1, 1), rnorm(100, 0.5, 1)),
  x6=c(rnorm(100, -3, 2), rnorm(100, 1.4, 2), rnorm(100, -0.3, 1.6))
)

In [3]:
## plot the sample data set:

plot(df[,-1], col=df$w)



In [6]:
res.fcm <- cmeans(df[,-1], centers=3)

str(res.fcm)


List of 7
 $ centers    : num [1:3, 1:6] -0.129 0.661 -0.659 -0.111 2.317 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "1" "2" "3"
  .. ..$ : chr [1:6] "x1" "x2" "x3" "x4" ...
 $ size       : int [1:3] 97 103 100
 $ cluster    : int [1:300] 3 3 3 3 3 3 3 3 3 3 ...
 $ membership : num [1:300, 1:3] 0.3796 0.1804 0.2756 0.3135 0.0693 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : NULL
  .. ..$ : chr [1:3] "1" "2" "3"
 $ iter       : num 23
 $ withinerror: num 7.69
 $ call       : language cmeans(x = df[, -1], centers = 3)
 - attr(*, "class")= chr "fclust"

In [9]:
## Evaluate the purity of each obtained hard-clusters
## compraed to ground-truth cluster labels

res.fcm.split <- split(df$w, as.factor(res.fcm$cluster))
res.fcm.split

sapply(res.fcm.split, clustPurity)


Out[9]:
$`1`
 [1] 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[39] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[77] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

$`2`
  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

$`3`
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3
Out[9]:
        1         2         3 
0.8247423 0.8446602 0.9600000 

In [11]:
p1 <- cmeans(df[,c("x1", "x2")], centers=3)
p2 <- cmeans(df[,c("x3", "x4")], centers=3)
p3 <- cmeans(df[,c("x5", "x6")], centers=3)

sapply(split(df$w, as.factor(p1$cluster)), clustPurity)
sapply(split(df$w, as.factor(p2$cluster)), clustPurity)
sapply(split(df$w, as.factor(p3$cluster)), clustPurity)


Out[11]:
        1         2         3 
0.7252747 0.4705882 0.7444444 
Out[11]:
        1         2         3 
0.5974843 0.9285714 0.9718310 
Out[11]:
        1         2         3 
0.5454545 0.9506173 0.6734694 

In [12]:
write.csv(df, "sample.data/df.csv", col.names=F, row.names=F)

write.csv(p1$membership, "sample.data/p1_memb.csv", col.names=F, row.names=F)
write.csv(p2$membership, "sample.data/p2_memb.csv", col.names=F, row.names=F)
write.csv(p3$membership, "sample.data/p3_memb.csv", col.names=F, row.names=F)


simpleWarning in write.csv(df, "sample.data/df.csv", col.names = F, row.names = F): attempt to set 'col.names' ignored
simpleWarning in write.csv(p1$membership, "sample.data/p1_memb.csv", col.names = F, : attempt to set 'col.names' ignored
simpleWarning in write.csv(p2$membership, "sample.data/p2_memb.csv", col.names = F, : attempt to set 'col.names' ignored
simpleWarning in write.csv(p3$membership, "sample.data/p3_memb.csv", col.names = F, : attempt to set 'col.names' ignored

In [ ]: