As usual, we start doing some magic to load R scripts.


In [1]:
%load_ext rmagic

In [2]:
%%R

dir = "19-05-2014"

options(stringsAsFactors=F)

mycon = gzcon(gzfile(paste(dir, "buggy_traces.csv.gz", sep="/"), open="r"))
buggy_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)


mycon = gzcon(gzfile(paste(dir, "robust_traces.csv.gz", sep="/"), open="r"))
robust_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)

print(nrow(robust_program_events))
print(nrow(buggy_program_events))

programs = c(levels(buggy_program_events[,1]),levels(robust_program_events[,1]))
cats = factor(c(robust_program_events[,4], buggy_program_events[,4]), levels = c("R","B"))

#write.csv(programs,paste(dir,"programs.csv", sep="/"))


[1] 182
[1] 546

TODO: Add an explanation about program traces as documents.

Now, we load the tm package and create the corpuses from the "documents".


In [3]:
%%R

library(tm)

mut_corpus = Corpus(VectorSource(c(robust_program_events[,2],buggy_program_events[,2])))
evs_corpus = Corpus(VectorSource(c(robust_program_events[,3],buggy_program_events[,3])))

print(mut_corpus)
print(evs_corpus)


A corpus with 728 text documents
A corpus with 728 text documents

Now, it is time to create the document matrixes, and convert them to data frames adding its correspondent classes. The function "inspect" to convert the document term matrix into a data frame prints some information that we don't care, so we discard printed messages in this step.


In [4]:
%%R

#library("RWeka")
#options(mc.cores=1)


mut_dm = DocumentTermMatrix(mut_corpus)

sink("/dev/null")

mut_dm_df =  as.data.frame(inspect(mut_dm))
rownames(mut_dm_df) = 1:nrow(mut_dm)
mut_dm_df["class"] = cats

sink()

#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters=" "))
evs_dm = DocumentTermMatrix(evs_corpus, control = list(bounds = list(global = c(1,Inf))))#, tokenize = BigramTokenizer))
print(evs_dm)

sink("/dev/null")

sorted_dm = sort(colSums(as.matrix(evs_dm)), decreasing=TRUE)

evs_dm_df =  (as.data.frame(inspect(evs_dm)))
#rownames(evs_dm_df) = 1:nrow(evs_dm)
#evs_dm_df = evs_dm_df[,cols]
#print(2)
evs_dm_df["class"] = cats

sink()


A document-term matrix (728 documents, 782 terms)

Non-/sparse entries: 27754/541542
Sparsity           : 95%
Maximal term length: 36 
Weighting          : term frequency (tf)

To finish preparing the data, we need make sure we are using the same variables for all the corpuses in the mutation and events data.


In [5]:
%%R

# Mutation data

mut_robust_cases = mut_dm_df[mut_dm_df$class == "R",]
mut_buggy_cases  = mut_dm_df[mut_dm_df$class == "B",]

# Event data

evs_robust_cases = evs_dm_df[evs_dm_df$class == "R",]
evs_buggy_cases  = evs_dm_df[evs_dm_df$class == "B",]

both_robust_cases = cbind(mut_robust_cases[,names(mut_robust_cases) != "class"], evs_robust_cases)
both_buggy_cases = cbind(mut_buggy_cases[,names(mut_buggy_cases) != "class"], evs_buggy_cases)

print(nrow(both_robust_cases))
print(nrow(both_buggy_cases))

print(ncol(both_robust_cases))
print(ncol(both_buggy_cases))


[1] 397
[1] 331
[1] 1362
[1] 1362

In [6]:
%%R

robust_cases = both_robust_cases
buggy_cases = both_buggy_cases

#rm(buggy_program_events)
#gc()

now, we are ready to select train and test..


In [7]:
%%R

train_size = 250
test_size = nrow(buggy_cases) - train_size

print(train_size)
print(test_size)

n = nrow(buggy_cases)
rsample = sample(n)

train_sample = rsample[1:(train_size)] 
test_sample = rsample[(train_size+1):(train_size+test_size)]

#print(rsample)

buggy_train = buggy_cases[train_sample,]
buggy_test  = buggy_cases[test_sample,]

print(nrow(buggy_train))
print(nrow(buggy_test))

# robust train and test

n = nrow(robust_cases)
rsample = sample(n)

#print(rsample)

# n cases are selected to keep the train dataset balanced
train_sample = rsample[1:(train_size)]
test_sample =  rsample[(train_size+1):(train_size+test_size)]
more_test_sample = rsample[(train_size+test_size+1):n]

robust_train = robust_cases[train_sample,]
robust_test  = robust_cases[test_sample,]
robust_more_test = robust_cases[more_test_sample,]

print(nrow(robust_train))
print(nrow(robust_test))

train = rbind(buggy_train, robust_train)
test  = rbind(buggy_test, robust_test)
#more_test = robust_more_test

xy_train = train
#xy_train[,"class"] = factor(train[,"class"])
x_test = test[,names(test) != "class"]
y_test  = test[,"class"]

#print(more_test[1,])


[1] 250
[1] 81
[1] 250
[1] 81
[1] 250
[1] 81

Finally, we are ready to train and test a knn model:


In [27]:
%%R

mycon = gzcon(gzfile(paste(dir, "filtered_traces.csv.gz", sep="/"), open="r"))
more_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)

cats = factor(more_program_events[,4], levels = c("R","B"))

mut_more_corpus = Corpus(VectorSource(more_program_events[,2]))
evs_more_corpus = Corpus(VectorSource(more_program_events[,3]))

mut_more_dm  = DocumentTermMatrix(mut_more_corpus)
evs_more_dm  = DocumentTermMatrix(evs_more_corpus)

print(evs_more_dm)

sink("/dev/null")

mut_more_dm_df =  as.data.frame(inspect(mut_more_dm))
mut_more_dm_df["class"] = cats

evs_more_dm_df =  (as.data.frame(inspect(evs_more_dm)))
evs_more_dm_df["class"] = cats
    
sink()


A document-term matrix (2333 documents, 1026 terms)

Non-/sparse entries: 61664/2331994
Sparsity           : 97%
Maximal term length: 34 
Weighting          : term frequency (tf)

In [28]:
%%R

gc()

more_cases = cbind(mut_more_dm_df[,names(mut_more_dm_df) != "class"], evs_more_dm_df)

robust_cases = more_cases[more_cases$class == "R",]
buggy_cases = more_cases[more_cases$class == "B",]

#mut_robust_cases = mut_more_dm_df[mut_more_dm_df$class == "R",]
#mut_buggy_cases = mut_more_dm_df[mut_more_dm_df$class == "B",]

#evs_robust_cases = evs_more_dm_df[more_dm_df$class == "R",]
#evs_buggy_cases  = evs_more_dm_df[more_dm_df$class == "B",]

#robust_cases = cbind(mut_more_dm_df[,names(mut_more_dm_df) != "class"], evs_more_dm_df)

n = nrow(robust_cases)

rsample = sample(nrow(robust_cases))
robust_cases = robust_cases[rsample[1:n],]

rsample = sample(nrow(buggy_cases))
buggy_cases = buggy_cases[rsample[1:n],]

print(nrow(robust_cases))
print(nrow(buggy_cases))

more_test = rbind(robust_cases, buggy_cases)

x_more_test = more_test[,names(more_test) != "class"]
y_more_test  = more_test[,"class"]


[1] 837
[1] 837

In [62]:
%%R

xy_train_vars = names(xy_train)
x_more_test_vars = names(x_more_test)

#print(x_more_test_vars)

xy_train_vars = xy_train_vars[xy_train_vars != "class"]
missing_vars = xy_train_vars[! xy_train_vars %in% x_more_test_vars]
x_more_test[,missing_vars] = 0
#print(missing_vars)

missing_vars = names(more_test)[!(names(more_test) %in% names(xy_train))]
missing_vars = missing_vars[missing_vars != "class"]
#print(names(more_test))
#print()

or a SVM ..


In [64]:
%%R
#library("e1071")
#library("caret")

#print(names(tail(sorted_dm, -700)))

for (n in c(0)){#seq(25,100,25)){
  exc_vars = names(sorted_dm)#names(tail(sorted_dm, -n))
  inc_vars = names(xy_train)[!(names(xy_train) %in% exc_vars)]
  
  #m = svm(class ~., data=xy_train[,inc_vars], gamma=0.001, cost=100)
  m = svm(class ~., data=more_test, gamma=0.001, cost=100)
  #m = tune.svm(class~., data = more_test,  gamma = 10^(-5:-1), cost = 10^(1:2))
  #print(summary(m))
  #m = m$best.model
  
  x_test_aug = xy_train
  x_test_aug[,missing_vars] = 0
  
  z = predict(m,x_test_aug)
  
  #print(m)
  print(n)
  print(confusionMatrix(table(pred=z, true=xy_train[,"class"]))$overall[1])
  
  #z = predict(m,x_more_test)
  #print(confusionMatrix(table(pred=z, true=y_more_test))$overall[1])
}

print(warning())

#x_more_test = more_test[,names(test) != "class"]
#y_more_test  = more_test[,"class"]

#m = svm(class ~., data=xy_train, gamma=0.1, cost=10)#, kernel="linear")
#m = svm(class ~., data=xy_train, gamma=0.001, cost=100)#, kernel="linear")

#m = tune.svm(class~., data = xy_train,  gamma = 10^(-5:-1), cost = 10^(1:2))
#print(summary(m))
#m = m$best.model

#m = svm(class ~., data=xy_train, gamma=0.01, cost=100)#, kernel="linear")

#z = predict(m,x_test)
#print(z)
#print(y_test)
#print(confusionMatrix(table(pred=z, true=y_test)))

#z = predict(m,x_more_test)
#print(z)
#print(confusionMatrix(table(pred=z, true=y_more_test)))


[1] 0
Accuracy 
    0.55 
[1] ""

In [12]:
%%R

scores = t(abs(t(m$coefs) %*% m$SV))
inds = sort(scores, decreasing=TRUE, index.return = TRUE)$i
print(names(scores[inds,][1:50]))


 [1] "X.strlen.ret_val.num32b8."          "X.strlen.0.hptr32."                
 [3] "X.strchr.1.num32b8."                "X.strchr.0.lptr32."                
 [5] "X.strcpy.ret_val.hptr32."           "X.strcpy.0.hptr32."                
 [7] "X.strlen.0.gptr32."                 "X.memcpy.1.hptr32."                
 [9] "X.strcpy.1.hptr32."                 "X.strchr.ret_val.nptr32."          
[11] "X.__ctype_b_loc.ret_val.fptr32."    "X.__ctype_b_loc.0.top32."          
[13] "X.memcpy.0.hptr32."                 "X.memcpy.ret_val.hptr32."          
[15] "X.memcpy.2.num32b8."                "X._io_getc.ret_val.num32b8."       
[17] "X.strchr.ret_val.lptr32."           "X.fread.1.num32b8."                
[19] "X.fread.3.hptr32."                  "X.fread.2.num32b8."                
[21] "X.fread.0.sptr32."                  "X.fread.ret_val.num32b0."          
[23] "X.strlen.0.lptr32."                 "X.tolower.0.num32b8."              
[25] "X.tolower.ret_val.num32b8."         "X._io_getc.0.hptr32."              
[27] "X.memcpy.1.lptr32."                 "X.realloc.ret_val.hptr32."         
[29] "X.realloc.1.num32b8."               "X.strcmp.0.hptr32."                
[31] "X.memset.0.hptr32."                 "X.memset.ret_val.hptr32."          
[33] "X.realloc.0.nptr32."                "X.memset.2.num32b16."              
[35] "X.memset.1.num32b0."                "X.__errno_location.0.top32."       
[37] "X.__errno_location.ret_val.fptr32." "X.fgets.0.sptr32."                 
[39] "X.fgets.ret_val.sptr32."            "X.fgets.2.hptr32."                 
[41] "X.fgets.1.num32b16."                "X.strcmp.1.sptr32."                
[43] "X.vfprintf.0.lptr32."               "X.vfprintf.2.sptr32."              
[45] "X.vfprintf.ret_val.num32b8."        "X.fprintf.1.gptr32."               
[47] "X.vfprintf.1.gptr32."               "X.fprintf.0.lptr32."               
[49] "X.fputc.0.num32b8."                 "X.fputc.1.lptr32."                 

In [13]:
%%R
m_vars = names(xy_train)

save(m_vars, file=paste(dir, "svms", "mvars.data", sep="/"))
save(m, file=paste(dir, "svms", "mutation-event-classifier.svm", sep="/"))

In [ ]: