As usual, we start doing some magic to load R scripts.
In [1]:
%load_ext rmagic
In [2]:
%%R
dir = "19-05-2014"
options(stringsAsFactors=F)
mycon = gzcon(gzfile(paste(dir, "buggy_traces.csv.gz", sep="/"), open="r"))
buggy_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)
mycon = gzcon(gzfile(paste(dir, "robust_traces.csv.gz", sep="/"), open="r"))
robust_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)
print(nrow(robust_program_events))
print(nrow(buggy_program_events))
programs = c(levels(buggy_program_events[,1]),levels(robust_program_events[,1]))
cats = factor(c(robust_program_events[,4], buggy_program_events[,4]), levels = c("R","B"))
#write.csv(programs,paste(dir,"programs.csv", sep="/"))
TODO: Add an explanation about program traces as documents.
Now, we load the tm package and create the corpuses from the "documents".
In [3]:
%%R
library(tm)
mut_corpus = Corpus(VectorSource(c(robust_program_events[,2],buggy_program_events[,2])))
evs_corpus = Corpus(VectorSource(c(robust_program_events[,3],buggy_program_events[,3])))
print(mut_corpus)
print(evs_corpus)
Now, it is time to create the document matrixes, and convert them to data frames adding its correspondent classes. The function "inspect" to convert the document term matrix into a data frame prints some information that we don't care, so we discard printed messages in this step.
In [4]:
%%R
#library("RWeka")
#options(mc.cores=1)
mut_dm = DocumentTermMatrix(mut_corpus)
sink("/dev/null")
mut_dm_df = as.data.frame(inspect(mut_dm))
rownames(mut_dm_df) = 1:nrow(mut_dm)
mut_dm_df["class"] = cats
sink()
#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters=" "))
evs_dm = DocumentTermMatrix(evs_corpus, control = list(bounds = list(global = c(1,Inf))))#, tokenize = BigramTokenizer))
print(evs_dm)
sink("/dev/null")
sorted_dm = sort(colSums(as.matrix(evs_dm)), decreasing=TRUE)
evs_dm_df = (as.data.frame(inspect(evs_dm)))
#rownames(evs_dm_df) = 1:nrow(evs_dm)
#evs_dm_df = evs_dm_df[,cols]
#print(2)
evs_dm_df["class"] = cats
sink()
To finish preparing the data, we need make sure we are using the same variables for all the corpuses in the mutation and events data.
In [5]:
%%R
# Mutation data
mut_robust_cases = mut_dm_df[mut_dm_df$class == "R",]
mut_buggy_cases = mut_dm_df[mut_dm_df$class == "B",]
# Event data
evs_robust_cases = evs_dm_df[evs_dm_df$class == "R",]
evs_buggy_cases = evs_dm_df[evs_dm_df$class == "B",]
both_robust_cases = cbind(mut_robust_cases[,names(mut_robust_cases) != "class"], evs_robust_cases)
both_buggy_cases = cbind(mut_buggy_cases[,names(mut_buggy_cases) != "class"], evs_buggy_cases)
print(nrow(both_robust_cases))
print(nrow(both_buggy_cases))
print(ncol(both_robust_cases))
print(ncol(both_buggy_cases))
In [6]:
%%R
robust_cases = both_robust_cases
buggy_cases = both_buggy_cases
#rm(buggy_program_events)
#gc()
now, we are ready to select train and test..
In [7]:
%%R
train_size = 250
test_size = nrow(buggy_cases) - train_size
print(train_size)
print(test_size)
n = nrow(buggy_cases)
rsample = sample(n)
train_sample = rsample[1:(train_size)]
test_sample = rsample[(train_size+1):(train_size+test_size)]
#print(rsample)
buggy_train = buggy_cases[train_sample,]
buggy_test = buggy_cases[test_sample,]
print(nrow(buggy_train))
print(nrow(buggy_test))
# robust train and test
n = nrow(robust_cases)
rsample = sample(n)
#print(rsample)
# n cases are selected to keep the train dataset balanced
train_sample = rsample[1:(train_size)]
test_sample = rsample[(train_size+1):(train_size+test_size)]
more_test_sample = rsample[(train_size+test_size+1):n]
robust_train = robust_cases[train_sample,]
robust_test = robust_cases[test_sample,]
robust_more_test = robust_cases[more_test_sample,]
print(nrow(robust_train))
print(nrow(robust_test))
train = rbind(buggy_train, robust_train)
test = rbind(buggy_test, robust_test)
#more_test = robust_more_test
xy_train = train
#xy_train[,"class"] = factor(train[,"class"])
x_test = test[,names(test) != "class"]
y_test = test[,"class"]
#print(more_test[1,])
Finally, we are ready to train and test a knn model:
In [27]:
%%R
mycon = gzcon(gzfile(paste(dir, "filtered_traces.csv.gz", sep="/"), open="r"))
more_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)
cats = factor(more_program_events[,4], levels = c("R","B"))
mut_more_corpus = Corpus(VectorSource(more_program_events[,2]))
evs_more_corpus = Corpus(VectorSource(more_program_events[,3]))
mut_more_dm = DocumentTermMatrix(mut_more_corpus)
evs_more_dm = DocumentTermMatrix(evs_more_corpus)
print(evs_more_dm)
sink("/dev/null")
mut_more_dm_df = as.data.frame(inspect(mut_more_dm))
mut_more_dm_df["class"] = cats
evs_more_dm_df = (as.data.frame(inspect(evs_more_dm)))
evs_more_dm_df["class"] = cats
sink()
In [28]:
%%R
gc()
more_cases = cbind(mut_more_dm_df[,names(mut_more_dm_df) != "class"], evs_more_dm_df)
robust_cases = more_cases[more_cases$class == "R",]
buggy_cases = more_cases[more_cases$class == "B",]
#mut_robust_cases = mut_more_dm_df[mut_more_dm_df$class == "R",]
#mut_buggy_cases = mut_more_dm_df[mut_more_dm_df$class == "B",]
#evs_robust_cases = evs_more_dm_df[more_dm_df$class == "R",]
#evs_buggy_cases = evs_more_dm_df[more_dm_df$class == "B",]
#robust_cases = cbind(mut_more_dm_df[,names(mut_more_dm_df) != "class"], evs_more_dm_df)
n = nrow(robust_cases)
rsample = sample(nrow(robust_cases))
robust_cases = robust_cases[rsample[1:n],]
rsample = sample(nrow(buggy_cases))
buggy_cases = buggy_cases[rsample[1:n],]
print(nrow(robust_cases))
print(nrow(buggy_cases))
more_test = rbind(robust_cases, buggy_cases)
x_more_test = more_test[,names(more_test) != "class"]
y_more_test = more_test[,"class"]
In [62]:
%%R
xy_train_vars = names(xy_train)
x_more_test_vars = names(x_more_test)
#print(x_more_test_vars)
xy_train_vars = xy_train_vars[xy_train_vars != "class"]
missing_vars = xy_train_vars[! xy_train_vars %in% x_more_test_vars]
x_more_test[,missing_vars] = 0
#print(missing_vars)
missing_vars = names(more_test)[!(names(more_test) %in% names(xy_train))]
missing_vars = missing_vars[missing_vars != "class"]
#print(names(more_test))
#print()
or a SVM ..
In [64]:
%%R
#library("e1071")
#library("caret")
#print(names(tail(sorted_dm, -700)))
for (n in c(0)){#seq(25,100,25)){
exc_vars = names(sorted_dm)#names(tail(sorted_dm, -n))
inc_vars = names(xy_train)[!(names(xy_train) %in% exc_vars)]
#m = svm(class ~., data=xy_train[,inc_vars], gamma=0.001, cost=100)
m = svm(class ~., data=more_test, gamma=0.001, cost=100)
#m = tune.svm(class~., data = more_test, gamma = 10^(-5:-1), cost = 10^(1:2))
#print(summary(m))
#m = m$best.model
x_test_aug = xy_train
x_test_aug[,missing_vars] = 0
z = predict(m,x_test_aug)
#print(m)
print(n)
print(confusionMatrix(table(pred=z, true=xy_train[,"class"]))$overall[1])
#z = predict(m,x_more_test)
#print(confusionMatrix(table(pred=z, true=y_more_test))$overall[1])
}
print(warning())
#x_more_test = more_test[,names(test) != "class"]
#y_more_test = more_test[,"class"]
#m = svm(class ~., data=xy_train, gamma=0.1, cost=10)#, kernel="linear")
#m = svm(class ~., data=xy_train, gamma=0.001, cost=100)#, kernel="linear")
#m = tune.svm(class~., data = xy_train, gamma = 10^(-5:-1), cost = 10^(1:2))
#print(summary(m))
#m = m$best.model
#m = svm(class ~., data=xy_train, gamma=0.01, cost=100)#, kernel="linear")
#z = predict(m,x_test)
#print(z)
#print(y_test)
#print(confusionMatrix(table(pred=z, true=y_test)))
#z = predict(m,x_more_test)
#print(z)
#print(confusionMatrix(table(pred=z, true=y_more_test)))
In [12]:
%%R
scores = t(abs(t(m$coefs) %*% m$SV))
inds = sort(scores, decreasing=TRUE, index.return = TRUE)$i
print(names(scores[inds,][1:50]))
In [13]:
%%R
m_vars = names(xy_train)
save(m_vars, file=paste(dir, "svms", "mvars.data", sep="/"))
save(m, file=paste(dir, "svms", "mutation-event-classifier.svm", sep="/"))
In [ ]: