In [2]:
%load_ext rmagic
In [3]:
%%R
dir = "24-04-2014"
mycvs = paste(dir, "robust_program_events.csv", sep="/")
robust_program_events = read.csv(mycvs, sep="\t")
mycvs = paste(dir, "buggy_program_events.csv", sep="/")
buggy_program_events = read.csv(mycvs, sep="\t")
print(nrow(robust_program_events))
print(nrow(buggy_program_events))
TODO: Add an explanation about program traces as documents.
Now, we load the tm package and create the corpuses from the "documents"
In [4]:
%%R
library(tm)
robust_corpus = Corpus(VectorSource(robust_program_events[,1]))
buggy_corpus = Corpus(VectorSource(buggy_program_events[,1]))
print(robust_corpus)
print(buggy_corpus)
Now, it is time to create the document matrixes, and convert them to data frames adding its classes.
In [7]:
%%R
robust_dm = DocumentTermMatrix(robust_corpus)
buggy_dm = DocumentTermMatrix(buggy_corpus)
#print(robust_dm)
#print(buggy_dm)
sink("/dev/null")
robust_dm_df = as.data.frame(inspect(robust_dm))
rownames(robust_dm_df) = 1:nrow(robust_dm)
robust_dm_df["class"] = "robust"
buggy_dm_df = as.data.frame(inspect(buggy_dm))
rownames(buggy_dm_df) = 1:nrow(buggy_dm)
buggy_dm_df["class"] = "buggy"
sink()
#print(colnames(robust_dm_df))
#print(colnames(buggy_dm_df))
but we need make sure we are using the same variables for both corpuses.
In [8]:
%%R
dm_df = merge(robust_dm_df, buggy_dm_df,all=TRUE, sort=FALSE)
#print(dm_df[1,])
#print(nrow(dm_df))
dm_df[is.na(dm_df)] = 0
robust_cases = dm_df[dm_df$class == "robust",]
buggy_cases = dm_df[dm_df$class == "buggy",]
print(nrow(robust_cases))
print(nrow(buggy_cases))
now, we are ready to select train and test..
In [12]:
%%R
# buggy train and test
n = nrow(buggy_cases)
#rsample = sample(n)
# 100 cases are selected to keep the train dataset balanced
train_sample = 1:75 #rsample[1:as.integer(n*0.45)]
test_sample = 76:n #rsample[as.integer(n*0.45+1):n]
buggy_train = buggy_cases[train_sample,]
buggy_test = buggy_cases[test_sample,]
print(nrow(buggy_train))
print(nrow(buggy_test))
# robust train and test
n = nrow(robust_cases)
#rsample = sample(n)
# 100 cases are selected to keep the train dataset balanced
train_sample = 1:75#rsample[1:as.integer(n*0.75)]
test_sample = 76:n #rsample[as.integer(n*0.75+1):n]
robust_train = robust_cases[train_sample,]
robust_test = robust_cases[test_sample,]
print(nrow(robust_train))
print(nrow(robust_test))
train = rbind(buggy_train, robust_train)
test = rbind(buggy_test, robust_test)
Finally, we are ready to train and test a knn model:
In [16]:
%%R
#print(round(importance(rf),2))
library("class")
x_train = train[,names(train) != "class"]
x_test = test[,names(test) != "class"]
y_train = train[,"class"]
y_test = test[,"class"]
#print(y)
for (k in 1:10) {
print(k)
z = knn(x_train,x_test, y_train, k, use.all = FALSE)
#print(z)
#print(test[,"class"])
print(table(z, y_test))
}
or a random forest..
In [33]:
%%R
library("e1071")
xy_train = train#[,names(train) != "class"]
xy_train[,"class"] = factor(train[,"class"])
x_test = test[,names(test) != "class"]
#y_train = train[,"class"]
y_test = test[,"class"]
m = svm(class ~., data=xy_train, gamma=0.001, cost=10)
#tuned = tune.svm(class~., data = xy_train, gamma = 10^(-6:-1), cost = 10^(1:2))
#print(summary(tuned))
z = predict(m,x_test)
print(table(z, y_test))