In [ ]:
require(randomForest)
In [ ]:
set.seed(20121228)
train <- read.csv("train.csv")
# train.nona <- na.omit(train)
# drops <- c("PassengerId", "Name", "Cabin", "Ticket", "Embarked")
drops <- c("PassengerId", "Name", "Cabin", "Ticket")
train <- transform(train, Survived = as.factor(Survived))
train <- train[ , !(names(train) %in% drops)]
train <- subset(train, Embarked != "")
train <- transform(train, Embarked = as.factor(as.character(Embarked)))
# train <- train[train$Embarked != "",]
summary(train)
In [ ]:
# This is for checking the "Embarked" feature
# e <- train$Embarked
# str(e)
In [ ]:
# There are about 889 rows, take 720 of them as training, the rest for validation
train_row <- sample(1:nrow(train), 720)
In [ ]:
# rf.titanic <- randomForest(Survived ~ ., data = train, na.action = na.omit, subset = train_row)
rf.titanic <- randomForest(Survived ~ ., data = train, na.action = na.omit, subset = train_row)
In [ ]:
# Check the data structure
str(rf.titanic)
In [ ]:
train[train_row[c(2,4)],]
In [ ]:
sum(train[names(rf.titanic$predicted), ]$Survived == rf.titanic$predicted) / length(rf.titanic$predicted)
In [ ]:
# The explanation of
getTree(rf.titanic, 1)
In [ ]:
res <- predict(rf.titanic, train[train_row[2],], predict.all = TRUE, )
str(res)
In [ ]:
pred_train <- predict(rf.titanic, train[train_row,])
sum(is.na(pred_train))
head(pred_train)
pred_train[is.na(pred_train)] <- 0
print("Correct rate on training set: ")
rate <- with(data = train[train_row,], sum(Survived == pred_train) / length(pred_train))
print(rate)
length(pred_train)
In [ ]:
pred_validation <- predict(rf.titanic, train[-train_row,])
length(pred_validation)
sum(is.na(pred_validation))
pred_validation[is.na(pred_validation)] <- 0
res_validation <- train$Survived[-train_row]
# pred_validation <- as.numeric(pred_validation) - 1
# pred_validation
# res_validation
print("Correct rate on validation set: ")
print(sum(res_validation == pred_validation) / length(res_validation))
In [ ]:
test <- read.csv("test.csv")
test <- transform(test, Survived = 0)
test[1, "Survived"] <- 1
test <- transform(test, Survived = as.factor(Survived))
test <- test[ , !(names(test) %in% drops)]
summary(test)
In [ ]:
pred <- predict(rf.titanic, test)
str(pred)
In [ ]:
result.titanic <- data.frame(PassengerId = 892:1309, Survived = as.numeric(pred) - 1)
In [ ]:
summary(result.titanic)
str(result.titanic)
head(result.titanic)
In [ ]:
sum(is.na(result.titanic$Survived))
result.titanic$Survived[is.na(result.titanic$Survived)] <- 0
In [ ]:
result.titanic
In [ ]:
write.csv(result.titanic, file = "my_random_forest_2.csv", row.names = FALSE)
The reason looks pretty simple, there is "NA" in the training set.