In [ ]:
library(tidyverse)
library(mice)
library(randomForest)

In [ ]:
# Load the data and use Megan Risdal's method to replace NA with meaningful values
set.seed(20121228)
train <- read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)
full <- bind_rows(train, test)
full$Embarked[c(62, 830)] <- 'C'
full$Fare[1044] <- 8.05
factor_vars <- c('Pclass','Sex','Embarked','Survived')
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
mice_mod <- mice(full[, !names(full) %in% c('PassengerId','Name','Ticket','Cabin','Survived')], method='rf')
mice_output <- complete(mice_mod)
full$Age <- mice_output$Age
summary(full)

In [ ]:
train_nona <- subset(full, !is.na(Survived))
test_nona <- subset(full, is.na(Survived))

In [ ]:
drops <- c("PassengerId", "Name", "Cabin", "Ticket")
train_nona <- train_nona[, !(names(train_nona) %in% drops)]
train_row <- sample(1:nrow(train_nona), floor(0.7 * nrow(train_nona)))

In [ ]:
nf <- ncol(train_nona) - 1
n_valid <- nrow(train_nona[-train_row,])
n_train <- nrow(train_nona[train_row,])
train.err = double(nf)
valid.err = double(nf)
for (mtry in 1:nf) {
    fit <- randomForest(Survived ~ ., data = train_nona, subset = train_row, mtry = mtry, ntree = 500)
    # oob.err[mtry] <- fit$mse[500]
    pred <- predict(fit, train_nona[-train_row,])
    valid.err[mtry] <- with(train_nona[-train_row,], sum(pred != Survived) / n_valid)
    pred <- predict(fit, train_nona[train_row,])
    train.err[mtry] <- with(train_nona[train_row, ], sum(pred != Survived) / n_train)
    cat(mtry, "")
}

In [ ]:
valid.err
train.err

In [ ]:
err <- data.frame(nf = 1:nf, valid = valid.err, train = train.err)
ggplot(data = err) +
    geom_point(mapping = aes(x = nf, y = valid)) +
    geom_line(mapping = aes(x = nf, y = valid)) +
    geom_point(mapping = aes(x = nf, y = train), color = "red") +
    geom_line(mapping = aes(x = nf, y = train), color = "red") +
    xlab("# of features used in each split") + ylab("error")

In [ ]:
# Train the model on full train set
rf.titanic <- randomForest(Survived ~ ., data = train_nona, mtry = 3, ntree = 500)

In [ ]:
test_nona <- test_nona[ , !(names(test_nona) %in% drops)]
pred <- predict(rf.titanic, test_nona)

In [ ]:
result.titanic <- data.frame(PassengerId = 892:1309, Survived = as.numeric(pred) - 1)
sum(is.na(result.titanic$Survived))

In [ ]:
write.csv(result.titanic, file = "my_random_forest_3_nona_validation.csv", row.names = FALSE)