In [ ]:
library(tidyverse)
library(mice)

In [ ]:
train <- read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)
full  <- bind_rows(train, test)

In [ ]:
str(full)
summary(full)

In [ ]:
subset(full, Embarked == "")

In [ ]:
full[with(data = full, order(Ticket)),][50:100,]

In [ ]:
# Make the Embarked no NA from Megan's Analysis
full$Embarked[c(62, 830)] <- 'C'

In [ ]:
subset(full, is.na(Fare))

In [ ]:
full$Fare[1044] <- 8.05

In [ ]:
factor_vars <- c('Pclass','Sex','Embarked')
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
str(full)

In [ ]:
set.seed(129)
mice_mod <- mice(full[, !names(full) %in% c('PassengerId','Name','Ticket','Cabin','Survived')],
                 method='rf')

In [ ]:
mice_output <- complete(mice_mod)
par(mfrow=c(1,2))
hist(full$Age, freq=F, main='Age: Original Data', 
  col='darkgreen', ylim=c(0,0.04))
hist(mice_output$Age, freq=F, main='Age: MICE Output', 
  col='lightgreen', ylim=c(0,0.04))

In [ ]:
full$Age <- mice_output$Age
sum(is.na(full$Age))

In [ ]:
train$Fsize <- train$SibSp + train$Parch + 1

In [ ]:
str(train)

In [ ]:
ggplot(data = train, aes(x = Fsize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
  scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size')