In [ ]:
library(tidyverse)
library(mice)
In [ ]:
train <- read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)
full <- bind_rows(train, test)
In [ ]:
str(full)
summary(full)
In [ ]:
subset(full, Embarked == "")
In [ ]:
full[with(data = full, order(Ticket)),][50:100,]
In [ ]:
# Make the Embarked no NA from Megan's Analysis
full$Embarked[c(62, 830)] <- 'C'
In [ ]:
subset(full, is.na(Fare))
In [ ]:
full$Fare[1044] <- 8.05
In [ ]:
factor_vars <- c('Pclass','Sex','Embarked')
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
str(full)
In [ ]:
set.seed(129)
mice_mod <- mice(full[, !names(full) %in% c('PassengerId','Name','Ticket','Cabin','Survived')],
method='rf')
In [ ]:
mice_output <- complete(mice_mod)
par(mfrow=c(1,2))
hist(full$Age, freq=F, main='Age: Original Data',
col='darkgreen', ylim=c(0,0.04))
hist(mice_output$Age, freq=F, main='Age: MICE Output',
col='lightgreen', ylim=c(0,0.04))
In [ ]:
full$Age <- mice_output$Age
sum(is.na(full$Age))
In [ ]:
train$Fsize <- train$SibSp + train$Parch + 1
In [ ]:
str(train)
In [ ]:
ggplot(data = train, aes(x = Fsize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size')