In [1]:
source("C:\\Work\\myRfunctions.R")
fnRunDate()
fnInstallPackages()
In [2]:
train <- as_tibble(titanic_train)
test <- as_tibble(titanic_test)
dim(train)
dim(test)
In [3]:
glimpse(train)
In [4]:
fnClassDistribution(Class = train$Survived)
In [5]:
# combine test and train.
dataset <- dplyr::full_join(train, test)
In [6]:
glimpse(dataset)
In [7]:
# change blanks to NAs
dataset[dataset == ""] <- NA
In [8]:
Hmisc::describe(dataset, listunique=1)
In [9]:
psych::describe(dataset, check = T)
In [10]:
VIM::aggr(dataset, prop = FALSE, combined = TRUE, numbers = TRUE, sortVars = TRUE, sortCombs = TRUE)
In [11]:
# summarize correlations between input variables
options(warn=-1)
PerformanceAnalytics::chart.Correlation(dplyr::select_if(dataset, is.numeric), histogram=TRUE, pch=".")
In [12]:
tabplot::tableplot(
dataset, sortCol = Survived)
In [11]:
#crosstable
gmodels::CrossTable(
x = u_dataset$Sex, #categorical or continuous
y = u_dataset$Survived #needs to be categorical
)
In [16]:
# Feature Engineering
# Extract title from Name, creating a new variable
dataset <- mutate(dataset, Title = stringr::str_sub(Name, stringr::str_locate(Name, ",")[ , 1] + 2, stringr::str_locate(Name, "\\.")[ , 1] - 1))
fnClassDistribution(Class = dataset$Title)
In [ ]:
titanic <- titanic %>%
mutate(Mother = ifelse(c(titanic$Title == "Mrs" | titanic$Title == "Mme" |
titanic$Title == "the Countess" | titanic$Title == "Dona" |
titanic$Title == "Lady") & titanic$Parch > 0, "Yes", "No"))
In [14]:
# count number of characters in cabin and name
u_dataset$Cabin_nchar <- as.numeric(nchar(u_dataset$Cabin))
u_dataset$Name_nchar <- as.numeric(nchar(u_dataset$Name))
In [15]:
# is missing data significant in Cabin
u_dataset$Cabin_isna <- as.numeric(is.na(u_dataset$Cabin))
head(u_dataset$Cabin_isna)
In [16]:
#create new variables for character versions of few that could be used as char or int
u_dataset$Pclass_c <- as.character(u_dataset$Pclass)
u_dataset$SibSp_c <- as.character(u_dataset$SibSp)
u_dataset$Parch_c <- as.character(u_dataset$Parch)
ggplot(data = u_dataset, aes( x = Age, #can be continous or categorical. is supposed to be continous y = as.factor(Title) #needs to be discrete )) + geom_joy( scale = .8, na.rm = T) + theme_joy(center_axis_labels = T)
In [17]:
#create dummy variables
dmy <- caret::dummyVars(" ~ Title + Sex + Embarked + Pclass_c + SibSp_c + Parch_c + Pclass:Sex + Embarked:Sex + Embarked:Pclass_c + Pclass:Cabin_isna", data = u_dataset, fullRank = T)
dataset_d <- data.frame(predict(dmy, newdata = u_dataset))
In [18]:
# recombine data
u_dataset <- cbind(u_dataset, dataset_d)
In [19]:
# split back out the dataset and test
sktrain <- dplyr::filter(u_dataset, d_set == "train")
sktest <- dplyr::filter(u_dataset, d_set == "test")
#drop the sktest target column full of NAs
sktest$Survived <- NULL
In [20]:
# create only numeric df for use with sklearn
sktrain <- dplyr::select_if(sktrain, is.numeric)
sktest <- dplyr::select_if(sktest, is.numeric)
In [21]:
#y <- dplyr::select(sktrain, Survived) #%>% #dplyr::slice(., 1:100000)
#remove id field
sktrain <- dplyr::select(sktrain, -dplyr::one_of(c("PassengerId"))) #%>% #dplyr::slice(., 1:100000)
#dplyr::glimpse(y)
sktrain$Survived <- as.factor(sktrain$Survived)
sktrain$Survived <- sub("^", "class", sktrain$Survived)
dplyr::glimpse(sktrain)
In [22]:
# Split out validation dataset
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(sktrain$Survived, p = 0.67, list = FALSE)
# select 20% of the data for validation
validation <- sktrain[-validation_index,]
# use the remaining 80% of data to training and testing the models
sktrainval <- sktrain[validation_index,]
In [23]:
# 10-fold cross validation with 3 repeats
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3
, sampling = "up" # up down rose smote
, summaryFunction=twoClassSummary
, classProbs = TRUE
)
metric <- "ROC"
In [24]:
# RF
set.seed(7)
#fit.cforest <- train(Survived~., data = sktrain, method = "rf", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# C5.0
set.seed(7)
fit.c5 <- train(Survived~., data = sktrainval, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# GLMNET
set.seed(7)
fit.glmnet <- train(Survived~., data = sktrainval, method = "glmnet", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# KNN
set.seed(7)
#fit.knn <- train(Survived~., data = sktrain, method = "knn", preProcess = c('zv', "center", "scale",'medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# SVM
set.seed(7)
#fit.svm <- train(Survived~., data = sktrain, method = "svmRadial", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgb
set.seed(9)
fit.xgb <- train(Survived~., data = sktrainval, method = "xgbTree", preProcess =c('medianImpute','BoxCox','zv'), metric = metric, trControl = control, na.action = na.pass)
In [25]:
# Compare results
results <- resamples(list(
#RF = fit.cforest,
C5 = fit.c5,
GLMNET = fit.glmnet,
#KNN = fit.knn,
xgb = fit.xgb
#SVM = fit.svm
))
summary(results)
bwplot(results)
dotplot(results)
varImp(fit.glmnet)
varImp(fit.c5)
varImp(fit.xgb)
#varImp(fit.cforest)
In [26]:
#test predicting power on unseen validation set
validation$prediction <- predict(fit.c5, newdata = validation, na.action = na.pass)
#Checking the accuracy of the random forest model
confusionMatrix(validation$prediction, validation$Survived, positive = "class1") #convention is positive class is the rarest one
In [27]:
# set up k-fold cross validation and metric
control <- trainControl(
method = "repeatedcv", number = 10, repeats = 3
, sampling = "up"
, search='random'
, summaryFunction=twoClassSummary
, classProbs = TRUE
)
metric <- "ROC"
set.seed(13)
fit.c5s <- train(Survived~., data = sktrainval, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox')
, metric = metric
, trControl = control
, na.action = na.pass
, tuneLengh=20)
In [28]:
fit.c5s
In [34]:
#test predicting power on unseen validation set
validation$prediction <- predict(fit.c5s, newdata = validation, na.action = na.pass)
#Checking the accuracy of the random forest model
confusionMatrix(validation$prediction, validation$Survived, positive = "class1") #convention is positive class is the rarest one
In [30]:
# build a model using full data
# 10-fold cross validation with 3 repeats
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3
, sampling = "up" # up down rose smote
, summaryFunction=twoClassSummary
, classProbs = TRUE
)
metric <- "ROC"
set.seed(7)
fit.c5fin <- train(Survived~., data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
In [31]:
sktest$prediction <- predict(fit.c5fin, newdata = sktest, na.action = na.pass)
glimpse(sktest)
nrow(data.frame(sktest))
In [32]:
sktest$prediction <- ifelse(sktest$prediction == 'class0',0,1)
In [33]:
#my_solution <- cbind(PassengerId = sktest$PassengerId, Survived = prediction)
my_solution <- dplyr::select(sktest,
PassengerId = PassengerId,
Survived = prediction
)
readr::write_csv(x = data.frame(my_solution), path = "C:\\Work\\my_solution.csv")
head(my_solution, n=20)
tail(my_solution, n=20)
Right now this works better when trained on less data. When I split a validation set and train, final results are better than training on full data.