In [1]:
# Start and connect to a local H2O cluster
suppressPackageStartupMessages(library(h2o))
h2o.init(nthreads = -1)
In [2]:
# Import Titanic data (local CSV)
titanic = h2o.importFile("kaggle_titanic.csv")
In [3]:
# Explore the dataset using various functions
head(titanic, 10)
Explain why we need to transform
In [4]:
# Explore the column 'Survived'
h2o.describe(titanic[, 'Survived'])
In [5]:
# Use hist() to create a histogram
h2o.hist(titanic[, 'Survived'])
In [6]:
# Use table() to summarize 0s and 1s
h2o.table(titanic[, 'Survived'])
In [7]:
# Convert 'Survived' to categorical variable
titanic[, 'Survived'] = as.factor(titanic[, 'Survived'])
In [8]:
# Look at the summary of 'Survived' again
# The feature is now an 'enum' (enum is the name of categorical variable in Java)
h2o.describe(titanic[, 'Survived'])
Doing the same for 'Pclass'
In [9]:
# Explore the column 'Pclass'
h2o.describe(titanic[,'Pclass'])
In [10]:
# Use hist() to create a histogram
h2o.hist(titanic[, 'Pclass'])
In [11]:
# Use table() to summarize 1s, 2s and 3s
h2o.table(titanic[, 'Pclass'])
In [12]:
# Convert 'Pclass' to categorical variable
titanic[, 'Pclass'] = as.factor(titanic[, 'Pclass'])
In [13]:
# Look at the summary of 'Pclass' again
# The feature is now an 'enum' (enum is the name of categorical variable in Java)
h2o.describe(titanic[, 'Pclass'])