In [1]:
# Start and connect to a local H2O cluster
import h2o
h2o.init(nthreads = -1)
In [2]:
# Import Titanic data (local CSV)
titanic = h2o.import_file("kaggle_titanic.csv")
In [3]:
# Explore the dataset using various functions
titanic.head(10)
Out[3]:
Explain why we need to transform
In [4]:
# Explore the column 'Survived'
titanic['Survived'].summary()
In [5]:
# Use hist() to create a histogram
titanic['Survived'].hist()
In [6]:
# Use table() to summarize 0s and 1s
titanic['Survived'].table()
Out[6]:
In [7]:
# Convert 'Survived' to categorical variable
titanic['Survived'] = titanic['Survived'].asfactor()
In [8]:
# Look at the summary of 'Survived' again
# The feature is now an 'enum' (enum is the name of categorical variable in Java)
titanic['Survived'].summary()
Doing the same for 'Pclass'
In [9]:
# Explore the column 'Pclass'
titanic['Pclass'].summary()
In [10]:
# Use hist() to create a histogram
titanic['Pclass'].hist()
In [11]:
# Use table() to summarize 1s, 2s and 3s
titanic['Pclass'].table()
Out[11]:
In [12]:
# Convert 'Pclass' to categorical variable
titanic['Pclass'] = titanic['Pclass'].asfactor()
In [13]:
# Explore the column 'Pclass' again
titanic['Pclass'].summary()