Objective:
Wine Quality Dataset:
Algorithms:
Full Technical Reference:
In [1]:
# Start and connect to a local H2O cluster
suppressPackageStartupMessages(library(h2o))
h2o.init(nthreads = -1)
In [2]:
# Import wine quality data from a local CSV file
wine = h2o.importFile("winequality-white.csv")
head(wine, 5)
In [3]:
# Define features (or predictors)
features = colnames(wine) # we want to use all the information
features = setdiff(features, 'quality') # we need to exclude the target 'quality'
features
In [4]:
# Split the H2O data frame into training/test sets
# so we can evaluate out-of-bag performance
wine_split = h2o.splitFrame(wine, ratios = 0.8, seed = 1234)
wine_train = wine_split[[1]] # using 80% for training
wine_test = wine_split[[2]] # using the rest 20% for out-of-bag evaluation
In [5]:
dim(wine_train)
In [6]:
dim(wine_test)
In [7]:
# Build a Generalized Linear Model (GLM) with default settings
glm_default = h2o.glm(x = features,
y = 'quality',
training_frame = wine_train,
family = 'gaussian',
model_id = 'glm_default')
In [8]:
# Check the model performance on training dataset
glm_default
In [9]:
# Check the model performance on test dataset
h2o.performance(glm_default, wine_test)
In [10]:
# Build a Distributed Random Forest (DRF) model with default settings
drf_default = h2o.randomForest(x = features,
y = 'quality',
training_frame = wine_train,
seed = 1234,
model_id = 'drf_default')
In [11]:
# Check the DRF model summary
drf_default
In [12]:
# Check the model performance on test dataset
h2o.performance(drf_default, wine_test)
In [13]:
# Build a Gradient Boosting Machines (GBM) model with default settings
gbm_default = h2o.gbm(x = features,
y = 'quality',
training_frame = wine_train,
seed = 1234,
model_id = 'gbm_default')
In [14]:
# Check the GBM model summary
gbm_default
In [15]:
# Check the model performance on test dataset
h2o.performance(gbm_default, wine_test)
In [16]:
# Build a Deep Learning (Deep Neural Networks, DNN) model with default settings
dnn_default = h2o.deeplearning(x = features,
y = 'quality',
training_frame = wine_train,
model_id = 'dnn_default')
In [17]:
# Check the DNN model summary
dnn_default
In [18]:
# Check the model performance on test dataset
h2o.performance(dnn_default, wine_test)
In [19]:
# Use GLM model to make predictions
yhat_test_glm = h2o.predict(glm_default, wine_test)
head(yhat_test_glm)
In [20]:
# Use DRF model to make predictions
yhat_test_drf = h2o.predict(drf_default, wine_test)
head(yhat_test_drf)
In [21]:
# Use GBM model to make predictions
yhat_test_gbm = h2o.predict(gbm_default, wine_test)
head(yhat_test_gbm)
In [22]:
# Use DNN model to make predictions
yhat_test_dnn = h2o.predict(dnn_default, wine_test)
head(yhat_test_dnn)