Objective:
Wine Quality Dataset:
Algorithms:
Full Technical Reference:
In [1]:
# Start and connect to a local H2O cluster
import h2o
h2o.init(nthreads = -1)
In [2]:
# Import wine quality data from a local CSV file
wine = h2o.import_file("winequality-white.csv")
wine.head(5)
Out[2]:
In [3]:
# Define features (or predictors)
features = list(wine.columns) # we want to use all the information
features.remove('quality') # we need to exclude the target 'quality' (otherwise there is nothing to predict)
features
Out[3]:
In [4]:
# Split the H2O data frame into training/test sets
# so we can evaluate out-of-bag performance
wine_split = wine.split_frame(ratios = [0.8], seed = 1234)
wine_train = wine_split[0] # using 80% for training
wine_test = wine_split[1] # using the rest 20% for out-of-bag evaluation
In [5]:
wine_train.shape
Out[5]:
In [6]:
wine_test.shape
Out[6]:
In [7]:
# Build a Generalized Linear Model (GLM) with default settings
# Import the function for GLM
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# Set up GLM for regression
glm_default = H2OGeneralizedLinearEstimator(family = 'gaussian', model_id = 'glm_default')
# Use .train() to build the model
glm_default.train(x = features,
y = 'quality',
training_frame = wine_train)
In [8]:
# Check the model performance on training dataset
glm_default
Out[8]:
In [9]:
# Check the model performance on test dataset
glm_default.model_performance(wine_test)
Out[9]:
In [10]:
# Build a Distributed Random Forest (DRF) model with default settings
# Import the function for DRF
from h2o.estimators.random_forest import H2ORandomForestEstimator
# Set up DRF for regression
# Add a seed for reproducibility
drf_default = H2ORandomForestEstimator(model_id = 'drf_default', seed = 1234)
# Use .train() to build the model
drf_default.train(x = features,
y = 'quality',
training_frame = wine_train)
In [11]:
# Check the DRF model summary
drf_default
Out[11]:
In [12]:
# Check the model performance on test dataset
drf_default.model_performance(wine_test)
Out[12]:
In [13]:
# Build a Gradient Boosting Machines (GBM) model with default settings
# Import the function for GBM
from h2o.estimators.gbm import H2OGradientBoostingEstimator
# Set up GBM for regression
# Add a seed for reproducibility
gbm_default = H2OGradientBoostingEstimator(model_id = 'gbm_default', seed = 1234)
# Use .train() to build the model
gbm_default.train(x = features,
y = 'quality',
training_frame = wine_train)
In [14]:
# Check the GBM model summary
gbm_default
Out[14]:
In [15]:
# Check the model performance on test dataset
gbm_default.model_performance(wine_test)
Out[15]:
In [16]:
# Build a Deep Learning (Deep Neural Networks, DNN) model with default settings
# Import the function for DNN
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# Set up DNN for regression
dnn_default = H2ODeepLearningEstimator(model_id = 'dnn_default')
# (not run) Change 'reproducible' to True if you want to reproduce the results
# The model will be built using a single thread (could be very slow)
# dnn_default = H2ODeepLearningEstimator(model_id = 'dnn_default', reproducible = True)
# Use .train() to build the model
dnn_default.train(x = features,
y = 'quality',
training_frame = wine_train)
In [17]:
# Check the DNN model summary
dnn_default
Out[17]:
In [18]:
# Check the model performance on test dataset
dnn_default.model_performance(wine_test)
Out[18]:
In [19]:
# Use GLM model to make predictions
yhat_test_glm = glm_default.predict(wine_test)
yhat_test_glm.head(5)
Out[19]:
In [20]:
# Use DRF model to make predictions
yhat_test_drf = drf_default.predict(wine_test)
yhat_test_drf.head(5)
Out[20]:
In [21]:
# Use GBM model to make predictions
yhat_test_gbm = gbm_default.predict(wine_test)
yhat_test_gbm.head(5)
Out[21]:
In [22]:
# Use DNN model to make predictions
yhat_test_dnn = dnn_default.predict(wine_test)
yhat_test_dnn.head(5)
Out[22]: