Random Forest with Grid Search (XGBoost)


In [1]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

This example features:

  • XGBoost's native API for cross-validating and training gradient-boosted trees
  • scikit-learn's ParameterGrid utility for iterating over a hyperparameter grid
  • verta's Python client integrated into the grid search loop
  • verta's Python client retrieving the best run from the grid search to calculate and log full training accuracy

In [2]:
HOST = "app.verta.ai"

PROJECT_NAME = "Wine Multiclassification"
EXPERIMENT_NAME = "Boosted Trees"

In [3]:
# import os
# os.environ['VERTA_EMAIL'] = 
# os.environ['VERTA_DEV_KEY'] =

Imports


In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import multiprocessing
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import datasets
from sklearn import model_selection

import xgboost as xgb

Log Workflow

Prepare Data


In [5]:
data = datasets.load_wine()

X = data['data']
y = data['target']

dtrain = xgb.DMatrix(X, label=y)

In [6]:
df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))),
                  columns=data['feature_names'] + ['species'])

df.head()

Prepare Hyperparameters


In [7]:
grid = model_selection.ParameterGrid({
    'eta': [0.5, 0.7],
    'max_depth': [1, 2, 3],
    'num_class': [10],
})

Instantiate Client


In [8]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

Run Validation


In [9]:
def run_experiment(hyperparams):
    run = client.set_experiment_run()
    
    # log training data
    run.log_dataset("train_data", df)
    
    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    
    # run cross validation on hyperparameters
    cv_history = xgb.cv(hyperparams, dtrain,
                        nfold=5,
                        metrics=("merror", "mlogloss"))

    # log observations from each iteration
    for _, iteration in cv_history.iterrows():
        for obs, val in iteration.iteritems():
            run.log_observation(obs, val)
            
    # log error from final iteration
    final_val_error = iteration['test-merror-mean']
    run.log_metric("val_error", final_val_error)
    print("{} Mean error: {:.4f}".format(hyperparams, final_val_error))
    
with multiprocessing.Pool() as pool:
    pool.map(run_experiment, grid)

Revisit Workflow

Retrieve Best Run


In [10]:
best_run = expt.expt_runs.sort("metrics.val_error", descending=False)[0]
print("Validation Error: {:.4f}".format(best_run.get_metric("val_error")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

Train on Full Dataset


In [11]:
model = xgb.XGBClassifier(**best_hyperparams)
model.fit(X, y)

Calculate and Log Accuracy on Full Training Set


In [12]:
train_acc = model.score(X, y)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

Log Model for Deployment


In [13]:
# create deployment artifacts
model_api = ModelAPI(X, model.predict(X))
requirements = ["scikit-learn", "xgboost"]

best_run.log_model(model, model_api=model_api)
best_run.log_requirements(requirements)

Make Live Predictions

Deploy Model Through Web App


In [14]:
best_run

Load Deployed Model


In [15]:
from verta._demo_utils import DeployedModel

deployed_model = DeployedModel(HOST, best_run.id)

Query Deployed Model


In [16]:
for x in itertools.cycle(np.random.permutation(X).tolist()):
    print(deployed_model.predict([x]))
    time.sleep(.5)