Random Forest with Grid Search (XGBoost)



In [1]:

    
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

This example features:

XGBoost's native API for cross-validating and training gradient-boosted trees
scikit-learn's ParameterGrid utility for iterating over a hyperparameter grid
verta's Python client integrated into the grid search loop
verta's Python client retrieving the best run from the grid search to calculate and log full training accuracy



In [2]:

    
HOST = "app.verta.ai"

PROJECT_NAME = "Wine Multiclassification"
EXPERIMENT_NAME = "Boosted Trees"



In [3]:

    
# import os
# os.environ['VERTA_EMAIL'] = 
# os.environ['VERTA_DEV_KEY'] =

Imports



In [4]:

    
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import multiprocessing
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import datasets
from sklearn import model_selection

import xgboost as xgb

Log Workflow

Prepare Data



In [5]:

    
data = datasets.load_wine()

X = data['data']
y = data['target']

dtrain = xgb.DMatrix(X, label=y)



In [6]:

    
df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))),
                  columns=data['feature_names'] + ['species'])

df.head()

Prepare Hyperparameters



In [7]:

    
grid = model_selection.ParameterGrid({
    'eta': [0.5, 0.7],
    'max_depth': [1, 2, 3],
    'num_class': [10],
})

Instantiate Client



In [8]:

    
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

Run Validation



In [9]:

    
def run_experiment(hyperparams):
    run = client.set_experiment_run()
    
    # log training data
    run.log_dataset("train_data", df)
    
    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    
    # run cross validation on hyperparameters
    cv_history = xgb.cv(hyperparams, dtrain,
                        nfold=5,
                        metrics=("merror", "mlogloss"))

    # log observations from each iteration
    for _, iteration in cv_history.iterrows():
        for obs, val in iteration.iteritems():
            run.log_observation(obs, val)
            
    # log error from final iteration
    final_val_error = iteration['test-merror-mean']
    run.log_metric("val_error", final_val_error)
    print("{} Mean error: {:.4f}".format(hyperparams, final_val_error))
    
with multiprocessing.Pool() as pool:
    pool.map(run_experiment, grid)

Revisit Workflow

Retrieve Best Run



In [10]:

    
best_run = expt.expt_runs.sort("metrics.val_error", descending=False)[0]
print("Validation Error: {:.4f}".format(best_run.get_metric("val_error")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

Train on Full Dataset



In [11]:

    
model = xgb.XGBClassifier(**best_hyperparams)
model.fit(X, y)

Calculate and Log Accuracy on Full Training Set



In [12]:

    
train_acc = model.score(X, y)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

Log Model for Deployment



In [13]:

    
# create deployment artifacts
model_api = ModelAPI(X, model.predict(X))
requirements = ["scikit-learn", "xgboost"]

best_run.log_model(model, model_api=model_api)
best_run.log_requirements(requirements)

Make Live Predictions

Deploy Model Through Web App



In [14]:

    
best_run

Load Deployed Model



In [15]:

    
from verta._demo_utils import DeployedModel

deployed_model = DeployedModel(HOST, best_run.id)

Query Deployed Model



In [16]:

    
for x in itertools.cycle(np.random.permutation(X).tolist()):
    print(deployed_model.predict([x]))
    time.sleep(.5)