Logistic Regression with Grid Search (scikit-learn)


In [1]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

This example features:

  • scikit-learn's LinearRegression model
  • scikit-learn's GridSearchCV utility for performing grid search and cross-validation
  • verta's Python client logging the grid search results
  • verta's Python client retrieving the best run from the grid search to calculate and log full training accuracy

In [2]:
HOST = "app.verta.ai"

PROJECT_NAME = "Iris Multiclassification"
EXPERIMENT_NAME = "Logistic Regression"

In [3]:
# import os
# os.environ['VERTA_EMAIL'] = 
# os.environ['VERTA_DEV_KEY'] =

Imports


In [4]:
from __future__ import print_function

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

Log Workflow

Prepare Data


In [5]:
data = datasets.load_iris()

X = data['data']
y = data['target']

In [6]:
df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))),
                  columns=data['feature_names'] + ['species'])

df.head()

Prepare Hyperparameters


In [7]:
grid = {
    'C': [1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [1e4, 1e5],
}

Instantiate Client


In [8]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

Run Validation


In [9]:
model = linear_model.LogisticRegression(multi_class='auto')
grid_search = model_selection.GridSearchCV(model, grid,
                                           cv=5, return_train_score=False)
grid_search.fit(X, y)

In [10]:
results = pd.DataFrame(grid_search.cv_results_)

for _, run_result in results.iterrows():
    run = client.set_experiment_run()
    
    # log training data
    run.log_dataset("train_data", df)
    
    # log hyperparameters
    run.log_hyperparameters(run_result['params'])
    
    # log accuracy for each validation fold
    for obs_key in ["split{}_test_score".format(i) for i in range(5)]:
        run.log_observation("fold_acc", run_result[obs_key])
    
    # log summary stats of validation
    run.log_metric("val_acc_mean", run_result['mean_test_score'])
    run.log_metric("val_acc_std", run_result['std_test_score'])

Revisit Workflow

Retrieve Best Run


In [11]:
best_run = expt.expt_runs.sort("metrics.val_acc_mean", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc_mean")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

Train on Full Dataset


In [12]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X, y)

Calculate and Log Accuracy on Full Training Set


In [13]:
train_acc = model.score(X, y)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

Log Model for Deployment


In [14]:
# create deployment artifacts
model_api = ModelAPI(X, model.predict(X))
requirements = ["scikit-learn"]

best_run.log_model(model, model_api=model_api)
best_run.log_requirements(requirements)

Make Live Predictions

Deploy Model Through Web App


In [15]:
best_run

Load Deployed Model


In [16]:
from verta._demo_utils import DeployedModel

deployed_model = DeployedModel(HOST, best_run.id)

Query Deployed Model


In [17]:
for x in itertools.cycle(X.tolist()):
    print(deployed_model.predict([x]))
    time.sleep(.5)