Logistic Regression with Grid Search (scikit-learn)


In [1]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

In [2]:
verta.__version__

This example features:

  • scikit-learn's LinearRegression model
  • verta's Python client logging grid search results
  • verta's Python client retrieving the best run from the grid search to calculate full training accuracy
  • predictions against a deployed model

In [3]:
HOST = "http://localhost:3000"

PROJECT_NAME = "Webinar 1 - Census Income Classification"
EXPERIMENT_NAME = "Logistic Regression"

Imports


In [4]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import os
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

In [5]:
try:
    import wget
except ImportError:
    !pip install wget  # you may need pip3
    import wget

Log Workflow

Instantiate Client


In [6]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

In [7]:
repo = client.set_repository('Webinar 1 - Census Demo')
commit = repo.get_commit(branch='master').new_branch("log-reg")

Prepare Data


In [8]:
dataset = client.set_dataset(name="Census Income", type="s3", workspace="Demos")
version = dataset.create_version(bucket_name="verta-starter")

In [9]:
DATASET_PATH = "./"

train_data_filename = DATASET_PATH + "census-train.csv"
test_data_filename = DATASET_PATH + "census-test.csv"

def download_starter_dataset(bucket_name):
    if not os.path.exists(DATASET_PATH + "census-train.csv"):
        train_data_url = "http://s3.amazonaws.com/" + bucket_name + "/census-train.csv"
        if not os.path.isfile(train_data_filename):
            wget.download(train_data_url)

    if not os.path.exists(DATASET_PATH + "census-test.csv"):
        test_data_url = "http://s3.amazonaws.com/" + bucket_name + "/census-test.csv"
        if not os.path.isfile(test_data_filename):
            wget.download(test_data_url)

download_starter_dataset("verta-starter")

In [10]:
df_train = pd.read_csv(train_data_filename)
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]

df_train.head()

Prepare Hyperparameters


In [11]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

Train Models


In [12]:
def run_experiment(hyperparams, index):
    from verta.code import Notebook
    from verta.configuration import Hyperparameters
    from verta.dataset import S3
    from verta.environment import Python

    code_ver = Notebook()
    config_ver = Hyperparameters(hyperparams)
    dataset_ver_train = S3("s3://{}/{}".format("verta-starter", "census-train.csv"))
    dataset_ver_test = S3("s3://{}/{}".format("verta-starter", "census-test.csv"))
    env_ver = Python()

    commit.update("notebooks/census-s3-example", code_ver)
    commit.update("config/hyperparams", config_ver)
    commit.update("data/train-data", dataset_ver_train)
    commit.update("data/test-data", dataset_ver_test)
    commit.update("env/python", env_ver)
    commit.save("Hyperparameter tuning Run: " + str(index))

    # create object to track experiment run
    run = client.set_experiment_run()
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # save and log model
    run.log_model(model)
    
    # log dataset snapshot as version
    run.log_dataset_version("train", version)
    
    # log Git information as code version
    run.log_code()
    
    run.log_commit(
        commit,
        {
            'notebook': "notebooks/census-s3-example",
            'hyperparameters': "config/hyperparams",
            'training_data': "data/train-data",
            'test_data': "data/test-data",
            'python_env': "env/python",
        },
    )
    
for i, hyperparams in enumerate(hyperparam_sets):
    run_experiment(hyperparams, i)

In [13]:
for c in commit.log():
    print(c)

In [14]:
commit.revert()

In [15]:
master = repo.get_commit(branch='master')
master.merge(commit)

Revisit Workflow

Retrieve Best Run


In [16]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

Train on Full Dataset


In [17]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)

Calculate Accuracy on Full Training Set


In [18]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))