In [1]:
# restart your notebook if prompted on Colab
try:
import verta
except ImportError:
!pip install verta
In [2]:
verta.__version__
This example features:
LinearRegression
model
In [3]:
HOST = "http://localhost:3000"
PROJECT_NAME = "Webinar 1 - Census Income Classification"
EXPERIMENT_NAME = "Logistic Regression"
In [4]:
from __future__ import print_function
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import itertools
import os
import time
import six
import numpy as np
import pandas as pd
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
In [5]:
try:
import wget
except ImportError:
!pip install wget # you may need pip3
import wget
In [6]:
from verta import Client
from verta.utils import ModelAPI
client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)
In [7]:
repo = client.set_repository('Webinar 1 - Census Demo')
commit = repo.get_commit(branch='master').new_branch("log-reg")
In [8]:
dataset = client.set_dataset(name="Census Income", type="s3", workspace="Demos")
version = dataset.create_version(bucket_name="verta-starter")
In [9]:
DATASET_PATH = "./"
train_data_filename = DATASET_PATH + "census-train.csv"
test_data_filename = DATASET_PATH + "census-test.csv"
def download_starter_dataset(bucket_name):
if not os.path.exists(DATASET_PATH + "census-train.csv"):
train_data_url = "http://s3.amazonaws.com/" + bucket_name + "/census-train.csv"
if not os.path.isfile(train_data_filename):
wget.download(train_data_url)
if not os.path.exists(DATASET_PATH + "census-test.csv"):
test_data_url = "http://s3.amazonaws.com/" + bucket_name + "/census-test.csv"
if not os.path.isfile(test_data_filename):
wget.download(test_data_url)
download_starter_dataset("verta-starter")
In [10]:
df_train = pd.read_csv(train_data_filename)
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]
df_train.head()
In [11]:
hyperparam_candidates = {
'C': [1e-6, 1e-4],
'solver': ['lbfgs'],
'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
for values
in itertools.product(*hyperparam_candidates.values())]
In [12]:
def run_experiment(hyperparams, index):
from verta.code import Notebook
from verta.configuration import Hyperparameters
from verta.dataset import S3
from verta.environment import Python
code_ver = Notebook()
config_ver = Hyperparameters(hyperparams)
dataset_ver_train = S3("s3://{}/{}".format("verta-starter", "census-train.csv"))
dataset_ver_test = S3("s3://{}/{}".format("verta-starter", "census-test.csv"))
env_ver = Python()
commit.update("notebooks/census-s3-example", code_ver)
commit.update("config/hyperparams", config_ver)
commit.update("data/train-data", dataset_ver_train)
commit.update("data/test-data", dataset_ver_test)
commit.update("env/python", env_ver)
commit.save("Hyperparameter tuning Run: " + str(index))
# create object to track experiment run
run = client.set_experiment_run()
# create validation split
(X_val_train, X_val_test,
y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
test_size=0.2,
shuffle=True)
# log hyperparameters
run.log_hyperparameters(hyperparams)
print(hyperparams, end=' ')
# create and train model
model = linear_model.LogisticRegression(**hyperparams)
model.fit(X_train, y_train)
# calculate and log validation accuracy
val_acc = model.score(X_val_test, y_val_test)
run.log_metric("val_acc", val_acc)
print("Validation accuracy: {:.4f}".format(val_acc))
# save and log model
run.log_model(model)
# log dataset snapshot as version
run.log_dataset_version("train", version)
# log Git information as code version
run.log_code()
run.log_commit(
commit,
{
'notebook': "notebooks/census-s3-example",
'hyperparameters': "config/hyperparams",
'training_data': "data/train-data",
'test_data': "data/test-data",
'python_env': "env/python",
},
)
for i, hyperparams in enumerate(hyperparam_sets):
run_experiment(hyperparams, i)
In [13]:
for c in commit.log():
print(c)
In [14]:
commit.revert()
In [15]:
master = repo.get_commit(branch='master')
master.merge(commit)
In [16]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))
best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))
In [17]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)
In [18]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))