In [0]:
import datetime
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import time
from tensorflow import keras
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
In [2]:
# First, download the data
# We've made it publicly available in Google Cloud Storage
!gsutil cp gs://ml-design-patterns/mushrooms.csv .
In [3]:
mushroom_data = pd.read_csv('mushrooms.csv')
mushroom_data.head()
Out[3]:
To keep things simple, we'll first convert the label column to numeric and then
use pd.get_dummies()
to covert the data to numeric.
In [0]:
# 1 = edible, 0 = poisonous
mushroom_data.loc[mushroom_data['class'] == 'p', 'class'] = 0
mushroom_data.loc[mushroom_data['class'] == 'e', 'class'] = 1
In [0]:
labels = mushroom_data.pop('class')
In [0]:
dummy_data = pd.get_dummies(mushroom_data)
In [0]:
# Split the data
train_size = int(len(mushroom_data) * .8)
train_data = dummy_data[:train_size]
test_data = dummy_data[train_size:]
train_labels = labels[:train_size].astype(int)
test_labels = labels[train_size:].astype(int)
Next, we'll build our Scikit-learn model and define the hyperparameters we want to optimize using grid serach.
In [0]:
model = RandomForestClassifier()
In [0]:
grid_vals = {
'max_depth': [5, 10, 100],
'n_estimators': [100, 150, 200]
}
In [0]:
grid_search = GridSearchCV(model, param_grid=grid_vals, scoring='accuracy')
In [20]:
# Train the model while running hyperparameter trials
grid_search.fit(train_data.values, train_labels.values)
Out[20]:
Let's see which hyperparameters resulted in the best accuracy.
In [21]:
grid_search.best_params_
Out[21]:
Finally, we can generate some test predictions on our model and evaluate its accuracy.
In [0]:
grid_predict = grid_search.predict(test_data.values)
In [0]:
grid_acc = accuracy_score(test_labels.values, grid_predict)
grid_f = f1_score(test_labels.values, grid_predict)
In [26]:
print('Accuracy: ', grid_acc)
print('F1-Score: ', grid_f)
keras-tuner
To show how this works we'll train a model on the MNIST handwritten digit dataset, which is available directly in Keras. For more details, see this Keras tuner guide.
In [31]:
!pip install keras-tuner --quiet
In [0]:
import kerastuner as kt
In [27]:
# Get the mnist data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
In [0]:
def build_model(hp):
model = keras.Sequential([
keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(hp.Int('first_hidden', 128, 256, step=32), activation='relu'),
keras.layers.Dense(hp.Int('second_hidden', 16, 128, step=32), activation='relu'),
keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer=tf.keras.optimizers.Adam(
hp.Float('learning_rate', .005, .01, sampling='log')),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
In [22]:
tuner = kt.BayesianOptimization(
build_model,
objective='val_accuracy',
max_trials=30
)
In [0]:
tuner.search(x_train, y_train, validation_split=0.1, epochs=10)
In [0]:
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]
In this section we'll show you how to scale your hyperparameter optimization by running it on Google Cloud's AI Platform. You'll need a Cloud account with AI Platform Training enabled to run this section.
We'll be using PyTorch to build a regression model in this section. To train the model we'll be the BigQuery natality dataset. We've made a subset of this data available in a public Cloud Storage bucket, which we'll download from within the training job.
In [0]:
from google.colab import auth
auth.authenticate_user()
In the cells below, replcae your-project-id
with the ID of your Cloud project, and your-gcs-bucket
with the name of your Cloud Storage bucket.
In [98]:
!gcloud config set project your-project-id
In [0]:
BUCKET_URL = 'gs://your-gcs-bucket'
To run this on AI Platform, we'll need to package up our model code in Python's package format, which includes an empty __init__.py
file and a setup.py
to install dependencies (in this case PyTorch, Scikit-learn, and Pandas).
In [100]:
!mkdir trainer
!touch trainer/__init__.py
In [101]:
%%writefile setup.py
from setuptools import find_packages
from setuptools import setup
REQUIRED_PACKAGES = ['torch>=1.5', 'scikit-learn>=0.20', 'pandas>=1.0']
setup(
name='trainer',
version='0.1',
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=True,
description='My training application package.'
)
Below, we're copying our model training code to a model.py
file in our trainer package directory. This code runs training and after training completes, reports the model's final loss to Cloud HyperTune.
In [107]:
%%writefile trainer/model.py
import argparse
import hypertune
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
def get_args():
"""Argument parser.
Returns:
Dictionary of arguments.
"""
parser = argparse.ArgumentParser(description='PyTorch MNIST')
parser.add_argument('--job-dir', # handled automatically by AI Platform
help='GCS location to write checkpoints and export ' \
'models')
parser.add_argument('--lr', # Specified in the config file
type=float,
default=0.01,
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', # Specified in the config file
type=float,
default=0.5,
help='SGD momentum (default: 0.5)')
parser.add_argument('--hidden-layer-size', # Specified in the config file
type=int,
default=8,
help='hidden layer size')
args = parser.parse_args()
return args
def train_model(args):
# Get the data
natality = pd.read_csv('https://storage.googleapis.com/ml-design-patterns/natality.csv')
natality = natality.dropna()
natality = shuffle(natality, random_state = 2)
natality.head()
natality_labels = natality['weight_pounds']
natality = natality.drop(columns=['weight_pounds'])
train_size = int(len(natality) * 0.8)
traindata_natality = natality[:train_size]
trainlabels_natality = natality_labels[:train_size]
testdata_natality = natality[train_size:]
testlabels_natality = natality_labels[train_size:]
# Normalize and convert to PT tensors
normalized_train = normalize(np.array(traindata_natality.values), axis=0)
normalized_test = normalize(np.array(testdata_natality.values), axis=0)
train_x = torch.Tensor(normalized_train)
train_y = torch.Tensor(np.array(trainlabels_natality))
test_x = torch.Tensor(normalized_test)
test_y = torch.Tensor(np.array(testlabels_natality))
# Define our data loaders
train_dataset = torch.utils.data.TensorDataset(train_x, train_y)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(test_x, test_y)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)
# Define the model, while tuning the size of our hidden layer
model = nn.Sequential(nn.Linear(len(train_x[0]), args.hidden_layer_size),
nn.ReLU(),
nn.Linear(args.hidden_layer_size, 1))
criterion = nn.MSELoss()
# Tune hyperparameters in our optimizer
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
epochs = 10
for e in range(epochs):
for batch_id, (data, label) in enumerate(train_dataloader):
optimizer.zero_grad()
y_pred = model(data)
label = label.view(-1,1)
loss = criterion(y_pred, label)
loss.backward()
optimizer.step()
val_mse = 0
num_batches = 0
# Evaluate accuracy on our test set
with torch.no_grad():
for i, (data, label) in enumerate(test_dataloader):
num_batches += 1
y_pred = model(data)
mse = criterion(y_pred, label.view(-1,1))
val_mse += mse.item()
avg_val_mse = (val_mse / num_batches)
# Report the metric we're optimizing for to AI Platform's HyperTune service
# In this example, we're mimizing loss on our test set
hpt = hypertune.HyperTune()
hpt.report_hyperparameter_tuning_metric(
hyperparameter_metric_tag='val_mse',
metric_value=avg_val_mse,
global_step=epochs
)
def main():
args = get_args()
print('in main', args)
train_model(args)
if __name__ == '__main__':
main()
In [108]:
%%writefile config.yaml
trainingInput:
hyperparameters:
goal: MINIMIZE
maxTrials: 10
maxParallelTrials: 5
hyperparameterMetricTag: val_mse
enableTrialEarlyStopping: TRUE
params:
- parameterName: lr
type: DOUBLE
minValue: 0.0001
maxValue: 0.1
scaleType: UNIT_LINEAR_SCALE
- parameterName: momentum
type: DOUBLE
minValue: 0.0
maxValue: 1.0
scaleType: UNIT_LINEAR_SCALE
- parameterName: hidden-layer-size
type: INTEGER
minValue: 8
maxValue: 32
scaleType: UNIT_LINEAR_SCALE
In [0]:
MAIN_TRAINER_MODULE = "trainer.model"
TRAIN_DIR = os.getcwd() + '/trainer'
JOB_DIR = BUCKET_URL + '/output'
REGION = "us-central1"
In [0]:
# Create a unique job name (run this each time you submit a job)
timestamp = str(datetime.datetime.now().time())
JOB_NAME = 'caip_training_' + str(int(time.time()))
The command below will submit your training job to AI Platform. To view the logs, and the results of each HyperTune trial visit your Cloud console.
In [111]:
# Configure and submit the training job
!gcloud ai-platform jobs submit training $JOB_NAME \
--scale-tier basic \
--package-path $TRAIN_DIR \
--module-name $MAIN_TRAINER_MODULE \
--job-dir $JOB_DIR \
--region $REGION \
--runtime-version 2.1 \
--python-version 3.7 \
--config config.yaml
Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License