This example guides you through:
Before you can deploy MNIST you will need to obtain credentials that allow creating a storage account and also obtain credentials to n Azure Container Registry.
Run the following command in Bash using the Azure CLI. You may also want to use the Cloud Shell in your browser at shell.azure.com.
# Creates an Azure Active Directory Service Principal
az ad sp create-for-rbac --name kubeflow
In [ ]:
import os
# This is the Kubernetes namespace into which you installed Kubeflow
os.environ['TARGET_NAMESPACE'] = '<your-kubeflow-namespace-name>'
# The credentials you obtained from the newly created Service Principal
os.environ['AZ_CLIENT_ID'] = '<your-service-principal-client-id>'
os.environ['AZ_CLIENT_SECRET'] = '<your-service-principal-client-secret>'
os.environ['AZ_TENANT_ID'] = '<your-service-principal-tenant-id>'
# Your Azure Subcription ID
os.environ['AZ_SUBSCRIPTION_ID'] = '<your-azure-subscription-id>'
# If you haven't already created an Azure Container Registry (ACR), follow the instructions at
# https://docs.microsoft.com/azure/container-registry/container-registry-get-started-azure-cli
os.environ['ACR_NAME'] = '<your-azure-container-registry-name>'
os.environ['ACR_RESOURCE_GROUP_NAME'] = '<your-azure-container-registry-resource-group-name>'
# The existing resource group where a storage account should be created to hold all our data
os.environ['STORAGE_ACCOUNT_NAME'] = '<your-globally-unique-storage-account-name>'
os.environ['STORAGE_RESOURCE_GROUP_NAME'] = '<your-storage-resource-group-name>'
os.environ['STORAGE_RESOURCE_LOCATION'] = '<your-storage-account-resource-location>'
# Stores the Service Principal as a Kubernetes Secret for reuse
!kubectl create secret generic -n ${TARGET_NAMESPACE} azcreds \
--from-literal=AZ_CLIENT_ID=${AZ_CLIENT_ID} \
--from-literal=AZ_CLIENT_SECRET=${AZ_CLIENT_SECRET} \
--from-literal=AZ_TENANT_ID=${AZ_TENANT_ID} \
--from-literal=AZ_SUBSCRIPTION_ID=${AZ_SUBSCRIPTION_ID}
# Stores credentials for accessing the private Azure Container Registry
!kubectl create secret docker-registry -n ${TARGET_NAMESPACE} acrcreds \
--docker-server=${ACR_NAME}.azurecr.io \
--docker-username=${AZ_CLIENT_ID} \
--docker-password=${AZ_CLIENT_SECRET}
!kubectl patch serviceaccount default-editor -n ${TARGET_NAMESPACE} \
-p "{\"imagePullSecrets\": [{\"name\": \"acrcreds\"}]}"
Run the following command in Bash using the Azure CLI or use the Cloud Shell
# Gives the service principal permission to create storage accounts in the desired resource group
export AZ_CLIENT_ID='<your-service-principal-client-id>'
export AZ_SUBSCRIPTION_ID='<your-azure-subscription-id>'
export STORAGE_RESOURCE_GROUP_NAME='<your-storage-resource-group-name>'
az role assignment create --assignee $AZ_CLIENT_ID --scope /subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$STORAGE_RESOURCE_GROUP_NAME --role "Storage Account Contributor"
There is a delta between existing distributed mnist examples and what's needed to run well as a TFJob.
Basically, we must:
tf.estimator.train_and_evaluate
to enable model exporting and serving.The resulting model is model.py.
In [ ]:
import logging
from importlib import reload
import notebook_setup
reload(notebook_setup)
notebook_setup.notebook_setup(platform='azure')
In [ ]:
import k8s_util
# Force a reload of kubeflow; since kubeflow is a multi namespace module
# it looks like doing this in notebook_setup may not be sufficient
import kubeflow
reload(kubeflow)
from kubernetes import client as k8s_client
from kubernetes import config as k8s_config
from kubeflow.tfjob.api import tf_job_client as tf_job_client_module
from IPython.core.display import display, HTML
import yaml
In [ ]:
import os
from kubernetes import client as k8s_client
from kubernetes.client import rest as k8s_rest
from kubeflow import fairing
from kubeflow.fairing import utils as fairing_utils
from kubeflow.fairing.builders import append
from kubeflow.fairing.deployers import job
from kubeflow.fairing.preprocessors import base as base_preprocessor
AZURE_ACR_NAME = os.environ.get('ACR_NAME')
# Setting up AWS Elastic Container Registry (ECR) for storing output containers
# You can use any docker container registry istead of ECR
# AWS_ACCOUNT_ID=fairing.cloud.azure.guess_account_id()
# AWS_ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')
DOCKER_REGISTRY = '{}.azurecr.io'.format(AZURE_ACR_NAME)
namespace = fairing_utils.get_current_k8s_namespace()
logging.info(f"Running in namespace {namespace}")
logging.info(f"Using docker registry {DOCKER_REGISTRY}")
In [ ]:
# TODO(https://github.com/kubeflow/fairing/issues/426): We should get rid of this once the default
# Kaniko image is updated to a newer image than 0.7.0.
from kubeflow.fairing import constants
constants.constants.KANIKO_IMAGE = "gcr.io/kaniko-project/executor:v0.14.0"
In [ ]:
from kubeflow.fairing.builders import cluster
# output_map is a map of extra files to add to the notebook.
# It is a map from source location to the location inside the context.
output_map = {
"Dockerfile.model": "Dockerfile",
"model.py": "model.py"
}
preprocessor = base_preprocessor.BasePreProcessor(
command=["python"], # The base class will set this.
input_files=[],
path_prefix="/app", # irrelevant since we aren't preprocessing any files
output_map=output_map)
preprocessor.preprocess()
In [ ]:
STORAGE_GROUP_NAME = os.environ.get('STORAGE_RESOURCE_GROUP_NAME')
STORAGE_ACCOUNT_NAME = os.environ.get('STORAGE_ACCOUNT_NAME')
AZURE_REGION = os.environ.get('STORAGE_RESOURCE_LOCATION')
# Use a Tensorflow image as the base image
# We use a custom Dockerfile
cluster_builder = cluster.cluster.ClusterBuilder(registry=DOCKER_REGISTRY,
base_image="", # base_image is set in the Dockerfile
preprocessor=preprocessor,
image_name="mnist",
dockerfile_path="Dockerfile",
pod_spec_mutators=[fairing.cloud.azure.add_acr_config, fairing.cloud.azure.add_azure_files],
context_source=cluster.azurestorage_context.StorageContextSource(region=AZURE_REGION, storage_account_name=STORAGE_ACCOUNT_NAME, resource_group_name=STORAGE_GROUP_NAME))
cluster_builder.build()
logging.info(f"Built image {cluster_builder.image_tag}")
Run the following in your local Bash terminal with Azure CLI or in the Azure Cloud Shell
export AZ_STORAGE_ACCOUNT_NAME="<your-storage-account-name>"
export AZ_STORAGE_RESOURCE_GROUP="<your-storage-account-resource-group>"
export AZ_SHARE_NAME="mnist"
STORAGE_KEY=$(az storage account keys list --resource-group $AZ_STORAGE_RESOURCE_GROUP --account-name $AZ_STORAGE_ACCOUNT_NAME --query "[0].value" -o tsv)
az storage share create --name $AZ_SHARE_NAME --account-name $AZ_STORAGE_ACCOUNT_NAME --account-key $STORAGE_KEY
echo $STORAGE_KEY
In [ ]:
# Insert the Storage Key in the command below to save the storage access credentials as a Kubernetes secret
%env STORAGE_KEY='<your-storage-account-access-key>'
!kubectl create secret generic azure-share-secret --namespace $TARGET_NAMESPACE --from-literal=azurestorageaccountname=$AZ_STORAGE_ACCOUNT_NAME --from-literal=azurestorageaccountkey=$STORAGE_KEY
In [ ]:
import uuid
train_name = f"mnist-train-{uuid.uuid4().hex[:4]}"
num_ps = 1
num_workers = 2
model_dir = "/mnt/azure/mnist"
export_path = "/mnt/azure/mnist/export"
train_steps = 200
batch_size = 100
learning_rate = .01
image = cluster_builder.image_tag
train_spec = f"""apiVersion: kubeflow.org/v1
kind: TFJob
metadata:
name: {train_name}
spec:
tfReplicaSpecs:
Ps:
replicas: {num_ps}
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
serviceAccount: default-editor
containers:
- name: tensorflow
command:
- python
- /opt/model.py
- --tf-model-dir={model_dir}
- --tf-export-dir={export_path}
- --tf-train-steps={train_steps}
- --tf-batch-size={batch_size}
- --tf-learning-rate={learning_rate}
image: {image}
workingDir: /opt
volumeMounts:
- name: azure
mountPath: /mnt/azure
readOnly: false
volumes:
- name: azure
azureFile:
secretName: azure-share-secret
shareName: mnist
readOnly: false
restartPolicy: OnFailure
Chief:
replicas: 1
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
serviceAccount: default-editor
containers:
- name: tensorflow
command:
- python
- /opt/model.py
- --tf-model-dir={model_dir}
- --tf-export-dir={export_path}
- --tf-train-steps={train_steps}
- --tf-batch-size={batch_size}
- --tf-learning-rate={learning_rate}
image: {image}
workingDir: /opt
volumeMounts:
- name: azure
mountPath: /mnt/azure
readOnly: false
volumes:
- name: azure
azureFile:
secretName: azure-share-secret
shareName: mnist
readOnly: false
restartPolicy: OnFailure
Worker:
replicas: 1
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
serviceAccount: default-editor
containers:
- name: tensorflow
command:
- python
- /opt/model.py
- --tf-model-dir={model_dir}
- --tf-export-dir={export_path}
- --tf-train-steps={train_steps}
- --tf-batch-size={batch_size}
- --tf-learning-rate={learning_rate}
image: {image}
workingDir: /opt
volumeMounts:
- name: azure
mountPath: /mnt/azure
readOnly: false
volumes:
- name: azure
azureFile:
secretName: azure-share-secret
shareName: mnist
readOnly: false
restartPolicy: OnFailure
"""
kubectl apply -f {FILE}
In [ ]:
tf_job_client = tf_job_client_module.TFJobClient()
In [ ]:
tf_job_body = yaml.safe_load(train_spec)
tf_job = tf_job_client.create(tf_job_body, namespace=namespace)
logging.info(f"Created job {namespace}.{train_name}")
In [ ]:
!kubectl get tfjobs -o yaml {train_name}
In [ ]:
tb_name = "mnist-tensorboard"
tb_deploy = f"""apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: mnist-tensorboard
name: {tb_name}
namespace: {namespace}
spec:
selector:
matchLabels:
app: mnist-tensorboard
template:
metadata:
labels:
app: mnist-tensorboard
version: v1
spec:
serviceAccount: default-editor
containers:
- command:
- /usr/local/bin/tensorboard
- --logdir={model_dir}
- --port=80
image: tensorflow/tensorflow:1.15.2-py3
name: tensorboard
ports:
- containerPort: 80
volumeMounts:
- name: azure
mountPath: /mnt/azure
readOnly: false
volumes:
- name: azure
azureFile:
secretName: azure-share-secret
shareName: mnist
readOnly: false
"""
tb_service = f"""apiVersion: v1
kind: Service
metadata:
labels:
app: mnist-tensorboard
name: {tb_name}
namespace: {namespace}
spec:
ports:
- name: http-tb
port: 80
targetPort: 80
selector:
app: mnist-tensorboard
type: ClusterIP
"""
tb_virtual_service = f"""apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: {tb_name}
namespace: {namespace}
spec:
gateways:
- kubeflow/kubeflow-gateway
hosts:
- '*'
http:
- match:
- uri:
prefix: /mnist/{namespace}/tensorboard/
rewrite:
uri: /
route:
- destination:
host: {tb_name}.{namespace}.svc.cluster.local
port:
number: 80
timeout: 300s
"""
tb_specs = [tb_deploy, tb_service, tb_virtual_service]
In [ ]:
k8s_util.apply_k8s_specs(tb_specs, k8s_util.K8S_CREATE_OR_REPLACE)
Recall we are forwarding the cluster-internal ingress.
kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80
To access TensorBoard, manually visit the path: /mnist/your-kubeflow-namespace/tensorboard/
In [ ]:
tf_job = tf_job_client.wait_for_condition(train_name, expected_condition=["Succeeded", "Failed"], namespace=namespace)
if tf_job_client.is_job_succeeded(train_name, namespace):
logging.info(f"TFJob {namespace}.{train_name} succeeded")
else:
raise ValueError(f"TFJob {namespace}.{train_name} failed")
In [ ]:
import os
deploy_name = "mnist-model"
model_base_path = export_path
# The web ui defaults to mnist-service so if you change it you will
# need to change it in the UI as well to send predictions to the mode
model_service = "mnist-service"
deploy_spec = f"""apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: mnist
name: {deploy_name}
namespace: {namespace}
spec:
selector:
matchLabels:
app: mnist-model
template:
metadata:
# TODO(jlewi): Right now we disable the istio side car because otherwise ISTIO rbac will prevent the
# UI from sending RPCs to the server. We should create an appropriate ISTIO rbac authorization
# policy to allow traffic from the UI to the model servier.
# https://istio.io/docs/concepts/security/#target-selectors
annotations:
sidecar.istio.io/inject: "false"
labels:
app: mnist-model
version: v1
spec:
serviceAccount: default-editor
containers:
- args:
- --port=9000
- --rest_api_port=8500
- --model_name=mnist
- --model_base_path={model_base_path}
- --monitoring_config_file=/var/config/monitoring_config.txt
command:
- /usr/bin/tensorflow_model_server
env:
- name: modelBasePath
value: {model_base_path}
image: tensorflow/serving:1.15.0
imagePullPolicy: IfNotPresent
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 30
tcpSocket:
port: 9000
name: mnist
ports:
- containerPort: 9000
- containerPort: 8500
resources:
limits:
cpu: "1"
memory: 1Gi
requests:
cpu: "1"
memory: 1Gi
volumeMounts:
- mountPath: /var/config/
name: model-config
- name: azure
mountPath: /mnt/azure
volumes:
- configMap:
name: {deploy_name}
name: model-config
- name: azure
azureFile:
secretName: azure-share-secret
shareName: mnist
readOnly: false
"""
service_spec = f"""apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/path: /monitoring/prometheus/metrics
prometheus.io/port: "8500"
prometheus.io/scrape: "true"
labels:
app: mnist-model
name: {model_service}
namespace: {namespace}
spec:
ports:
- name: grpc-tf-serving
port: 9000
targetPort: 9000
- name: http-tf-serving
port: 8500
targetPort: 8500
selector:
app: mnist-model
type: ClusterIP
"""
monitoring_config = f"""kind: ConfigMap
apiVersion: v1
metadata:
name: {deploy_name}
namespace: {namespace}
data:
monitoring_config.txt: |-
prometheus_config: {{
enable: true,
path: "/monitoring/prometheus/metrics"
}}
"""
model_specs = [deploy_spec, service_spec, monitoring_config]
In [ ]:
k8s_util.apply_k8s_specs(model_specs, k8s_util.K8S_CREATE_OR_REPLACE)
In [ ]:
ui_name = "mnist-ui"
ui_deploy = f"""apiVersion: apps/v1
kind: Deployment
metadata:
name: {ui_name}
namespace: {namespace}
spec:
replicas: 1
selector:
matchLabels:
app: mnist-web-ui
template:
metadata:
labels:
app: mnist-web-ui
spec:
containers:
- image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225
name: web-ui
ports:
- containerPort: 5000
serviceAccount: default-editor
"""
ui_service = f"""apiVersion: v1
kind: Service
metadata:
annotations:
name: {ui_name}
namespace: {namespace}
spec:
ports:
- name: http-mnist-ui
port: 80
targetPort: 5000
selector:
app: mnist-web-ui
type: ClusterIP
"""
ui_virtual_service = f"""apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: {ui_name}
namespace: {namespace}
spec:
gateways:
- kubeflow/kubeflow-gateway
hosts:
- '*'
http:
- match:
- uri:
prefix: /mnist/{namespace}/ui/
rewrite:
uri: /
route:
- destination:
host: {ui_name}.{namespace}.svc.cluster.local
port:
number: 80
timeout: 300s
"""
ui_specs = [ui_deploy, ui_service, ui_virtual_service]
In [ ]:
k8s_util.apply_k8s_specs(ui_specs, k8s_util.K8S_CREATE_OR_REPLACE)
Recall we are forwarding the cluster-internal ingress.
kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80
To access the web UI, manually visit the path: /mnist/your-kubeflow-namespace/ui/
In [ ]: