This is a sample for Kubeflow PyTorchJob SDK kubeflow-pytorchjob
.
The notebook shows how to use Kubeflow PyTorchJob SDK to create, get, wait, check and delete PyTorchJob.
In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements
from kubeflow.pytorchjob import constants
from kubeflow.pytorchjob import utils
from kubeflow.pytorchjob import V1ReplicaSpec
from kubeflow.pytorchjob import V1PyTorchJob
from kubeflow.pytorchjob import V1PyTorchJobSpec
from kubeflow.pytorchjob import PyTorchJobClient
Define namespace where pytorchjob needs to be created to. If not specified, below function defines namespace to the current one where SDK is running in the cluster, otherwise it will deploy to default namespace.
In [2]:
namespace = utils.get_default_target_namespace()
The demo only creates a worker of PyTorchJob to run mnist sample.
In [3]:
container = V1Container(
name="pytorch",
image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
args=["--backend","gloo"]
)
master = V1ReplicaSpec(
replicas=1,
restart_policy="OnFailure",
template=V1PodTemplateSpec(
spec=V1PodSpec(
containers=[container]
)
)
)
worker = V1ReplicaSpec(
replicas=1,
restart_policy="OnFailure",
template=V1PodTemplateSpec(
spec=V1PodSpec(
containers=[container]
)
)
)
pytorchjob = V1PyTorchJob(
api_version="kubeflow.org/v1",
kind="PyTorchJob",
metadata=V1ObjectMeta(name="pytorch-dist-mnist-gloo",namespace=namespace),
spec=V1PyTorchJobSpec(
clean_pod_policy="None",
pytorch_replica_specs={"Master": master,
"Worker": worker}
)
)
In [4]:
pytorchjob_client = PyTorchJobClient()
pytorchjob_client.create(pytorchjob)
Out[4]:
In [5]:
pytorchjob_client.get('pytorch-dist-mnist-gloo')
Out[5]:
In [6]:
pytorchjob_client.get_job_status('pytorch-dist-mnist-gloo', namespace=namespace)
Out[6]:
In [7]:
pytorchjob_client.wait_for_job('pytorch-dist-mnist-gloo', namespace=namespace, watch=True)
In [8]:
pytorchjob_client.is_job_succeeded('pytorch-dist-mnist-gloo', namespace=namespace)
Out[8]:
In [9]:
pytorchjob_client.get_logs('pytorch-dist-mnist-gloo', namespace=namespace)
In [10]:
pytorchjob_client.delete('pytorch-dist-mnist-gloo')
Out[10]:
In [ ]: