In [ ]:
from kubeflow import fairing
from kubeflow.fairing import TrainJob
from kubeflow.fairing.backends import KubeflowGKEBackend
from kubeflow.fairing.kubernetes.utils import get_resource_mutator

Executing a python file


In [ ]:
%%writefile train.py
print("hello world!")

In [ ]:
job = TrainJob("train.py", backend=KubeflowGKEBackend())
job.submit()

Executing a python function


In [ ]:
def train():
    print("simple train job!")

In [ ]:
job = TrainJob(train, backend=KubeflowGKEBackend())
job.submit()

Executing a complete notebook


In [ ]:
%%writefile requirements.txt
papermill
jupyter

In [ ]:
job = TrainJob("train.ipynb", backend=KubeflowGKEBackend(), input_files=["requirements.txt"])
job.submit()

Executing it with large #CPUs and huge Memory

You Kubernetes cluster should have a node pool that can satisfy these resource requests. For example, to schedule a job with 90 cpus and 600GB memory you need a nodepool created using n1-hihmem-624 in GCP.


In [ ]:
import multiprocessing
import os
def train():
    print("CPU count: {}".format(multiprocessing.cpu_count()))
    print("Memory: {}", os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')/(1024.**3))
train()

In [ ]:
job = TrainJob(train, base_docker_image=None, docker_registry=None,  backend=KubeflowGKEBackend(), 
              pod_spec_mutators=[get_resource_mutator(cpu=90, memory=600)])
job.submit()

In [ ]: