In [4]:
from kubernetes.client import V1ObjectMeta
import kubeflow.katib as kc
from kubeflow.katib import constants
from kubeflow.katib import utils
from kubeflow.katib import V1alpha3AlgorithmSetting
from kubeflow.katib import V1alpha3AlgorithmSetting
from kubeflow.katib import V1alpha3AlgorithmSpec
from kubeflow.katib import V1alpha3CollectorSpec
from kubeflow.katib import V1alpha3EarlyStoppingSetting
from kubeflow.katib import V1alpha3EarlyStoppingSpec
from kubeflow.katib import V1alpha3Experiment
from kubeflow.katib import V1alpha3ExperimentCondition
from kubeflow.katib import V1alpha3ExperimentList
from kubeflow.katib import V1alpha3ExperimentSpec
from kubeflow.katib import V1alpha3ExperimentStatus
from kubeflow.katib import V1alpha3FeasibleSpace
from kubeflow.katib import V1alpha3FileSystemPath
from kubeflow.katib import V1alpha3FilterSpec
from kubeflow.katib import V1alpha3GoTemplate
from kubeflow.katib import V1alpha3GraphConfig
from kubeflow.katib import V1alpha3Metric
from kubeflow.katib import V1alpha3MetricsCollectorSpec
from kubeflow.katib import V1alpha3NasConfig
from kubeflow.katib import V1alpha3ObjectiveSpec
from kubeflow.katib import V1alpha3Observation
from kubeflow.katib import V1alpha3Operation
from kubeflow.katib import V1alpha3OptimalTrial
from kubeflow.katib import V1alpha3ParameterAssignment
from kubeflow.katib import V1alpha3ParameterSpec
from kubeflow.katib import V1alpha3SourceSpec
from kubeflow.katib import V1alpha3Suggestion
from kubeflow.katib import V1alpha3SuggestionCondition
from kubeflow.katib import V1alpha3SuggestionList
from kubeflow.katib import V1alpha3SuggestionSpec
from kubeflow.katib import V1alpha3SuggestionStatus
from kubeflow.katib import V1alpha3TemplateSpec
from kubeflow.katib import V1alpha3Trial
from kubeflow.katib import V1alpha3TrialAssignment
from kubeflow.katib import V1alpha3TrialCondition
from kubeflow.katib import V1alpha3TrialList
from kubeflow.katib import V1alpha3TrialSpec
from kubeflow.katib import V1alpha3TrialStatus
from kubeflow.katib import V1alpha3TrialTemplate

In [5]:
algorithmsettings = V1alpha3AlgorithmSetting(
    name= None,
    value = None
    )
algorithm = V1alpha3AlgorithmSpec(
    algorithm_name = "random",
    algorithm_settings = [algorithmsettings]
  )

# Metric Collector
collector = V1alpha3CollectorSpec(kind = "TensorFlowEvent")
FileSystemPath = V1alpha3FileSystemPath(kind = "/train" , path = "Directory")
metrics_collector_spec = V1alpha3MetricsCollectorSpec(
    collector = collector,
    source = FileSystemPath)

# Objective
objective = V1alpha3ObjectiveSpec(
    goal = 0.9999,
    objective_metric_name = "accuracy_1",
    type = "maximize")

# Parameters
feasible_space = V1alpha3FeasibleSpace(min = "100", max = "200")
parameters = [V1alpha3ParameterSpec(
    feasible_space = feasible_space, 
    name = "--batch_size",
    parameter_type = "int"
    )]

# Trialtemplate
go_template = V1alpha3GoTemplate(
    raw_template =   "apiVersion: \"kubeflow.org/v1\"\nkind: TFJob\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n tfReplicaSpecs:\n  Worker:\n    replicas: 1\n    restartPolicy: OnFailure\n    template:\n      spec:\n        containers:\n          - name: tensorflow\n            image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\n            imagePullPolicy: Always\n            command:\n              - \"python\"\n              - \"/var/tf_mnist/mnist_with_summaries.py\"\n              - \"--log_dir=/train/metrics\"\n              {{- with .HyperParameters}}\n              {{- range .}}\n              - \"{{.Name}}={{.Value}}\"\n              {{- end}}\n              {{- end}}"
    )


trial_template= V1alpha3TrialTemplate(go_template=go_template)


# Experiment
experiment = V1alpha3Experiment(
    api_version="kubeflow.org/v1alpha3",
    kind="Experiment",
    metadata=V1ObjectMeta(name="tfjob-example",namespace="anonymous"),

    spec=V1alpha3ExperimentSpec(
         algorithm = algorithm,
         max_failed_trial_count=3,
         max_trial_count=12,
         metrics_collector_spec= metrics_collector_spec ,
         objective = objective,
         parallel_trial_count=4,
         parameters = parameters ,
         trial_template = trial_template
    )
)

In [6]:
namespace = kc.utils.get_default_target_namespace()

In [7]:
kclient = kc.KatibClient()

Create Experiment


In [8]:
kclient.create_experiment(experiment,namespace=namespace)


Katib Experiment link here
Out[8]:
{'apiVersion': 'kubeflow.org/v1alpha3',
 'kind': 'Experiment',
 'metadata': {'creationTimestamp': '2020-03-04T10:50:02Z',
  'generation': 1,
  'name': 'tfjob-example',
  'namespace': 'anonymous',
  'resourceVersion': '7037041',
  'selfLink': '/apis/kubeflow.org/v1alpha3/namespaces/anonymous/experiments/tfjob-example',
  'uid': 'e6d578d8-5e05-11ea-8d71-42010aa00012'},
 'spec': {'algorithm': {'algorithmName': 'random', 'algorithmSettings': [{}]},
  'maxFailedTrialCount': 3,
  'maxTrialCount': 12,
  'metricsCollectorSpec': {'collector': {'kind': 'TensorFlowEvent'},
   'source': {'fileSystemPath': {'kind': 'Directory',
     'path': '/var/log/katib/tfevent/'},
    'kind': '/train',
    'path': 'Directory'}},
  'objective': {'goal': 0.9999,
   'objectiveMetricName': 'accuracy_1',
   'type': 'maximize'},
  'parallelTrialCount': 4,
  'parameters': [{'feasibleSpace': {'max': '200', 'min': '100'},
    'name': '--batch_size',
    'parameterType': 'int'}],
  'trialTemplate': {'goTemplate': {'rawTemplate': 'apiVersion: "kubeflow.org/v1"\nkind: TFJob\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n tfReplicaSpecs:\n  Worker:\n    replicas: 1\n    restartPolicy: OnFailure\n    template:\n      spec:\n        containers:\n          - name: tensorflow\n            image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\n            imagePullPolicy: Always\n            command:\n              - "python"\n              - "/var/tf_mnist/mnist_with_summaries.py"\n              - "--log_dir=/train/metrics"\n              {{- with .HyperParameters}}\n              {{- range .}}\n              - "{{.Name}}={{.Value}}"\n              {{- end}}\n              {{- end}}'}}}}

Get Single Experiment


In [9]:
kclient.get_experiment(name="tfjob-example", namespace=namespace)


Out[9]:
{'apiVersion': 'kubeflow.org/v1alpha3',
 'kind': 'Experiment',
 'metadata': {'creationTimestamp': '2020-03-04T10:50:02Z',
  'finalizers': ['update-prometheus-metrics'],
  'generation': 2,
  'name': 'tfjob-example',
  'namespace': 'anonymous',
  'resourceVersion': '7037043',
  'selfLink': '/apis/kubeflow.org/v1alpha3/namespaces/anonymous/experiments/tfjob-example',
  'uid': 'e6d578d8-5e05-11ea-8d71-42010aa00012'},
 'spec': {'algorithm': {'algorithmName': 'random', 'algorithmSettings': [{}]},
  'maxFailedTrialCount': 3,
  'maxTrialCount': 12,
  'metricsCollectorSpec': {'collector': {'kind': 'TensorFlowEvent'},
   'source': {'fileSystemPath': {'kind': 'Directory',
     'path': '/var/log/katib/tfevent/'}}},
  'objective': {'goal': 0.9999,
   'objectiveMetricName': 'accuracy_1',
   'type': 'maximize'},
  'parallelTrialCount': 4,
  'parameters': [{'feasibleSpace': {'max': '200', 'min': '100'},
    'name': '--batch_size',
    'parameterType': 'int'}],
  'trialTemplate': {'goTemplate': {'rawTemplate': 'apiVersion: "kubeflow.org/v1"\nkind: TFJob\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n tfReplicaSpecs:\n  Worker:\n    replicas: 1\n    restartPolicy: OnFailure\n    template:\n      spec:\n        containers:\n          - name: tensorflow\n            image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\n            imagePullPolicy: Always\n            command:\n              - "python"\n              - "/var/tf_mnist/mnist_with_summaries.py"\n              - "--log_dir=/train/metrics"\n              {{- with .HyperParameters}}\n              {{- range .}}\n              - "{{.Name}}={{.Value}}"\n              {{- end}}\n              {{- end}}'}}},
 'status': {'completionTime': None,
  'conditions': [{'lastTransitionTime': '2020-03-04T10:50:02Z',
    'lastUpdateTime': '2020-03-04T10:50:02Z',
    'message': 'Experiment is created',
    'reason': 'ExperimentCreated',
    'status': 'True',
    'type': 'Created'}],
  'currentOptimalTrial': {'bestTrialName': '',
   'observation': {'metrics': None},
   'parameterAssignments': None},
  'startTime': '2020-03-04T10:50:02Z'}}

Get all Experiments


In [10]:
kclient.get_experiment(namespace=namespace)


Out[10]:
{'apiVersion': 'kubeflow.org/v1alpha3',
 'items': [{'apiVersion': 'kubeflow.org/v1alpha3',
   'kind': 'Experiment',
   'metadata': {'creationTimestamp': '2020-03-04T10:46:49Z',
    'finalizers': ['update-prometheus-metrics'],
    'generation': 1,
    'name': 'bayesianoptimization',
    'namespace': 'anonymous',
    'resourceVersion': '7036404',
    'selfLink': '/apis/kubeflow.org/v1alpha3/namespaces/anonymous/experiments/bayesianoptimization',
    'uid': '7360157b-5e05-11ea-8d71-42010aa00012'},
   'spec': {'algorithm': {'algorithmName': 'bayesianoptimization',
     'algorithmSettings': [{'name': 'random_state', 'value': '10'}]},
    'maxFailedTrialCount': 3,
    'maxTrialCount': 5,
    'metricsCollectorSpec': {'collector': {'kind': 'StdOut'}},
    'objective': {'goal': 0.9999,
     'objectiveMetricName': 'Validation-accuracy',
     'type': 'maximize'},
    'parallelTrialCount': 5,
    'parameters': [{'feasibleSpace': {'list': ['sgd', 'adam', 'ftrl']},
      'name': '--optimizer',
      'parameterType': 'categorical'},
     {'feasibleSpace': {'max': '0.03', 'min': '0.01'},
      'name': '--lr',
      'parameterType': 'double'},
     {'feasibleSpace': {'max': '5', 'min': '2'},
      'name': '--num-layers',
      'parameterType': 'int'}],
    'trialTemplate': {'goTemplate': {'rawTemplate': 'apiVersion: "batch/v1"\nkind: Job\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n  template:\n    spec:\n      containers:\n      - name: {{.Trial}}\n        image: docker.io/kubeflowkatib/mxnet-mnist\n        command:\n        - "python3"\n        - "/opt/mxnet-mnist/mnist.py"\n        - "--batch-size=64"\n        {{- with .HyperParameters}}\n        {{- range .}}\n        - "{{.Name}}={{.Value}}"\n        {{- end}}\n        {{- end}}\n      restartPolicy: Never'}}},
   'status': {'conditions': [{'lastTransitionTime': '2020-03-04T10:46:49Z',
      'lastUpdateTime': '2020-03-04T10:46:49Z',
      'message': 'Experiment is created',
      'reason': 'ExperimentCreated',
      'status': 'True',
      'type': 'Created'},
     {'lastTransitionTime': '2020-03-04T10:47:03Z',
      'lastUpdateTime': '2020-03-04T10:47:03Z',
      'message': 'Experiment is running',
      'reason': 'ExperimentRunning',
      'status': 'True',
      'type': 'Running'}],
    'currentOptimalTrial': {'bestTrialName': 'bayesianoptimization-d5blztvn',
     'observation': {'metrics': [{'name': 'Validation-accuracy',
        'value': 0.978404}]},
     'parameterAssignments': [{'name': '--optimizer', 'value': 'sgd'},
      {'name': '--lr', 'value': '0.021196507397563884'},
      {'name': '--num-layers', 'value': '4'}]},
    'runningTrialList': ['bayesianoptimization-gs7b6g25'],
    'startTime': '2020-03-04T10:46:49Z',
    'succeededTrialList': ['bayesianoptimization-52tsscjq',
     'bayesianoptimization-65fnsn5t',
     'bayesianoptimization-d5blztvn',
     'bayesianoptimization-s8qwszvr'],
    'trials': 5,
    'trialsRunning': 1,
    'trialsSucceeded': 4}},
  {'apiVersion': 'kubeflow.org/v1alpha3',
   'kind': 'Experiment',
   'metadata': {'creationTimestamp': '2020-03-04T10:50:02Z',
    'finalizers': ['update-prometheus-metrics'],
    'generation': 2,
    'name': 'tfjob-example',
    'namespace': 'anonymous',
    'resourceVersion': '7037043',
    'selfLink': '/apis/kubeflow.org/v1alpha3/namespaces/anonymous/experiments/tfjob-example',
    'uid': 'e6d578d8-5e05-11ea-8d71-42010aa00012'},
   'spec': {'algorithm': {'algorithmName': 'random',
     'algorithmSettings': [{}]},
    'maxFailedTrialCount': 3,
    'maxTrialCount': 12,
    'metricsCollectorSpec': {'collector': {'kind': 'TensorFlowEvent'},
     'source': {'fileSystemPath': {'kind': 'Directory',
       'path': '/var/log/katib/tfevent/'}}},
    'objective': {'goal': 0.9999,
     'objectiveMetricName': 'accuracy_1',
     'type': 'maximize'},
    'parallelTrialCount': 4,
    'parameters': [{'feasibleSpace': {'max': '200', 'min': '100'},
      'name': '--batch_size',
      'parameterType': 'int'}],
    'trialTemplate': {'goTemplate': {'rawTemplate': 'apiVersion: "kubeflow.org/v1"\nkind: TFJob\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n tfReplicaSpecs:\n  Worker:\n    replicas: 1\n    restartPolicy: OnFailure\n    template:\n      spec:\n        containers:\n          - name: tensorflow\n            image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\n            imagePullPolicy: Always\n            command:\n              - "python"\n              - "/var/tf_mnist/mnist_with_summaries.py"\n              - "--log_dir=/train/metrics"\n              {{- with .HyperParameters}}\n              {{- range .}}\n              - "{{.Name}}={{.Value}}"\n              {{- end}}\n              {{- end}}'}}},
   'status': {'completionTime': None,
    'conditions': [{'lastTransitionTime': '2020-03-04T10:50:02Z',
      'lastUpdateTime': '2020-03-04T10:50:02Z',
      'message': 'Experiment is created',
      'reason': 'ExperimentCreated',
      'status': 'True',
      'type': 'Created'}],
    'currentOptimalTrial': {'bestTrialName': '',
     'observation': {'metrics': None},
     'parameterAssignments': None},
    'startTime': '2020-03-04T10:50:02Z'}}],
 'kind': 'ExperimentList',
 'metadata': {'continue': '',
  'resourceVersion': '7037091',
  'selfLink': '/apis/kubeflow.org/v1alpha3/namespaces/anonymous/experiments'}}

Get experiment status


In [11]:
kclient.get_experiment_status(name="tfjob-example", namespace=namespace)


Out[11]:
'Created'

Is experiment succeeded


In [12]:
kclient.is_experiment_succeeded(name="tfjob-example", namespace=namespace)


Out[12]:
False

List Trials of a experiment


In [18]:
kclient.list_trials(name="tfjob-example", namespace=namespace)


Out[18]:
[{'name': 'tfjob-example-fl9h8gpv', 'status': 'Running'},
 {'name': 'tfjob-example-gg2wl8gl', 'status': 'Running'},
 {'name': 'tfjob-example-gh6s9htw', 'status': 'Created'},
 {'name': 'tfjob-example-w774npf9', 'status': 'Running'}]

List all Experiments


In [19]:
kclient.list_experiments(namespace=namespace)


Out[19]:
[{'name': 'bayesianoptimization', 'status': 'Succeeded'},
 {'name': 'tfjob-example', 'status': 'Running'}]

Get Optimal Hyperparameter


In [ ]:
kclient.get_optimal_hyperparmeters(name="tfjob-example", namespace=namespace)

Delete experiment


In [13]:
kclient.delete_experiment(name="tfjob-example", namespace=namespace)