Single host multi gpus (K80 * 8)


In [2]:
%%bash

PROJECT_ID="YOUR-PROJECT-ID"
BUCKET_ID="YOUR-BUCKET-ID"
REGION="YOUR-REGION"

TRAINER_PACKAGE_PATH=$(pwd)/project/trainer
now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="sample_model_$now"
MAIN_TRAINER_MODULE=trainer.task
JOB_DIR=gs://$BUCKET_ID/job
PACKAGE_STAGING_PATH=gs://$BUCKET_ID/staging
#https://cloud.google.com/ml-engine/docs/tensorflow/regions


JOB_DIR=gs://$BUCKET_ID/sample_model_job_dir
SCALE_TIER=BASIC
RUNTIME_VERSION="1.11"
# https://cloud.google.com/ml-engine/docs/tensorflow/runtime-version-list

gcloud ml-engine jobs submit training $JOB_NAME \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --job-dir $JOB_DIR \
  --project $PROJECT_ID \
  --region $REGION \
  --runtime-version $RUNTIME_VERSION \
  --config config.yaml \
  -- \
  --train_data_pattern "gs://$BUCKET_ID/data/cifar10_data_00*" \
  --eval_data_pattern "gs://$BUCKET_ID/data/cifar10_data_01*" \
  --max_steps 10000 \
  --num_gpus 8 \
  --output_dir "gs://$BUCKET_ID/model"


jobId: sample_model_20181221_112757
state: QUEUED
Job [sample_model_20181221_112757] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe sample_model_20181221_112757

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs sample_model_20181221_112757