This notebook illustrates using the tensor2tensor library to do from-scratch, distributed training of a English-German translator. Then, the trained model is deployed to Cloud ML Engine and used to translate new pieces of text.
In [ ]:
%bash
pip install tensor2tensor
In [8]:
import os
PROJECT = 'cloud-training-demos' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'cloud-training-demos-ml' # REPLACE WITH YOUR BUCKET NAME
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1
# this is what this notebook is demonstrating
PROBLEM= 'my_translate_problem'
# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['PROBLEM'] = PROBLEM
In [ ]:
! gcloud config set project $PROJECT
In [ ]:
%bash
wget http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
wget http://data.statmt.org/wmt17/translation-task/dev.tgz
In [2]:
!ls *.tgz
In [27]:
%bash
rm -rf t2t
mkdir -p t2t/ende
In [28]:
!pwd
In [29]:
%writefile t2t/ende/problem.py
import tensorflow as tf
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import translate
from tensor2tensor.utils import registry
#TOPDIR="gs://{}/translate_ende/".format("BUCKET_NAME")
TOPDIR="file:///content/t2t" # Make sure this matches the !pwd above
_ENDE_TRAIN_DATASETS = [
[
"{}/training-parallel-nc-v12.tgz".format(TOPDIR),
("training/news-commentary-v12.de-en.en",
"training/news-commentary-v12.de-en.de")
],
]
_ENDE_TEST_DATASETS = [
[
"{}/dev.tgz".format(TOPDIR),
("dev/newstest2013.en", "dev/newstest2013.de")
],
]
@registry.register_problem
class MyTranslateProblem(translate.TranslateProblem):
@property
def targeted_vocab_size(self):
return 2**13 # 8192
@property
def vocab_name(self):
return "vocab.english_to_german"
def generator(self, data_dir, tmp_dir, train):
symbolizer_vocab = generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, sources=_ENDE_TRAIN_DATASETS)
datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
tag = "train" if train else "dev"
data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
symbolizer_vocab, text_encoder.EOS_ID)
@property
def input_space_id(self):
return problem.SpaceID.EN_TOK
@property
def target_space_id(self):
return problem.SpaceID.DE_TOK
In [30]:
%%writefile t2t/ende/__init__.py
from . import problem
In [31]:
%%writefile t2t/setup.py
from setuptools import find_packages
from setuptools import setup
REQUIRED_PACKAGES = [
'tensor2tensor'
]
setup(
name='ende',
version='0.1',
author = 'Google',
author_email = 'training-feedback@cloud.google.com',
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=True,
description='My Translate Problem',
requires=[]
)
In [ ]:
%bash
DATA_DIR=./t2t_data
TMP_DIR=$DATA_DIR/tmp
rm -rf $DATA_DIR $TMP_DIR
mkdir -p $DATA_DIR $TMP_DIR
# Generate data
t2t-datagen \
--t2t_usr_dir=./t2t/ende \
--problem=$PROBLEM \
--data_dir=$DATA_DIR \
--tmp_dir=$TMP_DIR
In [ ]:
%bash
DATA_DIR=./t2t_data
gsutil -m rm -r gs://${BUCKET}/translate_ende/
gsutil -m cp ${DATA_DIR}/${PROBLEM}* ${DATA_DIR}/vocab* gs://${BUCKET}/translate_ende/data
In [ ]:
%bash
PROJECT_ID=$PROJECT
AUTH_TOKEN=$(gcloud auth print-access-token)
SVC_ACCOUNT=$(curl -X GET -H "Content-Type: application/json" \
-H "Authorization: Bearer $AUTH_TOKEN" \
https://ml.googleapis.com/v1/projects/${PROJECT_ID}:getConfig \
| python -c "import json; import sys; response = json.load(sys.stdin); \
print response['serviceAccount']")
echo "Authorizing the Cloud ML Service account $SVC_ACCOUNT to access files in $BUCKET"
gsutil -m defacl ch -u $SVC_ACCOUNT:R gs://$BUCKET
gsutil -m acl ch -u $SVC_ACCOUNT:R -r gs://$BUCKET # error message (if bucket is empty) can be ignored
gsutil -m acl ch -u $SVC_ACCOUNT:W gs://$BUCKET
In [ ]:
%bash
wget https://raw.githubusercontent.com/tensorflow/tensor2tensor/master/tensor2tensor/bin/t2t-trainer
mv t2t-trainer t2t/ende/t2t-trainer.py
In [38]:
!touch t2t/__init__.py
In [39]:
!find t2t
Let's test that the Python package works. Since we are running this locally, I'll try it out on a subset of the original data
In [ ]:
%bash
BASE=gs://${BUCKET}/translate_ende/data
OUTDIR=gs://${BUCKET}/translate_ende/subset
gsutil -m rm -r $OUTDIR
gsutil -m cp \
${BASE}/${PROBLEM}-train-0008* \
${BASE}/${PROBLEM}-dev-00000* \
${BASE}/vocab* \
$OUTDIR
In [ ]:
%bash
OUTDIR=./trained_model
rm -rf $OUTDIR
export PYTHONPATH=${PYTHONPATH}:${PWD}/t2t
python -m ende.t2t-trainer \
--data_dir=gs://${BUCKET}/translate_ende/subset \
--problems=$PROBLEM \
--model=transformer \
--hparams_set=transformer_base_single_gpu \
--output_dir=$OUTDIR --job-dir=$OUTDIR
In [ ]:
%bash
OUTDIR=gs://${BUCKET}/translate_ende/model
JOBNAME=t2t_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
--region=$REGION \
--staging-bucket=gs://$BUCKET \
--scale-tier=BASIC_GPU \
--module-name=ende.t2t-trainer \
--package-path=${PWD}/t2t/ende \
--job-dir=$OUTDIR \
--runtime-version=1.4 \
-- \
--data_dir=gs://${BUCKET}/translate_ende/data \
--problems=my_translate_problem \
--model=transformer \
--hparams_set=transformer_base_single_gpu \
--output_dir=$OUTDIR
Copyright 2018 Google Inc. Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License