In [0]:
# @title Copyright 2020 The ALBERT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
ALBERT is "A Lite" version of BERT, a popular unsupervised language representation learning algorithm. ALBERT uses parameter-reduction techniques that allow for large-scale configurations, overcome previous memory limitations, and achieve better behavior with respect to model degradation.
For a technical description of the algorithm, see our paper:
https://arxiv.org/abs/1909.11942
Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut
This Colab demonstates using a free Colab Cloud TPU to fine-tune GLUE tasks built on top of pretrained ALBERT models and run predictions on tuned model. The colab demonsrates loading pretrained ALBERT models from both TF Hub and checkpoints.
Note: You will need a GCP (Google Compute Engine) account and a GCS (Google Cloud Storage) bucket for this Colab to run.
Please follow the Google Cloud TPU quickstart for how to create GCP account and GCS bucket. You have $300 free credit to get started with any GCP product. You can learn more about Cloud TPU at https://cloud.google.com/tpu/docs.
This notebook is hosted on GitHub. To view it in its original repository, after opening the notebook, select File > View on GitHub.
Create a Cloud Storage bucket for your TensorBoard logs at http://console.cloud.google.com/storage and fill in the BUCKET parameter in the "Parameters" section below.
On the main menu, click Runtime and select Change runtime type. Set "TPU" as the hardware accelerator.
In [0]:
# TODO(lanzhzh): Add support for 2.x.
%tensorflow_version 1.x
import os
import pprint
import json
import tensorflow as tf
assert "COLAB_TPU_ADDR" in os.environ, "ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!"
TPU_ADDRESS = "grpc://" + os.environ["COLAB_TPU_ADDR"]
TPU_TOPOLOGY = "2x2"
print("TPU address is", TPU_ADDRESS)
from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
print('TPU devices:')
pprint.pprint(session.list_devices())
# Upload credentials to TPU.
with open('/content/adc.json', 'r') as f:
auth_info = json.load(f)
tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
# Now credentials are set for all future sessions on this TPU.
In [0]:
#TODO(lanzhzh): Add pip support
import sys
!test -d albert || git clone https://github.com/google-research/albert albert
if not 'albert' in sys.path:
sys.path += ['albert']
!pip install sentencepiece
In [0]:
# Please find the full list of tasks and their fintuning hyperparameters
# here https://github.com/google-research/albert/blob/master/run_glue.sh
BUCKET = "albert_tutorial_glue" #@param { type: "string" }
TASK = 'MRPC' #@param {type:"string"}
# Available pretrained model checkpoints:
# base, large, xlarge, xxlarge
ALBERT_MODEL = 'base' #@param {type:"string"}
TASK_DATA_DIR = 'glue_data'
BASE_DIR = "gs://" + BUCKET
if not BASE_DIR or BASE_DIR == "gs://":
raise ValueError("You must enter a BUCKET.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
OUTPUT_DIR = 'gs://{}/albert-tfhub/models/{}'.format(BUCKET, TASK)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
# Download glue data.
! test -d download_glue_repo || git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo
!python download_glue_repo/download_glue_data.py --data_dir=$TASK_DATA_DIR --tasks=$TASK
print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))
ALBERT_MODEL_HUB = 'https://tfhub.dev/google/albert_' + ALBERT_MODEL + '/3'
Now let's run the fine-tuning scripts. If you use the default MRPC task, this should be finished in around 10 mintues and you will get an accuracy of around 86.5.
In [0]:
os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR
!python -m albert.run_classifier \
--data_dir="glue_data/" \
--output_dir=$OUTPUT_DIR \
--albert_hub_module_handle=$ALBERT_MODEL_HUB \
--spm_model_file="from_tf_hub" \
--do_train=True \
--do_eval=True \
--do_predict=False \
--max_seq_length=512 \
--optimizer=adamw \
--task_name=$TASK \
--warmup_step=200 \
--learning_rate=2e-5 \
--train_step=800 \
--save_checkpoints_steps=100 \
--train_batch_size=32 \
--tpu_name=$TPU_ADDRESS \
--use_tpu=True