In [ ]:
# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks.
The original paper can be found here: https://arxiv.org/abs/1810.04805.
NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
Here we run QA fine-tuning on a pre-trained BERT model. To fine-tune we will use the SQuaD 1.1 Dataset which contains 100,000+ question-answer pairs on 500+ articles.
In [ ]:
import os
import sys
data_dir = '/workspace/bert/data/download'
# SQuAD json for training
train_file = os.path.join(data_dir, 'squad/v1.1/train-v1.1.json')
# json for inference
predict_file = os.path.join(data_dir, 'squad/v1.1/dev-v1.1.json')
Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.
For information about:
In this notebook we control mixed precision execution with the following flag:
In [ ]:
use_fp16 = True;
import os
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" if use_fp16 else "0"
# For detailed debug uncomment the following line:
#os.environ["TF_CPP_VMODULE"]="auto_mixed_precision=2"
Based on the model size, we have the following two default configurations of BERT.
Model | Hidden layers | Hidden unit size | Attention heads | Feedforward filter size | Max sequence length | Parameters |
---|---|---|---|---|---|---|
BERTBASE | 12 encoder | 768 | 12 | 4 x 768 | 512 | 110M |
BERTLARGE | 24 encoder | 1024 | 16 | 4 x 1024 | 512 | 330M |
We will large use pre-trained models avaialble on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com). There are many configuration available, in particular we will download and use the following:
bert_tf_large_fp16_384
Which is pre-trained using the Wikipedia and Book corpus datasets as training data. We will fine-tune on the SQuaD 1.1 Dataset.
Let's create the folders for the pre-trained models:
In [ ]:
# bert_tf_large_fp16_384
DATA_DIR_FP16 = '/workspace/bert/data/download/pretrained_model_fp16'
!mkdir -p $DATA_DIR_FP16
!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_for_tensorflow.zip \
https://api.ngc.nvidia.com/v2/models/nvidia/bert_for_tensorflow/versions/1/zip
!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_for_tensorflow.zip
In the code that follows we will refer to this model.
In [ ]:
notebooks_dir = '/workspace/bert/notebooks'
working_dir = '/workspace/bert'
if working_dir not in sys.path:
sys.path.append(working_dir)
init_checkpoint = os.path.join(data_dir, 'pretrained_model_fp16/model.ckpt-1000000')
In [ ]:
import run_squad
import json
import tensorflow as tf
import modeling
import tokenization
import time
import random
import optimization
tf.logging.set_verbosity(tf.logging.INFO)
# Create the output directory where all the results are saved.
output_dir = os.path.join(working_dir, 'results')
tf.gfile.MakeDirs(output_dir)
# The config json file corresponding to the pre-trained BERT model.
# This specifies the model architecture.
bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')
# The vocabulary file that the BERT model was trained on.
vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')
# Whether to lower case the input text.
# Should be True for uncased models and False for cased models.
do_lower_case = True
# Total batch size for predictions
predict_batch_size = 1
params = dict([('batch_size', predict_batch_size)])
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
max_seq_length = 384
# When splitting up a long document into chunks, how much stride to take between chunks.
doc_stride = 128
# The maximum number of tokens for the question.
# Questions longer than this will be truncated to this length.
max_query_length = 64
# This is a WA to use flags from here:
flags = tf.flags
if 'f' not in tf.flags.FLAGS:
tf.app.flags.DEFINE_string('f', '', 'kernel')
FLAGS = flags.FLAGS
# FLAGS.verbose_logging = True
# The total number of n-best predictions to generate in the nbest_predictions.json output file.
n_best_size = 20
# The maximum length of an answer that can be generated.
# This is needed because the start and end predictions are not conditioned on one another.
max_answer_length = 30
# The initial learning rate for Adam
learning_rate = 5e-6
# Total batch size for training
train_batch_size = 3
# Proportion of training to perform linear learning rate warmup for
warmup_proportion = 0.1
# # Total number of training epochs to perform (results will improve if trained with epochs)
num_train_epochs = 2
global_batch_size = train_batch_size
training_hooks = []
training_hooks.append(run_squad.LogTrainRunHook(global_batch_size, 0))
Let's create the tokenizer and the training tf_record:
In [ ]:
# Validate the casing config consistency with the checkpoint name.
tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)
# Create the tokenizer.
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
# Load the configuration from file
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
config = tf.ConfigProto(log_device_placement=True)
run_config = tf.estimator.RunConfig(
model_dir=output_dir,
session_config=config,
save_checkpoints_steps=1000,
keep_checkpoint_max=1)
# Read the training examples from the training file:
train_examples = run_squad.read_squad_examples(input_file=train_file, is_training=True)
num_train_steps = int(len(train_examples) / global_batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)
# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in in the `input_fn`.
rng = random.Random(12345)
rng.shuffle(train_examples)
start_index = 0
end_index = len(train_examples)
tmp_filenames = os.path.join(output_dir, "train.tf_record")
# We write to a temporary file to avoid storing very large constant tensors
# in memory.
train_writer = run_squad.FeatureWriter(
filename=tmp_filenames,
is_training=True)
run_squad.convert_examples_to_features(
examples=train_examples[start_index:end_index],
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=True,
output_fn=train_writer.process_feature)
train_writer.close()
tf.logging.info("***** Running training *****")
tf.logging.info(" Num orig examples = %d", end_index - start_index)
tf.logging.info(" Num split examples = %d", train_writer.num_features)
tf.logging.info(" Batch size = %d", train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
tf.logging.info(" LR = %f", learning_rate)
del train_examples
We need to create the model for the estimator:
In [ ]:
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(start_logits, end_logits) = run_squad.create_model(
bert_config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
use_one_hot_embeddings=False)
tvars = tf.trainable_variables()
initialized_variable_names = {}
if init_checkpoint:
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
seq_length = modeling.get_shape_list(input_ids)[1]
def compute_loss(logits, positions):
one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32)
log_probs = tf.nn.log_softmax(logits, axis=-1)
loss = -tf.reduce_mean(tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
return loss
start_positions = features["start_positions"]
end_positions = features["end_positions"]
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, None, False, use_fp16)
output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op)
elif mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
"unique_ids": unique_ids,
"start_logits": start_logits,
"end_logits": end_logits,
}
output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
return output_spec
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params=params)
In [ ]:
train_input_fn = run_squad.input_fn_builder(
input_file=tmp_filenames,
batch_size=train_batch_size,
seq_length=max_seq_length,
is_training=True,
drop_remainder=True,
hvd=None)
train_start_time = time.time()
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps)
train_time_elapsed = time.time() - train_start_time
train_time_wo_startup = training_hooks[-1].total_time
avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_wo_startup if train_time_wo_startup else 0
tf.logging.info("-----------------------------")
tf.logging.info("Total Training Time = %0.2f Training Time W/O start up overhead = %0.2f "
"Sentences processed = %d", train_time_elapsed, train_time_wo_startup,
num_train_steps * global_batch_size)
tf.logging.info("Training Performance = %0.4f sentences/sec", avg_sentences_per_second)
tf.logging.info("-----------------------------")
In [ ]:
eval_examples = run_squad.read_squad_examples(
input_file=predict_file, is_training=False)
eval_writer = run_squad.FeatureWriter(
filename=os.path.join(output_dir, "eval.tf_record"),
is_training=False)
eval_features = []
def append_feature(feature):
eval_features.append(feature)
eval_writer.process_feature(feature)
# Loads a data file into a list of InputBatch's
run_squad.convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
output_fn=append_feature)
eval_writer.close()
tf.logging.info("***** Running predictions *****")
tf.logging.info(" Num orig examples = %d", len(eval_examples))
tf.logging.info(" Num split examples = %d", len(eval_features))
tf.logging.info(" Batch size = %d", predict_batch_size)
predict_input_fn = run_squad.input_fn_builder(
input_file=eval_writer.filename,
batch_size=predict_batch_size,
seq_length=max_seq_length,
is_training=False,
drop_remainder=False)
all_results = []
eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]
eval_start_time = time.time()
for result in estimator.predict(
predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=None):
unique_id = int(result["unique_ids"])
start_logits = [float(x) for x in result["start_logits"].flat]
end_logits = [float(x) for x in result["end_logits"].flat]
all_results.append(
run_squad.RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_startup = eval_hooks[-1].total_time
num_sentences = eval_hooks[-1].count * predict_batch_size
avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f "
"Sentences processed = %d", eval_time_elapsed, eval_time_wo_startup,
num_sentences)
tf.logging.info("Inference Performance = %0.4f sentences/sec", avg_sentences_per_second)
tf.logging.info("-----------------------------")
output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
run_squad.write_predictions(eval_examples, eval_features, all_results,
n_best_size, max_answer_length,
do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file)
tf.logging.info("Inference Results:")
# Here we show only the prediction results, nbest prediction is also available in the output directory
results = ""
with open(output_prediction_file, 'r') as json_file:
data = json.load(json_file)
for question in eval_examples:
results += "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(question.qas_id, question.question_text, data[question.qas_id])
from IPython.display import display, HTML
display(HTML("<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>".format(results)))
In [ ]:
!python /workspace/bert/data/download/squad/v1.1/evaluate-v1.1.py \
$predict_file \
$output_dir/predictions.json