In [ ]:
# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks.
The original paper can be found here: https://arxiv.org/abs/1810.04805.
We can run inference on a fine-tuned BERT model for tasks like Question Answering.
Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles comb ined with over 50,000 new, unanswerable questions.
In [ ]:
paragraph_text = "The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space
Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spac
ecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on t
he Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Proje
ct Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 t
o 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehi
cles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and th
e Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975."
In [ ]:
question_text = "What project put the first Americans into space?"
#question_text = "What year did the first manned Apollo flight occur?"
#question_text = "What President is credited with the original notion of putting Americans in space?"
#question_text = "Who did the U.S. collaborate with on an Earth orbit mission in 1975?"
In this example we ask our BERT model questions related to the following paragraph:
The Apollo Program "The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration ( NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and retu rning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The f irst manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini m issions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Sat urn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz T est Project, a joint Earth orbit mission with the Soviet Union in 1975."
The questions and relative answers expected are shown below:
In [ ]:
import data_processing as dp
import tokenization
#Large
#tokenizer = tokenization.FullTokenizer(vocab_file="./data/uncased_L-24_H-1024_A-16/vocab.txt", do_lower_case=True)
#Base
tokenizer = tokenization.FullTokenizer(vocab_file="./data/uncased_L-12_H-768_A-12/vocab.txt", do_lower_case=True)
# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.
max_query_length = 64
# When splitting up a long document into chunks, how much stride to take between chunks.
doc_stride = 128
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter
max_seq_length = 384
# Extract tokecs from the paragraph
doc_tokens = dp.convert_doc_tokens(paragraph_text)
# Extract features from the paragraph and question
features = dp.convert_examples_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)
In [ ]:
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
In [ ]:
import ctypes
nvinfer = ctypes.CDLL("libnvinfer_plugin.so", mode = ctypes.RTLD_GLOBAL)
cm = ctypes.CDLL("./build/libcommon.so", mode = ctypes.RTLD_GLOBAL)
pg = ctypes.CDLL("./build/libbert_plugins.so", mode = ctypes.RTLD_GLOBAL)
In [ ]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
# For this example we are going to use batch size 1
max_batch_size = 1
# Load the BERT Engine
# with open("./bert_python.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
with open("./bert_python_base.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
print("List engine binding:")
for binding in engine:
print(" - {}: {}, Shape {}, {}".format(
"Input" if engine.binding_is_input(binding) else "Output",
binding,
engine.get_binding_shape(binding),
engine.get_binding_dtype(binding)))
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
shape_input_0 = (max_batch_size,) + tuple(engine.get_binding_shape(0))
shape_input_1 = (max_batch_size,) + tuple(engine.get_binding_shape(1))
shape_input_2 = (max_batch_size,) + tuple(engine.get_binding_shape(2))
h_input_0 = cuda.pagelocked_empty(shape_input_0, dtype=np.int32)
h_input_1 = cuda.pagelocked_empty(shape_input_1, dtype=np.int32)
h_input_2 = cuda.pagelocked_empty(shape_input_2, dtype=np.int32)
shape_output = (max_batch_size,) + tuple(engine.get_binding_shape(3))
h_output = cuda.pagelocked_empty(shape_output, dtype=np.float32)
# Allocate device memory for inputs and outputs.
d_input_0 = cuda.mem_alloc(h_input_0.nbytes)
d_input_1 = cuda.mem_alloc(h_input_1.nbytes)
d_input_2 = cuda.mem_alloc(h_input_2.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
print("\nRunning Inference...")
with engine.create_execution_context() as context:
eval_start_time = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input_0, features["input_ids"], stream)
cuda.memcpy_htod_async(d_input_1, features["segment_ids"], stream)
cuda.memcpy_htod_async(d_input_2, features["input_mask"], stream)
# Run inference.
context.execute_async(bindings=[int(d_input_0), int(d_input_1), int(d_input_2), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# Return the host output.
eval_time_elapsed = time.time() - eval_start_time
Now that we have the inference results let's extract the actual answer to our question
In [ ]:
start_logits = h_output[0,0,0]
end_logits = h_output[0,1,0]
# The total number of n-best predictions to generate in the nbest_predictions.json output file
n_best_size = 20
# The maximum length of an answer that can be generated. This is needed
# because the start and end predictions are not conditioned on one another
max_answer_length = 30
(prediction, nbest_json, scores_diff_json) = \
dp.get_predictions(doc_tokens, features, \
start_logits, end_logits, n_best_size, max_answer_length)
print("-----------------------------")
print("Running Inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
print("-----------------------------")
print("Answer: '{}'".format(prediction))
print("with prob: {:.3f}%".format(nbest_json[0]['probability']*100.0))