In [0]:
# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
The NVIDIA TensorRT is a C++ library that facilitates high performance inference on NVIDIA graphics processing units (GPUs). TensorRT takes a trained network, which consists of a network definition and a set of trained parameters, and produces a highly optimized runtime engine which performs inference for that network.
TensorFlow™ integration with TensorRT™ (TF-TRT) optimizes and executes compatible subgraphs, allowing TensorFlow to execute the remaining graph. While you can still use TensorFlow's wide and flexible feature set, TensorRT will parse the model and apply optimizations to the portions of the graph wherever possible.
In this notebook, we demonstrate the process of creating a TF-TRT optimized model from a ResNet-50 Keras saved model.
Before running this notebook, please set the Colab runtime environment to GPU via the menu Runtime => Change runtime type => GPU.
This demo will work on any NVIDIA GPU with CUDA cores, though for improved FP16 and INT8 inference, a Volta, Turing or newer generation GPU with Tensor cores is desired. On Google Colab, this normally means a T4 GPU. If you are assigned an older K80 GPU, another trial at another time might give you a T4 GPU.
In [0]:
!nvidia-smi
In [0]:
!pip install pillow matplotlib
!pip install tensorflow-gpu==2.0.0
In [1]:
import tensorflow as tf
print("Tensorflow version: ", tf.version.VERSION)
In [0]:
%%bash
wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
dpkg -i nvidia-machine-learning-repo-*.deb
apt-get update
sudo apt-get install libnvinfer5
In [3]:
# check TensorRT version
print("TensorRT version: ")
!dpkg -l | grep nvinfer
A successfull TensorRT installation looks like:
TensorRT version:
ii libnvinfer5 5.1.5-1+cuda10.1 amd64 TensorRT runtime libraries
In [4]:
from tensorflow.python.client import device_lib
def check_tensor_core_gpu_present():
local_device_protos = device_lib.list_local_devices()
for line in local_device_protos:
if "compute capability" in str(line):
compute_capability = float(line.physical_device_desc.split("compute capability: ")[-1])
if compute_capability>=7.0:
return True
print("Tensor Core GPU Present:", check_tensor_core_gpu_present())
tensor_core_gpu = check_tensor_core_gpu_present()
In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.compiler.tensorrt import trt_convert as trt
from tensorflow.python.saved_model import tag_constants
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
In [0]:
!mkdir ./data
!wget -O ./data/img0.JPG "https://d17fnq9dkz9hgj.cloudfront.net/breed-uploads/2018/08/siberian-husky-detail.jpg?bust=1535566590&width=630"
!wget -O ./data/img1.JPG "https://www.hakaimagazine.com/wp-content/uploads/header-gulf-birds.jpg"
!wget -O ./data/img2.JPG "https://www.artis.nl/media/filer_public_thumbnails/filer_public/00/f1/00f1b6db-fbed-4fef-9ab0-84e944ff11f8/chimpansee_amber_r_1920x1080.jpg__1920x1080_q85_subject_location-923%2C365_subsampling-2.jpg"
!wget -O ./data/img3.JPG "https://www.familyhandyman.com/wp-content/uploads/2018/09/How-to-Avoid-Snakes-Slithering-Up-Your-Toilet-shutterstock_780480850.jpg"
In [7]:
from tensorflow.keras.preprocessing import image
fig, axes = plt.subplots(nrows=2, ncols=2)
for i in range(4):
img_path = './data/img%d.JPG'%i
img = image.load_img(img_path, target_size=(224, 224))
plt.subplot(2,2,i+1)
plt.imshow(img);
plt.axis('off');
In [8]:
model = ResNet50(weights='imagenet')
In [9]:
for i in range(4):
img_path = './data/img%d.JPG'%i
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = model.predict(x)
# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('{} - Predicted: {}'.format(img_path, decode_predictions(preds, top=3)[0]))
plt.subplot(2,2,i+1)
plt.imshow(img);
plt.axis('off');
plt.title(decode_predictions(preds, top=3)[0][0][1])
TF-TRT takes input as aTensorFlow saved model, therefore, we re-export the Keras model as a TF saved model.
In [10]:
# Save the entire model as a SavedModel.
model.save('resnet50_saved_model')
In [11]:
!saved_model_cli show --all --dir resnet50_saved_model
In [0]:
model = tf.keras.models.load_model('resnet50_saved_model')
In [13]:
img_path = './data/img0.JPG' # Siberian_husky
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = model.predict(x)
# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('{} - Predicted: {}'.format(img_path, decode_predictions(preds, top=3)[0]))
plt.subplot(2,2,1)
plt.imshow(img);
plt.axis('off');
plt.title(decode_predictions(preds, top=3)[0][0][1])
Out[13]:
In [14]:
batch_size = 8
batched_input = np.zeros((batch_size, 224, 224, 3), dtype=np.float32)
for i in range(batch_size):
img_path = './data/img%d.JPG' % (i % 4)
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
batched_input[i, :] = x
batched_input = tf.constant(batched_input)
print('batched_input shape: ', batched_input.shape)
In [0]:
# Benchmarking throughput
N_warmup_run = 50
N_run = 1000
elapsed_time = []
for i in range(N_warmup_run):
preds = model.predict(batched_input)
for i in range(N_run):
start_time = time.time()
preds = model.predict(batched_input)
end_time = time.time()
elapsed_time = np.append(elapsed_time, end_time - start_time)
if i % 50 == 0:
print('Step {}: {:4.1f}ms'.format(i, (elapsed_time[-50:].mean()) * 1000))
print('Throughput: {:.0f} images/s'.format(N_run * batch_size / elapsed_time.sum()))
In [16]:
print('Converting to TF-TRT FP32...')
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=trt.TrtPrecisionMode.FP32,
max_workspace_size_bytes=8000000000)
converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
conversion_params=conversion_params)
converter.convert()
converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_FP32')
print('Done Converting to TF-TRT FP32')
In [17]:
!saved_model_cli show --all --dir resnet50_saved_model_TFTRT_FP32
Next, we load and test the TF-TRT FP32 model.
In [0]:
def predict_tftrt(input_saved_model):
"""Runs prediction on a single image and shows the result.
input_saved_model (string): Name of the input model stored in the current dir
"""
img_path = './data/img0.JPG' # Siberian_husky
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
x = tf.constant(x)
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
signature_keys = list(saved_model_loaded.signatures.keys())
print(signature_keys)
infer = saved_model_loaded.signatures['serving_default']
print(infer.structured_outputs)
labeling = infer(x)
preds = labeling['probs'].numpy()
print('{} - Predicted: {}'.format(img_path, decode_predictions(preds, top=3)[0]))
plt.subplot(2,2,1)
plt.imshow(img);
plt.axis('off');
plt.title(decode_predictions(preds, top=3)[0][0][1])
In [19]:
predict_tftrt('resnet50_saved_model_TFTRT_FP32')
In [0]:
def benchmark_tftrt(input_saved_model):
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
infer = saved_model_loaded.signatures['serving_default']
N_warmup_run = 50
N_run = 1000
elapsed_time = []
for i in range(N_warmup_run):
labeling = infer(batched_input)
for i in range(N_run):
start_time = time.time()
labeling = infer(batched_input)
#prob = labeling['probs'].numpy()
end_time = time.time()
elapsed_time = np.append(elapsed_time, end_time - start_time)
if i % 50 == 0:
print('Step {}: {:4.1f}ms'.format(i, (elapsed_time[-50:].mean()) * 1000))
print('Throughput: {:.0f} images/s'.format(N_run * batch_size / elapsed_time.sum()))
In [0]:
benchmark_tftrt('resnet50_saved_model_TFTRT_FP32')
In [22]:
print('Converting to TF-TRT FP16...')
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
precision_mode=trt.TrtPrecisionMode.FP16,
max_workspace_size_bytes=8000000000)
converter = trt.TrtGraphConverterV2(
input_saved_model_dir='resnet50_saved_model', conversion_params=conversion_params)
converter.convert()
converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_FP16')
print('Done Converting to TF-TRT FP16')
In [23]:
predict_tftrt('resnet50_saved_model_TFTRT_FP16')
In [0]:
benchmark_tftrt('resnet50_saved_model_TFTRT_FP16')
Creating TF-TRT INT8 model requires a small calibration dataset. This data set ideally should represent the test data in production well, and will be used to create a value histogram for each layer in the neural network for effective 8-bit quantization.
Herein, for demonstration purposes, we take only the 4 images that we downloaded for calibration. In production, this set should be more representative of the production data.
Due to Colab memory limit which sometime causes TensorRT to crash, to proceed, first, restart the runtime by pressing CTRL+M or select Runtime -> Restart runtime... or simply execute the next cell.
In [0]:
import os
os.kill(os.getpid(), 9)
In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.compiler.tensorrt import trt_convert as trt
from tensorflow.python.saved_model import tag_constants
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
In [2]:
batch_size = 8
batched_input = np.zeros((batch_size, 224, 224, 3), dtype=np.float32)
for i in range(batch_size):
img_path = './data/img%d.JPG' % (i % 4)
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
batched_input[i, :] = x
batched_input = tf.constant(batched_input)
print('batched_input shape: ', batched_input.shape)
In [3]:
print('Converting to TF-TRT INT8...')
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
precision_mode=trt.TrtPrecisionMode.INT8,
max_workspace_size_bytes=8000000000,
use_calibration=True)
converter = trt.TrtGraphConverterV2(
input_saved_model_dir='resnet50_saved_model',
conversion_params=conversion_params)
def calibration_input_fn():
yield (batched_input, )
converter.convert(calibration_input_fn=calibration_input_fn)
converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_INT8')
print('Done Converting to TF-TRT INT8')
In [0]:
def predict_tftrt(input_saved_model):
"""Runs prediction on a single image and shows the result.
input_saved_model (string): Name of the input model stored in the current dir
"""
img_path = './data/img0.JPG' # Siberian_husky
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
x = tf.constant(x)
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
signature_keys = list(saved_model_loaded.signatures.keys())
print(signature_keys)
infer = saved_model_loaded.signatures['serving_default']
print(infer.structured_outputs)
labeling = infer(x)
preds = labeling['probs'].numpy()
print('{} - Predicted: {}'.format(img_path, decode_predictions(preds, top=3)[0]))
plt.subplot(2,2,1)
plt.imshow(img);
plt.axis('off');
plt.title(decode_predictions(preds, top=3)[0][0][1])
In [5]:
predict_tftrt('resnet50_saved_model_TFTRT_INT8')
In [0]:
def benchmark_tftrt(input_saved_model):
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
infer = saved_model_loaded.signatures['serving_default']
N_warmup_run = 50
N_run = 1000
elapsed_time = []
for i in range(N_warmup_run):
labeling = infer(batched_input)
for i in range(N_run):
start_time = time.time()
labeling = infer(batched_input)
#prob = labeling['probs'].numpy()
end_time = time.time()
elapsed_time = np.append(elapsed_time, end_time - start_time)
if i % 50 == 0:
print('Step {}: {:4.1f}ms'.format(i, (elapsed_time[-50:].mean()) * 1000))
print('Throughput: {:.0f} images/s'.format(N_run * batch_size / elapsed_time.sum()))
In [0]:
benchmark_tftrt('resnet50_saved_model_TFTRT_INT8')