TensorRT is NVIDIA's high-performance deep learning inference optimizer and runtime. It takes trained models from frameworks like PyTorch, TensorFlow, or ONNX and optimizes them specifically for NVIDIA GPUs through layer fusion, precision calibration, and kernel auto-tuning. For CUDA developers deploying ML models, TensorRT can provide 2-10x speedup over native framework inference. It automatically fuses layers, selects optimal kernels, and leverages Tensor Cores for mixed-precision inference - all while maintaining accuracy within acceptable bounds. This guide covers TensorRT model conversion, INT8 quantization, dynamic shapes, custom plugins, and optimization strategies for production ML deployment.
CUDA Integration: TensorRT compiles neural networks to optimized execution plans using cuDNN, cuBLAS, and custom CUDA kernels. It profiles different kernel implementations and selects the fastest for your specific GPU, batch size, and precision. The engine is serialized for fast loading in production.
Install TensorRT via pip or from NVIDIA.
# Install via pip (includes Python API)
pip install tensorrt
# Or install via conda
conda install -c conda-forge tensorrt
# For specific CUDA version
pip install tensorrt-cu12 # CUDA 12
pip install tensorrt-cu11 # CUDA 11
# Verify installation
python -c "import tensorrt as trt; print(f'TensorRT {trt.__version__}')"
# Check CUDA version
python -c "import tensorrt as trt; print(trt.get_build_info())"
# Install additional tools
pip install onnx onnx-graphsurgeon # For ONNX models
pip install torch2trt # PyTorch to TensorRT converterBasic TensorRT conversion and inference from ONNX.
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
# Create logger (required)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def build_engine_from_onnx(onnx_path, fp16=False):
"""Build TensorRT engine from ONNX model."""
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
# Parse ONNX model
with open(onnx_path, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# Build engine config
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
# Enable FP16 if requested
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
# Build engine
serialized_engine = builder.build_serialized_network(network, config)
# Deserialize
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)
return engine
# Inference wrapper
class TRTInference:
def __init__(self, engine):
self.engine = engine
self.context = engine.create_execution_context()
# Allocate buffers
self.inputs = []
self.outputs = []
self.bindings = []
self.stream = cuda.Stream()
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
size = trt.volume(engine.get_tensor_shape(tensor_name))
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
self.inputs.append({'host': host_mem, 'device': device_mem})
else:
self.outputs.append({'host': host_mem, 'device': device_mem})
def infer(self, input_data):
# Copy input to device
np.copyto(self.inputs[0]['host'], input_data.ravel())
cuda.memcpy_htod_async(
self.inputs[0]['device'],
self.inputs[0]['host'],
self.stream
)
# Run inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# Copy output to host
cuda.memcpy_dtoh_async(
self.outputs[0]['host'],
self.outputs[0]['device'],
self.stream
)
self.stream.synchronize()
return self.outputs[0]['host']
# Usage
engine = build_engine_from_onnx('model.onnx', fp16=True)
inference = TRTInference(engine)
# Run inference
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
output = inference.infer(input_data)Optimize model with INT8 precision using calibration dataset.
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
class Int8Calibrator(trt.IInt8EntropyCalibrator2):
"""INT8 calibrator for quantization."""
def __init__(self, calibration_data, cache_file='calibration.cache'):
trt.IInt8EntropyCalibrator2.__init__(self)
self.calibration_data = calibration_data
self.cache_file = cache_file
self.current_index = 0
# Allocate device memory for batch
self.batch_size = calibration_data[0].shape[0]
self.device_input = cuda.mem_alloc(calibration_data[0].nbytes)
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
if self.current_index < len(self.calibration_data):
batch = self.calibration_data[self.current_index]
cuda.memcpy_htod(self.device_input, batch)
self.current_index += 1
return [int(self.device_input)]
return None
def read_calibration_cache(self):
if os.path.exists(self.cache_file):
with open(self.cache_file, 'rb') as f:
return f.read()
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, 'wb') as f:
f.write(cache)
def build_int8_engine(onnx_path, calibration_data):
"""Build INT8 quantized engine."""
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
# Parse ONNX
with open(onnx_path, 'rb') as model:
parser.parse(model.read())
# Configure for INT8
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 2 << 30) # 2GB
config.set_flag(trt.BuilderFlag.INT8)
# Set calibrator
calibrator = Int8Calibrator(calibration_data)
config.int8_calibrator = calibrator
# Build engine
print("Building INT8 engine (this may take a while)...")
serialized_engine = builder.build_serialized_network(network, config)
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)
# Save engine
with open('model_int8.trt', 'wb') as f:
f.write(serialized_engine)
return engine
# Dynamic shapes support
def build_dynamic_engine(onnx_path):
"""Build engine with dynamic input shapes."""
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
with open(onnx_path, 'rb') as model:
parser.parse(model.read())
config = builder.create_builder_config()
# Configure dynamic shapes
profile = builder.create_optimization_profile()
profile.set_shape(
'input',
min=(1, 3, 224, 224), # Minimum shape
opt=(8, 3, 224, 224), # Optimal shape
max=(32, 3, 224, 224) # Maximum shape
)
config.add_optimization_profile(profile)
# Build
serialized_engine = builder.build_serialized_network(network, config)
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)
return engine
# Custom plugin example
class CustomPlugin(trt.IPluginV2DynamicExt):
"""Example custom TensorRT plugin."""
def __init__(self):
trt.IPluginV2DynamicExt.__init__(self)
def get_output_shape(self, output_index, input_shapes, input_exprs):
return input_shapes[0]
def configure_plugin(self, inp, out):
pass
def execute_async(self, inputs, outputs, workspace, stream):
# Custom CUDA kernel execution
# Launch your kernel here
pass
# Benchmark
import time
def benchmark_engine(engine, input_data, iterations=1000):
"""Benchmark TensorRT engine."""
inference = TRTInference(engine)
# Warmup
for _ in range(10):
_ = inference.infer(input_data)
# Benchmark
start = time.time()
for _ in range(iterations):
_ = inference.infer(input_data)
end = time.time()
avg_time = (end - start) / iterations * 1000
throughput = iterations / (end - start)
print(f"Average latency: {avg_time:.2f} ms")
print(f"Throughput: {throughput:.2f} inferences/sec")INT8 provides 2-4x speedup over FP16 with minimal accuracy loss. Requires calibration dataset but worth it for production.
TensorRT tunes kernels for your batch size. Build separate engines for different batch sizes or use dynamic shapes with optimal batch.
TensorRT automatically fuses layers and mixes FP16/INT8. Ensure you don't disable these optimizations.
Multiple inference contexts with different streams enable concurrent batch processing for higher throughput.
Use trtexec tool to profile your model and understand which layers are bottlenecks.
Building engines is slow. Serialize to file and deserialize in production for instant loading.
| Task | Performance | Notes |
|---|---|---|
| ResNet-50 FP16 (batch=1) | 2.5x | vs PyTorch eager |
| ResNet-50 INT8 (batch=1) | 5x | vs PyTorch eager |
| BERT-Base FP16 | 3x | vs ONNX Runtime |
| YOLOv5 INT8 | 4x | vs PyTorch |
With proper calibration, INT8 typically loses <1% accuracy on classification and <2% on detection tasks. Always validate on your specific model and dataset.
Yes! Use torch2trt for direct conversion, or export to ONNX then TensorRT. torch.compile with TensorRT backend is also available in PyTorch 2.0+.
TensorRT profiles many kernel implementations to find the fastest. This is done once - serialize the engine and reuse. Building can take minutes but inference is optimized.
Engines are GPU-specific (architecture and version). Build on same GPU model as deployment, or rebuild for each target GPU type.
Cross-platform, easier to use, less optimized
Training framework, less optimized inference
For custom kernels, not full models
Optimize your TensorRT CUDA code with RightNow AI - get real-time performance suggestions and memory analysis.