NVIDIA TensorRT

10.16.1.11 · active · verified Thu Apr 16

NVIDIA TensorRT is a Python library and C++ SDK for high-performance deep learning inference. It optimizes trained neural networks for deployment on NVIDIA GPUs, focusing on throughput, latency, and memory efficiency. The current version is 10.16.1.11. NVIDIA typically releases minor updates to TensorRT frequently, often monthly or bi-monthly, with major versions released annually.

Common errors

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to build a simple TensorRT engine for an identity operation. It uses the `cuda-python` library for CUDA memory management, reflecting modern TensorRT usage. The process involves creating a logger, builder, network definition, configuration, defining input/output tensors, building the engine, and then performing a basic inference with device memory management.

import tensorrt as trt
import numpy as np
from cuda import cudart # Using cuda-python as per release notes

# 1. Create Logger
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine():
    # 2. Create Builder
    builder = trt.Builder(TRT_LOGGER)

    # 3. Create NetworkDefinition
    # EXPLICIT_BATCH is required for dynamic shapes or when batch size is a dimension
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

    # 4. Create BuilderConfig
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 20 # 1 MiB

    # Define input tensor (e.g., a simple 1x3x16x16 input)
    input_tensor = network.add_input(name="input_tensor", dtype=trt.float32, shape=(1, 3, 16, 16))

    # Add an identity layer (input -> output directly)
    output_tensor = network.add_identity(input_tensor).get_output(0)

    # 6. Mark output
    network.mark_output(output_tensor)
    output_tensor.name = "output_tensor"

    # Build and return the engine
    engine = builder.build_engine(network, config)
    if not engine:
        raise RuntimeError("Failed to build TensorRT engine")
    return engine

def main():
    engine = None
    runtime = None
    context = None
    device_input = None
    device_output = None
    try:
        engine = build_engine()
        print("TensorRT engine built successfully!")

        # Create runtime and execution context
        runtime = trt.Runtime(TRT_LOGGER)
        # For demonstration, we use the already built engine. In real apps, you might deserialize.
        context = engine.create_execution_context()

        # Prepare input data
        host_input = np.random.rand(1, 3, 16, 16).astype(np.float32)
        host_output = np.empty_like(host_input) # Output shape is same as input for identity

        # Allocate device memory
        _, device_input = cudart.cudaMalloc(host_input.nbytes)
        _, device_output = cudart.cudaMalloc(host_output.nbytes)

        # Copy input to device
        cudart.cudaMemcpy(device_input, host_input.ctypes.data, host_input.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

        # Execute inference
        # The execute_v2 takes an iterable of device pointers in the order of inputs and outputs
        bindings = [int(device_input), int(device_output)]
        context.execute_v2(bindings)

        # Copy output back to host
        cudart.cudaMemcpy(host_output.ctypes.data, device_output, host_output.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

        print(f"Input shape: {host_input.shape}")
        print(f"Output shape: {host_output.shape}")
        print(f"Input (first 5 elements): {host_input.flatten()[:5]}")
        print(f"Output (first 5 elements): {host_output.flatten()[:5]}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Clean up resources
        if device_input: cudart.cudaFree(device_input)
        if device_output: cudart.cudaFree(device_output)
        if context: del context
        if engine: del engine
        if runtime: del runtime

if __name__ == "__main__":
    main()

view raw JSON →