The ONNX-LRE library provides a machine learning runtime environment for executing ONNX (Open Neural Network Exchange) models.
ONNX-LRE C++ APIs offer an easy-to-use interface to onboard and execute ONNX models from LEIP Optimize.
Inference Options
ONNX-LRE supports three different input formats for inference:
Each approach offers different tradeoffs between ease of use, performance, and integration complexity. See the examples below for practical usage patterns.
These examples demonstrate usage of the ONNX-LRE.
Examples
Example 1: DLPack Tensors with Smart Pointers
#include <onnx_lre/onnx_lre.hpp>
#include <memory>
#include <functional>
#include <iostream>
struct DLTensorDeleter {
void operator()(DLManagedTensor* tensor) const {
if (tensor && tensor->deleter) {
tensor->deleter(tensor);
}
}
};
using DLTensorPtr = std::unique_ptr<DLManagedTensor, DLTensorDeleter>;
int main() {
try {
std::vector<DLTensorPtr> outputTensors;
for (auto* tensor : engine.getOutput()) {
outputTensors.emplace_back(tensor);
}
for (const auto& tensor : outputTensors) {
if (!tensor) continue;
const auto& dl_tensor = tensor->dl_tensor;
std::cout << "Shape: [";
for (int j = 0; j < dl_tensor.ndim; j++) {
std::cout << dl_tensor.shape[j] << " ";
}
std::cout << "]" << std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
The LatentRuntimeEngine class provides a C++ interface to load and run ONNX models using ONNX Runtime...
Definition onnx_lre.hpp:253
@ CPU
CPU execution - universal fallback with no special hardware requirements.
Definition onnx_lre.hpp:174
Configuration parameters for the inference engine.
Definition onnx_lre.hpp:233
ExecutionProvider executionProvider
Specifies the execution provider (e.g., CPU, CUDA, TensorRT). Defaults to the best available EP.
Definition onnx_lre.hpp:235
Example 2: Using ONNX Runtime Tensors with RAII
This approach uses ONNX Runtime's tensor types with automatic memory management:
#include <onnx_lre/onnx_lre.hpp>
#include <memory>
#include <iostream>
int main() {
try {
const auto& inputShapes = engine.getInputShapes();
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "example");
Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
std::vector<Ort::Value> inputTensors;
for (size_t i = 0; i < engine.getNumberOfInputs(); i++) {
size_t totalElements = 1;
for (auto dim : inputShapes[i]) {
totalElements *= (dim > 0) ? dim : 1;
}
std::vector<float> data(totalElements, 0.5f);
inputTensors.push_back(Ort::Value::CreateTensor<float>(
memInfo, data.data(), data.size() * sizeof(float),
inputShapes[i].data(), inputShapes[i].size()));
}
engine.infer(inputTensors);
auto outputTensors = engine.getOutputOrt();
for (size_t i = 0; i < outputTensors.size(); i++) {
auto info = outputTensors[i].GetTensorTypeAndShapeInfo();
std::cout << "Output " << i << " shape: [";
for (auto dim : info.GetShape()) {
std::cout << dim << " ";
}
std::cout << "]" << std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
@ Float16
16-bit floating point - reduced precision, ~50% memory reduction, faster on compatible hardware
Definition onnx_lre.hpp:185
@ CUDA
NVIDIA CUDA - GPU acceleration without TensorRT optimizations.
Definition onnx_lre.hpp:173
Precision precision
Specifies the precision type for model execution. Defaults to the best precision runtime can run.
Definition onnx_lre.hpp:236
Example 3: Using CUDA Graphs for Optimized Inference
This example demonstrates how to leverage CUDA graphs for optimized inference performance with static input shapes:
#include <onnx_lre/onnx_lre.hpp>
#include <dlpack/dlpack.h>
#include <torch/torch.h>
#include <ATen/DLConvertor.h>
#include <memory>
#include <iostream>
int main() {
try {
const auto& inputShapes = engine.getInputShapes();
torch::Tensor dummy_input;
std::vector<DLManagedTensor *> input_tensors, output_tensors;
for (const auto& shape : inputShapes) {
dummy_input = torch::randn(shape, torch::device(torch::kCUDA));
input_tensors.push_back(at::toDLPack(dummy_input));
}
engine.infer(input_tensors);
const int numInferences = 100;
for (int i = 0; i < numInferences; i++) {
input_tensors.clear();
for (const auto& shape : inputShapes) {
dummy_input = torch::randn(shape, torch::device(torch::kCUDA));
input_tensors.push_back(at::toDLPack(dummy_input));
}
engine.infer(input_tensors);
output_tensors = engine.getOutput();
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
@ Float32
32-bit floating point - highest precision, largest memory footprint
Definition onnx_lre.hpp:184
std::optional< bool > enableCudaGraph
Enables CUDA Graph optimization for inference. When true, static models use CUDA Graphs for faster ex...
Definition onnx_lre.hpp:240
Example 4: Using CUDA Streams for multi-stream inference
This example demonstrates how to leverage CUDA Streams to run two models in parallel:
#include <onnx_lre/onnx_lre.hpp>
#include <dlpack/dlpack.h>
#include <torch/torch.h>
#include <ATen/DLConvertor.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>
#include <iostream>
#include <thread>
int main() {
try {
auto stream1 = at::cuda::getStreamFromPool();
auto stream2 = at::cuda::getStreamFromPool();
options2 = options1;
const auto& inputShapes1 = engine1.getInputShapes();
const auto& inputShapes2 = engine2.getInputShapes();
std::vector<torch::Tensor> dummy_inputs1;
std::vector<torch::Tensor> dummy_inputs2;
{
c10::cuda::CUDAGuard guard1(stream1.device());
at::cuda::setCurrentCUDAStream(stream1);
for (const auto& shape : inputShapes1) {
dummy_inputs1.push_back(torch::randn(shape, torch::device(torch::kCUDA)));
}
}
{
c10::cuda::CUDAGuard guard2(stream2.device());
at::cuda::setCurrentCUDAStream(stream2);
for (const auto& shape : inputShapes2) {
dummy_inputs2.push_back(torch::randn(shape, torch::device(torch::kCUDA)));
}
}
std::vector<DLManagedTensor*> input_tensors1, input_tensors2, output_tensors1, output_tensors2;
for (const auto& tensor : dummy_inputs1) {
input_tensors1.push_back(at::toDLPack(tensor));
}
for (const auto& tensor : dummy_inputs2) {
input_tensors2.push_back(at::toDLPack(tensor));
}
engine1.infer(input_tensors1);
engine2.infer(input_tensors2);
const int numInferences = 50;
for (int i = 0; i < numInferences; ++i) {
for (auto& tensor : dummy_inputs1) {
tensor.normal_();
}
for (auto& tensor : dummy_inputs2) {
tensor.normal_();
}
input_tensors1.clear();
input_tensors2.clear();
for (const auto& tensor : dummy_inputs1) {
input_tensors1.push_back(at::toDLPack(tensor));
}
for (const auto& tensor : dummy_inputs2) {
input_tensors2.push_back(at::toDLPack(tensor));
}
std::thread thread1([&]() {
c10::cuda::CUDAGuard guard(stream1.device());
at::cuda::setCurrentCUDAStream(stream1);
engine1.infer(input_tensors1);
output_tensors1 = engine1.getOutputs();
});
std::thread thread2([&]() {
c10::cuda::CUDAGuard guard(stream2.device());
at::cuda::setCurrentCUDAStream(stream2);
engine2.infer(input_tensors2);
output_tensors2 = engine2.getOutputs();
});
thread1.join();
thread2.join();
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
@ TensorRT
NVIDIA TensorRT - highest performance for supported operations with optimization passes.
Definition onnx_lre.hpp:172
void * cudaStream
Definition onnx_lre.hpp:243