Optimum ONNX

0.1.0 · active · verified Wed Apr 15

Optimum ONNX is a specialized extension of the Hugging Face Optimum library, providing a streamlined interface for exporting Hugging Face Transformer models (and other architectures like Diffusers, Timm, Sentence Transformers) to the ONNX format. It facilitates efficient inference and deployment using ONNX Runtime, including features like graph optimization and quantization. Currently at version 0.1.0, it sees regular updates to support new Hugging Face models and ensure compatibility with underlying libraries like PyTorch and Transformers.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates the core workflow: exporting a Hugging Face model to ONNX using `ORTModelForSequenceClassification.from_pretrained(export=True)`, saving the exported model and tokenizer, then loading the ONNX model and performing inference with the `optimum.onnxruntime.pipeline`.

import os
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.onnxruntime import pipeline as ORTPipeline # Alias to avoid conflict with transformers.pipeline if imported
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
save_directory = "./tmp/onnx_model"

# 1. Load a model from transformers and export it to ONNX
print(f"Exporting model {model_checkpoint} to ONNX...")
ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 2. Save the ONNX model and tokenizer
os.makedirs(save_directory, exist_ok=True)
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Model and tokenizer saved to {save_directory}")

# 3. Load the exported ONNX model for inference
print(f"Loading ONNX model from {save_directory} for inference...")
loaded_ort_model = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model.onnx")
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

# 4. Run inference using the Optimum ONNX Runtime pipeline
cls_pipeline = ORTPipeline("text-classification", model=loaded_ort_model, tokenizer=loaded_tokenizer)
results = cls_pipeline("I love using Hugging Face Optimum ONNX!")
print(f"Inference result: {results}")

# Example with a quantized model (if applicable)
# from optimum.onnxruntime.configuration import AutoQuantizationConfig
# from optimum.onnxruntime import ORTQuantizer
# qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
# quantizer = ORTQuantizer.from_pretrained(ort_model)
# quantizer.quantize(save_dir=save_directory, quantization_config=qconfig)
# loaded_quantized_model = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model_quantized.onnx")
# cls_pipeline_quant = ORTPipeline("text-classification", model=loaded_quantized_model, tokenizer=loaded_tokenizer)
# results_quant = cls_pipeline_quant("I love using Hugging Face Optimum ONNX with quantization!")
# print(f"Quantized inference result: {results_quant}")

view raw JSON →