Sherpa ONNX

1.12.38 · active · verified Thu Apr 16

sherpa-onnx is a next-generation speech recognition (ASR) and text-to-speech (TTS) toolkit built with k2 and ONNX Runtime. It provides high-performance, cross-platform inference for various state-of-the-art speech models, enabling real-time and offline processing. The library is actively maintained with frequent minor releases, often multiple times a week, reflecting rapid development and integration of new models and features. The current version is 1.12.38.

Common errors

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to perform offline speech recognition using `sherpa-onnx`. It first ensures that a sample ASR model and an audio file are downloaded locally, then configures and initializes an `OfflineRecognizer`. Finally, it processes the sample audio file and prints the transcribed text. For GPU inference, install `onnxruntime-gpu` and ensure CUDA is properly set up.

import os
import wave
import urllib.request
from sherpa_onnx import OfflineRecognizer, OfflineRecognizerConfig, FeatureConfig, SpeakerEmbeddingExtractorConfig, OfflineStream

# --- Configuration and Model Download ---
# This example uses a small, popular ASR model.
# You can find more models at https://k2-fsa.github.io/sherpa-onnx/index.html

MODEL_DIR = "./sherpa-onnx-models/csukuangfj/sherpa-onnx-offline-zh-en-conformer-mix-init-transducer-2023-12-13"
MODEL_URL = "https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.12.38/sherpa-onnx-offline-zh-en-conformer-mix-init-transducer-2023-12-13.tar.bz2"
MODEL_TAR_FILE = os.path.join(os.path.dirname(MODEL_DIR), os.path.basename(MODEL_URL))
MODEL_FILES = {
    "encoder": "encoder-epoch-99-avg-1.onnx",
    "decoder": "decoder-epoch-99-avg-1.onnx",
    "joiner": "joiner-epoch-99-avg-1.onnx",
    "tokens": "tokens.txt"
}

AUDIO_FILE = "./sherpa-onnx-models/test.wav"
AUDIO_URL = "https://github.com/k2-fsa/sherpa-onnx/raw/master/sherpa-onnx/python/test.wav"

def download_file_if_not_exists(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {os.path.basename(filename)} from {url}...")
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")

def extract_tar_bz2(tar_path, extract_path):
    if not os.path.exists(extract_path) or not os.listdir(extract_path):
        print(f"Extracting {os.path.basename(tar_path)} to {extract_path}...")
        import tarfile
        with tarfile.open(tar_path, "r:bz2") as tar:
            tar.extractall(path=extract_path)
        print("Extraction complete.")

# Ensure model directory exists and models are downloaded/extracted
os.makedirs(MODEL_DIR, exist_ok=True)
if not all(os.path.exists(os.path.join(MODEL_DIR, f)) for f in MODEL_FILES.values()):
    download_file_if_not_exists(MODEL_URL, MODEL_TAR_FILE)
    extract_tar_bz2(MODEL_TAR_FILE, MODEL_DIR)

# Ensure test audio exists
download_file_if_not_exists(AUDIO_URL, AUDIO_FILE)

# --- Recognizer Configuration ---
feat_config = FeatureConfig(sample_rate=16000, feature_dim=80)

recognizer_config = OfflineRecognizerConfig(
    feat_config=feat_config,
    model_config={
        "encoder": os.path.join(MODEL_DIR, MODEL_FILES["encoder"]),
        "decoder": os.path.join(MODEL_DIR, MODEL_FILES["decoder"]),
        "joiner": os.path.join(MODEL_DIR, MODEL_FILES["joiner"]),
        "tokens": os.path.join(MODEL_DIR, MODEL_FILES["tokens"]),
        "num_threads": 1, # Use 1 thread for CPU inference
        "debug": False
    },
    lm_config={},
    transducer_config={},
    decode_config={
        "method": "modified_beam_search",
        "num_active_paths": 4
    }
)

# --- Create Recognizer and Process Audio ---
recognizer = OfflineRecognizer(recognizer_config)

# Read the audio file
with wave.open(AUDIO_FILE, "rb") as f:
    assert f.getframerate() == 16000, f.getframerate()
    assert f.getnchannels() == 1, f.getnchannels()
    assert f.getsampwidth() == 2, f.getsampwidth()
    n_samples = f.getnframes()
    audio_bytes = f.readframes(n_samples)

# Create an audio stream and pass the audio data
stream = recognizer.create_stream()
# Data expects float32 array, scale int16 to float32 range
import numpy as np
samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
stream.accept_waveform(16000, samples)

# Decode the stream
recognizer.decode_stream(stream)

# Get the result
result = stream.result.text
print(f"Recognition Result: {result}")

view raw JSON →