{"id":28006,"library":"omnivoice","title":"OmniVoice","description":"OmniVoice is a zero-shot text-to-speech library using diffusion language models. It supports multilingual TTS with voice cloning from short audio samples. Current version 0.1.5, actively maintained. Requires Python >= 3.10.","status":"active","version":"0.1.5","language":"python","source_language":"en","source_url":"https://github.com/k2-fsa/OmniVoice","tags":["text-to-speech","zero-shot","voice-cloning","multilingual","diffusion"],"install":[{"cmd":"pip install omnivoice","lang":"bash","label":"Install from PyPI"}],"dependencies":[{"reason":"Core dependency for model inference and training.","package":"torch","optional":false},{"reason":"Audio processing backend; resampling and loading audio files.","package":"torchaudio","optional":false},{"reason":"Required for tokenizer and model components.","package":"transformers","optional":false}],"imports":[{"note":"Direct import path changed in early versions.","wrong":"from omnivoice.model import OmniVoice","symbol":"OmniVoice","correct":"from omnivoice import OmniVoice"},{"note":"Inference function available at top-level.","wrong":"from omnivoice.inference import infer","symbol":"infer","correct":"from omnivoice import infer"}],"quickstart":{"code":"from omnivoice import OmniVoice, infer\n\n# Load model\nmodel = OmniVoice.from_pretrained(\"k2-fsa/OmniVoice\")\n\n# Synthesize speech\naudio = infer(model, text=\"Hello world\", reference_audio=\"ref.wav\", reference_text=\"The quick brown fox\")\n\n# Save to file\nimport torchaudio\ntorchaudio.save(\"output.wav\", audio.unsqueeze(0), 24000)","lang":"python","description":"Basic TTS inference with voice cloning."},"warnings":[{"fix":"Set OMNIVOICE_CACHE_DIR or download model files manually.","message":"Model loading without internet will fail if cache is missing. Use local pretrained path explicitly.","severity":"breaking","affected_versions":"<=0.1.5"},{"fix":"Resample audio to 24000 Hz and convert to mono before passing.","message":"Reference audio must be monophonic and at 24kHz sample rate. Mismatch causes quality degradation.","severity":"gotcha","affected_versions":"all"},{"fix":"Set device='cpu' explicitly when using MPS.","message":"Inference on MPS (Apple Silicon) may fail due to unsupported operations. Use CPU or CUDA.","severity":"gotcha","affected_versions":"<=0.1.5"},{"fix":"Remove `load_asr=True` from `OmniVoice.from_pretrained`.","message":"The `load_asr` argument in model loading is deprecated. ASR model is now loaded automatically.","severity":"deprecated","affected_versions":">=0.1.5"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Trim reference audio to 3-30 seconds and ensure the text corresponds exactly.","cause":"Reference audio and text lengths do not align, or audio is too long (>30s recommended).","error":"RuntimeError: Audio length mismatch"},{"fix":"Install torchaudio >= 0.12: pip install --upgrade torchaudio","cause":"torchaudio version is too old (<0.12) for resample function.","error":"AttributeError: module 'torchaudio' has no attribute 'resample'"},{"fix":"Use 'from omnivoice import OmniVoice' instead of 'from omnivoice.model import OmniVoice'.","cause":"Incorrect import path; older documentation showed wrong path.","error":"ImportError: cannot import name 'OmniVoice' from 'omnivoice'"},{"fix":"Convert reference audio to mono with torchaudio.functional.to_mono().","cause":"Passing stereo audio as reference; expects mono.","error":"ValueError: The truth value of an array with more than one element is ambiguous"},{"fix":"Ensure internet connection for first download, or set OMNIVOICE_CACHE_DIR to a valid path.","cause":"Model not downloaded or cache path misconfigured.","error":"FileNotFoundError: No such file or directory: 'path/to/model'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}