WhisperX

3.8.5 · active · verified Sun Apr 12

WhisperX is a Python library that provides time-accurate Automatic Speech Recognition (ASR) using OpenAI's Whisper model, enhanced with speaker diarization. It supports a range of models, languages, and device configurations (CPU/GPU) to offer high-quality transcription with precise timestamps and speaker identification. The current version is 3.8.5, and it maintains an active release cadence with frequent updates.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates loading an ASR model, transcribing a sample audio file (downloaded automatically for runnability), and aligning the transcription. It also includes an optional step for speaker diarization, which requires a Hugging Face authentication token for model downloads. Ensure `requests` is installed (`pip install requests`) to run this example, and `ffmpeg` is installed on your system.

import whisperx
import torch
import os
from pathlib import Path
import requests

# --- Setup for a runnable example ---
# Path for the temporary audio file
temp_audio_path = Path("temp_whisperx_example.wav")
# A small WAV file from Mozilla DeepSpeech samples
wav_url = "https://github.com/mozilla/DeepSpeech/raw/master/samples/audio/8455-210777-0068.wav"

# Download a small audio file if it doesn't exist
if not temp_audio_path.exists():
    print(f"Downloading sample audio from {wav_url}...")
    try:
        response = requests.get(wav_url, stream=True)
        response.raise_for_status()
        with open(temp_audio_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Sample audio downloaded.")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download sample audio: {e}")
        print("Please ensure you have an internet connection or manually place a WAV file at temp_whisperx_example.wav")
        exit(1)

# --- WhisperX Core Logic ---
device = "cuda" if torch.cuda.is_available() else "cpu"
# float16 for GPU, int8 for CPU or low VRAM GPU
compute_type = "float16" if device == "cuda" else "int8" 
batch_size = 16 # Reduce if low on GPU VRAM

print(f"\nLoading WhisperX model ('base') on {device} with {compute_type} precision...")
# Using 'base' for faster download and less VRAM for quickstart
model = whisperx.load_model("base", device, compute_type=compute_type, language="en")

print(f"Loading audio from {temp_audio_path}...")
audio = whisperx.load_audio(str(temp_audio_path))

print("Transcribing audio...")
result = model.transcribe(audio, batch_size=batch_size)

print("Loading alignment model and aligning segments...")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
aligned_result = whisperx.align(result["segments"], model_a, metadata, audio, device)

print("\nTranscription Result (Aligned):")
for segment in aligned_result["segments"]:
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}")

# Optional: Add diarization for speaker assignment
# Diarization models from Hugging Face may require an auth token.
# Set your Hugging Face token as an environment variable (e.g., HF_TOKEN="hf_xxxx")
hf_token = os.environ.get("HF_TOKEN", "") 

if hf_token:
    print("\nPerforming diarization (speaker assignment)...")
    # Diarization requires an internet connection to download models
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
    diarize_segments = diarize_model(str(temp_audio_path), min_speakers=1, max_speakers=2)
    result_with_speakers = whisperx.assign_speakers(diarize_segments, aligned_result)
    
    print("\nTranscription with Speakers:")
    for segment in result_with_speakers["segments"]:
        print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment.get('speaker', 'UNKNOWN')}: {segment['text']}")
else:
    print("\nSkipping diarization: HF_TOKEN environment variable not found. Diarization models may require it.")

# --- Cleanup ---
temp_audio_path.unlink()
print("\nWhisperX quickstart completed.")

view raw JSON →