Skip to content

Quick Start

Behnam Ebrahimi edited this page Mar 29, 2026 · 1 revision

Quick Start

Python — Simplest Usage

from whisper_mlx import LightningWhisperMLX

# Initialize with batched decoding for maximum speed
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)

# Transcribe audio
result = whisper.transcribe("audio.mp3")
print(result["text"])

Python — With Options

from whisper_mlx import LightningWhisperMLX

whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)

# With word-level timestamps
result = whisper.transcribe("audio.mp3", language="en", word_timestamps=True)

# Access word timings
for segment in result["segments"]:
    for word in segment.get("words", []):
        print(f"  [{word['start']:.2f} -> {word['end']:.2f}] {word['word']}")

Python — Full API

from whisper_mlx import transcribe

result = transcribe(
    "audio.mp3",
    path_or_hf_repo="mlx-community/whisper-turbo",
    batch_size=6,
    language="en",
    word_timestamps=True,
)

print(result["text"])
for segment in result["segments"]:
    print(f"[{segment['start']:.2f} -> {segment['end']:.2f}] {segment['text']}")

CLI — Basic

# Basic transcription
vayu audio.mp3

# Batched decoding (3-5x faster)
vayu audio.mp3 --batch-size 12

# Specify model and output format
vayu audio.mp3 --model mlx-community/distil-whisper-large-v3 --output-format srt

# Multiple files
vayu audio1.mp3 audio2.mp3 --output-dir ./transcripts

# Word-level timestamps
vayu audio.mp3 --word-timestamps True

# Translate to English
vayu audio.mp3 --task translate

Result Format

Both the Python API and CLI return results in this structure:

{
    "text": "Full transcription text...",
    "segments": [
        {
            "id": 0,
            "seek": 0,
            "start": 0.0,
            "end": 5.2,
            "text": " Segment text",
            "tokens": [50364, 1025, ...],
            "temperature": 0.0,
            "avg_logprob": -0.42,
            "compression_ratio": 1.8,
            "no_speech_prob": 0.05,
            "words": [  # Only with word_timestamps=True
                {"word": "Segment", "start": 0.1, "end": 0.5, "probability": 0.98},
                {"word": "text", "start": 0.5, "end": 0.9, "probability": 0.95}
            ]
        }
    ],
    "language": "en"
}

Clone this wiki locally