Integration Recipes

Recipes for integrating Vayu into larger workflows and applications.

FFmpeg Preprocessing

Convert Any Format to Optimal Input

# Convert to 16kHz mono WAV (optimal for Vayu)
ffmpeg -i input.mkv -vn -ar 16000 -ac 1 -f wav output.wav
vayu output.wav --batch-size 12

Extract Audio from Video

# Extract audio track from MP4
ffmpeg -i video.mp4 -vn -acodec libmp3lame audio.mp3
vayu audio.mp3 --batch-size 12 -f srt

Trim Before Transcribing

# Extract 5 minutes starting at 1:30:00
ffmpeg -i long_recording.mp3 -ss 1:30:00 -t 5:00 -acodec copy segment.mp3
vayu segment.mp3 --batch-size 12

Pipe Directly (No Intermediate File)

ffmpeg -i video.mkv -vn -ar 16000 -ac 1 -f wav - | vayu - --output-name transcript

Web API with FastAPI

import tempfile
from pathlib import Path

from fastapi import FastAPI, UploadFile
from whisper_mlx import LightningWhisperMLX

app = FastAPI()
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)

@app.post("/transcribe")
async def transcribe(file: UploadFile, language: str = None):
    with tempfile.NamedTemporaryFile(suffix=Path(file.filename).suffix, delete=True) as tmp:
        tmp.write(await file.read())
        tmp.flush()
        result = whisper.transcribe(tmp.name, language=language)
    return {"text": result["text"], "language": result["language"]}

Run with:

# Using uv
uv pip install fastapi uvicorn

# Using pip
pip install fastapi uvicorn

uvicorn server:app --host 0.0.0.0 --port 8000

Test:

curl -X POST http://localhost:8000/transcribe \
    -F "file=@audio.mp3" \
    -F "language=en"

Flask API

import tempfile

from flask import Flask, request, jsonify
from whisper_mlx import LightningWhisperMLX

app = Flask(__name__)
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)

@app.route("/transcribe", methods=["POST"])
def transcribe():
    audio = request.files["audio"]
    language = request.form.get("language")

    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as tmp:
        audio.save(tmp.name)
        result = whisper.transcribe(tmp.name, language=language)

    return jsonify(text=result["text"], language=result["language"])

if __name__ == "__main__":
    app.run(port=5000)

Batch Processing Script

Process a directory of audio files with progress tracking:

#!/usr/bin/env python3
"""Batch transcribe all audio files in a directory."""

import sys
import json
from pathlib import Path
from whisper_mlx import LightningWhisperMLX

AUDIO_EXTENSIONS = {".mp3", ".wav", ".flac", ".m4a", ".ogg", ".webm", ".aac", ".wma"}

def main(input_dir: str, output_dir: str, model: str = "distil-large-v3"):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    files = sorted(
        f for f in input_path.iterdir()
        if f.suffix.lower() in AUDIO_EXTENSIONS
    )

    if not files:
        print(f"No audio files found in {input_dir}")
        return

    whisper = LightningWhisperMLX(model=model, batch_size=12)
    errors = []

    for i, audio_file in enumerate(files, 1):
        print(f"[{i}/{len(files)}] {audio_file.name}")
        try:
            result = whisper.transcribe(str(audio_file))
            # Save text
            (output_path / f"{audio_file.stem}.txt").write_text(result["text"])
            # Save JSON with full metadata
            with open(output_path / f"{audio_file.stem}.json", "w") as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"  ERROR: {e}")
            errors.append((audio_file.name, str(e)))

    print(f"\nDone: {len(files) - len(errors)}/{len(files)} succeeded")
    if errors:
        print("Failures:")
        for name, err in errors:
            print(f"  {name}: {err}")

if __name__ == "__main__":
    main(sys.argv[1], sys.argv[2])

Usage:

python batch_transcribe.py ./recordings ./transcripts

YouTube Bulk Transcription

Transcribe an entire YouTube playlist:

#!/bin/bash
# Download and transcribe a YouTube playlist

PLAYLIST_URL="$1"
OUTPUT_DIR="./youtube_transcripts"
mkdir -p "$OUTPUT_DIR"

# Download audio only
yt-dlp -x --audio-format mp3 -o "$OUTPUT_DIR/%(title)s.%(ext)s" "$PLAYLIST_URL"

# Transcribe all
vayu "$OUTPUT_DIR"/*.mp3 --batch-size 12 -f all -o "$OUTPUT_DIR"

Search Index Builder

Build a searchable index from transcriptions:

import json
from whisper_mlx import LightningWhisperMLX

whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
result = whisper.transcribe("lecture.mp3", word_timestamps=True)

# Build timestamp-indexed search data
index = []
for seg in result["segments"]:
    index.append({
        "text": seg["text"].strip(),
        "start": seg["start"],
        "end": seg["end"],
        "words": [w["word"].strip() for w in seg.get("words", [])],
    })

# Search function
def search(query, index):
    results = []
    for entry in index:
        if query.lower() in entry["text"].lower():
            mins = int(entry["start"] // 60)
            secs = int(entry["start"] % 60)
            results.append(f"[{mins:02d}:{secs:02d}] {entry['text']}")
    return results

# Example
for match in search("machine learning", index):
    print(match)

Cron Job for Automated Transcription

Watch a directory and auto-transcribe new files:

#!/bin/bash
# watch_and_transcribe.sh
# Run with: ./watch_and_transcribe.sh /path/to/watch /path/to/output

WATCH_DIR="$1"
OUTPUT_DIR="$2"

mkdir -p "$OUTPUT_DIR"

# Find files not yet transcribed
for audio in "$WATCH_DIR"/*.{mp3,wav,m4a,flac}; do
    [ -f "$audio" ] || continue
    base=$(basename "${audio%.*}")
    if [ ! -f "$OUTPUT_DIR/$base.txt" ]; then
        echo "Transcribing: $audio"
        vayu "$audio" --batch-size 12 -f all -o "$OUTPUT_DIR"
    fi
done

Add to crontab:

# Run every 15 minutes
*/15 * * * * /path/to/watch_and_transcribe.sh /incoming /transcripts

Pre-loading Models

For applications that need fast startup, pre-load the model once:

from whisper_mlx import LightningWhisperMLX

# Load once at application startup
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)

# Reuse for all transcriptions (model stays cached)
result1 = whisper.transcribe("audio1.mp3")
result2 = whisper.transcribe("audio2.mp3")
result3 = whisper.transcribe("audio3.mp3")

The model is cached by ModelHolder — subsequent calls with the same model/dtype skip loading entirely.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integration Recipes

Integration Recipes

FFmpeg Preprocessing

Convert Any Format to Optimal Input

Extract Audio from Video

Trim Before Transcribing

Pipe Directly (No Intermediate File)

Web API with FastAPI

Flask API

Batch Processing Script

YouTube Bulk Transcription

Search Index Builder

Cron Job for Automated Transcription

Pre-loading Models

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally