-
Notifications
You must be signed in to change notification settings - Fork 0
Integration Recipes
Behnam Ebrahimi edited this page Mar 29, 2026
·
2 revisions
Recipes for integrating Vayu into larger workflows and applications.
# Convert to 16kHz mono WAV (optimal for Vayu)
ffmpeg -i input.mkv -vn -ar 16000 -ac 1 -f wav output.wav
vayu output.wav --batch-size 12# Extract audio track from MP4
ffmpeg -i video.mp4 -vn -acodec libmp3lame audio.mp3
vayu audio.mp3 --batch-size 12 -f srt# Extract 5 minutes starting at 1:30:00
ffmpeg -i long_recording.mp3 -ss 1:30:00 -t 5:00 -acodec copy segment.mp3
vayu segment.mp3 --batch-size 12ffmpeg -i video.mkv -vn -ar 16000 -ac 1 -f wav - | vayu - --output-name transcriptimport tempfile
from pathlib import Path
from fastapi import FastAPI, UploadFile
from whisper_mlx import LightningWhisperMLX
app = FastAPI()
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
@app.post("/transcribe")
async def transcribe(file: UploadFile, language: str = None):
with tempfile.NamedTemporaryFile(suffix=Path(file.filename).suffix, delete=True) as tmp:
tmp.write(await file.read())
tmp.flush()
result = whisper.transcribe(tmp.name, language=language)
return {"text": result["text"], "language": result["language"]}Run with:
# Using uv
uv pip install fastapi uvicorn
# Using pip
pip install fastapi uvicorn
uvicorn server:app --host 0.0.0.0 --port 8000Test:
curl -X POST http://localhost:8000/transcribe \
-F "file=@audio.mp3" \
-F "language=en"import tempfile
from flask import Flask, request, jsonify
from whisper_mlx import LightningWhisperMLX
app = Flask(__name__)
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
@app.route("/transcribe", methods=["POST"])
def transcribe():
audio = request.files["audio"]
language = request.form.get("language")
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as tmp:
audio.save(tmp.name)
result = whisper.transcribe(tmp.name, language=language)
return jsonify(text=result["text"], language=result["language"])
if __name__ == "__main__":
app.run(port=5000)Process a directory of audio files with progress tracking:
#!/usr/bin/env python3
"""Batch transcribe all audio files in a directory."""
import sys
import json
from pathlib import Path
from whisper_mlx import LightningWhisperMLX
AUDIO_EXTENSIONS = {".mp3", ".wav", ".flac", ".m4a", ".ogg", ".webm", ".aac", ".wma"}
def main(input_dir: str, output_dir: str, model: str = "distil-large-v3"):
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
files = sorted(
f for f in input_path.iterdir()
if f.suffix.lower() in AUDIO_EXTENSIONS
)
if not files:
print(f"No audio files found in {input_dir}")
return
whisper = LightningWhisperMLX(model=model, batch_size=12)
errors = []
for i, audio_file in enumerate(files, 1):
print(f"[{i}/{len(files)}] {audio_file.name}")
try:
result = whisper.transcribe(str(audio_file))
# Save text
(output_path / f"{audio_file.stem}.txt").write_text(result["text"])
# Save JSON with full metadata
with open(output_path / f"{audio_file.stem}.json", "w") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f" ERROR: {e}")
errors.append((audio_file.name, str(e)))
print(f"\nDone: {len(files) - len(errors)}/{len(files)} succeeded")
if errors:
print("Failures:")
for name, err in errors:
print(f" {name}: {err}")
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])Usage:
python batch_transcribe.py ./recordings ./transcriptsTranscribe an entire YouTube playlist:
#!/bin/bash
# Download and transcribe a YouTube playlist
PLAYLIST_URL="$1"
OUTPUT_DIR="./youtube_transcripts"
mkdir -p "$OUTPUT_DIR"
# Download audio only
yt-dlp -x --audio-format mp3 -o "$OUTPUT_DIR/%(title)s.%(ext)s" "$PLAYLIST_URL"
# Transcribe all
vayu "$OUTPUT_DIR"/*.mp3 --batch-size 12 -f all -o "$OUTPUT_DIR"Build a searchable index from transcriptions:
import json
from whisper_mlx import LightningWhisperMLX
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
result = whisper.transcribe("lecture.mp3", word_timestamps=True)
# Build timestamp-indexed search data
index = []
for seg in result["segments"]:
index.append({
"text": seg["text"].strip(),
"start": seg["start"],
"end": seg["end"],
"words": [w["word"].strip() for w in seg.get("words", [])],
})
# Search function
def search(query, index):
results = []
for entry in index:
if query.lower() in entry["text"].lower():
mins = int(entry["start"] // 60)
secs = int(entry["start"] % 60)
results.append(f"[{mins:02d}:{secs:02d}] {entry['text']}")
return results
# Example
for match in search("machine learning", index):
print(match)Watch a directory and auto-transcribe new files:
#!/bin/bash
# watch_and_transcribe.sh
# Run with: ./watch_and_transcribe.sh /path/to/watch /path/to/output
WATCH_DIR="$1"
OUTPUT_DIR="$2"
mkdir -p "$OUTPUT_DIR"
# Find files not yet transcribed
for audio in "$WATCH_DIR"/*.{mp3,wav,m4a,flac}; do
[ -f "$audio" ] || continue
base=$(basename "${audio%.*}")
if [ ! -f "$OUTPUT_DIR/$base.txt" ]; then
echo "Transcribing: $audio"
vayu "$audio" --batch-size 12 -f all -o "$OUTPUT_DIR"
fi
doneAdd to crontab:
# Run every 15 minutes
*/15 * * * * /path/to/watch_and_transcribe.sh /incoming /transcriptsFor applications that need fast startup, pre-load the model once:
from whisper_mlx import LightningWhisperMLX
# Load once at application startup
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
# Reuse for all transcriptions (model stays cached)
result1 = whisper.transcribe("audio1.mp3")
result2 = whisper.transcribe("audio2.mp3")
result3 = whisper.transcribe("audio3.mp3")The model is cached by ModelHolder — subsequent calls with the same model/dtype skip loading entirely.