Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions whisperx/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
SingleAlignedSegment,
SingleWordSegment,
SegmentData,
ProgressCallback,
)
import nltk
from nltk.data import load as nltk_load
Expand Down Expand Up @@ -122,6 +123,7 @@ def align(
return_char_alignments: bool = False,
print_progress: bool = False,
combined_progress: bool = False,
progress_callback: ProgressCallback = None,
) -> AlignedTranscriptionResult:
"""
Align phoneme recognition predictions to known transcription.
Expand Down Expand Up @@ -376,6 +378,9 @@ def align(
agg_dict["avg_logprob"] = "first"
aligned_subsegments= aligned_subsegments.groupby(["start", "end"], as_index=False).agg(agg_dict)
aligned_subsegments = aligned_subsegments.to_dict('records')
if progress_callback is not None:
progress_callback(((sdx + 1) / total_segments) * 100)

aligned_segments += aligned_subsegments

# create word_segments list
Expand Down
5 changes: 4 additions & 1 deletion whisperx/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from transformers.pipelines.pt_utils import PipelineIterator

from whisperx.audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram
from whisperx.schema import SingleSegment, TranscriptionResult
from whisperx.schema import SingleSegment, TranscriptionResult, ProgressCallback
from whisperx.vads import Vad, Silero, Pyannote
from whisperx.log_utils import get_logger

Expand Down Expand Up @@ -205,6 +205,7 @@ def transcribe(
print_progress=False,
combined_progress=False,
verbose=False,
progress_callback: ProgressCallback = None,
) -> TranscriptionResult:
if isinstance(audio, str):
audio = load_audio(audio)
Expand Down Expand Up @@ -268,6 +269,8 @@ def data(audio, segments):
base_progress = ((idx + 1) / total_segments) * 100
percent_complete = base_progress / 2 if combined_progress else base_progress
print(f"Progress: {percent_complete:.2f}%...")
if progress_callback is not None:
progress_callback(((idx + 1) / total_segments) * 100)
text = out['text']
avg_logprob = out['avg_logprob']
if batch_size in [0, 1, None]:
Expand Down
26 changes: 25 additions & 1 deletion whisperx/diarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from whisperx.audio import load_audio, SAMPLE_RATE
from whisperx.schema import TranscriptionResult, AlignedTranscriptionResult
from whisperx.schema import TranscriptionResult, AlignedTranscriptionResult, ProgressCallback
from whisperx.log_utils import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -109,6 +109,7 @@ def __call__(
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
return_embeddings: bool = False,
progress_callback: ProgressCallback = None,
) -> Union[tuple[pd.DataFrame, Optional[dict[str, list[float]]]], pd.DataFrame]:
"""
Perform speaker diarization on audio.
Expand All @@ -119,6 +120,7 @@ def __call__(
min_speakers: Minimum number of speakers to detect
max_speakers: Maximum number of speakers to detect
return_embeddings: Whether to return speaker embeddings
progress_callback: Optional callable receiving a float (0-100) with progress percentage

Returns:
If return_embeddings is True:
Expand All @@ -133,13 +135,35 @@ def __call__(
'sample_rate': SAMPLE_RATE
}

hook = None
if progress_callback is not None:
# pyannote's diarization has two progress-trackable steps, each with
# its own completed/total counter that resets between steps. Map each
# step into a sub-range so progress is monotonic and meaningful.
_STEP_RANGES = {
"segmentation": (0.0, 50.0),
"embeddings": (50.0, 99.0),
}
last_pct = [0.0]
def hook(step_name, step_artifact, file=None, total=None, completed=None):
if total is not None and completed is not None and total > 0:
offset, end = _STEP_RANGES.get(step_name, (0.0, 99.0))
pct = offset + min(completed / total, 1.0) * (end - offset)
if pct > last_pct[0]:
last_pct[0] = pct
progress_callback(pct)

output = self.model(
audio_data,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers,
**({"hook": hook} if hook is not None else {}),
)

if progress_callback is not None:
progress_callback(100.0)

diarization = output.speaker_diarization
embeddings = output.speaker_embeddings if return_embeddings else None

Expand Down
4 changes: 3 additions & 1 deletion whisperx/schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import TypedDict, Optional, List, Tuple
from typing import Callable, TypedDict, Optional, List, Tuple

ProgressCallback = Optional[Callable[[float], None]]

try:
from typing import NotRequired
Expand Down
Loading