Skip to content
3 changes: 3 additions & 0 deletions backend/app/api/docs/stt_evaluation/update_sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Update an STT sample's language and/or ground truth transcription.

Only the provided fields will be updated. Fields set to `null` in the request will not modify the existing value.
17 changes: 16 additions & 1 deletion backend/app/api/docs/tts_evaluation/update_feedback.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
Update human feedback on a TTS synthesis result.
Update human feedback and score on a TTS synthesis result.

Only the provided fields will be updated. Fields omitted from the request will not modify the existing value. Sending a field as `null` will clear its value.

Fields:
- **is_correct**: Whether the synthesized audio quality is acceptable (null to clear)
- **comment**: Optional feedback comment
- **score**: Evaluation metrics for the synthesized audio

**Example request:**
```json
{
"is_correct": true,
"comment": "string",
"score": {
"Speech Naturalness": "low | medium | high",
"Pronunciation Accuracy": "low | medium | high"
Comment on lines +16 to +17
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Use concrete enum values in the JSON example.

On Line 16 and Line 17, "low | medium | high" reads like a literal payload value and can lead to invalid requests when copied directly.

Suggested doc tweak
   "score": {
-    "Speech Naturalness": "low | medium | high",
-    "Pronunciation Accuracy": "low | medium | high"
+    "Speech Naturalness": "medium",
+    "Pronunciation Accuracy": "high"
   }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"Speech Naturalness": "low | medium | high",
"Pronunciation Accuracy": "low | medium | high"
"score": {
"Speech Naturalness": "medium",
"Pronunciation Accuracy": "high"
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/api/docs/tts_evaluation/update_feedback.md` around lines 16 - 17,
Replace the placeholder pipe-separated strings in the JSON example with concrete
enum values: change the values for "Speech Naturalness" and "Pronunciation
Accuracy" from "low | medium | high" to a single valid example such as "low" (or
"medium"/"high") so the example is a valid JSON payload and unambiguous when
copied.

}
}
```
1 change: 1 addition & 0 deletions backend/app/api/routes/evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _dataset_to_response(
return DatasetUploadResponse(
dataset_id=dataset.id,
dataset_name=dataset.name,
description=dataset.description,
total_items=dataset.dataset_metadata.get("total_items_count", 0),
original_items=dataset.dataset_metadata.get("original_items_count", 0),
duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1),
Expand Down
12 changes: 7 additions & 5 deletions backend/app/api/routes/languages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging

from fastapi import APIRouter, HTTPException
from fastapi import APIRouter

from app.api.deps import AuthContextDep, SessionDep
from app.crud.language import get_language_by_id, get_languages
Expand Down Expand Up @@ -37,8 +37,10 @@ def get_language(session: SessionDep, auth_context: AuthContextDep, language_id:
"""
Retrieve a language by ID.
"""
language = get_language_by_id(session=session, language_id=language_id)
if language is None:
logger.error(f"[get_language] Language not found | language_id={language_id}")
raise HTTPException(status_code=404, detail="Language not found")
language = get_language_by_id(
session=session,
language_id=language_id,
status_code=404,
detail="Language not found",
)
return APIResponse.success_response(language)
58 changes: 50 additions & 8 deletions backend/app/api/routes/stt_evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
get_samples_by_dataset_id,
get_stt_dataset_by_id,
list_stt_datasets,
update_stt_sample,
)
from app.models.stt_evaluation import (
STTDatasetCreate,
STTDatasetPublic,
STTDatasetWithSamples,
STTSamplePublic,
STTSampleUpdate,
)
from app.services.stt_evaluations.dataset import upload_stt_dataset
from app.utils import APIResponse, load_description
Expand All @@ -43,13 +45,7 @@ def create_dataset(
"""Create an STT evaluation dataset."""
# Validate language_id
if dataset_create.language_id is not None:
language = get_language_by_id(
session=session, language_id=dataset_create.language_id
)
if not language:
raise HTTPException(
status_code=400, detail="Invalid language_id: language not found"
)
get_language_by_id(session=session, language_id=dataset_create.language_id)

dataset, samples = upload_stt_dataset(
session=session,
Expand Down Expand Up @@ -165,7 +161,6 @@ def get_dataset(
session=session, project_id=auth_context.project_.id
)

samples = []
for s in sample_records:
signed_url = None
if storage and s.file_id in file_map:
Expand Down Expand Up @@ -209,3 +204,50 @@ def get_dataset(
),
metadata={"samples_total": samples_total},
)


@router.patch(
"/samples/{sample_id}",
response_model=APIResponse[STTSamplePublic],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
summary="Update STT sample",
description=load_description("stt_evaluation/update_sample.md"),
)
def update_sample(
session: SessionDep,
auth_context: AuthContextDep,
sample_id: int,
sample_update: STTSampleUpdate = Body(...),
) -> APIResponse[STTSamplePublic]:
"""Update an STT sample's language and/or ground truth."""
logger.info(f"[update_sample] Updating sample | " f"sample_id: {sample_id}")

if sample_update.language_id is not None:
get_language_by_id(session=session, language_id=sample_update.language_id)

sample = update_stt_sample(
session=session,
sample_id=sample_id,
org_id=auth_context.organization_.id,
project_id=auth_context.project_.id,
language_id=sample_update.language_id,
ground_truth=sample_update.ground_truth,
)

if not sample:
raise HTTPException(status_code=404, detail="Sample not found")

return APIResponse.success_response(
data=STTSamplePublic(
id=sample.id,
file_id=sample.file_id,
language_id=sample.language_id,
ground_truth=sample.ground_truth,
sample_metadata=sample.sample_metadata,
dataset_id=sample.dataset_id,
organization_id=sample.organization_id,
project_id=sample.project_id,
inserted_at=sample.inserted_at,
updated_at=sample.updated_at,
)
)
39 changes: 30 additions & 9 deletions backend/app/api/routes/tts_evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from app.api.deps import AuthContextDep, SessionDep
from app.api.permissions import Permission, require_permission
from app.core.cloud import get_cloud_storage
from app.crud.language import get_language_by_id
from app.crud.tts_evaluations import (
get_tts_dataset_by_id,
Expand Down Expand Up @@ -38,13 +39,7 @@ def create_dataset(
"""Create a TTS evaluation dataset."""
# Validate language_id if provided
if dataset_create.language_id is not None:
language = get_language_by_id(
session=session, language_id=dataset_create.language_id
)
if not language:
raise HTTPException(
status_code=400, detail="Invalid language_id: language not found"
)
get_language_by_id(session=session, language_id=dataset_create.language_id)

dataset = upload_tts_dataset(
session=session,
Expand All @@ -71,6 +66,9 @@ def list_datasets(
auth_context: AuthContextDep,
limit: int = Query(50, ge=1, le=100, description="Maximum results to return"),
offset: int = Query(0, ge=0, description="Number of results to skip"),
include_signed_url: bool = Query(
False, description="Include signed URL for dataset files"
),
) -> APIResponse[list[TTSDatasetPublic]]:
"""List TTS evaluation datasets."""
datasets, total = list_tts_datasets(
Expand All @@ -81,8 +79,21 @@ def list_datasets(
offset=offset,
)

storage = None
if include_signed_url:
storage = get_cloud_storage(
session=session, project_id=auth_context.project_.id
)

data = []
for dataset in datasets:
signed_url = None
if storage and dataset.object_store_url:
signed_url = storage.get_signed_url(dataset.object_store_url)
data.append(TTSDatasetPublic.from_model(dataset, signed_url=signed_url))
Comment on lines +89 to +93
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle signed URL generation failures explicitly.

storage.get_signed_url(...) is an external call that can fail (invalid URL/storage transient). Right now, that bubbles into 500s and can break full list/get responses.

Suggested hardening
@@
-    for dataset in datasets:
+    for dataset in datasets:
         signed_url = None
         if storage and dataset.object_store_url:
-            signed_url = storage.get_signed_url(dataset.object_store_url)
+            try:
+                signed_url = storage.get_signed_url(dataset.object_store_url)
+            except Exception as err:
+                logger.warning(
+                    f"[list_datasets] Signed URL generation failed | dataset_id={dataset.id} | error={err}"
+                )
         data.append(TTSDatasetPublic.from_model(dataset, signed_url=signed_url))
@@
     signed_url = None
     if include_signed_url and dataset.object_store_url:
         storage = get_cloud_storage(
             session=session, project_id=auth_context.project_.id
         )
-        signed_url = storage.get_signed_url(dataset.object_store_url)
+        try:
+            signed_url = storage.get_signed_url(dataset.object_store_url)
+        except Exception as err:
+            logger.error(
+                f"[get_dataset] Signed URL generation failed | dataset_id={dataset_id} | error={err}",
+                exc_info=True,
+            )
+            raise HTTPException(
+                status_code=502, detail="Failed to generate signed URL"
+            ) from err

Also applies to: 133-138

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/api/routes/tts_evaluations/dataset.py` around lines 95 - 99, The
loop that builds response items calls
storage.get_signed_url(dataset.object_store_url) directly inside the routes that
produce TTSDatasetPublic.from_model, which lets exceptions from get_signed_url
bubble up and return 500s; wrap the call in a try/except (or try/catch) to catch
errors from storage.get_signed_url and handle them by logging the error and
setting signed_url to None (or a safe fallback) so the route (the function that
iterates over datasets and calls TTSDatasetPublic.from_model) continues
returning the rest of the items; apply the same pattern to the other
loop/occurrence around the lines referenced (the second block that also calls
storage.get_signed_url) so both list/get handlers are hardened.


return APIResponse.success_response(
data=datasets,
data=data,
metadata={"total": total, "limit": limit, "offset": offset},
)

Expand All @@ -98,6 +109,9 @@ def get_dataset(
session: SessionDep,
auth_context: AuthContextDep,
dataset_id: int,
include_signed_url: bool = Query(
False, description="Include signed URL for dataset file"
),
) -> APIResponse[TTSDatasetPublic]:
"""Get a TTS evaluation dataset."""
dataset = get_tts_dataset_by_id(
Expand All @@ -110,8 +124,15 @@ def get_dataset(
if not dataset:
raise HTTPException(status_code=404, detail="Dataset not found")

signed_url = None
if include_signed_url and dataset.object_store_url:
storage = get_cloud_storage(
session=session, project_id=auth_context.project_.id
)
signed_url = storage.get_signed_url(dataset.object_store_url)

return APIResponse.success_response(
data=TTSDatasetPublic.from_model(dataset),
data=TTSDatasetPublic.from_model(dataset, signed_url=signed_url),
metadata={
"sample_count": (dataset.dataset_metadata or {}).get("sample_count", 0)
},
Expand Down
2 changes: 2 additions & 0 deletions backend/app/api/routes/tts_evaluations/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def update_result_feedback(
update_kwargs["is_correct"] = feedback.is_correct
if "comment" in feedback.model_fields_set:
update_kwargs["comment"] = feedback.comment
if "score" in feedback.model_fields_set:
update_kwargs["score"] = feedback.score

result = update_tts_human_feedback(
session=session,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/crud/evaluations/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def update_traces_with_cosine_scores(
try:
langfuse.score(
trace_id=trace_id,
name="cosine_similarity",
name="Cosine Similarity",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use a machine-stable metric key when writing Langfuse scores.

Line 212 sets name="Cosine Similarity". This diverges from the existing canonical key "cosine_similarity" used by consumers and aggregation logic, and can lead to missing/duplicated metric series.

Suggested patch
-                name="Cosine Similarity",
+                name="cosine_similarity",
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
name="Cosine Similarity",
name="cosine_similarity",
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/crud/evaluations/langfuse.py` at line 212, The metric is being
written with a human-readable name ("Cosine Similarity") which breaks downstream
aggregation expecting the canonical machine key "cosine_similarity"; update the
Langfuse metric creation call (the place setting name="Cosine Similarity") to
use the stable key "cosine_similarity" (e.g., key="cosine_similarity" or
metric_key="cosine_similarity" depending on the API) and, if supported, keep the
human label in a separate display_name/name field so consumers continue to read
the friendly label while aggregation uses the machine-stable key.

value=cosine_score,
comment=(
"Cosine similarity between generated output and "
Expand Down
2 changes: 1 addition & 1 deletion backend/app/crud/evaluations/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ async def process_completed_embedding_batch(
eval_run.score = {
"summary_scores": [
{
"name": "cosine_similarity",
"name": "Cosine Similarity",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Keep score identifiers stable to avoid breaking clients.

Line 395 changes the summary score name to "Cosine Similarity", but downstream code/tests match by exact key "cosine_similarity". This introduces a backward-incompatible API contract change for score filtering.

Suggested patch
-                    "name": "Cosine Similarity",
+                    "name": "cosine_similarity",
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"name": "Cosine Similarity",
"name": "cosine_similarity",
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/crud/evaluations/processing.py` at line 395, The change replaced
the summary score "name" with "Cosine Similarity", which breaks downstream code
that matches the exact key "cosine_similarity"; revert this to the stable
identifier or explicitly include a stable key field. Locate the score dictionary
created in processing.py (the summary score entry around where "name": "Cosine
Similarity" is set) and either restore the original name value that clients
expect or add a new unchanging field such as "key": "cosine_similarity" (keeping
the human-friendly "name" if desired) so downstream filters/tests can continue
to match by the stable identifier.

"avg": round(float(similarity_stats["cosine_similarity_avg"]), 2),
"std": round(float(similarity_stats["cosine_similarity_std"]), 2),
"total_pairs": similarity_stats["total_pairs"],
Expand Down
16 changes: 13 additions & 3 deletions backend/app/crud/language.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from typing import Optional

from fastapi import HTTPException
from sqlmodel import Session, select

from app.models import Language
Expand All @@ -16,10 +17,19 @@ def get_languages(session: Session, skip: int = 0, limit: int = 100) -> list[Lan
return list(session.exec(statement).all())


def get_language_by_id(session: Session, language_id: int) -> Optional[Language]:
"""Retrieve a language by its ID."""
def get_language_by_id(
session: Session,
language_id: int,
*,
status_code: int = 400,
detail: str = "Invalid language_id: language not found",
) -> Language:
"""Retrieve a language by its ID. Raises HTTPException if not found."""
statement = select(Language).where(Language.id == language_id)
return session.exec(statement).first()
language = session.exec(statement).first()
if not language:
raise HTTPException(status_code=status_code, detail=detail)
return language


def get_language_by_locale(session: Session, locale: str) -> Optional[Language]:
Expand Down
4 changes: 4 additions & 0 deletions backend/app/crud/stt_evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
create_stt_dataset,
create_stt_samples,
get_stt_dataset_by_id,
get_stt_sample_by_id,
list_stt_datasets,
get_samples_by_dataset_id,
update_stt_sample,
)
from .run import (
create_stt_run,
Expand All @@ -30,8 +32,10 @@
"create_stt_dataset",
"create_stt_samples",
"get_stt_dataset_by_id",
"get_stt_sample_by_id",
"list_stt_datasets",
"get_samples_by_dataset_id",
"update_stt_sample",
# Run
"create_stt_run",
"get_stt_run_by_id",
Expand Down
79 changes: 79 additions & 0 deletions backend/app/crud/stt_evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,85 @@ def create_stt_samples(
return created_samples


def get_stt_sample_by_id(
*,
session: Session,
sample_id: int,
org_id: int,
project_id: int,
) -> STTSample | None:
"""Get an STT sample by ID.

Args:
session: Database session
sample_id: Sample ID
org_id: Organization ID
project_id: Project ID

Returns:
STTSample | None: Sample if found
"""
statement = select(STTSample).where(
STTSample.id == sample_id,
STTSample.organization_id == org_id,
STTSample.project_id == project_id,
)

return session.exec(statement).one_or_none()


def update_stt_sample(
*,
session: Session,
sample_id: int,
org_id: int,
project_id: int,
language_id: int | None = None,
ground_truth: str | None = None,
) -> STTSample | None:
"""Update an STT sample's language and/or ground truth.

Args:
session: Database session
sample_id: Sample ID
org_id: Organization ID
project_id: Project ID
language_id: Optional new language ID
ground_truth: Optional new ground truth transcription

Returns:
STTSample | None: Updated sample, or None if not found
"""
sample = get_stt_sample_by_id(
session=session,
sample_id=sample_id,
org_id=org_id,
project_id=project_id,
)

if not sample:
return None

if language_id is not None:
sample.language_id = language_id

if ground_truth is not None:
sample.ground_truth = ground_truth

sample.updated_at = now()

session.add(sample)
session.flush()

logger.info(
f"[update_stt_sample] Sample updated | "
f"sample_id: {sample_id}, language_id: {language_id}, "
f"ground_truth_updated: {ground_truth is not None}"
)

return sample


def get_stt_dataset_by_id(
*,
session: Session,
Expand Down
Loading
Loading