ProjectTech4DevAI · AkhileshNegi · Mar 9, 2026 · Feb 14, 2026 · Feb 23, 2026 · Feb 24, 2026
diff --git a/.github/issue-formatter.yml b/.github/issue-formatter.yml
diff --git a/backend/app/alembic/versions/049_add_tts_evaluation_tables.py b/backend/app/alembic/versions/049_add_tts_evaluation_tables.py
@@ -0,0 +1,157 @@
+"""add tts evaluation tables
+
+Revision ID: 049
+Revises: 048
+Create Date: 2026-02-14 12:00:00.000000
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "049"
+down_revision = "048"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Create tts_result table
+    op.create_table(
+        "tts_result",
+        sa.Column(
+            "id",
+            sa.Integer(),
+            nullable=False,
+            comment="Unique identifier for the TTS result",
+        ),
+        sa.Column(
+            "sample_text",
+            sa.Text(),
+            nullable=False,
+            comment="Input text that will be synthesized to speech",
+        ),
+        sa.Column(
+            "object_store_url",
+            sa.String(),
+            nullable=True,
+            comment="S3 URL of the generated WAV audio file",
+        ),
+        sa.Column(
+            "metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+            comment="Audio metadata: {duration_seconds, size_bytes}",
+        ),
+        sa.Column(
+            "provider",
+            sa.String(length=100),
+            nullable=False,
+            comment="TTS provider used (e.g., gemini-2.5-pro-preview-tts)",
+        ),
+        sa.Column(
+            "status",
+            sa.String(length=20),
+            nullable=False,
+            server_default="PENDING",
+            comment="Result status: PENDING, SUCCESS, FAILED",
+        ),
+        sa.Column(
+            "score",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+            comment="Extensible evaluation metrics",
+        ),
+        sa.Column(
+            "is_correct",
+            sa.Boolean(),
+            nullable=True,
+            comment="Human feedback flag on audio quality correctness",
+        ),
+        sa.Column(
+            "comment",
+            sa.Text(),
+            nullable=True,
+            comment="Human feedback comment on audio quality",
+        ),
+        sa.Column(
+            "error_message",
+            sa.Text(),
+            nullable=True,
+            comment="Error message if synthesis failed",
+        ),
+        sa.Column(
+            "evaluation_run_id",
+            sa.Integer(),
+            nullable=False,
+            comment="Reference to the evaluation run",
+        ),
+        sa.Column(
+            "organization_id",
+            sa.Integer(),
+            nullable=False,
+            comment="Reference to the organization",
+        ),
+        sa.Column(
+            "project_id",
+            sa.Integer(),
+            nullable=False,
+            comment="Reference to the project",
+        ),
+        sa.Column(
+            "inserted_at",
+            sa.DateTime(),
+            nullable=False,
+            comment="Timestamp when the result was created",
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(),
+            nullable=False,
+            comment="Timestamp when the result was last updated",
+        ),
-        sa.Column(
-            "inserted_at",
-            sa.DateTime(),
-            nullable=False,
-            comment="Timestamp when the result was created",
-        ),
-        sa.Column(
-            "updated_at",
-            sa.DateTime(),
-            nullable=False,
-            comment="Timestamp when the result was last updated",
-        ),
+sa.Column(
+    "inserted_at",
+    sa.DateTime(),
+    nullable=False,
+    server_default=sa.text("now()"),
+    comment="Timestamp when the result was created",
+),
+sa.Column(
+    "updated_at",
+    sa.DateTime(),
+    nullable=False,
+    server_default=sa.text("now()"),
+    comment="Timestamp when the result was last updated",
+),
-        sa.Column(
-            "inserted_at",
-            sa.DateTime(),
-            nullable=False,
-            comment="Timestamp when the result was created",
-        ),
-        sa.Column(
-            "updated_at",
-            sa.DateTime(),
-            nullable=False,
-            comment="Timestamp when the result was last updated",
-        ),
+sa.Column(
+    "inserted_at",
+    sa.DateTime(),
+    nullable=False,
+    server_default=sa.text("now()"),
+    comment="Timestamp when the result was created",
+),
+sa.Column(
+    "updated_at",
+    sa.DateTime(),
+    nullable=False,
+    server_default=sa.text("now()"),
+    comment="Timestamp when the result was last updated",
+),
+        sa.ForeignKeyConstraint(
+            ["evaluation_run_id"],
+            ["evaluation_run.id"],
+            name="fk_tts_result_run_id",
+            ondelete="CASCADE",
+        ),
+        sa.ForeignKeyConstraint(
+            ["organization_id"],
+            ["organization.id"],
+            ondelete="CASCADE",
+        ),
+        sa.ForeignKeyConstraint(
+            ["project_id"],
+            ["project.id"],
+            ondelete="CASCADE",
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        "ix_tts_result_run_id",
+        "tts_result",
+        ["evaluation_run_id"],
+        unique=False,
+    )
+    op.create_index(
+        "idx_tts_result_feedback",
+        "tts_result",
+        ["evaluation_run_id", "is_correct"],
+        unique=False,
+    )
+    op.create_index(
+        "idx_tts_result_status",
+        "tts_result",
+        ["evaluation_run_id", "status"],
+        unique=False,
+    )
+
+
+def downgrade():
+    op.drop_index("idx_tts_result_status", table_name="tts_result")
+    op.drop_index("idx_tts_result_feedback", table_name="tts_result")
+    op.drop_index("ix_tts_result_run_id", table_name="tts_result")
+    op.drop_table("tts_result")
diff --git a/backend/app/api/docs/tts_evaluation/create_dataset.md b/backend/app/api/docs/tts_evaluation/create_dataset.md
@@ -0,0 +1,9 @@
+Create a new TTS evaluation dataset with text samples.
+
+Required fields:
+- **name**: Dataset name
+- **samples**: List of text samples, each with a **text** field
+
+Optional fields:
+- **description**: Dataset description
+- **language_id**: ID of a language from the global languages table
diff --git a/backend/app/api/docs/tts_evaluation/get_dataset.md b/backend/app/api/docs/tts_evaluation/get_dataset.md
@@ -0,0 +1,3 @@
+Get a TTS evaluation dataset by ID.
+
+Returns dataset including sample count.
diff --git a/backend/app/api/docs/tts_evaluation/get_result.md b/backend/app/api/docs/tts_evaluation/get_result.md
@@ -0,0 +1,3 @@
+Get a single TTS synthesis result by ID.
+
+Returns the result including audio URL, metadata, and human feedback status.
diff --git a/backend/app/api/docs/tts_evaluation/get_run.md b/backend/app/api/docs/tts_evaluation/get_run.md
@@ -0,0 +1,4 @@
+Get a TTS evaluation run by ID with optional results.
+
+Query parameters:
+- `include_results`: Include synthesis results (default: true)
diff --git a/backend/app/api/docs/tts_evaluation/list_datasets.md b/backend/app/api/docs/tts_evaluation/list_datasets.md
@@ -0,0 +1,3 @@
+List all TTS evaluation datasets for the current project.
+
+Supports pagination with `limit` and `offset` parameters.
diff --git a/backend/app/api/docs/tts_evaluation/list_runs.md b/backend/app/api/docs/tts_evaluation/list_runs.md
@@ -0,0 +1,3 @@
+List TTS evaluation runs for the current project.
+
+Supports filtering by `dataset_id` and `status`, with pagination via `limit` and `offset`.
diff --git a/backend/app/api/docs/tts_evaluation/start_evaluation.md b/backend/app/api/docs/tts_evaluation/start_evaluation.md
@@ -0,0 +1,15 @@
+Start a TTS evaluation run on a dataset.
+
+Required fields:
+- **run_name**: Name for this evaluation run
+- **dataset_id**: ID of the TTS dataset to evaluate
+
+Optional fields:
+- **models**: List of TTS models to use (default: `["gemini-2.5-pro-preview-tts"]`)
+
+The evaluation will:
+1. Process each text sample through the specified TTS models
+2. Generate speech audio using Gemini Batch API
+3. Store WAV audio files in S3 for human review
+
+**Supported models:** `gemini-2.5-pro-preview-tts`
diff --git a/backend/app/api/docs/tts_evaluation/update_feedback.md b/backend/app/api/docs/tts_evaluation/update_feedback.md
@@ -0,0 +1,5 @@
+Update human feedback on a TTS synthesis result.
+
+Fields:
+- **is_correct**: Whether the synthesized audio quality is acceptable (null to clear)
+- **comment**: Optional feedback comment
diff --git a/backend/app/api/routes/evaluations/__init__.py b/backend/app/api/routes/evaluations/__init__.py
@@ -4,9 +4,11 @@
 
 from app.api.routes.evaluations import dataset, evaluation
 from app.api.routes.stt_evaluations.router import router as stt_router
+from app.api.routes.tts_evaluations.router import router as tts_router
 
 router = APIRouter()
 
 router.include_router(dataset.router)
 router.include_router(stt_router)
+router.include_router(tts_router)
 router.include_router(evaluation.router)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Get a TTS evaluation dataset by ID.

		Returns dataset including sample count.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Get a single TTS synthesis result by ID.

		Returns the result including audio URL, metadata, and human feedback status.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		List all TTS evaluation datasets for the current project.

		Supports pagination with `limit` and `offset` parameters.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		List TTS evaluation runs for the current project.

		Supports filtering by `dataset_id` and `status`, with pagination via `limit` and `offset`.