From fdd37e08092048bc067859464573cf534fe1e52b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 5 Mar 2026 16:01:12 +0530
Subject: [PATCH 1/4] added type text for evaluation dataset

---
 backend/app/crud/evaluations/dataset.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py
index 1e8c7b306..806238a00 100644
--- a/backend/app/crud/evaluations/dataset.py
+++ b/backend/app/crud/evaluations/dataset.py
@@ -22,6 +22,7 @@
 )
 from app.core.util import now
 from app.models import EvaluationDataset, EvaluationRun
+from app.models.stt_evaluation import EvaluationType
 
 logger = logging.getLogger(__name__)
 
@@ -60,6 +61,7 @@ def create_evaluation_dataset(
         dataset = EvaluationDataset(
             name=name,
             description=description,
+            type=EvaluationType.TEXT.value,
             dataset_metadata=dataset_metadata,
             object_store_url=object_store_url,
             langfuse_dataset_id=langfuse_dataset_id,
@@ -122,6 +124,7 @@ def get_dataset_by_id(
         .where(EvaluationDataset.id == dataset_id)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
     )
 
     dataset = session.exec(statement).first()
@@ -158,6 +161,7 @@ def get_dataset_by_name(
         .where(EvaluationDataset.name == name)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
     )
 
     dataset = session.exec(statement).first()
@@ -194,6 +198,7 @@ def list_datasets(
         select(EvaluationDataset)
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
+        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
         .order_by(EvaluationDataset.inserted_at.desc())
         .limit(limit)
         .offset(offset)

From 35a9e4e2df1bde91e8bd851255fdf68f9b14ea8b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 5 Mar 2026 16:07:43 +0530
Subject: [PATCH 2/4] added testcases

---
 .../tests/crud/evaluations/test_dataset.py    | 99 ++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/backend/app/tests/crud/evaluations/test_dataset.py b/backend/app/tests/crud/evaluations/test_dataset.py
index 42d41b10d..164e8b216 100644
--- a/backend/app/tests/crud/evaluations/test_dataset.py
+++ b/backend/app/tests/crud/evaluations/test_dataset.py
@@ -13,9 +13,10 @@
     update_dataset_langfuse_id,
     upload_csv_to_object_store,
 )
-from app.models import Organization, Project
+from app.models import EvaluationDataset, Organization, Project
 from app.core.util import now
 from app.models import EvaluationRun
+from app.models.stt_evaluation import EvaluationType
 from app.crud.evaluations.dataset import delete_dataset
 
 
@@ -39,6 +40,7 @@ def test_create_evaluation_dataset_minimal(self, db: Session) -> None:
 
         assert dataset.id is not None
         assert dataset.name == "test_dataset"
+        assert dataset.type == EvaluationType.TEXT.value
         assert dataset.dataset_metadata["original_items_count"] == 10
         assert dataset.dataset_metadata["total_items_count"] == 50
         assert dataset.organization_id == org.id
@@ -71,6 +73,7 @@ def test_create_evaluation_dataset_complete(self, db: Session) -> None:
 
         assert dataset.id is not None
         assert dataset.name == "complete_dataset"
+        assert dataset.type == EvaluationType.TEXT.value
         assert dataset.description == "A complete test dataset"
         assert dataset.dataset_metadata["duplication_factor"] == 5
         assert dataset.object_store_url == "s3://bucket/datasets/complete_dataset.csv"
@@ -124,6 +127,35 @@ def test_get_dataset_by_id_not_found(self, db: Session) -> None:
 
         assert fetched is None
 
+    def test_get_dataset_by_id_excludes_non_text_type(self, db: Session) -> None:
+        """Test that get_dataset_by_id excludes datasets with non-text type."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="stt_type_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Manually update type to STT to simulate a non-text dataset
+        dataset.type = EvaluationType.STT.value
+        db.add(dataset)
+        db.commit()
+
+        fetched = get_dataset_by_id(
+            session=db,
+            dataset_id=dataset.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
     def test_get_dataset_by_id_wrong_org(self, db: Session) -> None:
         """Test that datasets from other orgs can't be fetched."""
         org = db.exec(select(Organization)).first()
@@ -177,6 +209,35 @@ def test_get_dataset_by_name_success(self, db: Session) -> None:
         assert fetched is not None
         assert fetched.name == "unique_dataset"
 
+    def test_get_dataset_by_name_excludes_non_text_type(self, db: Session) -> None:
+        """Test that get_dataset_by_name excludes datasets with non-text type."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="stt_dataset_by_name",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Manually update type to STT
+        dataset.type = EvaluationType.STT.value
+        db.add(dataset)
+        db.commit()
+
+        fetched = get_dataset_by_name(
+            session=db,
+            name="stt_dataset_by_name",
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
     def test_get_dataset_by_name_not_found(self, db: Session) -> None:
         """Test fetching a non-existent dataset by name."""
         org = db.exec(select(Organization)).first()
@@ -210,6 +271,42 @@ def test_list_datasets_empty(self, db: Session) -> None:
 
         assert len(datasets) == 0
 
+    def test_list_datasets_excludes_non_text_type(self, db: Session) -> None:
+        """Test that list_datasets only returns text type datasets."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create text datasets
+        for i in range(3):
+            create_evaluation_dataset(
+                session=db,
+                name=f"text_dataset_{i}",
+                dataset_metadata={"original_items_count": i},
+                organization_id=org.id,
+                project_id=project.id,
+            )
+
+        # Create a non-text dataset by updating type after creation
+        stt_dataset = create_evaluation_dataset(
+            session=db,
+            name="stt_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+        stt_dataset.type = EvaluationType.STT.value
+        db.add(stt_dataset)
+        db.commit()
+
+        datasets = list_datasets(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(datasets) == 3
+        assert all(d.type == EvaluationType.TEXT.value for d in datasets)
+
     def test_list_datasets_multiple(self, db: Session) -> None:
         """Test listing multiple datasets."""
         org = db.exec(select(Organization)).first()

From 9d5d663d3f75074d7b4993e195dda3ace0198ce4 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 5 Mar 2026 16:16:23 +0530
Subject: [PATCH 3/4] updated for evaluation run as well

---
 backend/app/crud/evaluations/core.py          |   4 +
 .../app/tests/crud/evaluations/test_core.py   | 248 ++++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 backend/app/tests/crud/evaluations/test_core.py

diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 86f9f0264..79a3c9d3f 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -11,6 +11,7 @@
 from app.crud.evaluations.score import EvaluationScore
 from app.models import EvaluationRun
 from app.models.llm.request import ConfigBlob, LLMCallConfig
+from app.models.stt_evaluation import EvaluationType
 from app.services.llm.jobs import resolve_config_blob
 
 from app.core.db import engine
@@ -80,6 +81,7 @@ def create_evaluation_run(
         run_name=run_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
+        type=EvaluationType.TEXT.value,
         config_id=config_id,
         config_version=config_version,
         status="pending",
@@ -129,6 +131,7 @@ def list_evaluation_runs(
         select(EvaluationRun)
         .where(EvaluationRun.organization_id == organization_id)
         .where(EvaluationRun.project_id == project_id)
+        .where(EvaluationRun.type == EvaluationType.TEXT.value)
         .order_by(EvaluationRun.inserted_at.desc())
         .limit(limit)
         .offset(offset)
@@ -167,6 +170,7 @@ def get_evaluation_run_by_id(
         .where(EvaluationRun.id == evaluation_id)
         .where(EvaluationRun.organization_id == organization_id)
         .where(EvaluationRun.project_id == project_id)
+        .where(EvaluationRun.type == EvaluationType.TEXT.value)
     )
 
     eval_run = session.exec(statement).first()
diff --git a/backend/app/tests/crud/evaluations/test_core.py b/backend/app/tests/crud/evaluations/test_core.py
new file mode 100644
index 000000000..e7449c7ee
--- /dev/null
+++ b/backend/app/tests/crud/evaluations/test_core.py
@@ -0,0 +1,248 @@
+from uuid import uuid4
+
+from sqlmodel import Session, select
+
+from app.core.util import now
+from app.crud.evaluations.core import (
+    create_evaluation_run,
+    get_evaluation_run_by_id,
+    list_evaluation_runs,
+)
+from app.crud.evaluations.dataset import create_evaluation_dataset
+from app.models import EvaluationRun, Organization, Project
+from app.models.stt_evaluation import EvaluationType
+
+
+def _create_config(db: Session, project_id: int) -> tuple:
+    """Helper to create a config and config_version for evaluation runs."""
+    from app.models.config import Config, ConfigVersion
+
+    config = Config(
+        name="test_config",
+        project_id=project_id,
+        inserted_at=now(),
+        updated_at=now(),
+    )
+    db.add(config)
+    db.commit()
+    db.refresh(config)
+
+    config_version = ConfigVersion(
+        config_id=config.id,
+        version=1,
+        blob={"completion": {"params": {"model": "gpt-4o"}}},
+        inserted_at=now(),
+        updated_at=now(),
+    )
+    db.add(config_version)
+    db.commit()
+    db.refresh(config_version)
+
+    return config.id, config_version.version
+
+
+class TestCreateEvaluationRun:
+    """Test creating evaluation runs."""
+
+    def test_create_evaluation_run_sets_text_type(self, db: Session) -> None:
+        """Test that create_evaluation_run sets type to TEXT."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_run_type",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert eval_run.id is not None
+        assert eval_run.type == EvaluationType.TEXT.value
+        assert eval_run.status == "pending"
+        assert eval_run.run_name == "test_run"
+
+
+class TestGetEvaluationRunById:
+    """Test fetching evaluation runs by ID."""
+
+    def test_get_evaluation_run_by_id_success(self, db: Session) -> None:
+        """Test fetching an existing evaluation run by ID."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_get_run",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_get_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=eval_run.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is not None
+        assert fetched.id == eval_run.id
+        assert fetched.run_name == "test_get_run"
+
+    def test_get_evaluation_run_by_id_not_found(self, db: Session) -> None:
+        """Test fetching a non-existent evaluation run."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=99999,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+    def test_get_evaluation_run_by_id_excludes_non_text_type(self, db: Session) -> None:
+        """Test that get_evaluation_run_by_id excludes runs with non-text type."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_exclude_run",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        eval_run = create_evaluation_run(
+            session=db,
+            run_name="test_stt_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Manually update type to STT to simulate a non-text run
+        eval_run.type = EvaluationType.STT.value
+        db.add(eval_run)
+        db.commit()
+
+        fetched = get_evaluation_run_by_id(
+            session=db,
+            evaluation_id=eval_run.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+
+class TestListEvaluationRuns:
+    """Test listing evaluation runs."""
+
+    def test_list_evaluation_runs_empty(self, db: Session) -> None:
+        """Test listing evaluation runs when none exist."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        runs = list_evaluation_runs(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(runs) == 0
+
+    def test_list_evaluation_runs_excludes_non_text_type(self, db: Session) -> None:
+        """Test that list_evaluation_runs only returns text type runs."""
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset_list_runs",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        config_id, config_version = _create_config(db, project.id)
+
+        # Create text evaluation runs
+        for i in range(3):
+            create_evaluation_run(
+                session=db,
+                run_name=f"text_run_{i}",
+                dataset_name=dataset.name,
+                dataset_id=dataset.id,
+                config_id=config_id,
+                config_version=config_version,
+                organization_id=org.id,
+                project_id=project.id,
+            )
+
+        # Create a non-text evaluation run by updating type after creation
+        stt_run = create_evaluation_run(
+            session=db,
+            run_name="stt_run",
+            dataset_name=dataset.name,
+            dataset_id=dataset.id,
+            config_id=config_id,
+            config_version=config_version,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+        stt_run.type = EvaluationType.STT.value
+        db.add(stt_run)
+        db.commit()
+
+        runs = list_evaluation_runs(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(runs) == 3
+        assert all(r.type == EvaluationType.TEXT.value for r in runs)

From 176b769c5a8f33f75efd314646008ee400b65853 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 5 Mar 2026 16:38:29 +0530
Subject: [PATCH 4/4] updating testcases

---
 backend/app/tests/crud/evaluations/test_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/tests/crud/evaluations/test_core.py b/backend/app/tests/crud/evaluations/test_core.py
index e7449c7ee..6d982f75f 100644
--- a/backend/app/tests/crud/evaluations/test_core.py
+++ b/backend/app/tests/crud/evaluations/test_core.py
@@ -30,7 +30,7 @@ def _create_config(db: Session, project_id: int) -> tuple:
     config_version = ConfigVersion(
         config_id=config.id,
         version=1,
-        blob={"completion": {"params": {"model": "gpt-4o"}}},
+        config_blob={"completion": {"params": {"model": "gpt-4o"}}},
         inserted_at=now(),
         updated_at=now(),
     )