From fdd37e08092048bc067859464573cf534fe1e52b Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 5 Mar 2026 16:01:12 +0530 Subject: [PATCH 1/4] added type text for evaluation dataset --- backend/app/crud/evaluations/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py index 1e8c7b306..806238a00 100644 --- a/backend/app/crud/evaluations/dataset.py +++ b/backend/app/crud/evaluations/dataset.py @@ -22,6 +22,7 @@ ) from app.core.util import now from app.models import EvaluationDataset, EvaluationRun +from app.models.stt_evaluation import EvaluationType logger = logging.getLogger(__name__) @@ -60,6 +61,7 @@ def create_evaluation_dataset( dataset = EvaluationDataset( name=name, description=description, + type=EvaluationType.TEXT.value, dataset_metadata=dataset_metadata, object_store_url=object_store_url, langfuse_dataset_id=langfuse_dataset_id, @@ -122,6 +124,7 @@ def get_dataset_by_id( .where(EvaluationDataset.id == dataset_id) .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) + .where(EvaluationDataset.type == EvaluationType.TEXT.value) ) dataset = session.exec(statement).first() @@ -158,6 +161,7 @@ def get_dataset_by_name( .where(EvaluationDataset.name == name) .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) + .where(EvaluationDataset.type == EvaluationType.TEXT.value) ) dataset = session.exec(statement).first() @@ -194,6 +198,7 @@ def list_datasets( select(EvaluationDataset) .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) + .where(EvaluationDataset.type == EvaluationType.TEXT.value) .order_by(EvaluationDataset.inserted_at.desc()) .limit(limit) .offset(offset) From 35a9e4e2df1bde91e8bd851255fdf68f9b14ea8b Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 5 Mar 2026 16:07:43 +0530 Subject: [PATCH 2/4] added testcases --- .../tests/crud/evaluations/test_dataset.py | 99 ++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/backend/app/tests/crud/evaluations/test_dataset.py b/backend/app/tests/crud/evaluations/test_dataset.py index 42d41b10d..164e8b216 100644 --- a/backend/app/tests/crud/evaluations/test_dataset.py +++ b/backend/app/tests/crud/evaluations/test_dataset.py @@ -13,9 +13,10 @@ update_dataset_langfuse_id, upload_csv_to_object_store, ) -from app.models import Organization, Project +from app.models import EvaluationDataset, Organization, Project from app.core.util import now from app.models import EvaluationRun +from app.models.stt_evaluation import EvaluationType from app.crud.evaluations.dataset import delete_dataset @@ -39,6 +40,7 @@ def test_create_evaluation_dataset_minimal(self, db: Session) -> None: assert dataset.id is not None assert dataset.name == "test_dataset" + assert dataset.type == EvaluationType.TEXT.value assert dataset.dataset_metadata["original_items_count"] == 10 assert dataset.dataset_metadata["total_items_count"] == 50 assert dataset.organization_id == org.id @@ -71,6 +73,7 @@ def test_create_evaluation_dataset_complete(self, db: Session) -> None: assert dataset.id is not None assert dataset.name == "complete_dataset" + assert dataset.type == EvaluationType.TEXT.value assert dataset.description == "A complete test dataset" assert dataset.dataset_metadata["duplication_factor"] == 5 assert dataset.object_store_url == "s3://bucket/datasets/complete_dataset.csv" @@ -124,6 +127,35 @@ def test_get_dataset_by_id_not_found(self, db: Session) -> None: assert fetched is None + def test_get_dataset_by_id_excludes_non_text_type(self, db: Session) -> None: + """Test that get_dataset_by_id excludes datasets with non-text type.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="stt_type_dataset", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + # Manually update type to STT to simulate a non-text dataset + dataset.type = EvaluationType.STT.value + db.add(dataset) + db.commit() + + fetched = get_dataset_by_id( + session=db, + dataset_id=dataset.id, + organization_id=org.id, + project_id=project.id, + ) + + assert fetched is None + def test_get_dataset_by_id_wrong_org(self, db: Session) -> None: """Test that datasets from other orgs can't be fetched.""" org = db.exec(select(Organization)).first() @@ -177,6 +209,35 @@ def test_get_dataset_by_name_success(self, db: Session) -> None: assert fetched is not None assert fetched.name == "unique_dataset" + def test_get_dataset_by_name_excludes_non_text_type(self, db: Session) -> None: + """Test that get_dataset_by_name excludes datasets with non-text type.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="stt_dataset_by_name", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + # Manually update type to STT + dataset.type = EvaluationType.STT.value + db.add(dataset) + db.commit() + + fetched = get_dataset_by_name( + session=db, + name="stt_dataset_by_name", + organization_id=org.id, + project_id=project.id, + ) + + assert fetched is None + def test_get_dataset_by_name_not_found(self, db: Session) -> None: """Test fetching a non-existent dataset by name.""" org = db.exec(select(Organization)).first() @@ -210,6 +271,42 @@ def test_list_datasets_empty(self, db: Session) -> None: assert len(datasets) == 0 + def test_list_datasets_excludes_non_text_type(self, db: Session) -> None: + """Test that list_datasets only returns text type datasets.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + # Create text datasets + for i in range(3): + create_evaluation_dataset( + session=db, + name=f"text_dataset_{i}", + dataset_metadata={"original_items_count": i}, + organization_id=org.id, + project_id=project.id, + ) + + # Create a non-text dataset by updating type after creation + stt_dataset = create_evaluation_dataset( + session=db, + name="stt_dataset", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + stt_dataset.type = EvaluationType.STT.value + db.add(stt_dataset) + db.commit() + + datasets = list_datasets( + session=db, organization_id=org.id, project_id=project.id + ) + + assert len(datasets) == 3 + assert all(d.type == EvaluationType.TEXT.value for d in datasets) + def test_list_datasets_multiple(self, db: Session) -> None: """Test listing multiple datasets.""" org = db.exec(select(Organization)).first() From 9d5d663d3f75074d7b4993e195dda3ace0198ce4 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 5 Mar 2026 16:16:23 +0530 Subject: [PATCH 3/4] updated for evaluation run as well --- backend/app/crud/evaluations/core.py | 4 + .../app/tests/crud/evaluations/test_core.py | 248 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 backend/app/tests/crud/evaluations/test_core.py diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 86f9f0264..79a3c9d3f 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -11,6 +11,7 @@ from app.crud.evaluations.score import EvaluationScore from app.models import EvaluationRun from app.models.llm.request import ConfigBlob, LLMCallConfig +from app.models.stt_evaluation import EvaluationType from app.services.llm.jobs import resolve_config_blob from app.core.db import engine @@ -80,6 +81,7 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, + type=EvaluationType.TEXT.value, config_id=config_id, config_version=config_version, status="pending", @@ -129,6 +131,7 @@ def list_evaluation_runs( select(EvaluationRun) .where(EvaluationRun.organization_id == organization_id) .where(EvaluationRun.project_id == project_id) + .where(EvaluationRun.type == EvaluationType.TEXT.value) .order_by(EvaluationRun.inserted_at.desc()) .limit(limit) .offset(offset) @@ -167,6 +170,7 @@ def get_evaluation_run_by_id( .where(EvaluationRun.id == evaluation_id) .where(EvaluationRun.organization_id == organization_id) .where(EvaluationRun.project_id == project_id) + .where(EvaluationRun.type == EvaluationType.TEXT.value) ) eval_run = session.exec(statement).first() diff --git a/backend/app/tests/crud/evaluations/test_core.py b/backend/app/tests/crud/evaluations/test_core.py new file mode 100644 index 000000000..e7449c7ee --- /dev/null +++ b/backend/app/tests/crud/evaluations/test_core.py @@ -0,0 +1,248 @@ +from uuid import uuid4 + +from sqlmodel import Session, select + +from app.core.util import now +from app.crud.evaluations.core import ( + create_evaluation_run, + get_evaluation_run_by_id, + list_evaluation_runs, +) +from app.crud.evaluations.dataset import create_evaluation_dataset +from app.models import EvaluationRun, Organization, Project +from app.models.stt_evaluation import EvaluationType + + +def _create_config(db: Session, project_id: int) -> tuple: + """Helper to create a config and config_version for evaluation runs.""" + from app.models.config import Config, ConfigVersion + + config = Config( + name="test_config", + project_id=project_id, + inserted_at=now(), + updated_at=now(), + ) + db.add(config) + db.commit() + db.refresh(config) + + config_version = ConfigVersion( + config_id=config.id, + version=1, + blob={"completion": {"params": {"model": "gpt-4o"}}}, + inserted_at=now(), + updated_at=now(), + ) + db.add(config_version) + db.commit() + db.refresh(config_version) + + return config.id, config_version.version + + +class TestCreateEvaluationRun: + """Test creating evaluation runs.""" + + def test_create_evaluation_run_sets_text_type(self, db: Session) -> None: + """Test that create_evaluation_run sets type to TEXT.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="test_dataset_run_type", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + config_id, config_version = _create_config(db, project.id) + + eval_run = create_evaluation_run( + session=db, + run_name="test_run", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config_id, + config_version=config_version, + organization_id=org.id, + project_id=project.id, + ) + + assert eval_run.id is not None + assert eval_run.type == EvaluationType.TEXT.value + assert eval_run.status == "pending" + assert eval_run.run_name == "test_run" + + +class TestGetEvaluationRunById: + """Test fetching evaluation runs by ID.""" + + def test_get_evaluation_run_by_id_success(self, db: Session) -> None: + """Test fetching an existing evaluation run by ID.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="test_dataset_get_run", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + config_id, config_version = _create_config(db, project.id) + + eval_run = create_evaluation_run( + session=db, + run_name="test_get_run", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config_id, + config_version=config_version, + organization_id=org.id, + project_id=project.id, + ) + + fetched = get_evaluation_run_by_id( + session=db, + evaluation_id=eval_run.id, + organization_id=org.id, + project_id=project.id, + ) + + assert fetched is not None + assert fetched.id == eval_run.id + assert fetched.run_name == "test_get_run" + + def test_get_evaluation_run_by_id_not_found(self, db: Session) -> None: + """Test fetching a non-existent evaluation run.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + fetched = get_evaluation_run_by_id( + session=db, + evaluation_id=99999, + organization_id=org.id, + project_id=project.id, + ) + + assert fetched is None + + def test_get_evaluation_run_by_id_excludes_non_text_type(self, db: Session) -> None: + """Test that get_evaluation_run_by_id excludes runs with non-text type.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="test_dataset_exclude_run", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + config_id, config_version = _create_config(db, project.id) + + eval_run = create_evaluation_run( + session=db, + run_name="test_stt_run", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config_id, + config_version=config_version, + organization_id=org.id, + project_id=project.id, + ) + + # Manually update type to STT to simulate a non-text run + eval_run.type = EvaluationType.STT.value + db.add(eval_run) + db.commit() + + fetched = get_evaluation_run_by_id( + session=db, + evaluation_id=eval_run.id, + organization_id=org.id, + project_id=project.id, + ) + + assert fetched is None + + +class TestListEvaluationRuns: + """Test listing evaluation runs.""" + + def test_list_evaluation_runs_empty(self, db: Session) -> None: + """Test listing evaluation runs when none exist.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + runs = list_evaluation_runs( + session=db, organization_id=org.id, project_id=project.id + ) + + assert len(runs) == 0 + + def test_list_evaluation_runs_excludes_non_text_type(self, db: Session) -> None: + """Test that list_evaluation_runs only returns text type runs.""" + org = db.exec(select(Organization)).first() + project = db.exec( + select(Project).where(Project.organization_id == org.id) + ).first() + + dataset = create_evaluation_dataset( + session=db, + name="test_dataset_list_runs", + dataset_metadata={"original_items_count": 10}, + organization_id=org.id, + project_id=project.id, + ) + + config_id, config_version = _create_config(db, project.id) + + # Create text evaluation runs + for i in range(3): + create_evaluation_run( + session=db, + run_name=f"text_run_{i}", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config_id, + config_version=config_version, + organization_id=org.id, + project_id=project.id, + ) + + # Create a non-text evaluation run by updating type after creation + stt_run = create_evaluation_run( + session=db, + run_name="stt_run", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config_id, + config_version=config_version, + organization_id=org.id, + project_id=project.id, + ) + stt_run.type = EvaluationType.STT.value + db.add(stt_run) + db.commit() + + runs = list_evaluation_runs( + session=db, organization_id=org.id, project_id=project.id + ) + + assert len(runs) == 3 + assert all(r.type == EvaluationType.TEXT.value for r in runs) From 176b769c5a8f33f75efd314646008ee400b65853 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 5 Mar 2026 16:38:29 +0530 Subject: [PATCH 4/4] updating testcases --- backend/app/tests/crud/evaluations/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/tests/crud/evaluations/test_core.py b/backend/app/tests/crud/evaluations/test_core.py index e7449c7ee..6d982f75f 100644 --- a/backend/app/tests/crud/evaluations/test_core.py +++ b/backend/app/tests/crud/evaluations/test_core.py @@ -30,7 +30,7 @@ def _create_config(db: Session, project_id: int) -> tuple: config_version = ConfigVersion( config_id=config.id, version=1, - blob={"completion": {"params": {"model": "gpt-4o"}}}, + config_blob={"completion": {"params": {"model": "gpt-4o"}}}, inserted_at=now(), updated_at=now(), )