diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py new file mode 100644 index 000000000..1cb164942 --- /dev/null +++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py @@ -0,0 +1,59 @@ +"""add config in evals run table + +Revision ID: 041 +Revises: 040 +Create Date: 2025-12-15 14:03:22.082746 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "evaluation_run", + sa.Column( + "config_id", + sa.Uuid(), + nullable=True, + comment="Reference to the stored config used", + ), + ) + op.add_column( + "evaluation_run", + sa.Column( + "config_version", + sa.Integer(), + nullable=True, + comment="Version of the config used", + ), + ) + op.create_foreign_key( + "fk_evaluation_run_config_id", "evaluation_run", "config", ["config_id"], ["id"] + ) + op.drop_column("evaluation_run", "config") + + +def downgrade(): + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + comment="Evaluation configuration (model, instructions, etc.)", + ), + ) + op.drop_constraint( + "fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey" + ) + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "config_id") diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index d66ff71ce..1ce42742a 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse: @router.post( - "/", + "", description=load_description("evaluation/upload_dataset.md"), response_model=APIResponse[DatasetUploadResponse], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -87,7 +87,7 @@ async def upload_dataset( @router.get( - "/", + "", description=load_description("evaluation/list_datasets.md"), response_model=APIResponse[list[DatasetUploadResponse]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index d40a88a1a..fe09edcec 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -1,6 +1,7 @@ """Evaluation run API routes.""" import logging +from uuid import UUID from fastapi import ( APIRouter, @@ -29,7 +30,7 @@ @router.post( - "/", + "", description=load_description("evaluation/create_evaluation.md"), response_model=APIResponse[EvaluationRunPublic], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -41,19 +42,16 @@ def evaluate( experiment_name: str = Body( ..., description="Name for this evaluation experiment/run" ), - config: dict = Body(default_factory=dict, description="Evaluation configuration"), - assistant_id: str - | None = Body( - None, description="Optional assistant ID to fetch configuration from" - ), + config_id: UUID = Body(..., description="Stored config ID"), + config_version: int = Body(..., ge=1, description="Stored config version"), ) -> APIResponse[EvaluationRunPublic]: """Start an evaluation run.""" eval_run = start_evaluation( session=_session, dataset_id=dataset_id, experiment_name=experiment_name, - config=config, - assistant_id=assistant_id, + config_id=config_id, + config_version=config_version, organization_id=auth_context.organization_.id, project_id=auth_context.project_.id, ) @@ -68,7 +66,7 @@ def evaluate( @router.get( - "/", + "", description=load_description("evaluation/list_evaluations.md"), response_model=APIResponse[list[EvaluationRunPublic]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index e667dcbb7..a5824c0a2 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -5,6 +5,8 @@ create_evaluation_run, get_evaluation_run_by_id, list_evaluation_runs, + resolve_evaluation_config, + resolve_model_from_config, save_score, ) from app.crud.evaluations.cron import ( @@ -43,3 +45,45 @@ TraceData, TraceScore, ) + +__all__ = [ + # Core + "create_evaluation_run", + "get_evaluation_run_by_id", + "list_evaluation_runs", + "resolve_evaluation_config", + "resolve_model_from_config", + "save_score", + # Cron + "process_all_pending_evaluations", + "process_all_pending_evaluations_sync", + # Dataset + "create_evaluation_dataset", + "delete_dataset", + "get_dataset_by_id", + "list_datasets", + "upload_csv_to_object_store", + # Batch + "start_evaluation_batch", + # Processing + "check_and_process_evaluation", + "poll_all_pending_evaluations", + "process_completed_embedding_batch", + "process_completed_evaluation", + # Embeddings + "calculate_average_similarity", + "calculate_cosine_similarity", + "start_embedding_batch", + # Langfuse + "create_langfuse_dataset_run", + "fetch_trace_scores_from_langfuse", + "update_traces_with_cosine_scores", + "upload_dataset_to_langfuse", + # Score types + "CategoricalSummaryScore", + "EvaluationScore", + "NumericSummaryScore", + "SummaryScore", + "TraceData", + "TraceScore", +] diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index e880d7d0c..db8349ebc 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -16,6 +16,7 @@ from app.core.batch import OpenAIBatchProvider, start_batch_job from app.models import EvaluationRun +from app.models.llm.request import KaapiLLMParams logger = logging.getLogger(__name__) @@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str, def build_evaluation_jsonl( - dataset_items: list[dict[str, Any]], config: dict[str, Any] + dataset_items: list[dict[str, Any]], config: KaapiLLMParams ) -> list[dict[str, Any]]: """ Build JSONL data for evaluation batch using OpenAI Responses API. @@ -88,7 +89,6 @@ def build_evaluation_jsonl( List of dictionaries (JSONL data) """ jsonl_data = [] - for item in dataset_items: # Extract question from input question = item["input"].get("question", "") @@ -100,14 +100,34 @@ def build_evaluation_jsonl( # Build the batch request object for Responses API # Use config as-is and only add the input field + body: dict[str, Any] = { + "model": config.model, + "instructions": config.instructions, + "temperature": config.temperature + if config.temperature is not None + else 0.01, + "input": question, # Add input from dataset + } + + # Add reasoning only if provided + if config.reasoning: + body["reasoning"] = {"effort": config.reasoning} + + # Add tools only if knowledge_base_ids are provided + if config.knowledge_base_ids: + body["tools"] = [ + { + "type": "file_search", + "vector_store_ids": config.knowledge_base_ids, + "max_num_results": config.max_num_results or 20, + } + ] + batch_request = { "custom_id": item["id"], "method": "POST", "url": "/v1/responses", - "body": { - **config, # Use config as-is - "input": question, # Add input from dataset - }, + "body": body, } jsonl_data.append(batch_request) @@ -119,7 +139,7 @@ def start_evaluation_batch( openai_client: OpenAI, session: Session, eval_run: EvaluationRun, - config: dict[str, Any], + config: KaapiLLMParams, ) -> EvaluationRun: """ Fetch data, build JSONL, and start evaluation batch. @@ -132,7 +152,7 @@ def start_evaluation_batch( openai_client: Configured OpenAI client session: Database session eval_run: EvaluationRun database object (with run_name, dataset_name, config) - config: Evaluation configuration dict with llm, instructions, vector_store_ids + config: KaapiLLMParams with model, instructions, knowledge_base_ids, etc. Returns: Updated EvaluationRun with batch_job_id populated @@ -166,7 +186,7 @@ def start_evaluation_batch( "description": f"Evaluation: {eval_run.run_name}", "completion_window": "24h", # Store complete config for reference - "evaluation_config": config, + "evaluation_config": config.model_dump(exclude_none=True), } # Step 5: Start batch job using generic infrastructure diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 33b6777f3..6cca562f2 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,22 +1,57 @@ import logging +from uuid import UUID from langfuse import Langfuse from sqlmodel import Session, select from app.core.util import now +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.crud.evaluations.score import EvaluationScore from app.models import EvaluationRun +from app.models.llm.request import ConfigBlob, LLMCallConfig +from app.services.llm.jobs import resolve_config_blob logger = logging.getLogger(__name__) +def resolve_evaluation_config( + session: Session, + config_id: UUID, + config_version: int, + project_id: int, +) -> tuple[ConfigBlob | None, str | None]: + """ + Resolve config blob from stored config management. + + Args: + session: Database session + config_id: UUID of the stored config + config_version: Version number of the config + project_id: Project ID for access control + + Returns: + Tuple of (ConfigBlob or None, error_message or None) + """ + config_version_crud = ConfigVersionCrud( + session=session, + config_id=config_id, + project_id=project_id, + ) + + return resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=config_id, version=config_version), + ) + + def create_evaluation_run( session: Session, run_name: str, dataset_name: str, dataset_id: int, - config: dict, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -28,7 +63,8 @@ def create_evaluation_run( run_name: Name of the evaluation run/experiment dataset_name: Name of the dataset being used dataset_id: ID of the dataset - config: Configuration dict for the evaluation + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -39,7 +75,8 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, status="pending", organization_id=organization_id, project_id=project_id, @@ -56,8 +93,10 @@ def create_evaluation_run( logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) raise - logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") - + logger.info( + f"[create_evaluation_run] Created EvaluationRun record: id={eval_run.id}, " + f"run_name={run_name}, config_id={config_id}, config_version={config_version}" + ) return eval_run @@ -311,3 +350,43 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + + +def resolve_model_from_config( + session: Session, + eval_run: EvaluationRun, +) -> str: + """ + Resolve the model name from the evaluation run's config. + + Args: + session: Database session + eval_run: EvaluationRun instance + + Returns: + Model name from config + + Raises: + ValueError: If config is missing, invalid, or has no model + """ + if not eval_run.config_id or not eval_run.config_version: + raise ValueError( + f"Evaluation run {eval_run.id} has no config reference " + f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" + ) + + config, error = resolve_evaluation_config( + session=session, + config_id=eval_run.config_id, + config_version=eval_run.config_version, + project_id=eval_run.project_id, + ) + + if error or config is None: + raise ValueError( + f"Config resolution failed for evaluation {eval_run.id} " + f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" + ) + + model = config.completion.params.model + return model diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 17ead39ab..bcf8b160b 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -21,6 +21,9 @@ logger = logging.getLogger(__name__) +# Default embedding model +EMBEDDING_MODEL = "text-embedding-3-large" + # Valid embedding models with their dimensions VALID_EMBEDDING_MODELS = { "text-embedding-3-small": 1536, @@ -49,7 +52,7 @@ def validate_embedding_model(model: str) -> None: def build_embedding_jsonl( results: list[dict[str, Any]], trace_id_mapping: dict[str, str], - embedding_model: str = "text-embedding-3-large", + embedding_model: str = EMBEDDING_MODEL, ) -> list[dict[str, Any]]: """ Build JSONL data for embedding batch using OpenAI Embeddings API. @@ -362,20 +365,8 @@ def start_embedding_batch( try: logger.info(f"Starting embedding batch for evaluation run {eval_run.id}") - # Get embedding model from config (default: text-embedding-3-large) - embedding_model = eval_run.config.get( - "embedding_model", "text-embedding-3-large" - ) - - # Validate and fallback to default if invalid - try: - validate_embedding_model(embedding_model) - except ValueError as e: - logger.warning( - f"Invalid embedding model '{embedding_model}' in config: {e}. " - f"Falling back to text-embedding-3-large" - ) - embedding_model = "text-embedding-3-large" + # Use default embedding model + embedding_model = EMBEDDING_MODEL # Step 1: Build embedding JSONL with trace_ids jsonl_data = build_embedding_jsonl( diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 076ac9f32..2c7f8648b 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -26,7 +26,7 @@ upload_batch_results_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run +from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( calculate_average_similarity, parse_embedding_results, @@ -254,16 +254,16 @@ async def process_completed_evaluation( if not results: raise ValueError("No valid results found in batch output") - # Extract model from config for cost tracking - model = eval_run.config.get("model") if eval_run.config else None - # Step 5: Create Langfuse dataset run with traces + # Use model stored at creation time for cost tracking + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, + model=model, run_name=eval_run.run_name, results=results, - model=model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27e..6ae4542fb 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Optional +from uuid import UUID from pydantic import BaseModel, Field from sqlalchemy import Column, Index, Text, UniqueConstraint @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Name of the Langfuse dataset used"}, ) - # Config field - dict requires sa_column - config: dict[str, Any] = SQLField( - default_factory=dict, - sa_column=Column( - JSONB, - nullable=False, - comment="Evaluation configuration (model, instructions, etc.)", - ), - description="Evaluation configuration", + config_id: UUID = SQLField( + foreign_key="config.id", + nullable=True, + description="Reference to the stored config used for this evaluation", + sa_column_kwargs={"comment": "Reference to the stored config used"}, + ) + config_version: int = SQLField( + nullable=True, + ge=1, + description="Version of the config used for this evaluation", + sa_column_kwargs={"comment": "Version of the config used"}, ) # Dataset reference @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config: dict[str, Any] + config_id: UUID | None + config_version: int | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None diff --git a/backend/app/services/evaluations/__init__.py b/backend/app/services/evaluations/__init__.py index 62201b426..92d88fe0b 100644 --- a/backend/app/services/evaluations/__init__.py +++ b/backend/app/services/evaluations/__init__.py @@ -2,7 +2,6 @@ from app.services.evaluations.dataset import upload_dataset from app.services.evaluations.evaluation import ( - build_evaluation_config, get_evaluation_with_scores, start_evaluation, ) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 4c1a5de74..785eb02af 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -1,108 +1,33 @@ """Evaluation run orchestration service.""" import logging +from uuid import UUID from fastapi import HTTPException from sqlmodel import Session -from app.crud.assistants import get_assistant_by_id from app.crud.evaluations import ( create_evaluation_run, fetch_trace_scores_from_langfuse, get_dataset_by_id, get_evaluation_run_by_id, + resolve_evaluation_config, save_score, start_evaluation_batch, ) from app.models.evaluation import EvaluationRun +from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client logger = logging.getLogger(__name__) -def build_evaluation_config( - session: Session, - config: dict, - assistant_id: str | None, - project_id: int, -) -> dict: - """ - Build evaluation configuration from assistant or provided config. - - If assistant_id is provided, fetch assistant and merge with config. - Config values take precedence over assistant values. - - Args: - session: Database session - config: Provided configuration dict - assistant_id: Optional assistant ID to fetch configuration from - project_id: Project ID for assistant lookup - - Returns: - Complete evaluation configuration dict - - Raises: - HTTPException: If assistant not found or model missing - """ - if assistant_id: - assistant = get_assistant_by_id( - session=session, - assistant_id=assistant_id, - project_id=project_id, - ) - - if not assistant: - raise HTTPException( - status_code=404, detail=f"Assistant {assistant_id} not found" - ) - - logger.info( - f"[build_evaluation_config] Found assistant in DB | id={assistant.id} | " - f"model={assistant.model} | instructions=" - f"{assistant.instructions[:50] if assistant.instructions else 'None'}..." - ) - - # Build config from assistant (use provided config values to override if present) - merged_config = { - "model": config.get("model", assistant.model), - "instructions": config.get("instructions", assistant.instructions), - "temperature": config.get("temperature", assistant.temperature), - } - - # Add tools if vector stores are available - vector_store_ids = config.get( - "vector_store_ids", assistant.vector_store_ids or [] - ) - if vector_store_ids and len(vector_store_ids) > 0: - merged_config["tools"] = [ - { - "type": "file_search", - "vector_store_ids": vector_store_ids, - } - ] - - logger.info("[build_evaluation_config] Using config from assistant") - return merged_config - - # Using provided config directly - logger.info("[build_evaluation_config] Using provided config directly") - - # Validate that config has minimum required fields - if not config.get("model"): - raise HTTPException( - status_code=400, - detail="Config must include 'model' when assistant_id is not provided", - ) - - return config - - def start_evaluation( session: Session, dataset_id: int, experiment_name: str, - config: dict, - assistant_id: str | None, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -111,7 +36,7 @@ def start_evaluation( Steps: 1. Validate dataset exists and has Langfuse ID - 2. Build config (from assistant or direct) + 2. Resolve config from stored config management 3. Create evaluation run record 4. Start batch processing @@ -119,8 +44,8 @@ def start_evaluation( session: Database session dataset_id: ID of the evaluation dataset experiment_name: Name for this evaluation experiment/run - config: Evaluation configuration - assistant_id: Optional assistant ID to fetch configuration from + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -128,16 +53,17 @@ def start_evaluation( EvaluationRun instance Raises: - HTTPException: If dataset not found or evaluation fails to start + HTTPException: If dataset not found, config invalid, or evaluation fails to start """ logger.info( f"[start_evaluation] Starting evaluation | experiment_name={experiment_name} | " f"dataset_id={dataset_id} | " f"org_id={organization_id} | " - f"assistant_id={assistant_id} | " - f"config_keys={list(config.keys())}" + f"config_id={config_id} | " + f"config_version={config_version}" ) + # Step 1: Fetch dataset from database dataset = get_dataset_by_id( session=session, dataset_id=dataset_id, @@ -165,13 +91,29 @@ def start_evaluation( "Please ensure Langfuse credentials were configured when the dataset was created.", ) - eval_config = build_evaluation_config( + # Step 2: Resolve config from stored config management + config, error = resolve_evaluation_config( session=session, - config=config, - assistant_id=assistant_id, + config_id=config_id, + config_version=config_version, project_id=project_id, ) + if error: + raise HTTPException( + status_code=400, + detail=f"Failed to resolve config from stored config: {error}", + ) + elif config.completion.provider != LLMProvider.OPENAI: + raise HTTPException( + status_code=422, + detail="Only 'openai' provider is supported for evaluation configs", + ) + + logger.info( + "[start_evaluation] Successfully resolved config from config management" + ) + # Get API clients openai_client = get_openai_client( session=session, org_id=organization_id, @@ -183,23 +125,26 @@ def start_evaluation( project_id=project_id, ) + # Step 3: Create EvaluationRun record with config references eval_run = create_evaluation_run( session=session, run_name=experiment_name, dataset_name=dataset.name, dataset_id=dataset_id, - config=eval_config, + config_id=config_id, + config_version=config_version, organization_id=organization_id, project_id=project_id, ) + # Step 4: Start the batch evaluation try: eval_run = start_evaluation_batch( langfuse=langfuse, openai_client=openai_client, session=session, eval_run=eval_run, - config=eval_config, + config=config.completion.params, ) logger.info( diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 773fe7fc3..f4700b51b 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -5,18 +5,17 @@ from fastapi import HTTPException from sqlmodel import Session +from app.celery.utils import start_high_priority_job from app.core.db import engine +from app.core.langfuse.langfuse import observe_llm_execution from app.crud.config import ConfigVersionCrud from app.crud.credentials import get_provider_credential from app.crud.jobs import JobCrud from app.models import JobStatus, JobType, JobUpdate, LLMCallRequest from app.models.llm.request import ConfigBlob, LLMCallConfig, KaapiCompletionConfig -from app.utils import APIResponse, send_callback -from app.celery.utils import start_high_priority_job -from app.core.langfuse.langfuse import observe_llm_execution from app.services.llm.providers.registry import get_llm_provider from app.services.llm.mappers import transform_kaapi_config_to_native - +from app.utils import APIResponse, send_callback logger = logging.getLogger(__name__) diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py index a5cfb4bb8..f5d17971f 100644 --- a/backend/app/services/llm/providers/registry.py +++ b/backend/app/services/llm/providers/registry.py @@ -13,12 +13,14 @@ class LLMProvider: OPENAI_NATIVE = "openai-native" + OPENAI = "openai" # Future constants for native providers: # CLAUDE_NATIVE = "claude-native" # GEMINI_NATIVE = "gemini-native" _registry: dict[str, type[BaseProvider]] = { OPENAI_NATIVE: OpenAIProvider, + OPENAI: OpenAIProvider, # Future native providers: # CLAUDE_NATIVE: ClaudeProvider, # GEMINI_NATIVE: GeminiProvider, diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 813fd483f..ac1ac0c0e 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1,6 +1,7 @@ import io from typing import Any from unittest.mock import Mock, patch +from uuid import uuid4 import pytest from fastapi.testclient import TestClient @@ -8,8 +9,9 @@ from app.crud.evaluations.batch import build_evaluation_jsonl from app.models import EvaluationDataset, EvaluationRun +from app.models.llm.request import KaapiLLMParams from app.tests.utils.auth import TestAuthContext -from app.tests.utils.test_data import create_test_evaluation_dataset +from app.tests.utils.test_data import create_test_config, create_test_evaluation_dataset # Helper function to create CSV file-like object @@ -524,15 +526,21 @@ def test_start_batch_evaluation_invalid_dataset_id( self, client: TestClient, user_api_key_header: dict[str, str], - sample_evaluation_config: dict[str, Any], + db: Session, + user_api_key: TestAuthContext, ) -> None: - """Test batch evaluation fails with invalid/non-existent dataset_id.""" + """Test batch evaluation fails with invalid dataset_id.""" + # Create a valid config to use + config = create_test_config(db, project_id=user_api_key.project_id) + + # Try to start evaluation with non-existent dataset_id response = client.post( "/api/v1/evaluations/", json={ "experiment_name": "test_evaluation_run", - "dataset_id": 99999, - "config": sample_evaluation_config, + "dataset_id": 99999, # Non-existent + "config_id": str(config.id), + "config_version": 1, }, headers=user_api_key_header, ) @@ -544,35 +552,30 @@ def test_start_batch_evaluation_invalid_dataset_id( ) assert "not found" in error_str.lower() or "not accessible" in error_str.lower() - def test_start_batch_evaluation_missing_model( + def test_start_batch_evaluation_invalid_config_id( self, client: TestClient, user_api_key_header: dict[str, str] ) -> None: - """Test batch evaluation fails when model is missing from config.""" - # We don't need a real dataset for this test - the validation should happen - # before dataset lookup. Use any dataset_id and expect config validation error - invalid_config = { - "instructions": "You are a helpful assistant", - "temperature": 0.5, - } - + """Test batch evaluation fails with invalid config_id.""" + # Test with a non-existent config_id (random UUID) response = client.post( "/api/v1/evaluations/", json={ - "experiment_name": "test_no_model", - "dataset_id": 1, # Dummy ID, error should come before this is checked - "config": invalid_config, + "experiment_name": "test_no_config", + "dataset_id": 1, # Dummy ID, config validation happens first + "config_id": str(uuid4()), # Non-existent config + "config_version": 1, }, headers=user_api_key_header, ) - # Should fail with either 400 (model missing) or 404 (dataset not found) + # Should fail with either 400 (config not found) or 404 (dataset/config not found) assert response.status_code in [400, 404] response_data = response.json() error_str = response_data.get( "detail", response_data.get("message", str(response_data)) ) - # Should fail with either "model" missing or "dataset not found" (both acceptable) - assert "model" in error_str.lower() or "not found" in error_str.lower() + # Should mention config or not found + assert "config" in error_str.lower() or "not found" in error_str.lower() def test_start_batch_evaluation_without_authentication( self, client, sample_evaluation_config @@ -604,11 +607,11 @@ def test_build_batch_jsonl_basic(self) -> None: } ] - config = { - "model": "gpt-4o", - "temperature": 0.2, - "instructions": "You are a helpful assistant", - } + config = KaapiLLMParams( + model="gpt-4o", + temperature=0.2, + instructions="You are a helpful assistant", + ) jsonl_data = build_evaluation_jsonl(dataset_items, config) @@ -635,16 +638,11 @@ def test_build_batch_jsonl_with_tools(self) -> None: } ] - config = { - "model": "gpt-4o-mini", - "instructions": "Search documents", - "tools": [ - { - "type": "file_search", - "vector_store_ids": ["vs_abc123"], - } - ], - } + config = KaapiLLMParams( + model="gpt-4o-mini", + instructions="Search documents", + knowledge_base_ids=["vs_abc123"], + ) jsonl_data = build_evaluation_jsonl(dataset_items, config) @@ -664,7 +662,7 @@ def test_build_batch_jsonl_minimal_config(self) -> None: } ] - config = {"model": "gpt-4o"} # Only model provided + config = KaapiLLMParams(model="gpt-4o") # Only model provided jsonl_data = build_evaluation_jsonl(dataset_items, config) @@ -696,7 +694,7 @@ def test_build_batch_jsonl_skips_empty_questions(self) -> None: }, ] - config = {"model": "gpt-4o", "instructions": "Test"} + config = KaapiLLMParams(model="gpt-4o", instructions="Test") jsonl_data = build_evaluation_jsonl(dataset_items, config) @@ -716,10 +714,10 @@ def test_build_batch_jsonl_multiple_items(self) -> None: for i in range(5) ] - config = { - "model": "gpt-4o", - "instructions": "Answer questions", - } + config = KaapiLLMParams( + model="gpt-4o", + instructions="Answer questions", + ) jsonl_data = build_evaluation_jsonl(dataset_items, config) @@ -758,11 +756,15 @@ def test_get_evaluation_run_trace_info_not_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for incomplete evaluation returns error.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project_id) + eval_run = EvaluationRun( run_name="test_pending_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -794,11 +796,15 @@ def test_get_evaluation_run_trace_info_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for completed evaluation returns cached scores.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project_id) + eval_run = EvaluationRun( run_name="test_completed_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, score={ @@ -853,11 +859,13 @@ def test_get_evaluation_run_without_trace_info( create_test_dataset: EvaluationDataset, ) -> None: """Test getting evaluation run without requesting trace info.""" + config = create_test_config(db, project_id=user_api_key.project_id) eval_run = EvaluationRun( run_name="test_simple_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, organization_id=user_api_key.organization_id, @@ -888,11 +896,13 @@ def test_get_evaluation_run_resync_without_trace_info_fails( create_test_dataset: EvaluationDataset, ) -> None: """Test that resync_score=true requires get_trace_info=true.""" + config = create_test_config(db, project_id=user_api_key.project_id) eval_run = EvaluationRun( run_name="test_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, organization_id=user_api_key.organization_id, diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index afb0ac0ed..9e16d44de 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -13,7 +13,7 @@ poll_all_pending_evaluations, ) from app.models import BatchJob, Organization, Project, EvaluationDataset, EvaluationRun -from app.tests.utils.test_data import create_test_evaluation_dataset +from app.tests.utils.test_data import create_test_evaluation_dataset, create_test_config from app.crud.evaluations.core import create_evaluation_run from app.core.util import now @@ -259,6 +259,11 @@ def test_dataset(self, db: Session) -> EvaluationDataset: @pytest.fixture def eval_run_with_batch(self, db: Session, test_dataset) -> EvaluationRun: """Create evaluation run with batch job.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + # Create batch job batch_job = BatchJob( provider="openai", @@ -281,7 +286,8 @@ def eval_run_with_batch(self, db: Session, test_dataset) -> EvaluationRun: run_name="test_run", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, ) @@ -397,12 +403,18 @@ async def test_process_completed_evaluation_no_batch_job_id( self, db: Session, test_dataset ): """Test processing without batch_job_id.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + eval_run = create_evaluation_run( session=db, run_name="test_run", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, ) @@ -446,6 +458,11 @@ def test_dataset(self, db: Session) -> EvaluationDataset: @pytest.fixture def eval_run_with_embedding_batch(self, db: Session, test_dataset) -> EvaluationRun: """Create evaluation run with embedding batch job.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + # Create embedding batch job embedding_batch = BatchJob( provider="openai", @@ -469,7 +486,8 @@ def eval_run_with_embedding_batch(self, db: Session, test_dataset) -> Evaluation run_name="test_run_embedding", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, ) @@ -598,6 +616,11 @@ async def test_check_and_process_evaluation_completed( test_dataset, ): """Test checking evaluation with completed batch.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + # Create batch job batch_job = BatchJob( provider="openai", @@ -621,7 +644,8 @@ async def test_check_and_process_evaluation_completed( run_name="test_run", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, ) @@ -659,6 +683,11 @@ async def test_check_and_process_evaluation_failed( test_dataset, ): """Test checking evaluation with failed batch.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + # Create failed batch job batch_job = BatchJob( provider="openai", @@ -683,7 +712,8 @@ async def test_check_and_process_evaluation_failed( run_name="test_run_fail", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, ) @@ -759,6 +789,11 @@ async def test_poll_all_pending_evaluations_with_runs( test_dataset, ): """Test polling with pending evaluations.""" + # Create a config for the evaluation run + config = create_test_config( + db, project_id=test_dataset.project_id, use_kaapi_schema=True + ) + # Create batch job batch_job = BatchJob( provider="openai", @@ -782,7 +817,8 @@ async def test_poll_all_pending_evaluations_with_runs( run_name="test_pending_run", dataset_name=test_dataset.name, dataset_id=test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, organization_id=test_dataset.organization_id, project_id=test_dataset.project_id, )