From 631f3f40e9ff8b97abb3d3dd8197eb561534b9fc Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 9 Apr 2026 22:36:45 +0530 Subject: [PATCH 1/9] first stab at costing --- .../050_add_cost_to_evaluation_run.py | 33 +++ backend/app/crud/evaluations/__init__.py | 13 ++ backend/app/crud/evaluations/core.py | 3 + backend/app/crud/evaluations/embeddings.py | 4 + backend/app/crud/evaluations/pricing.py | 200 ++++++++++++++++++ backend/app/crud/evaluations/processing.py | 36 +++- backend/app/models/evaluation.py | 12 ++ 7 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 backend/app/alembic/versions/050_add_cost_to_evaluation_run.py create mode 100644 backend/app/crud/evaluations/pricing.py diff --git a/backend/app/alembic/versions/050_add_cost_to_evaluation_run.py b/backend/app/alembic/versions/050_add_cost_to_evaluation_run.py new file mode 100644 index 000000000..6d63de3e8 --- /dev/null +++ b/backend/app/alembic/versions/050_add_cost_to_evaluation_run.py @@ -0,0 +1,33 @@ +"""add cost tracking to evaluation_run + +Revision ID: 050 +Revises: 049 +Create Date: 2026-04-09 12:00:00.000000 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "050" +down_revision = "049" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "evaluation_run", + sa.Column( + "cost", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + comment="Cost tracking (response/embedding tokens and USD)", + ), + ) + + +def downgrade(): + op.drop_column("evaluation_run", "cost") diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index a5824c0a2..8515d81da 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -31,6 +31,13 @@ update_traces_with_cosine_scores, upload_dataset_to_langfuse, ) +from app.crud.evaluations.pricing import ( + build_cost_dict, + build_embedding_cost_entry, + build_response_cost_entry, + calculate_embedding_cost, + calculate_response_cost, +) from app.crud.evaluations.processing import ( check_and_process_evaluation, poll_all_pending_evaluations, @@ -74,6 +81,12 @@ "calculate_average_similarity", "calculate_cosine_similarity", "start_embedding_batch", + # Pricing + "build_cost_dict", + "build_embedding_cost_entry", + "build_response_cost_entry", + "calculate_embedding_cost", + "calculate_response_cost", # Langfuse "create_langfuse_dataset_run", "fetch_trace_scores_from_langfuse", diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 79a3c9d3f..5f1b22ee0 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -197,6 +197,7 @@ def update_evaluation_run( object_store_url: str | None = None, score_trace_url: str | None = None, score: dict | None = None, + cost: dict | None = None, embedding_batch_job_id: int | None = None, ) -> EvaluationRun: """ @@ -226,6 +227,8 @@ def update_evaluation_run( eval_run.object_store_url = object_store_url if score is not None: eval_run.score = score + if cost is not None: + eval_run.cost = cost if embedding_batch_job_id is not None: eval_run.embedding_batch_job_id = embedding_batch_job_id if score_trace_url is not None: diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index d21f186cc..6c2456a04 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -204,11 +204,15 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str, ) continue + # Extract usage for cost tracking + usage = response_body.get("usage") + embedding_pairs.append( { "trace_id": trace_id, "output_embedding": output_embedding, "ground_truth_embedding": ground_truth_embedding, + "usage": usage, } ) diff --git a/backend/app/crud/evaluations/pricing.py b/backend/app/crud/evaluations/pricing.py new file mode 100644 index 000000000..0702a9385 --- /dev/null +++ b/backend/app/crud/evaluations/pricing.py @@ -0,0 +1,200 @@ +""" +Pricing utilities for evaluation cost tracking. + +This module provides model pricing data and cost calculation functions +for both response generation and embedding stages of evaluation runs. + +Pricing uses OpenAI Batch API rates (50% cheaper than real-time). +Source: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json +""" + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +# Batch API pricing in USD per token +MODEL_PRICING: dict[str, dict[str, Any]] = { + # Chat models (batch pricing) + "gpt-4o": { + "mode": "chat", + "input_cost_per_token": 1.25e-06, + "output_cost_per_token": 5e-06, + }, + "gpt-4o-2024-08-06": { + "mode": "chat", + "input_cost_per_token": 1.25e-06, + "output_cost_per_token": 5e-06, + }, + "gpt-4o-mini": { + "mode": "chat", + "input_cost_per_token": 7.5e-08, + "output_cost_per_token": 3e-07, + }, + "gpt-4o-mini-2024-07-18": { + "mode": "chat", + "input_cost_per_token": 7.5e-08, + "output_cost_per_token": 3e-07, + }, + # Embedding models (batch pricing) + "text-embedding-3-large": { + "mode": "embedding", + "input_cost_per_token": 6.5e-08, + }, + "text-embedding-3-small": { + "mode": "embedding", + "input_cost_per_token": 1e-08, + }, + "text-embedding-ada-002": { + "mode": "embedding", + "input_cost_per_token": 1e-07, + }, +} + + +def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) -> float: + """ + Calculate USD cost for response generation. + + Args: + model: OpenAI model name (e.g., "gpt-4o") + input_tokens: Number of input tokens + output_tokens: Number of output tokens + + Returns: + Cost in USD. Returns 0.0 if model is unknown. + """ + pricing = MODEL_PRICING.get(model) + if not pricing: + logger.warning( + f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0" + ) + return 0.0 + + input_cost = input_tokens * pricing.get("input_cost_per_token", 0) + output_cost = output_tokens * pricing.get("output_cost_per_token", 0) + return input_cost + output_cost + + +def calculate_embedding_cost(model: str, prompt_tokens: int) -> float: + """ + Calculate USD cost for embeddings. + + Args: + model: OpenAI embedding model name (e.g., "text-embedding-3-large") + prompt_tokens: Number of prompt tokens + + Returns: + Cost in USD. Returns 0.0 if model is unknown. + """ + pricing = MODEL_PRICING.get(model) + if not pricing: + logger.warning( + f"[calculate_embedding_cost] Unknown model '{model}', returning cost 0.0" + ) + return 0.0 + + return prompt_tokens * pricing.get("input_cost_per_token", 0) + + +def build_response_cost_entry( + model: str, results: list[dict[str, Any]] +) -> dict[str, Any]: + """ + Aggregate token usage from parsed evaluation results and calculate cost. + + Args: + model: OpenAI model name used for response generation + results: Parsed evaluation results from parse_evaluation_output(), + each containing a "usage" dict with input_tokens/output_tokens/total_tokens + + Returns: + Response cost entry for the cost JSONB field + """ + total_input_tokens = 0 + total_output_tokens = 0 + total_tokens = 0 + + for result in results: + usage = result.get("usage") + if not usage: + continue + total_input_tokens += usage.get("input_tokens", 0) + total_output_tokens += usage.get("output_tokens", 0) + total_tokens += usage.get("total_tokens", 0) + + cost_usd = calculate_response_cost(model, total_input_tokens, total_output_tokens) + + return { + "model": model, + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "total_tokens": total_tokens, + "cost_usd": round(cost_usd, 6), + } + + +def build_embedding_cost_entry( + model: str, raw_results: list[dict[str, Any]] +) -> dict[str, Any]: + """ + Aggregate token usage from raw embedding batch results and calculate cost. + + Args: + model: OpenAI embedding model name + raw_results: Raw JSONL lines from embedding batch output, + each containing response.body.usage with prompt_tokens/total_tokens + + Returns: + Embedding cost entry for the cost JSONB field + """ + total_prompt_tokens = 0 + total_tokens = 0 + + for response in raw_results: + usage = response.get("response", {}).get("body", {}).get("usage") + if not usage: + continue + total_prompt_tokens += usage.get("prompt_tokens", 0) + total_tokens += usage.get("total_tokens", 0) + + cost_usd = calculate_embedding_cost(model, total_prompt_tokens) + + return { + "model": model, + "prompt_tokens": total_prompt_tokens, + "total_tokens": total_tokens, + "cost_usd": round(cost_usd, 6), + } + + +def build_cost_dict( + response_entry: dict[str, Any] | None = None, + embedding_entry: dict[str, Any] | None = None, +) -> dict[str, Any]: + """ + Combine response and embedding cost entries into the final cost JSONB structure. + + Args: + response_entry: Response cost entry from build_response_cost_entry() + embedding_entry: Embedding cost entry from build_embedding_cost_entry() + + Returns: + Combined cost dict with total_cost_usd + """ + cost: dict[str, Any] = {} + + response_cost = 0.0 + embedding_cost = 0.0 + + if response_entry: + cost["response"] = response_entry + response_cost = response_entry.get("cost_usd", 0.0) + + if embedding_entry: + cost["embedding"] = embedding_entry + embedding_cost = embedding_entry.get("cost_usd", 0.0) + + cost["total_cost_usd"] = round(response_cost + embedding_cost, 6) + + return cost diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 1fa82b39f..e91a13002 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -29,10 +29,16 @@ from app.crud.evaluations.batch import fetch_dataset_items from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( + EMBEDDING_MODEL, calculate_average_similarity, parse_embedding_results, start_embedding_batch, ) +from app.crud.evaluations.pricing import ( + build_cost_dict, + build_embedding_cost_entry, + build_response_cost_entry, +) from app.crud.evaluations.langfuse import ( create_langfuse_dataset_run, update_traces_with_cosine_scores, @@ -332,6 +338,18 @@ async def process_completed_evaluation( # Use model stored at creation time for cost tracking model = resolve_model_from_config(session=session, eval_run=eval_run) + # Aggregate response generation cost + try: + response_cost_entry = build_response_cost_entry( + model=model, results=results + ) + cost = build_cost_dict(response_entry=response_cost_entry) + update_evaluation_run(session=session, eval_run=eval_run, cost=cost) + except Exception as cost_err: + logger.warning( + f"[process_completed_evaluation] {log_prefix} Failed to calculate response cost | {cost_err}" + ) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, @@ -488,7 +506,23 @@ async def process_completed_embedding_batch( exc_info=True, ) - # Step 7: Mark evaluation as completed + # Step 7: Accumulate embedding cost onto existing response cost + try: + embedding_cost_entry = build_embedding_cost_entry( + model=EMBEDDING_MODEL, raw_results=raw_results + ) + existing_cost = eval_run.cost or {} + response_entry = existing_cost.get("response") + eval_run.cost = build_cost_dict( + response_entry=response_entry, + embedding_entry=embedding_cost_entry, + ) + except Exception as cost_err: + logger.warning( + f"[process_completed_embedding_batch] {log_prefix} Failed to calculate embedding cost | {cost_err}" + ) + + # Step 8: Mark evaluation as completed eval_run = update_evaluation_run( session=session, eval_run=eval_run, status="completed", score=eval_run.score ) diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index d2d2beecc..fddc255cb 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -313,6 +313,17 @@ class EvaluationRun(SQLModel, table=True): description="Evaluation scores (e.g., correctness, cosine_similarity, etc.)", ) + # Cost tracking field + cost: dict[str, Any] | None = SQLField( + default=None, + sa_column=Column( + JSONB, + nullable=True, + comment="Cost tracking (response/embedding tokens and USD)", + ), + description="Cost breakdown by stage (response, embedding) with token counts and USD", + ) + # Error message field error_message: str | None = SQLField( default=None, @@ -397,6 +408,7 @@ class EvaluationRunPublic(SQLModel): object_store_url: str | None total_items: int score: dict[str, Any] | None + cost: dict[str, Any] | None error_message: str | None organization_id: int project_id: int From b6750f0509503646d145891fdc73e42935b67b43 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 9 Apr 2026 23:27:49 +0530 Subject: [PATCH 2/9] minor fixes --- backend/app/crud/evaluations/pricing.py | 59 ++++++++++++++++++------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/backend/app/crud/evaluations/pricing.py b/backend/app/crud/evaluations/pricing.py index 0702a9385..ffd14c8fa 100644 --- a/backend/app/crud/evaluations/pricing.py +++ b/backend/app/crud/evaluations/pricing.py @@ -15,40 +15,65 @@ # Batch API pricing in USD per token MODEL_PRICING: dict[str, dict[str, Any]] = { - # Chat models (batch pricing) + # GPT-4o (batch pricing) "gpt-4o": { "mode": "chat", "input_cost_per_token": 1.25e-06, "output_cost_per_token": 5e-06, }, - "gpt-4o-2024-08-06": { - "mode": "chat", - "input_cost_per_token": 1.25e-06, - "output_cost_per_token": 5e-06, - }, "gpt-4o-mini": { "mode": "chat", "input_cost_per_token": 7.5e-08, "output_cost_per_token": 3e-07, }, - "gpt-4o-mini-2024-07-18": { + # GPT-4.1 (batch pricing) + "gpt-4.1": { "mode": "chat", - "input_cost_per_token": 7.5e-08, - "output_cost_per_token": 3e-07, + "input_cost_per_token": 1e-06, + "output_cost_per_token": 4e-06, + }, + # GPT-5 (batch pricing) + "gpt-5": { + "mode": "chat", + "input_cost_per_token": 6.25e-07, + "output_cost_per_token": 5e-06, + }, + "gpt-5-mini": { + "mode": "chat", + "input_cost_per_token": 1.25e-07, + "output_cost_per_token": 1e-06, + }, + "gpt-5-nano": { + "mode": "chat", + "input_cost_per_token": 2.5e-08, + "output_cost_per_token": 2e-07, + }, + # GPT-5.4 (batch pricing) + "gpt-5.4": { + "mode": "chat", + "input_cost_per_token": 1.25e-06, + "output_cost_per_token": 7.5e-06, + }, + "gpt-5.4-pro": { + "mode": "chat", + "input_cost_per_token": 1.5e-05, + "output_cost_per_token": 9e-05, + }, + "gpt-5.4-mini": { + "mode": "chat", + "input_cost_per_token": 3.75e-07, + "output_cost_per_token": 2.25e-06, + }, + "gpt-5.4-nano": { + "mode": "chat", + "input_cost_per_token": 1e-07, + "output_cost_per_token": 6.25e-07, }, # Embedding models (batch pricing) "text-embedding-3-large": { "mode": "embedding", "input_cost_per_token": 6.5e-08, }, - "text-embedding-3-small": { - "mode": "embedding", - "input_cost_per_token": 1e-08, - }, - "text-embedding-ada-002": { - "mode": "embedding", - "input_cost_per_token": 1e-07, - }, } From 63eb9428a435244e844fc5f639d30063427cd99f Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Sat, 11 Apr 2026 11:40:46 +0530 Subject: [PATCH 3/9] cleanup --- backend/app/crud/evaluations/__init__.py | 6 +- backend/app/crud/evaluations/core.py | 2 + backend/app/crud/evaluations/embeddings.py | 4 - backend/app/crud/evaluations/pricing.py | 128 ++++---- backend/app/crud/evaluations/processing.py | 95 ++++-- .../tests/crud/evaluations/test_pricing.py | 287 ++++++++++++++++++ .../tests/crud/evaluations/test_processing.py | 61 +++- 7 files changed, 484 insertions(+), 99 deletions(-) create mode 100644 backend/app/tests/crud/evaluations/test_pricing.py diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index 8515d81da..64dfb8a3a 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -35,8 +35,7 @@ build_cost_dict, build_embedding_cost_entry, build_response_cost_entry, - calculate_embedding_cost, - calculate_response_cost, + calculate_token_cost, ) from app.crud.evaluations.processing import ( check_and_process_evaluation, @@ -85,8 +84,7 @@ "build_cost_dict", "build_embedding_cost_entry", "build_response_cost_entry", - "calculate_embedding_cost", - "calculate_response_cost", + "calculate_token_cost", # Langfuse "create_langfuse_dataset_run", "fetch_trace_scores_from_langfuse", diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 5f1b22ee0..e52d77cdc 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -212,7 +212,9 @@ def update_evaluation_run( status: New status value (optional) error_message: New error message (optional) object_store_url: New object store URL (optional) + score_trace_url: New per-trace score S3 URL (optional) score: New score dict (optional) + cost: New cost dict (optional) embedding_batch_job_id: New embedding batch job ID (optional) Returns: diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 6c2456a04..d21f186cc 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -204,15 +204,11 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str, ) continue - # Extract usage for cost tracking - usage = response_body.get("usage") - embedding_pairs.append( { "trace_id": trace_id, "output_embedding": output_embedding, "ground_truth_embedding": ground_truth_embedding, - "usage": usage, } ) diff --git a/backend/app/crud/evaluations/pricing.py b/backend/app/crud/evaluations/pricing.py index ffd14c8fa..be3d98791 100644 --- a/backend/app/crud/evaluations/pricing.py +++ b/backend/app/crud/evaluations/pricing.py @@ -9,82 +9,82 @@ """ import logging +from collections.abc import Callable, Iterable from typing import Any +from app.crud.evaluations.embeddings import EMBEDDING_MODEL + logger = logging.getLogger(__name__) -# Batch API pricing in USD per token -MODEL_PRICING: dict[str, dict[str, Any]] = { +# Number of decimals to round USD cost values to. +COST_USD_DECIMALS = 6 + +# Batch API pricing in USD per token. +MODEL_PRICING: dict[str, dict[str, float]] = { # GPT-4o (batch pricing) "gpt-4o": { - "mode": "chat", "input_cost_per_token": 1.25e-06, "output_cost_per_token": 5e-06, }, "gpt-4o-mini": { - "mode": "chat", "input_cost_per_token": 7.5e-08, "output_cost_per_token": 3e-07, }, # GPT-4.1 (batch pricing) "gpt-4.1": { - "mode": "chat", "input_cost_per_token": 1e-06, "output_cost_per_token": 4e-06, }, # GPT-5 (batch pricing) "gpt-5": { - "mode": "chat", "input_cost_per_token": 6.25e-07, "output_cost_per_token": 5e-06, }, "gpt-5-mini": { - "mode": "chat", "input_cost_per_token": 1.25e-07, "output_cost_per_token": 1e-06, }, "gpt-5-nano": { - "mode": "chat", "input_cost_per_token": 2.5e-08, "output_cost_per_token": 2e-07, }, # GPT-5.4 (batch pricing) "gpt-5.4": { - "mode": "chat", "input_cost_per_token": 1.25e-06, "output_cost_per_token": 7.5e-06, }, "gpt-5.4-pro": { - "mode": "chat", "input_cost_per_token": 1.5e-05, "output_cost_per_token": 9e-05, }, "gpt-5.4-mini": { - "mode": "chat", "input_cost_per_token": 3.75e-07, "output_cost_per_token": 2.25e-06, }, "gpt-5.4-nano": { - "mode": "chat", "input_cost_per_token": 1e-07, "output_cost_per_token": 6.25e-07, }, # Embedding models (batch pricing) - "text-embedding-3-large": { - "mode": "embedding", + EMBEDDING_MODEL: { "input_cost_per_token": 6.5e-08, }, } -def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) -> float: +def calculate_token_cost( + model: str, input_tokens: int, output_tokens: int = 0 +) -> float: """ - Calculate USD cost for response generation. + Calculate USD cost for a model call given input and output token counts. + + Used for both response generation (input + output tokens) and embeddings + (input tokens only — pass output_tokens=0 or omit). Args: - model: OpenAI model name (e.g., "gpt-4o") - input_tokens: Number of input tokens - output_tokens: Number of output tokens + model: OpenAI model name (e.g., "gpt-4o", "text-embedding-3-large") + input_tokens: Number of input/prompt tokens + output_tokens: Number of output tokens (default 0 for embeddings) Returns: Cost in USD. Returns 0.0 if model is unknown. @@ -92,7 +92,7 @@ def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) - pricing = MODEL_PRICING.get(model) if not pricing: logger.warning( - f"[calculate_response_cost] Unknown model '{model}', returning cost 0.0" + f"[calculate_token_cost] Unknown model '{model}', returning cost 0.0" ) return 0.0 @@ -101,25 +101,31 @@ def calculate_response_cost(model: str, input_tokens: int, output_tokens: int) - return input_cost + output_cost -def calculate_embedding_cost(model: str, prompt_tokens: int) -> float: +def _sum_usage( + items: Iterable[dict[str, Any]], + usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None], + fields: tuple[str, ...], +) -> dict[str, int]: """ - Calculate USD cost for embeddings. + Sum named token fields across items, using a caller-supplied extractor + to locate the per-item usage dict. Args: - model: OpenAI embedding model name (e.g., "text-embedding-3-large") - prompt_tokens: Number of prompt tokens + items: Iterable of items to aggregate + usage_extractor: Function returning the usage dict for an item, or None + fields: Token field names to sum (e.g., "input_tokens", "total_tokens") Returns: - Cost in USD. Returns 0.0 if model is unknown. + Mapping of field name to summed value """ - pricing = MODEL_PRICING.get(model) - if not pricing: - logger.warning( - f"[calculate_embedding_cost] Unknown model '{model}', returning cost 0.0" - ) - return 0.0 - - return prompt_tokens * pricing.get("input_cost_per_token", 0) + totals: dict[str, int] = {field: 0 for field in fields} + for item in items: + usage = usage_extractor(item) + if not usage: + continue + for field in fields: + totals[field] += usage.get(field, 0) + return totals def build_response_cost_entry( @@ -136,26 +142,24 @@ def build_response_cost_entry( Returns: Response cost entry for the cost JSONB field """ - total_input_tokens = 0 - total_output_tokens = 0 - total_tokens = 0 - - for result in results: - usage = result.get("usage") - if not usage: - continue - total_input_tokens += usage.get("input_tokens", 0) - total_output_tokens += usage.get("output_tokens", 0) - total_tokens += usage.get("total_tokens", 0) - - cost_usd = calculate_response_cost(model, total_input_tokens, total_output_tokens) + totals = _sum_usage( + items=results, + usage_extractor=lambda r: r.get("usage"), + fields=("input_tokens", "output_tokens", "total_tokens"), + ) + + cost_usd = calculate_token_cost( + model=model, + input_tokens=totals["input_tokens"], + output_tokens=totals["output_tokens"], + ) return { "model": model, - "input_tokens": total_input_tokens, - "output_tokens": total_output_tokens, - "total_tokens": total_tokens, - "cost_usd": round(cost_usd, 6), + "input_tokens": totals["input_tokens"], + "output_tokens": totals["output_tokens"], + "total_tokens": totals["total_tokens"], + "cost_usd": round(cost_usd, COST_USD_DECIMALS), } @@ -173,23 +177,19 @@ def build_embedding_cost_entry( Returns: Embedding cost entry for the cost JSONB field """ - total_prompt_tokens = 0 - total_tokens = 0 - - for response in raw_results: - usage = response.get("response", {}).get("body", {}).get("usage") - if not usage: - continue - total_prompt_tokens += usage.get("prompt_tokens", 0) - total_tokens += usage.get("total_tokens", 0) + totals = _sum_usage( + items=raw_results, + usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"), + fields=("prompt_tokens", "total_tokens"), + ) - cost_usd = calculate_embedding_cost(model, total_prompt_tokens) + cost_usd = calculate_token_cost(model=model, input_tokens=totals["prompt_tokens"]) return { "model": model, - "prompt_tokens": total_prompt_tokens, - "total_tokens": total_tokens, - "cost_usd": round(cost_usd, 6), + "prompt_tokens": totals["prompt_tokens"], + "total_tokens": totals["total_tokens"], + "cost_usd": round(cost_usd, COST_USD_DECIMALS), } @@ -220,6 +220,6 @@ def build_cost_dict( cost["embedding"] = embedding_entry embedding_cost = embedding_entry.get("cost_usd", 0.0) - cost["total_cost_usd"] = round(response_cost + embedding_cost, 6) + cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS) return cost diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index e91a13002..77502fe8a 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -51,6 +51,58 @@ logger = logging.getLogger(__name__) +def _safe_attach_cost( + eval_run: EvaluationRun, + log_prefix: str, + *, + response_model: str | None = None, + response_results: list[dict[str, Any]] | None = None, + embedding_model: str | None = None, + embedding_raw_results: list[dict[str, Any]] | None = None, +) -> None: + """ + Compute and attach a cost dict to eval_run.cost without raising. + + Cost-tracking failures must never block evaluation completion, so any + exception is logged and swallowed. The caller is responsible for + persisting eval_run via update_evaluation_run. + + When called for the embedding stage only, any previously-computed + response entry on eval_run.cost is preserved. + + Args: + eval_run: EvaluationRun whose cost field will be set + log_prefix: Caller-provided log prefix (org/project/eval ids) + response_model: Model name for response cost (response stage only) + response_results: Parsed evaluation results (response stage only) + embedding_model: Model name for embedding cost (embedding stage only) + embedding_raw_results: Raw embedding batch results (embedding stage only) + """ + try: + if response_model is not None and response_results is not None: + response_entry = build_response_cost_entry( + model=response_model, results=response_results + ) + else: + # Preserve any response entry computed during an earlier stage. + response_entry = (eval_run.cost or {}).get("response") + + embedding_entry: dict[str, Any] | None = None + if embedding_model is not None and embedding_raw_results is not None: + embedding_entry = build_embedding_cost_entry( + model=embedding_model, raw_results=embedding_raw_results + ) + + eval_run.cost = build_cost_dict( + response_entry=response_entry, + embedding_entry=embedding_entry, + ) + except Exception as cost_err: + logger.warning( + f"[_safe_attach_cost] {log_prefix} Failed to compute cost | {cost_err}" + ) + + def _extract_batch_error_message( provider: OpenAIBatchProvider, error_file_id: str, @@ -339,16 +391,13 @@ async def process_completed_evaluation( model = resolve_model_from_config(session=session, eval_run=eval_run) # Aggregate response generation cost - try: - response_cost_entry = build_response_cost_entry( - model=model, results=results - ) - cost = build_cost_dict(response_entry=response_cost_entry) - update_evaluation_run(session=session, eval_run=eval_run, cost=cost) - except Exception as cost_err: - logger.warning( - f"[process_completed_evaluation] {log_prefix} Failed to calculate response cost | {cost_err}" - ) + _safe_attach_cost( + eval_run=eval_run, + log_prefix=log_prefix, + response_model=model, + response_results=results, + ) + update_evaluation_run(session=session, eval_run=eval_run, cost=eval_run.cost) trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, @@ -507,24 +556,20 @@ async def process_completed_embedding_batch( ) # Step 7: Accumulate embedding cost onto existing response cost - try: - embedding_cost_entry = build_embedding_cost_entry( - model=EMBEDDING_MODEL, raw_results=raw_results - ) - existing_cost = eval_run.cost or {} - response_entry = existing_cost.get("response") - eval_run.cost = build_cost_dict( - response_entry=response_entry, - embedding_entry=embedding_cost_entry, - ) - except Exception as cost_err: - logger.warning( - f"[process_completed_embedding_batch] {log_prefix} Failed to calculate embedding cost | {cost_err}" - ) + _safe_attach_cost( + eval_run=eval_run, + log_prefix=log_prefix, + embedding_model=EMBEDDING_MODEL, + embedding_raw_results=raw_results, + ) # Step 8: Mark evaluation as completed eval_run = update_evaluation_run( - session=session, eval_run=eval_run, status="completed", score=eval_run.score + session=session, + eval_run=eval_run, + status="completed", + score=eval_run.score, + cost=eval_run.cost, ) logger.info( diff --git a/backend/app/tests/crud/evaluations/test_pricing.py b/backend/app/tests/crud/evaluations/test_pricing.py new file mode 100644 index 000000000..b938cb7dc --- /dev/null +++ b/backend/app/tests/crud/evaluations/test_pricing.py @@ -0,0 +1,287 @@ +import pytest + +from app.crud.evaluations.pricing import ( + COST_USD_DECIMALS, + MODEL_PRICING, + build_cost_dict, + build_embedding_cost_entry, + build_response_cost_entry, + calculate_token_cost, +) + + +class TestCalculateTokenCost: + """Tests for calculate_token_cost function.""" + + def test_known_chat_model_input_and_output(self) -> None: + """Cost is sum of input and output token costs for a known chat model.""" + pricing = MODEL_PRICING["gpt-4o"] + expected = ( + 1000 * pricing["input_cost_per_token"] + + 500 * pricing["output_cost_per_token"] + ) + + cost = calculate_token_cost( + model="gpt-4o", input_tokens=1000, output_tokens=500 + ) + + assert cost == pytest.approx(expected) + + def test_known_embedding_model_defaults_output_tokens_to_zero(self) -> None: + """Embedding models charge only for input tokens; output_tokens defaults to 0.""" + pricing = MODEL_PRICING["text-embedding-3-large"] + expected = 2000 * pricing["input_cost_per_token"] + + cost = calculate_token_cost(model="text-embedding-3-large", input_tokens=2000) + + assert cost == pytest.approx(expected) + + def test_unknown_model_returns_zero(self) -> None: + """Unknown models return 0.0 instead of raising.""" + cost = calculate_token_cost( + model="not-a-real-model", input_tokens=100, output_tokens=50 + ) + + assert cost == 0.0 + + def test_zero_tokens_returns_zero(self) -> None: + """Zero tokens for a known model returns zero cost.""" + cost = calculate_token_cost(model="gpt-4o", input_tokens=0, output_tokens=0) + + assert cost == 0.0 + + def test_embedding_model_with_explicit_output_tokens(self) -> None: + """Passing output_tokens to an embedding model adds 0 cost (no output rate).""" + pricing = MODEL_PRICING["text-embedding-3-large"] + expected = 100 * pricing["input_cost_per_token"] + + cost = calculate_token_cost( + model="text-embedding-3-large", input_tokens=100, output_tokens=999 + ) + + assert cost == pytest.approx(expected) + + +class TestBuildResponseCostEntry: + """Tests for build_response_cost_entry function.""" + + def test_basic_aggregation(self) -> None: + """Sums input/output/total tokens across results and computes USD cost.""" + results = [ + { + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + } + }, + { + "usage": { + "input_tokens": 200, + "output_tokens": 75, + "total_tokens": 275, + } + }, + ] + + entry = build_response_cost_entry(model="gpt-4o", results=results) + + assert entry["model"] == "gpt-4o" + assert entry["input_tokens"] == 300 + assert entry["output_tokens"] == 125 + assert entry["total_tokens"] == 425 + pricing = MODEL_PRICING["gpt-4o"] + expected_cost = round( + 300 * pricing["input_cost_per_token"] + + 125 * pricing["output_cost_per_token"], + COST_USD_DECIMALS, + ) + assert entry["cost_usd"] == expected_cost + + def test_empty_results(self) -> None: + """Empty results yields zero tokens and zero cost.""" + entry = build_response_cost_entry(model="gpt-4o", results=[]) + + assert entry["input_tokens"] == 0 + assert entry["output_tokens"] == 0 + assert entry["total_tokens"] == 0 + assert entry["cost_usd"] == 0.0 + + def test_results_missing_usage_are_skipped(self) -> None: + """Items without a usage dict are skipped without raising.""" + results = [ + {"usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}}, + {}, # No usage key + {"usage": None}, # Explicit None + ] + + entry = build_response_cost_entry(model="gpt-4o", results=results) + + assert entry["input_tokens"] == 10 + assert entry["output_tokens"] == 5 + assert entry["total_tokens"] == 15 + + def test_unknown_model_yields_zero_cost(self) -> None: + """Unknown model still aggregates token counts but reports zero cost.""" + results = [ + {"usage": {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}} + ] + + entry = build_response_cost_entry(model="mystery-model", results=results) + + assert entry["input_tokens"] == 100 + assert entry["output_tokens"] == 50 + assert entry["cost_usd"] == 0.0 + + +class TestBuildEmbeddingCostEntry: + """Tests for build_embedding_cost_entry function.""" + + def test_basic_aggregation(self) -> None: + """Sums prompt/total tokens from raw batch results and computes USD cost.""" + raw_results = [ + { + "response": { + "body": {"usage": {"prompt_tokens": 100, "total_tokens": 100}} + } + }, + { + "response": { + "body": {"usage": {"prompt_tokens": 250, "total_tokens": 250}} + } + }, + ] + + entry = build_embedding_cost_entry( + model="text-embedding-3-large", raw_results=raw_results + ) + + assert entry["model"] == "text-embedding-3-large" + assert entry["prompt_tokens"] == 350 + assert entry["total_tokens"] == 350 + pricing = MODEL_PRICING["text-embedding-3-large"] + expected_cost = round(350 * pricing["input_cost_per_token"], COST_USD_DECIMALS) + assert entry["cost_usd"] == expected_cost + + def test_empty_raw_results(self) -> None: + """Empty raw_results yields zero tokens and zero cost.""" + entry = build_embedding_cost_entry( + model="text-embedding-3-large", raw_results=[] + ) + + assert entry["prompt_tokens"] == 0 + assert entry["total_tokens"] == 0 + assert entry["cost_usd"] == 0.0 + + def test_results_missing_usage_are_skipped(self) -> None: + """Items without nested usage are skipped (e.g., error rows).""" + raw_results = [ + { + "response": { + "body": {"usage": {"prompt_tokens": 50, "total_tokens": 50}} + } + }, + {"error": {"message": "Rate limited"}}, # No response.body.usage + {"response": {"body": {}}}, # body present, usage missing + ] + + entry = build_embedding_cost_entry( + model="text-embedding-3-large", raw_results=raw_results + ) + + assert entry["prompt_tokens"] == 50 + assert entry["total_tokens"] == 50 + + def test_unknown_model_yields_zero_cost(self) -> None: + """Unknown embedding model still aggregates tokens but reports zero cost.""" + raw_results = [ + { + "response": { + "body": {"usage": {"prompt_tokens": 100, "total_tokens": 100}} + } + } + ] + + entry = build_embedding_cost_entry( + model="mystery-embed", raw_results=raw_results + ) + + assert entry["prompt_tokens"] == 100 + assert entry["cost_usd"] == 0.0 + + +class TestBuildCostDict: + """Tests for build_cost_dict function.""" + + def test_response_only(self) -> None: + """Only response entry → embedding key absent, total = response cost.""" + response_entry = { + "model": "gpt-4o", + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "cost_usd": 0.001234, + } + + cost = build_cost_dict(response_entry=response_entry) + + assert cost["response"] == response_entry + assert "embedding" not in cost + assert cost["total_cost_usd"] == 0.001234 + + def test_embedding_only(self) -> None: + """Only embedding entry → response key absent, total = embedding cost.""" + embedding_entry = { + "model": "text-embedding-3-large", + "prompt_tokens": 200, + "total_tokens": 200, + "cost_usd": 0.000013, + } + + cost = build_cost_dict(embedding_entry=embedding_entry) + + assert cost["embedding"] == embedding_entry + assert "response" not in cost + assert cost["total_cost_usd"] == 0.000013 + + def test_both_entries(self) -> None: + """Both entries → both keys present, total = sum of both costs.""" + response_entry = {"cost_usd": 0.001234} + embedding_entry = {"cost_usd": 0.000013} + + cost = build_cost_dict( + response_entry=response_entry, embedding_entry=embedding_entry + ) + + assert cost["response"] == response_entry + assert cost["embedding"] == embedding_entry + assert cost["total_cost_usd"] == round(0.001234 + 0.000013, COST_USD_DECIMALS) + + def test_neither_entry(self) -> None: + """No entries → only total_cost_usd present, equal to 0.0.""" + cost = build_cost_dict() + + assert cost == {"total_cost_usd": 0.0} + + def test_total_is_rounded(self) -> None: + """total_cost_usd is rounded to COST_USD_DECIMALS.""" + response_entry = {"cost_usd": 0.0000001} + embedding_entry = {"cost_usd": 0.0000002} + + cost = build_cost_dict( + response_entry=response_entry, embedding_entry=embedding_entry + ) + + # 0.0000003 rounded to 6 decimals → 0.0 + assert cost["total_cost_usd"] == 0.0 + + def test_entry_missing_cost_usd_treated_as_zero(self) -> None: + """Entries without a cost_usd key default to 0 in the total.""" + response_entry = {"model": "gpt-4o"} # No cost_usd + embedding_entry = {"cost_usd": 0.000050} + + cost = build_cost_dict( + response_entry=response_entry, embedding_entry=embedding_entry + ) + + assert cost["total_cost_usd"] == 0.000050 diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index 52162654d..51e2321db 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -357,7 +357,11 @@ async def test_process_completed_evaluation_success( "body": { "id": "resp_123", "output": "Answer 1", - "usage": {"total_tokens": 10}, + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + }, } }, } @@ -397,6 +401,20 @@ async def test_process_completed_evaluation_success( mock_create_langfuse.assert_called_once() mock_start_embedding.assert_called_once() + # Cost tracking: response cost should be aggregated and persisted. + db.refresh(result) + assert result.cost is not None + assert "response" in result.cost + response_cost = result.cost["response"] + assert response_cost["model"] == "gpt-4o" + assert response_cost["input_tokens"] == 100 + assert response_cost["output_tokens"] == 50 + assert response_cost["total_tokens"] == 150 + assert response_cost["cost_usd"] > 0 + assert result.cost["total_cost_usd"] == response_cost["cost_usd"] + # Embedding cost is added later by process_completed_embedding_batch. + assert "embedding" not in result.cost + @pytest.mark.asyncio @patch("app.crud.evaluations.processing.download_batch_results") @patch("app.crud.evaluations.processing.fetch_dataset_items") @@ -547,7 +565,31 @@ async def test_process_completed_embedding_batch_success( eval_run_with_embedding_batch, ): """Test successfully processing completed embedding batch.""" - mock_download.return_value = [] + # Pre-populate eval_run.cost with a response entry to verify that the + # embedding stage merges (not overwrites) existing cost data. + eval_run_with_embedding_batch.cost = { + "response": { + "model": "gpt-4o", + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "cost_usd": 0.000375, + }, + "total_cost_usd": 0.000375, + } + db.add(eval_run_with_embedding_batch) + db.commit() + db.refresh(eval_run_with_embedding_batch) + + # Raw results carry the usage payload that build_embedding_cost_entry reads. + mock_download.return_value = [ + { + "custom_id": "trace_123", + "response": { + "body": {"usage": {"prompt_tokens": 200, "total_tokens": 200}} + }, + } + ] mock_parse.return_value = [ { "item_id": "item1", @@ -586,6 +628,21 @@ async def test_process_completed_embedding_batch_success( assert cosine_score is not None assert cosine_score["avg"] == 0.95 + # Cost tracking: embedding entry is added, response entry is preserved, + # and total_cost_usd is the sum of both. + assert result.cost is not None + assert "response" in result.cost + assert "embedding" in result.cost + assert result.cost["response"]["cost_usd"] == 0.000375 + embedding_cost = result.cost["embedding"] + assert embedding_cost["model"] == "text-embedding-3-large" + assert embedding_cost["prompt_tokens"] == 200 + assert embedding_cost["total_tokens"] == 200 + assert embedding_cost["cost_usd"] > 0 + assert result.cost["total_cost_usd"] == pytest.approx( + 0.000375 + embedding_cost["cost_usd"] + ) + @pytest.mark.asyncio @patch("app.crud.evaluations.processing.download_batch_results") @patch("app.crud.evaluations.processing.parse_embedding_results") From b75a043ee9ddbe89767801335d2e66fd012b62c6 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 16 Apr 2026 12:23:43 +0530 Subject: [PATCH 4/9] first stab of using config table --- .../versions/052_create_model_config_table.py | 5 +- backend/app/crud/evaluations/__init__.py | 11 - backend/app/crud/evaluations/pricing.py | 225 -------------- backend/app/crud/evaluations/processing.py | 130 ++++++-- backend/app/crud/model_config.py | 15 + backend/app/models/llm/constants.py | 3 - backend/app/services/llm/jobs.py | 3 +- backend/app/services/llm/mappers.py | 21 +- .../tests/crud/evaluations/test_pricing.py | 287 ------------------ .../tests/crud/evaluations/test_processing.py | 13 +- backend/app/tests/services/llm/test_jobs.py | 2 +- .../app/tests/services/llm/test_mappers.py | 99 +++--- 12 files changed, 208 insertions(+), 606 deletions(-) delete mode 100644 backend/app/crud/evaluations/pricing.py delete mode 100644 backend/app/tests/crud/evaluations/test_pricing.py diff --git a/backend/app/alembic/versions/052_create_model_config_table.py b/backend/app/alembic/versions/052_create_model_config_table.py index e74b94641..c72a7ab32 100644 --- a/backend/app/alembic/versions/052_create_model_config_table.py +++ b/backend/app/alembic/versions/052_create_model_config_table.py @@ -115,7 +115,10 @@ def upgrade(): (16, 'openai', 'gpt-5.4', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 2.5, "output_token_cost": 15}, "batch": {"input_token_cost": 1.25, "output_token_cost": 7.5}}', true, NOW(), NOW()), (17, 'openai', 'gpt-5.4-mini', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.75, "output_token_cost": 4.5}, "batch": {"input_token_cost": 0.375, "output_token_cost": 2.25}}', true, NOW(), NOW()), (18, 'openai', 'gpt-5.4-nano', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.2, "output_token_cost": 1.25}, "batch": {"input_token_cost": 0.1, "output_token_cost": 0.625}}', true, NOW(), NOW()), - (19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW()) + (19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW()), + (20, 'openai', 'text-embedding-3-large', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.13, "output_token_cost": 0}, "batch": {"input_token_cost": 0.065, "output_token_cost": 0}}', true, NOW(), NOW()), + (21, 'openai', 'text-embedding-3-small', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.02, "output_token_cost": 0}, "batch": {"input_token_cost": 0.01, "output_token_cost": 0}}', true, NOW(), NOW()), + (22, 'openai', 'text-embedding-ada-002', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.1, "output_token_cost": 0}, "batch": {"input_token_cost": 0.05, "output_token_cost": 0}}', true, NOW(), NOW()) """ ) diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index 64dfb8a3a..a5824c0a2 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -31,12 +31,6 @@ update_traces_with_cosine_scores, upload_dataset_to_langfuse, ) -from app.crud.evaluations.pricing import ( - build_cost_dict, - build_embedding_cost_entry, - build_response_cost_entry, - calculate_token_cost, -) from app.crud.evaluations.processing import ( check_and_process_evaluation, poll_all_pending_evaluations, @@ -80,11 +74,6 @@ "calculate_average_similarity", "calculate_cosine_similarity", "start_embedding_batch", - # Pricing - "build_cost_dict", - "build_embedding_cost_entry", - "build_response_cost_entry", - "calculate_token_cost", # Langfuse "create_langfuse_dataset_run", "fetch_trace_scores_from_langfuse", diff --git a/backend/app/crud/evaluations/pricing.py b/backend/app/crud/evaluations/pricing.py deleted file mode 100644 index be3d98791..000000000 --- a/backend/app/crud/evaluations/pricing.py +++ /dev/null @@ -1,225 +0,0 @@ -""" -Pricing utilities for evaluation cost tracking. - -This module provides model pricing data and cost calculation functions -for both response generation and embedding stages of evaluation runs. - -Pricing uses OpenAI Batch API rates (50% cheaper than real-time). -Source: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json -""" - -import logging -from collections.abc import Callable, Iterable -from typing import Any - -from app.crud.evaluations.embeddings import EMBEDDING_MODEL - -logger = logging.getLogger(__name__) - -# Number of decimals to round USD cost values to. -COST_USD_DECIMALS = 6 - -# Batch API pricing in USD per token. -MODEL_PRICING: dict[str, dict[str, float]] = { - # GPT-4o (batch pricing) - "gpt-4o": { - "input_cost_per_token": 1.25e-06, - "output_cost_per_token": 5e-06, - }, - "gpt-4o-mini": { - "input_cost_per_token": 7.5e-08, - "output_cost_per_token": 3e-07, - }, - # GPT-4.1 (batch pricing) - "gpt-4.1": { - "input_cost_per_token": 1e-06, - "output_cost_per_token": 4e-06, - }, - # GPT-5 (batch pricing) - "gpt-5": { - "input_cost_per_token": 6.25e-07, - "output_cost_per_token": 5e-06, - }, - "gpt-5-mini": { - "input_cost_per_token": 1.25e-07, - "output_cost_per_token": 1e-06, - }, - "gpt-5-nano": { - "input_cost_per_token": 2.5e-08, - "output_cost_per_token": 2e-07, - }, - # GPT-5.4 (batch pricing) - "gpt-5.4": { - "input_cost_per_token": 1.25e-06, - "output_cost_per_token": 7.5e-06, - }, - "gpt-5.4-pro": { - "input_cost_per_token": 1.5e-05, - "output_cost_per_token": 9e-05, - }, - "gpt-5.4-mini": { - "input_cost_per_token": 3.75e-07, - "output_cost_per_token": 2.25e-06, - }, - "gpt-5.4-nano": { - "input_cost_per_token": 1e-07, - "output_cost_per_token": 6.25e-07, - }, - # Embedding models (batch pricing) - EMBEDDING_MODEL: { - "input_cost_per_token": 6.5e-08, - }, -} - - -def calculate_token_cost( - model: str, input_tokens: int, output_tokens: int = 0 -) -> float: - """ - Calculate USD cost for a model call given input and output token counts. - - Used for both response generation (input + output tokens) and embeddings - (input tokens only — pass output_tokens=0 or omit). - - Args: - model: OpenAI model name (e.g., "gpt-4o", "text-embedding-3-large") - input_tokens: Number of input/prompt tokens - output_tokens: Number of output tokens (default 0 for embeddings) - - Returns: - Cost in USD. Returns 0.0 if model is unknown. - """ - pricing = MODEL_PRICING.get(model) - if not pricing: - logger.warning( - f"[calculate_token_cost] Unknown model '{model}', returning cost 0.0" - ) - return 0.0 - - input_cost = input_tokens * pricing.get("input_cost_per_token", 0) - output_cost = output_tokens * pricing.get("output_cost_per_token", 0) - return input_cost + output_cost - - -def _sum_usage( - items: Iterable[dict[str, Any]], - usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None], - fields: tuple[str, ...], -) -> dict[str, int]: - """ - Sum named token fields across items, using a caller-supplied extractor - to locate the per-item usage dict. - - Args: - items: Iterable of items to aggregate - usage_extractor: Function returning the usage dict for an item, or None - fields: Token field names to sum (e.g., "input_tokens", "total_tokens") - - Returns: - Mapping of field name to summed value - """ - totals: dict[str, int] = {field: 0 for field in fields} - for item in items: - usage = usage_extractor(item) - if not usage: - continue - for field in fields: - totals[field] += usage.get(field, 0) - return totals - - -def build_response_cost_entry( - model: str, results: list[dict[str, Any]] -) -> dict[str, Any]: - """ - Aggregate token usage from parsed evaluation results and calculate cost. - - Args: - model: OpenAI model name used for response generation - results: Parsed evaluation results from parse_evaluation_output(), - each containing a "usage" dict with input_tokens/output_tokens/total_tokens - - Returns: - Response cost entry for the cost JSONB field - """ - totals = _sum_usage( - items=results, - usage_extractor=lambda r: r.get("usage"), - fields=("input_tokens", "output_tokens", "total_tokens"), - ) - - cost_usd = calculate_token_cost( - model=model, - input_tokens=totals["input_tokens"], - output_tokens=totals["output_tokens"], - ) - - return { - "model": model, - "input_tokens": totals["input_tokens"], - "output_tokens": totals["output_tokens"], - "total_tokens": totals["total_tokens"], - "cost_usd": round(cost_usd, COST_USD_DECIMALS), - } - - -def build_embedding_cost_entry( - model: str, raw_results: list[dict[str, Any]] -) -> dict[str, Any]: - """ - Aggregate token usage from raw embedding batch results and calculate cost. - - Args: - model: OpenAI embedding model name - raw_results: Raw JSONL lines from embedding batch output, - each containing response.body.usage with prompt_tokens/total_tokens - - Returns: - Embedding cost entry for the cost JSONB field - """ - totals = _sum_usage( - items=raw_results, - usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"), - fields=("prompt_tokens", "total_tokens"), - ) - - cost_usd = calculate_token_cost(model=model, input_tokens=totals["prompt_tokens"]) - - return { - "model": model, - "prompt_tokens": totals["prompt_tokens"], - "total_tokens": totals["total_tokens"], - "cost_usd": round(cost_usd, COST_USD_DECIMALS), - } - - -def build_cost_dict( - response_entry: dict[str, Any] | None = None, - embedding_entry: dict[str, Any] | None = None, -) -> dict[str, Any]: - """ - Combine response and embedding cost entries into the final cost JSONB structure. - - Args: - response_entry: Response cost entry from build_response_cost_entry() - embedding_entry: Embedding cost entry from build_embedding_cost_entry() - - Returns: - Combined cost dict with total_cost_usd - """ - cost: dict[str, Any] = {} - - response_cost = 0.0 - embedding_cost = 0.0 - - if response_entry: - cost["response"] = response_entry - response_cost = response_entry.get("cost_usd", 0.0) - - if embedding_entry: - cost["embedding"] = embedding_entry - embedding_cost = embedding_entry.get("cost_usd", 0.0) - - cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS) - - return cost diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 77502fe8a..16d5b1152 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -27,31 +27,124 @@ ) from app.core.batch.base import BATCH_KEY from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config +from app.crud.evaluations.core import resolve_model_from_config, update_evaluation_run from app.crud.evaluations.embeddings import ( EMBEDDING_MODEL, calculate_average_similarity, parse_embedding_results, start_embedding_batch, ) -from app.crud.evaluations.pricing import ( - build_cost_dict, - build_embedding_cost_entry, - build_response_cost_entry, -) from app.crud.evaluations.langfuse import ( create_langfuse_dataset_run, update_traces_with_cosine_scores, ) from app.crud.job import get_batch_job, update_batch_job +from app.crud.model_config import estimate_model_cost from app.models import EvaluationRun from app.models.batch_job import BatchJob, BatchJobUpdate from app.utils import get_langfuse_client, get_openai_client logger = logging.getLogger(__name__) +# Number of decimals to round USD cost values to. +COST_USD_DECIMALS = 6 + + +def _cost_usd_from_estimate(estimate: dict[str, Any] | None) -> float: + """Sum the unrounded per-direction costs and round to our USD precision. + + `estimate_model_cost` returns `total_cost` already rounded to 4 decimals, + which drops sub-cent precision we want to retain here. + """ + if not estimate: + return 0.0 + total = float(estimate.get("input_cost", 0.0)) + float( + estimate.get("output_cost", 0.0) + ) + return round(total, COST_USD_DECIMALS) + + +def _build_response_cost_entry( + session: Session, model: str, results: list[dict[str, Any]] +) -> dict[str, Any]: + """Aggregate token usage from parsed results and compute batch-pricing cost.""" + totals = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + for item in results: + usage = item.get("usage") + if not usage: + continue + for field in totals: + totals[field] += usage.get(field, 0) + + estimate = estimate_model_cost( + session=session, + provider="openai", + model_name=model, + input_tokens=totals["input_tokens"], + output_tokens=totals["output_tokens"], + usage_type="batch", + ) + + return { + "model": model, + "input_tokens": totals["input_tokens"], + "output_tokens": totals["output_tokens"], + "total_tokens": totals["total_tokens"], + "cost_usd": _cost_usd_from_estimate(estimate), + } + + +def _build_embedding_cost_entry( + session: Session, model: str, raw_results: list[dict[str, Any]] +) -> dict[str, Any]: + """Aggregate token usage from raw embedding results and compute batch-pricing cost.""" + totals = {"prompt_tokens": 0, "total_tokens": 0} + for item in raw_results: + usage = item.get("response", {}).get("body", {}).get("usage") + if not usage: + continue + for field in totals: + totals[field] += usage.get(field, 0) + + estimate = estimate_model_cost( + session=session, + provider="openai", + model_name=model, + input_tokens=totals["prompt_tokens"], + output_tokens=0, + usage_type="batch", + ) + + return { + "model": model, + "prompt_tokens": totals["prompt_tokens"], + "total_tokens": totals["total_tokens"], + "cost_usd": _cost_usd_from_estimate(estimate), + } + + +def _build_cost_dict( + response_entry: dict[str, Any] | None, + embedding_entry: dict[str, Any] | None, +) -> dict[str, Any]: + cost: dict[str, Any] = {} + response_cost = 0.0 + embedding_cost = 0.0 + + if response_entry: + cost["response"] = response_entry + response_cost = response_entry.get("cost_usd", 0.0) + + if embedding_entry: + cost["embedding"] = embedding_entry + embedding_cost = embedding_entry.get("cost_usd", 0.0) + + cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS) + return cost + def _safe_attach_cost( + session: Session, eval_run: EvaluationRun, log_prefix: str, *, @@ -67,21 +160,16 @@ def _safe_attach_cost( exception is logged and swallowed. The caller is responsible for persisting eval_run via update_evaluation_run. + Pricing is sourced from the `global.model_config` table using the batch + usage type (evaluations run through the OpenAI Batch API). + When called for the embedding stage only, any previously-computed response entry on eval_run.cost is preserved. - - Args: - eval_run: EvaluationRun whose cost field will be set - log_prefix: Caller-provided log prefix (org/project/eval ids) - response_model: Model name for response cost (response stage only) - response_results: Parsed evaluation results (response stage only) - embedding_model: Model name for embedding cost (embedding stage only) - embedding_raw_results: Raw embedding batch results (embedding stage only) """ try: if response_model is not None and response_results is not None: - response_entry = build_response_cost_entry( - model=response_model, results=response_results + response_entry = _build_response_cost_entry( + session=session, model=response_model, results=response_results ) else: # Preserve any response entry computed during an earlier stage. @@ -89,11 +177,13 @@ def _safe_attach_cost( embedding_entry: dict[str, Any] | None = None if embedding_model is not None and embedding_raw_results is not None: - embedding_entry = build_embedding_cost_entry( - model=embedding_model, raw_results=embedding_raw_results + embedding_entry = _build_embedding_cost_entry( + session=session, + model=embedding_model, + raw_results=embedding_raw_results, ) - eval_run.cost = build_cost_dict( + eval_run.cost = _build_cost_dict( response_entry=response_entry, embedding_entry=embedding_entry, ) @@ -392,6 +482,7 @@ async def process_completed_evaluation( # Aggregate response generation cost _safe_attach_cost( + session=session, eval_run=eval_run, log_prefix=log_prefix, response_model=model, @@ -557,6 +648,7 @@ async def process_completed_embedding_batch( # Step 7: Accumulate embedding cost onto existing response cost _safe_attach_cost( + session=session, eval_run=eval_run, log_prefix=log_prefix, embedding_model=EMBEDDING_MODEL, diff --git a/backend/app/crud/model_config.py b/backend/app/crud/model_config.py index fed1f71c7..16099e7e0 100644 --- a/backend/app/crud/model_config.py +++ b/backend/app/crud/model_config.py @@ -52,6 +52,21 @@ def get_model_config( return session.exec(statement).first() +def is_reasoning_model( + session: Session, provider: Literal["openai", "google"], model_name: str +) -> bool: + """Return True if the model is configured with a reasoning `effort` control. + + A model is considered reasoning-capable if its `config` JSON contains an + `effort` key; models that instead expose a `temperature` key are treated + as standard chat models. + """ + model = get_model_config(session=session, provider=provider, model_name=model_name) + if model is None or not isinstance(model.config, dict): + return False + return "effort" in model.config + + def estimate_model_cost( session: Session, provider: Literal["openai", "google"], diff --git a/backend/app/models/llm/constants.py b/backend/app/models/llm/constants.py index 02b1823ed..399748843 100644 --- a/backend/app/models/llm/constants.py +++ b/backend/app/models/llm/constants.py @@ -37,9 +37,6 @@ ], } -# OpenAI models that support reasoning (effort parameter) -OPENAI_REASONING_MODELS: set[str] = {"o1", "o1-preview", "o1-mini"} - SUPPORTED_VOICES = { ("google", "tts"): ["Kore", "Orus", "Leda", "Charon"], ("sarvamai", "tts"): ["simran", "shubh", "roopa"], diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 0ffbfd95c..484141376 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -1,6 +1,5 @@ import logging from contextlib import contextmanager -from typing import Any from uuid import UUID from asgi_correlation_id import correlation_id @@ -369,7 +368,7 @@ def execute_llm_call( if isinstance(completion_config, KaapiCompletionConfig): completion_config, warnings = transform_kaapi_config_to_native( - completion_config + session=session, kaapi_config=completion_config ) if request_metadata is None: request_metadata = {} diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py index 19cbd7d04..3bd049f05 100644 --- a/backend/app/services/llm/mappers.py +++ b/backend/app/services/llm/mappers.py @@ -1,12 +1,14 @@ import logging +from sqlmodel import Session + +from app.crud.model_config import is_reasoning_model from app.models.llm import KaapiCompletionConfig, NativeCompletionConfig from app.models.llm.constants import ( BCP47_LOCALE_TO_GEMINI_LANG, BCP47_TO_ELEVENLABS_LANG, - ELEVENLABS_VOICE_TO_ID, DEFAULT_TTS_VOICE, - OPENAI_REASONING_MODELS, + ELEVENLABS_VOICE_TO_ID, ) logger = logging.getLogger(__name__) @@ -35,13 +37,16 @@ def bcp47_to_elevenlabs_lang(bcp47_code: str) -> str | None: return BCP47_TO_ELEVENLABS_LANG.get(bcp47_code) -def map_kaapi_to_openai_params(kaapi_params: dict) -> tuple[dict, list[str]]: +def map_kaapi_to_openai_params( + session: Session, kaapi_params: dict +) -> tuple[dict, list[str]]: """Map Kaapi-abstracted parameters to OpenAI API parameters. This mapper transforms standardized Kaapi parameters into OpenAI-specific parameter format, enabling provider-agnostic interface design. Args: + session: Database session used to look up the model's config kaapi_params: Dictionary with standardized Kaapi parameters Supported Mapping: @@ -67,7 +72,9 @@ def map_kaapi_to_openai_params(kaapi_params: dict) -> tuple[dict, list[str]]: knowledge_base_ids = kaapi_params.get("knowledge_base_ids") max_num_results = kaapi_params.get("max_num_results") - support_reasoning = model in OPENAI_REASONING_MODELS + support_reasoning = bool(model) and is_reasoning_model( + session=session, provider="openai", model_name=model + ) # Handle reasoning vs temperature mutual exclusivity if support_reasoning: @@ -422,6 +429,7 @@ def map_kaapi_to_elevenlabs_params( def transform_kaapi_config_to_native( + session: Session, kaapi_config: KaapiCompletionConfig, ) -> tuple[NativeCompletionConfig, list[str]]: """Transform Kaapi completion config to native provider config with mapped parameters. @@ -429,6 +437,7 @@ def transform_kaapi_config_to_native( Supports OpenAI,Google AI and Sarvam AI providers. Args: + session: Database session used to look up model-specific config (e.g. reasoning support) kaapi_config: KaapiCompletionConfig with abstracted parameters Returns: @@ -438,7 +447,9 @@ def transform_kaapi_config_to_native( """ # TODO change from magic string to enums if kaapi_config.provider == "openai": - mapped_params, warnings = map_kaapi_to_openai_params(kaapi_config.params) + mapped_params, warnings = map_kaapi_to_openai_params( + session=session, kaapi_params=kaapi_config.params + ) return ( NativeCompletionConfig( provider="openai-native", params=mapped_params, type=kaapi_config.type diff --git a/backend/app/tests/crud/evaluations/test_pricing.py b/backend/app/tests/crud/evaluations/test_pricing.py deleted file mode 100644 index b938cb7dc..000000000 --- a/backend/app/tests/crud/evaluations/test_pricing.py +++ /dev/null @@ -1,287 +0,0 @@ -import pytest - -from app.crud.evaluations.pricing import ( - COST_USD_DECIMALS, - MODEL_PRICING, - build_cost_dict, - build_embedding_cost_entry, - build_response_cost_entry, - calculate_token_cost, -) - - -class TestCalculateTokenCost: - """Tests for calculate_token_cost function.""" - - def test_known_chat_model_input_and_output(self) -> None: - """Cost is sum of input and output token costs for a known chat model.""" - pricing = MODEL_PRICING["gpt-4o"] - expected = ( - 1000 * pricing["input_cost_per_token"] - + 500 * pricing["output_cost_per_token"] - ) - - cost = calculate_token_cost( - model="gpt-4o", input_tokens=1000, output_tokens=500 - ) - - assert cost == pytest.approx(expected) - - def test_known_embedding_model_defaults_output_tokens_to_zero(self) -> None: - """Embedding models charge only for input tokens; output_tokens defaults to 0.""" - pricing = MODEL_PRICING["text-embedding-3-large"] - expected = 2000 * pricing["input_cost_per_token"] - - cost = calculate_token_cost(model="text-embedding-3-large", input_tokens=2000) - - assert cost == pytest.approx(expected) - - def test_unknown_model_returns_zero(self) -> None: - """Unknown models return 0.0 instead of raising.""" - cost = calculate_token_cost( - model="not-a-real-model", input_tokens=100, output_tokens=50 - ) - - assert cost == 0.0 - - def test_zero_tokens_returns_zero(self) -> None: - """Zero tokens for a known model returns zero cost.""" - cost = calculate_token_cost(model="gpt-4o", input_tokens=0, output_tokens=0) - - assert cost == 0.0 - - def test_embedding_model_with_explicit_output_tokens(self) -> None: - """Passing output_tokens to an embedding model adds 0 cost (no output rate).""" - pricing = MODEL_PRICING["text-embedding-3-large"] - expected = 100 * pricing["input_cost_per_token"] - - cost = calculate_token_cost( - model="text-embedding-3-large", input_tokens=100, output_tokens=999 - ) - - assert cost == pytest.approx(expected) - - -class TestBuildResponseCostEntry: - """Tests for build_response_cost_entry function.""" - - def test_basic_aggregation(self) -> None: - """Sums input/output/total tokens across results and computes USD cost.""" - results = [ - { - "usage": { - "input_tokens": 100, - "output_tokens": 50, - "total_tokens": 150, - } - }, - { - "usage": { - "input_tokens": 200, - "output_tokens": 75, - "total_tokens": 275, - } - }, - ] - - entry = build_response_cost_entry(model="gpt-4o", results=results) - - assert entry["model"] == "gpt-4o" - assert entry["input_tokens"] == 300 - assert entry["output_tokens"] == 125 - assert entry["total_tokens"] == 425 - pricing = MODEL_PRICING["gpt-4o"] - expected_cost = round( - 300 * pricing["input_cost_per_token"] - + 125 * pricing["output_cost_per_token"], - COST_USD_DECIMALS, - ) - assert entry["cost_usd"] == expected_cost - - def test_empty_results(self) -> None: - """Empty results yields zero tokens and zero cost.""" - entry = build_response_cost_entry(model="gpt-4o", results=[]) - - assert entry["input_tokens"] == 0 - assert entry["output_tokens"] == 0 - assert entry["total_tokens"] == 0 - assert entry["cost_usd"] == 0.0 - - def test_results_missing_usage_are_skipped(self) -> None: - """Items without a usage dict are skipped without raising.""" - results = [ - {"usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}}, - {}, # No usage key - {"usage": None}, # Explicit None - ] - - entry = build_response_cost_entry(model="gpt-4o", results=results) - - assert entry["input_tokens"] == 10 - assert entry["output_tokens"] == 5 - assert entry["total_tokens"] == 15 - - def test_unknown_model_yields_zero_cost(self) -> None: - """Unknown model still aggregates token counts but reports zero cost.""" - results = [ - {"usage": {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}} - ] - - entry = build_response_cost_entry(model="mystery-model", results=results) - - assert entry["input_tokens"] == 100 - assert entry["output_tokens"] == 50 - assert entry["cost_usd"] == 0.0 - - -class TestBuildEmbeddingCostEntry: - """Tests for build_embedding_cost_entry function.""" - - def test_basic_aggregation(self) -> None: - """Sums prompt/total tokens from raw batch results and computes USD cost.""" - raw_results = [ - { - "response": { - "body": {"usage": {"prompt_tokens": 100, "total_tokens": 100}} - } - }, - { - "response": { - "body": {"usage": {"prompt_tokens": 250, "total_tokens": 250}} - } - }, - ] - - entry = build_embedding_cost_entry( - model="text-embedding-3-large", raw_results=raw_results - ) - - assert entry["model"] == "text-embedding-3-large" - assert entry["prompt_tokens"] == 350 - assert entry["total_tokens"] == 350 - pricing = MODEL_PRICING["text-embedding-3-large"] - expected_cost = round(350 * pricing["input_cost_per_token"], COST_USD_DECIMALS) - assert entry["cost_usd"] == expected_cost - - def test_empty_raw_results(self) -> None: - """Empty raw_results yields zero tokens and zero cost.""" - entry = build_embedding_cost_entry( - model="text-embedding-3-large", raw_results=[] - ) - - assert entry["prompt_tokens"] == 0 - assert entry["total_tokens"] == 0 - assert entry["cost_usd"] == 0.0 - - def test_results_missing_usage_are_skipped(self) -> None: - """Items without nested usage are skipped (e.g., error rows).""" - raw_results = [ - { - "response": { - "body": {"usage": {"prompt_tokens": 50, "total_tokens": 50}} - } - }, - {"error": {"message": "Rate limited"}}, # No response.body.usage - {"response": {"body": {}}}, # body present, usage missing - ] - - entry = build_embedding_cost_entry( - model="text-embedding-3-large", raw_results=raw_results - ) - - assert entry["prompt_tokens"] == 50 - assert entry["total_tokens"] == 50 - - def test_unknown_model_yields_zero_cost(self) -> None: - """Unknown embedding model still aggregates tokens but reports zero cost.""" - raw_results = [ - { - "response": { - "body": {"usage": {"prompt_tokens": 100, "total_tokens": 100}} - } - } - ] - - entry = build_embedding_cost_entry( - model="mystery-embed", raw_results=raw_results - ) - - assert entry["prompt_tokens"] == 100 - assert entry["cost_usd"] == 0.0 - - -class TestBuildCostDict: - """Tests for build_cost_dict function.""" - - def test_response_only(self) -> None: - """Only response entry → embedding key absent, total = response cost.""" - response_entry = { - "model": "gpt-4o", - "input_tokens": 100, - "output_tokens": 50, - "total_tokens": 150, - "cost_usd": 0.001234, - } - - cost = build_cost_dict(response_entry=response_entry) - - assert cost["response"] == response_entry - assert "embedding" not in cost - assert cost["total_cost_usd"] == 0.001234 - - def test_embedding_only(self) -> None: - """Only embedding entry → response key absent, total = embedding cost.""" - embedding_entry = { - "model": "text-embedding-3-large", - "prompt_tokens": 200, - "total_tokens": 200, - "cost_usd": 0.000013, - } - - cost = build_cost_dict(embedding_entry=embedding_entry) - - assert cost["embedding"] == embedding_entry - assert "response" not in cost - assert cost["total_cost_usd"] == 0.000013 - - def test_both_entries(self) -> None: - """Both entries → both keys present, total = sum of both costs.""" - response_entry = {"cost_usd": 0.001234} - embedding_entry = {"cost_usd": 0.000013} - - cost = build_cost_dict( - response_entry=response_entry, embedding_entry=embedding_entry - ) - - assert cost["response"] == response_entry - assert cost["embedding"] == embedding_entry - assert cost["total_cost_usd"] == round(0.001234 + 0.000013, COST_USD_DECIMALS) - - def test_neither_entry(self) -> None: - """No entries → only total_cost_usd present, equal to 0.0.""" - cost = build_cost_dict() - - assert cost == {"total_cost_usd": 0.0} - - def test_total_is_rounded(self) -> None: - """total_cost_usd is rounded to COST_USD_DECIMALS.""" - response_entry = {"cost_usd": 0.0000001} - embedding_entry = {"cost_usd": 0.0000002} - - cost = build_cost_dict( - response_entry=response_entry, embedding_entry=embedding_entry - ) - - # 0.0000003 rounded to 6 decimals → 0.0 - assert cost["total_cost_usd"] == 0.0 - - def test_entry_missing_cost_usd_treated_as_zero(self) -> None: - """Entries without a cost_usd key default to 0 in the total.""" - response_entry = {"model": "gpt-4o"} # No cost_usd - embedding_entry = {"cost_usd": 0.000050} - - cost = build_cost_dict( - response_entry=response_entry, embedding_entry=embedding_entry - ) - - assert cost["total_cost_usd"] == 0.000050 diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index 51e2321db..0f0a65bbd 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -1,23 +1,22 @@ -from typing import Any import json from unittest.mock import MagicMock, patch import pytest from sqlmodel import Session, select +from app.core.util import now +from app.crud.evaluations.core import create_evaluation_run from app.crud.evaluations.processing import ( _extract_batch_error_message, check_and_process_evaluation, parse_evaluation_output, + poll_all_pending_evaluations, process_completed_embedding_batch, process_completed_evaluation, - poll_all_pending_evaluations, ) -from app.models import BatchJob, Organization, Project, EvaluationDataset, EvaluationRun +from app.models import BatchJob, EvaluationDataset, EvaluationRun, Organization, Project from app.models.batch_job import BatchJobType -from app.tests.utils.test_data import create_test_evaluation_dataset, create_test_config -from app.crud.evaluations.core import create_evaluation_run -from app.core.util import now +from app.tests.utils.test_data import create_test_config, create_test_evaluation_dataset class TestParseEvaluationOutput: @@ -581,7 +580,7 @@ async def test_process_completed_embedding_batch_success( db.commit() db.refresh(eval_run_with_embedding_batch) - # Raw results carry the usage payload that build_embedding_cost_entry reads. + # Raw results carry the usage payload that _build_embedding_cost_entry reads. mock_download.return_value = [ { "custom_id": "trace_123", diff --git a/backend/app/tests/services/llm/test_jobs.py b/backend/app/tests/services/llm/test_jobs.py index 4a7f7e265..c1b6acf25 100644 --- a/backend/app/tests/services/llm/test_jobs.py +++ b/backend/app/tests/services/llm/test_jobs.py @@ -659,7 +659,7 @@ def test_kaapi_config_warnings_passed_through_metadata( provider="openai", type="text", params={ - "model": "o1", # Reasoning model + "model": "gpt-5", # Reasoning model "temperature": 0.7, # This will be suppressed with warning }, ) diff --git a/backend/app/tests/services/llm/test_mappers.py b/backend/app/tests/services/llm/test_mappers.py index 22e488b37..7f1a7f036 100644 --- a/backend/app/tests/services/llm/test_mappers.py +++ b/backend/app/tests/services/llm/test_mappers.py @@ -5,77 +5,72 @@ Covers real-world scenarios, edge cases, and provider-specific requirements. """ -import pytest +from sqlmodel import Session from app.models.llm.request import ( - TextLLMParams, - STTLLMParams, - TTSLLMParams, KaapiCompletionConfig, NativeCompletionConfig, + STTLLMParams, + TextLLMParams, + TTSLLMParams, ) from app.services.llm.mappers import ( - map_kaapi_to_openai_params, + bcp47_to_elevenlabs_lang, + map_kaapi_to_elevenlabs_params, map_kaapi_to_google_params, + map_kaapi_to_openai_params, map_kaapi_to_sarvam_params, - map_kaapi_to_elevenlabs_params, - bcp47_to_elevenlabs_lang, - voice_to_id, transform_kaapi_config_to_native, -) -from app.models.llm.constants import ( - DEFAULT_STT_MODEL, - DEFAULT_TTS_MODEL, - DEFAULT_TTS_VOICE, + voice_to_id, ) class TestMapKaapiToOpenAIParams: """Test cases for map_kaapi_to_openai_params function.""" - def test_basic_model_mapping(self): + def test_basic_model_mapping(self, db: Session): """Test basic model parameter mapping.""" kaapi_params = TextLLMParams(model="gpt-4o") result, warnings = map_kaapi_to_openai_params( - kaapi_params.model_dump(exclude_none=True) + session=db, kaapi_params=kaapi_params.model_dump(exclude_none=True) ) # TextLLMParams has default temperature=0.1 assert result == {"model": "gpt-4o", "temperature": 0.1} assert warnings == [] - def test_reasoning_mapping_for_reasoning_models(self): + def test_reasoning_mapping_for_reasoning_models(self, db: Session): """Test reasoning parameter mapping to OpenAI format for reasoning-capable models.""" kaapi_params = TextLLMParams( - model="o1", + model="gpt-5", reasoning="high", ) result, warnings = map_kaapi_to_openai_params( - kaapi_params.model_dump(exclude_none=True) + session=db, kaapi_params=kaapi_params.model_dump(exclude_none=True) ) - assert result["model"] == "o1" + assert result["model"] == "gpt-5" assert result["reasoning"] == {"effort": "high"} # Temperature is suppressed for reasoning models (even default value) assert "temperature" not in result assert len(warnings) == 1 assert "temperature" in warnings[0].lower() - def test_knowledge_base_ids_mapping(self): + def test_knowledge_base_ids_mapping(self, db: Session): """Test knowledge_base_ids mapping to OpenAI tools format.""" kaapi_params = TextLLMParams( - model="gpt-4", + model="gpt-4o", knowledge_base_ids=["vs_abc123", "vs_def456"], max_num_results=50, ) result, warnings = map_kaapi_to_openai_params( - kaapi_params.model_dump(exclude_none=True) + session=db, kaapi_params=kaapi_params.model_dump(exclude_none=True) ) - assert result["model"] == "gpt-4" + assert result["model"] == "gpt-4o" assert "tools" in result assert len(result["tools"]) == 1 assert result["tools"][0]["type"] == "file_search" @@ -83,37 +78,37 @@ def test_knowledge_base_ids_mapping(self): assert result["tools"][0]["max_num_results"] == 50 assert warnings == [] - def test_temperature_suppressed_for_reasoning_models(self): + def test_temperature_suppressed_for_reasoning_models(self, db: Session): """Test that temperature is suppressed with warning for reasoning models when reasoning is set.""" kaapi_params = TextLLMParams( - model="o1", + model="gpt-5", temperature=0.7, reasoning="high", ) result, warnings = map_kaapi_to_openai_params( - kaapi_params.model_dump(exclude_none=True) + session=db, kaapi_params=kaapi_params.model_dump(exclude_none=True) ) - assert result["model"] == "o1" + assert result["model"] == "gpt-5" assert result["reasoning"] == {"effort": "high"} assert "temperature" not in result assert len(warnings) == 1 assert "temperature" in warnings[0].lower() assert "suppressed" in warnings[0] - def test_reasoning_suppressed_for_non_reasoning_models(self): + def test_reasoning_suppressed_for_non_reasoning_models(self, db: Session): """Test that reasoning is suppressed with warning for non-reasoning models.""" kaapi_params = TextLLMParams( - model="gpt-4", + model="gpt-4o", reasoning="high", ) result, warnings = map_kaapi_to_openai_params( - kaapi_params.model_dump(exclude_none=True) + session=db, kaapi_params=kaapi_params.model_dump(exclude_none=True) ) - assert result["model"] == "gpt-4" + assert result["model"] == "gpt-4o" assert "reasoning" not in result assert len(warnings) == 1 assert "reasoning" in warnings[0].lower() @@ -826,7 +821,7 @@ def test_unsupported_voice_returns_none(self): class TestTransformKaapiConfigToNative: """Test end-to-end transformation with completion_type parameter.""" - def test_transform_elevenlabs_tts_config(self): + def test_transform_elevenlabs_tts_config(self, db: Session): """Test transformation of ElevenLabs TTS config.""" kaapi_config = KaapiCompletionConfig( provider="elevenlabs", @@ -839,7 +834,9 @@ def test_transform_elevenlabs_tts_config(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "elevenlabs-native" @@ -850,7 +847,7 @@ def test_transform_elevenlabs_tts_config(self): assert result.params["output_format"] == "mp3_44100_128" assert warnings == [] - def test_transform_elevenlabs_stt_config(self): + def test_transform_elevenlabs_stt_config(self, db: Session): """Test transformation of ElevenLabs STT config.""" kaapi_config = KaapiCompletionConfig( provider="elevenlabs", @@ -862,7 +859,9 @@ def test_transform_elevenlabs_stt_config(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "elevenlabs-native" @@ -872,7 +871,7 @@ def test_transform_elevenlabs_stt_config(self): assert result.params["temperature"] == 0.3 assert warnings == [] - def test_transform_sarvamai_stt_with_saaras_model(self): + def test_transform_sarvamai_stt_with_saaras_model(self, db: Session): """Test transformation of SarvamAI STT with saaras:v3 model.""" kaapi_config = KaapiCompletionConfig( provider="sarvamai", @@ -884,7 +883,9 @@ def test_transform_sarvamai_stt_with_saaras_model(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "sarvamai-native" @@ -898,7 +899,7 @@ def test_transform_sarvamai_stt_with_saaras_model(self): # Removed test_transform_sarvamai_stt_with_saarika_model - model no longer in SUPPORTED_MODELS # The mapper logic for saarika (no mode parameter) is already tested in unit tests - def test_transform_sarvamai_tts_with_voice(self): + def test_transform_sarvamai_tts_with_voice(self, db: Session): """Test transformation of SarvamAI TTS with explicit voice.""" kaapi_config = KaapiCompletionConfig( provider="sarvamai", @@ -910,7 +911,9 @@ def test_transform_sarvamai_tts_with_voice(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "sarvamai-native" @@ -920,7 +923,7 @@ def test_transform_sarvamai_tts_with_voice(self): assert result.params["speaker"] == "simran" assert warnings == [] - def test_transform_google_text_completion(self): + def test_transform_google_text_completion(self, db: Session): """Test transformation of Google text completion.""" kaapi_config = KaapiCompletionConfig( provider="google", @@ -932,7 +935,9 @@ def test_transform_google_text_completion(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "google-native" @@ -942,7 +947,7 @@ def test_transform_google_text_completion(self): assert result.params["reasoning"] == "high" assert warnings == [] - def test_transform_google_stt_completion(self): + def test_transform_google_stt_completion(self, db: Session): """Test transformation of Google STT completion.""" kaapi_config = KaapiCompletionConfig( provider="google", @@ -950,7 +955,9 @@ def test_transform_google_stt_completion(self): params={"model": "gemini-2.5-pro", "instructions": "Transcribe accurately"}, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "google-native" @@ -959,7 +966,7 @@ def test_transform_google_stt_completion(self): assert result.params["instructions"] == "Transcribe accurately" assert warnings == [] - def test_transform_google_tts_completion(self): + def test_transform_google_tts_completion(self, db: Session): """Test transformation of Google TTS completion.""" kaapi_config = KaapiCompletionConfig( provider="google", @@ -971,7 +978,9 @@ def test_transform_google_tts_completion(self): }, ) - result, warnings = transform_kaapi_config_to_native(kaapi_config) + result, warnings = transform_kaapi_config_to_native( + session=db, kaapi_config=kaapi_config + ) assert isinstance(result, NativeCompletionConfig) assert result.provider == "google-native" From be8f60e6a8ba5acde73b531cd0cc3ec9265de87a Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 16 Apr 2026 12:32:24 +0530 Subject: [PATCH 5/9] cleanup --- ...valuation_run.py => 053_add_cost_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{050_add_cost_to_evaluation_run.py => 053_add_cost_to_evaluation_run.py} (89%) diff --git a/backend/app/alembic/versions/050_add_cost_to_evaluation_run.py b/backend/app/alembic/versions/053_add_cost_to_evaluation_run.py similarity index 89% rename from backend/app/alembic/versions/050_add_cost_to_evaluation_run.py rename to backend/app/alembic/versions/053_add_cost_to_evaluation_run.py index 6d63de3e8..63110668d 100644 --- a/backend/app/alembic/versions/050_add_cost_to_evaluation_run.py +++ b/backend/app/alembic/versions/053_add_cost_to_evaluation_run.py @@ -1,7 +1,7 @@ """add cost tracking to evaluation_run -Revision ID: 050 -Revises: 049 +Revision ID: 053 +Revises: 052 Create Date: 2026-04-09 12:00:00.000000 """ @@ -11,8 +11,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = "050" -down_revision = "049" +revision = "053" +down_revision = "052" branch_labels = None depends_on = None From 85e249ad76202a8c3db4040fb4753b25ccdafb90 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 16 Apr 2026 13:36:15 +0530 Subject: [PATCH 6/9] few fixes from suggestions --- backend/app/crud/evaluations/core.py | 70 ++---- backend/app/crud/evaluations/cost.py | 174 +++++++++++++ backend/app/crud/evaluations/processing.py | 228 +++++------------- backend/app/crud/model_config.py | 3 +- backend/app/models/__init__.py | 110 ++++----- backend/app/models/evaluation.py | 16 ++ .../tests/crud/evaluations/test_processing.py | 5 +- .../crud/evaluations/test_score_storage.py | 14 +- 8 files changed, 320 insertions(+), 300 deletions(-) create mode 100644 backend/app/crud/evaluations/cost.py diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index e52d77cdc..e5577322a 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -5,19 +5,18 @@ from langfuse import Langfuse from sqlmodel import Session, select +from app.core.cloud.storage import get_cloud_storage +from app.core.db import engine +from app.core.storage_utils import upload_jsonl_to_object_store from app.core.util import now from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.crud.evaluations.score import EvaluationScore -from app.models import EvaluationRun +from app.models import EvaluationRun, EvaluationRunUpdate from app.models.llm.request import ConfigBlob, LLMCallConfig from app.models.stt_evaluation import EvaluationType from app.services.llm.jobs import resolve_config_blob -from app.core.db import engine -from app.core.cloud.storage import get_cloud_storage -from app.core.storage_utils import upload_jsonl_to_object_store - logger = logging.getLogger(__name__) @@ -192,51 +191,18 @@ def get_evaluation_run_by_id( def update_evaluation_run( session: Session, eval_run: EvaluationRun, - status: str | None = None, - error_message: str | None = None, - object_store_url: str | None = None, - score_trace_url: str | None = None, - score: dict | None = None, - cost: dict | None = None, - embedding_batch_job_id: int | None = None, + update: EvaluationRunUpdate, ) -> EvaluationRun: """ - Update an evaluation run with new values and persist to database. - - This helper function ensures consistency when updating evaluation runs - by always updating the timestamp and properly committing changes. + Apply a partial update to an evaluation run and persist it. - Args: - session: Database session - eval_run: EvaluationRun instance to update - status: New status value (optional) - error_message: New error message (optional) - object_store_url: New object store URL (optional) - score_trace_url: New per-trace score S3 URL (optional) - score: New score dict (optional) - cost: New cost dict (optional) - embedding_batch_job_id: New embedding batch job ID (optional) - - Returns: - Updated and refreshed EvaluationRun instance + Only fields explicitly set on `update` are applied (`exclude_unset=True` + semantics), so callers don't accidentally clear unrelated columns. + `updated_at` is always bumped. """ - # Update provided fields - if status is not None: - eval_run.status = status - if error_message is not None: - eval_run.error_message = error_message - if object_store_url is not None: - eval_run.object_store_url = object_store_url - if score is not None: - eval_run.score = score - if cost is not None: - eval_run.cost = cost - if embedding_batch_job_id is not None: - eval_run.embedding_batch_job_id = embedding_batch_job_id - if score_trace_url is not None: - eval_run.score_trace_url = score_trace_url or None - - # Always update timestamp + for key, value in update.model_dump(exclude_unset=True).items(): + setattr(eval_run, key, value) + eval_run.updated_at = now() # Persist to database @@ -319,7 +285,11 @@ def get_or_fetch_score( } # Update score column using existing helper - update_evaluation_run(session=session, eval_run=eval_run, score=score) + update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(score=score), + ) total_traces = len(score.get("traces", [])) logger.info( @@ -405,8 +375,10 @@ def save_score( update_evaluation_run( session=session, eval_run=eval_run, - score=db_score, - score_trace_url=score_trace_url, + update=EvaluationRunUpdate( + score=db_score, + score_trace_url=score_trace_url or None, + ), ) logger.info( diff --git a/backend/app/crud/evaluations/cost.py b/backend/app/crud/evaluations/cost.py new file mode 100644 index 000000000..76f2407f5 --- /dev/null +++ b/backend/app/crud/evaluations/cost.py @@ -0,0 +1,174 @@ +""" +Cost tracking for evaluation runs. + +Token usage is aggregated per stage (response generation, embedding) and +priced against `global.model_config` using OpenAI Batch rates. Failures +here must never block evaluation completion — `attach_cost` swallows +exceptions and logs a warning. + +Persisted shape on `eval_run.cost`: + + { + "response": {model, input_tokens, output_tokens, total_tokens, cost_usd}, + "embedding": {model, input_tokens, output_tokens, total_tokens, cost_usd}, + "total_cost_usd": float, + } + +Either stage entry is optional. Embedding entries use output_tokens=0. +""" + +import logging +from collections.abc import Callable, Iterable +from typing import Any + +from sqlmodel import Session + +from app.crud.model_config import estimate_model_cost +from app.models import EvaluationRun + +logger = logging.getLogger(__name__) + +# USD rounding precision for persisted cost values. +COST_USD_DECIMALS = 6 + + +def _cost_usd(estimate: dict[str, Any] | None) -> float: + """Sum the per-direction costs from an estimate and round to our USD precision.""" + if not estimate: + return 0.0 + total = float(estimate.get("input_cost", 0.0)) + float( + estimate.get("output_cost", 0.0) + ) + return round(total, COST_USD_DECIMALS) + + +def _sum_tokens( + items: Iterable[dict[str, Any]], + usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None], + input_key: str, +) -> dict[str, int]: + """Sum (input, output, total) tokens across items using a per-item usage extractor. + + The OpenAI Embeddings API reports input tokens as ``prompt_tokens`` and has + no output tokens; chat/responses APIs use ``input_tokens`` and ``output_tokens``. + Missing keys default to 0, so the embedding case naturally produces + output_tokens=0. + """ + totals = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + for item in items: + usage = usage_extractor(item) + if not usage: + continue + totals["input_tokens"] += usage.get(input_key, 0) + totals["output_tokens"] += usage.get("output_tokens", 0) + totals["total_tokens"] += usage.get("total_tokens", 0) + return totals + + +def _build_cost_entry( + session: Session, + model: str, + totals: dict[str, int], +) -> dict[str, Any]: + """Price aggregated token usage against the model's batch pricing row.""" + estimate = estimate_model_cost( + session=session, + provider="openai", + model_name=model, + input_tokens=totals["input_tokens"], + output_tokens=totals["output_tokens"], + usage_type="batch", + ) + return { + "model": model, + "input_tokens": totals["input_tokens"], + "output_tokens": totals["output_tokens"], + "total_tokens": totals["total_tokens"], + "cost_usd": _cost_usd(estimate), + } + + +def build_response_cost_entry( + session: Session, model: str, results: list[dict[str, Any]] +) -> dict[str, Any]: + """Build a response-stage cost entry from parsed evaluation results.""" + totals = _sum_tokens( + items=results, + usage_extractor=lambda r: r.get("usage"), + input_key="input_tokens", + ) + return _build_cost_entry(session=session, model=model, totals=totals) + + +def build_embedding_cost_entry( + session: Session, model: str, raw_results: list[dict[str, Any]] +) -> dict[str, Any]: + """Build an embedding-stage cost entry from raw embedding batch output.""" + totals = _sum_tokens( + items=raw_results, + usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"), + input_key="prompt_tokens", + ) + return _build_cost_entry(session=session, model=model, totals=totals) + + +def build_cost_dict( + response_entry: dict[str, Any] | None, + embedding_entry: dict[str, Any] | None, +) -> dict[str, Any]: + """Combine per-stage entries into the `eval_run.cost` payload with a grand total.""" + cost: dict[str, Any] = {} + total = 0.0 + + if response_entry: + cost["response"] = response_entry + total += response_entry.get("cost_usd", 0.0) + + if embedding_entry: + cost["embedding"] = embedding_entry + total += embedding_entry.get("cost_usd", 0.0) + + cost["total_cost_usd"] = round(total, COST_USD_DECIMALS) + return cost + + +def attach_cost( + session: Session, + eval_run: EvaluationRun, + log_prefix: str, + *, + response_model: str | None = None, + response_results: list[dict[str, Any]] | None = None, + embedding_model: str | None = None, + embedding_raw_results: list[dict[str, Any]] | None = None, +) -> None: + """Compute cost for the given stage(s) and attach to `eval_run.cost`, never raising. + + Caller is responsible for persisting `eval_run` afterwards. When only the + embedding stage is provided, a previously-computed response entry on + `eval_run.cost` is preserved. + """ + try: + if response_model is not None and response_results is not None: + response_entry = build_response_cost_entry( + session=session, model=response_model, results=response_results + ) + else: + response_entry = (eval_run.cost or {}).get("response") + + embedding_entry: dict[str, Any] | None = None + if embedding_model is not None and embedding_raw_results is not None: + embedding_entry = build_embedding_cost_entry( + session=session, + model=embedding_model, + raw_results=embedding_raw_results, + ) + + eval_run.cost = build_cost_dict( + response_entry=response_entry, + embedding_entry=embedding_entry, + ) + except Exception as cost_err: + logger.warning( + f"[attach_cost] {log_prefix} Failed to compute cost | {cost_err}" + ) diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 16d5b1152..472390801 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -28,6 +28,7 @@ from app.core.batch.base import BATCH_KEY from app.crud.evaluations.batch import fetch_dataset_items from app.crud.evaluations.core import resolve_model_from_config, update_evaluation_run +from app.crud.evaluations.cost import attach_cost from app.crud.evaluations.embeddings import ( EMBEDDING_MODEL, calculate_average_similarity, @@ -39,159 +40,12 @@ update_traces_with_cosine_scores, ) from app.crud.job import get_batch_job, update_batch_job -from app.crud.model_config import estimate_model_cost -from app.models import EvaluationRun +from app.models import EvaluationRun, EvaluationRunUpdate from app.models.batch_job import BatchJob, BatchJobUpdate from app.utils import get_langfuse_client, get_openai_client logger = logging.getLogger(__name__) -# Number of decimals to round USD cost values to. -COST_USD_DECIMALS = 6 - - -def _cost_usd_from_estimate(estimate: dict[str, Any] | None) -> float: - """Sum the unrounded per-direction costs and round to our USD precision. - - `estimate_model_cost` returns `total_cost` already rounded to 4 decimals, - which drops sub-cent precision we want to retain here. - """ - if not estimate: - return 0.0 - total = float(estimate.get("input_cost", 0.0)) + float( - estimate.get("output_cost", 0.0) - ) - return round(total, COST_USD_DECIMALS) - - -def _build_response_cost_entry( - session: Session, model: str, results: list[dict[str, Any]] -) -> dict[str, Any]: - """Aggregate token usage from parsed results and compute batch-pricing cost.""" - totals = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} - for item in results: - usage = item.get("usage") - if not usage: - continue - for field in totals: - totals[field] += usage.get(field, 0) - - estimate = estimate_model_cost( - session=session, - provider="openai", - model_name=model, - input_tokens=totals["input_tokens"], - output_tokens=totals["output_tokens"], - usage_type="batch", - ) - - return { - "model": model, - "input_tokens": totals["input_tokens"], - "output_tokens": totals["output_tokens"], - "total_tokens": totals["total_tokens"], - "cost_usd": _cost_usd_from_estimate(estimate), - } - - -def _build_embedding_cost_entry( - session: Session, model: str, raw_results: list[dict[str, Any]] -) -> dict[str, Any]: - """Aggregate token usage from raw embedding results and compute batch-pricing cost.""" - totals = {"prompt_tokens": 0, "total_tokens": 0} - for item in raw_results: - usage = item.get("response", {}).get("body", {}).get("usage") - if not usage: - continue - for field in totals: - totals[field] += usage.get(field, 0) - - estimate = estimate_model_cost( - session=session, - provider="openai", - model_name=model, - input_tokens=totals["prompt_tokens"], - output_tokens=0, - usage_type="batch", - ) - - return { - "model": model, - "prompt_tokens": totals["prompt_tokens"], - "total_tokens": totals["total_tokens"], - "cost_usd": _cost_usd_from_estimate(estimate), - } - - -def _build_cost_dict( - response_entry: dict[str, Any] | None, - embedding_entry: dict[str, Any] | None, -) -> dict[str, Any]: - cost: dict[str, Any] = {} - response_cost = 0.0 - embedding_cost = 0.0 - - if response_entry: - cost["response"] = response_entry - response_cost = response_entry.get("cost_usd", 0.0) - - if embedding_entry: - cost["embedding"] = embedding_entry - embedding_cost = embedding_entry.get("cost_usd", 0.0) - - cost["total_cost_usd"] = round(response_cost + embedding_cost, COST_USD_DECIMALS) - return cost - - -def _safe_attach_cost( - session: Session, - eval_run: EvaluationRun, - log_prefix: str, - *, - response_model: str | None = None, - response_results: list[dict[str, Any]] | None = None, - embedding_model: str | None = None, - embedding_raw_results: list[dict[str, Any]] | None = None, -) -> None: - """ - Compute and attach a cost dict to eval_run.cost without raising. - - Cost-tracking failures must never block evaluation completion, so any - exception is logged and swallowed. The caller is responsible for - persisting eval_run via update_evaluation_run. - - Pricing is sourced from the `global.model_config` table using the batch - usage type (evaluations run through the OpenAI Batch API). - - When called for the embedding stage only, any previously-computed - response entry on eval_run.cost is preserved. - """ - try: - if response_model is not None and response_results is not None: - response_entry = _build_response_cost_entry( - session=session, model=response_model, results=response_results - ) - else: - # Preserve any response entry computed during an earlier stage. - response_entry = (eval_run.cost or {}).get("response") - - embedding_entry: dict[str, Any] | None = None - if embedding_model is not None and embedding_raw_results is not None: - embedding_entry = _build_embedding_cost_entry( - session=session, - model=embedding_model, - raw_results=embedding_raw_results, - ) - - eval_run.cost = _build_cost_dict( - response_entry=response_entry, - embedding_entry=embedding_entry, - ) - except Exception as cost_err: - logger.warning( - f"[_safe_attach_cost] {log_prefix} Failed to compute cost | {cost_err}" - ) - def _extract_batch_error_message( provider: OpenAIBatchProvider, @@ -481,14 +335,18 @@ async def process_completed_evaluation( model = resolve_model_from_config(session=session, eval_run=eval_run) # Aggregate response generation cost - _safe_attach_cost( + attach_cost( session=session, eval_run=eval_run, log_prefix=log_prefix, response_model=model, response_results=results, ) - update_evaluation_run(session=session, eval_run=eval_run, cost=eval_run.cost) + update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(cost=eval_run.cost), + ) trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, @@ -525,8 +383,10 @@ async def process_completed_evaluation( eval_run = update_evaluation_run( session=session, eval_run=eval_run, - status="completed", - error_message=f"Embeddings failed: {str(e)}", + update=EvaluationRunUpdate( + status="completed", + error_message=f"Embeddings failed: {str(e)}", + ), ) logger.info( @@ -544,8 +404,10 @@ async def process_completed_evaluation( return update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=f"Processing failed: {str(e)}", + update=EvaluationRunUpdate( + status="failed", + error_message=f"Processing failed: {str(e)}", + ), ) @@ -647,7 +509,7 @@ async def process_completed_embedding_batch( ) # Step 7: Accumulate embedding cost onto existing response cost - _safe_attach_cost( + attach_cost( session=session, eval_run=eval_run, log_prefix=log_prefix, @@ -659,9 +521,11 @@ async def process_completed_embedding_batch( eval_run = update_evaluation_run( session=session, eval_run=eval_run, - status="completed", - score=eval_run.score, - cost=eval_run.cost, + update=EvaluationRunUpdate( + status="completed", + score=eval_run.score, + cost=eval_run.cost, + ), ) logger.info( @@ -679,8 +543,10 @@ async def process_completed_embedding_batch( return update_evaluation_run( session=session, eval_run=eval_run, - status="completed", - error_message=f"Embedding processing failed: {str(e)}", + update=EvaluationRunUpdate( + status="completed", + error_message=f"Embedding processing failed: {str(e)}", + ), ) @@ -764,8 +630,10 @@ async def check_and_process_evaluation( eval_run = update_evaluation_run( session=session, eval_run=eval_run, - status="completed", - error_message=f"Embedding batch failed: {embedding_batch_job.error_message}", + update=EvaluationRunUpdate( + status="completed", + error_message=f"Embedding batch failed: {embedding_batch_job.error_message}", + ), ) return { @@ -825,8 +693,10 @@ async def check_and_process_evaluation( eval_run = update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=error_msg, + update=EvaluationRunUpdate( + status="failed", + error_message=error_msg, + ), ) logger.error( @@ -867,8 +737,10 @@ async def check_and_process_evaluation( eval_run = update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=error_msg, + update=EvaluationRunUpdate( + status="failed", + error_message=error_msg, + ), ) logger.error( @@ -906,8 +778,10 @@ async def check_and_process_evaluation( update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=f"Checking failed: {str(e)}", + update=EvaluationRunUpdate( + status="failed", + error_message=f"Checking failed: {str(e)}", + ), ) return { @@ -999,8 +873,10 @@ async def poll_all_pending_evaluations(session: Session) -> dict[str, Any]: update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=http_exc.detail, + update=EvaluationRunUpdate( + status="failed", + error_message=http_exc.detail, + ), ) all_results.append( @@ -1040,8 +916,10 @@ async def poll_all_pending_evaluations(session: Session) -> dict[str, Any]: update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=f"Check failed: {str(e)}", + update=EvaluationRunUpdate( + status="failed", + error_message=f"Check failed: {str(e)}", + ), ) all_results.append( @@ -1063,8 +941,10 @@ async def poll_all_pending_evaluations(session: Session) -> dict[str, Any]: update_evaluation_run( session=session, eval_run=eval_run, - status="failed", - error_message=f"Project processing failed: {str(e)}", + update=EvaluationRunUpdate( + status="failed", + error_message=f"Project processing failed: {str(e)}", + ), ) all_results.append( diff --git a/backend/app/crud/model_config.py b/backend/app/crud/model_config.py index 16099e7e0..6d535240a 100644 --- a/backend/app/crud/model_config.py +++ b/backend/app/crud/model_config.py @@ -97,7 +97,6 @@ def estimate_model_cost( input_cost = (input_tokens / 1_000_000) * float(input_price) output_cost = (output_tokens / 1_000_000) * float(output_price) - total_cost = round(input_cost + output_cost, 4) return { "provider": provider, @@ -107,6 +106,6 @@ def estimate_model_cost( "output_tokens": output_tokens, "input_cost": input_cost, "output_cost": output_cost, - "total_cost": total_cost, + "total_cost": input_cost + output_cost, "currency": "USD", } diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index d3fc55a7f..98c4f7d24 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -1,5 +1,13 @@ from sqlmodel import SQLModel +from .api_key import ( + APIKey, + APIKeyBase, + APIKeyCreateResponse, + APIKeyPublic, + APIKeyVerifyResponse, +) +from .assistants import Assistant, AssistantBase, AssistantCreate, AssistantUpdate from .auth import ( AuthContext, GoogleAuthRequest, @@ -8,48 +16,44 @@ Token, TokenPayload, ) - -from .api_key import ( - APIKey, - APIKeyBase, - APIKeyPublic, - APIKeyCreateResponse, - APIKeyVerifyResponse, +from .batch_job import ( + BatchJob, + BatchJobCreate, + BatchJobPublic, + BatchJobType, + BatchJobUpdate, ) - -from .assistants import Assistant, AssistantBase, AssistantCreate, AssistantUpdate - from .collection import ( Collection, - CreationRequest, - CollectionPublic, CollectionIDPublic, + CollectionPublic, CollectionWithDocsPublic, + CreationRequest, DeletionRequest, ProviderType, ) from .collection_job import ( CollectionActionType, CollectionJob, - CollectionJobStatus, - CollectionJobUpdate, - CollectionJobPublic, CollectionJobCreate, CollectionJobImmediatePublic, + CollectionJobPublic, + CollectionJobStatus, + CollectionJobUpdate, ) from .config import ( Config, ConfigBase, ConfigCreate, - ConfigUpdate, ConfigPublic, - ConfigWithVersion, + ConfigUpdate, ConfigVersion, ConfigVersionBase, ConfigVersionCreate, - ConfigVersionUpdate, - ConfigVersionPublic, ConfigVersionItems, + ConfigVersionPublic, + ConfigVersionUpdate, + ConfigWithVersion, ) from .credentials import ( Credential, @@ -58,32 +62,22 @@ CredsPublic, CredsUpdate, ) - +from .doc_transformation_job import ( + DocTransformationJob, + DocTransformJobCreate, + DocTransformJobUpdate, + TransformationStatus, +) from .document import ( - Document, - DocumentPublic, DocTransformationJobPublic, DocTransformationJobsPublic, - TransformedDocumentPublic, + Document, + DocumentPublic, DocumentUploadResponse, TransformationJobInfo, -) -from .doc_transformation_job import ( - DocTransformationJob, - TransformationStatus, - DocTransformJobCreate, - DocTransformJobUpdate, + TransformedDocumentPublic, ) from .document_collection import DocumentCollection - -from .batch_job import ( - BatchJob, - BatchJobCreate, - BatchJobPublic, - BatchJobType, - BatchJobUpdate, -) - from .evaluation import ( EvaluationDataset, EvaluationDatasetCreate, @@ -91,50 +85,43 @@ EvaluationRun, EvaluationRunCreate, EvaluationRunPublic, + EvaluationRunUpdate, ) - -from .file import File, FilePublic, FileType, AudioUploadResponse - +from .file import AudioUploadResponse, File, FilePublic, FileType from .fine_tuning import ( - FineTuningJobBase, Fine_Tuning, + FineTuningJobBase, FineTuningJobCreate, FineTuningJobPublic, - FineTuningUpdate, FineTuningStatus, + FineTuningUpdate, ) - -from .job import Job, JobType, JobStatus, JobUpdate - +from .job import Job, JobStatus, JobType, JobUpdate from .language import ( Language, LanguageBase, LanguagePublic, LanguagesPublic, ) - from .llm import ( - ConfigBlob, CompletionConfig, + ConfigBlob, + LlmCall, LLMCallRequest, LLMCallResponse, - LlmCall, + LlmChain, LLMChainRequest, LLMChainResponse, - LlmChain, LLMJobImmediatePublic, LLMJobPublic, ) - from .message import Message - from .model_config import ( ModelConfig, ModelConfigBase, ModelConfigListPublic, ModelConfigPublic, ) - from .model_evaluation import ( ModelEvaluation, ModelEvaluationBase, @@ -143,14 +130,12 @@ ModelEvaluationStatus, ModelEvaluationUpdate, ) - - from .onboarding import OnboardingRequest, OnboardingResponse from .openai_conversation import ( - OpenAIConversationPublic, OpenAIConversation, OpenAIConversationBase, OpenAIConversationCreate, + OpenAIConversationPublic, ) from .organization import ( Organization, @@ -159,7 +144,6 @@ OrganizationsPublic, OrganizationUpdate, ) - from .project import ( Project, ProjectCreate, @@ -167,33 +151,29 @@ ProjectsPublic, ProjectUpdate, ) - from .response import ( CallbackResponse, Diagnostics, FileResultChunk, - ResponsesAPIRequest, ResponseJobStatus, + ResponsesAPIRequest, ResponsesSyncAPIRequest, ) - from .threads import OpenAI_Thread, OpenAIThreadBase, OpenAIThreadCreate - from .user import ( NewPassword, + UpdatePassword, User, UserCreate, UserPublic, UserRegister, + UsersPublic, UserUpdate, UserUpdateMe, - UsersPublic, - UpdatePassword, ) - from .user_project import ( - UserProject, AddUsersToProjectRequest, UserEntry, + UserProject, UserProjectPublic, ) diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index fddc255cb..00881e3bd 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -393,6 +393,22 @@ class EvaluationRunCreate(SQLModel): ) +class EvaluationRunUpdate(SQLModel): + """Partial update payload for an evaluation run. + + Any field left unset is untouched. Used by `update_evaluation_run` with + `model_dump(exclude_unset=True)` semantics. + """ + + status: str | None = None + error_message: str | None = None + object_store_url: str | None = None + score_trace_url: str | None = None + score: dict[str, Any] | None = None + cost: dict[str, Any] | None = None + embedding_batch_job_id: int | None = None + + class EvaluationRunPublic(SQLModel): """Public model for evaluation runs.""" diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index 0f0a65bbd..171572e34 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -580,7 +580,7 @@ async def test_process_completed_embedding_batch_success( db.commit() db.refresh(eval_run_with_embedding_batch) - # Raw results carry the usage payload that _build_embedding_cost_entry reads. + # Raw results carry the usage payload that build_embedding_cost_entry reads. mock_download.return_value = [ { "custom_id": "trace_123", @@ -635,7 +635,8 @@ async def test_process_completed_embedding_batch_success( assert result.cost["response"]["cost_usd"] == 0.000375 embedding_cost = result.cost["embedding"] assert embedding_cost["model"] == "text-embedding-3-large" - assert embedding_cost["prompt_tokens"] == 200 + assert embedding_cost["input_tokens"] == 200 + assert embedding_cost["output_tokens"] == 0 assert embedding_cost["total_tokens"] == 200 assert embedding_cost["cost_usd"] > 0 assert result.cost["total_cost_usd"] == pytest.approx( diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py index 87a82845b..0d029b21c 100644 --- a/backend/app/tests/crud/evaluations/test_score_storage.py +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -49,11 +49,9 @@ def test_uploads_traces_to_s3_and_stores_summary_only( assert mock_upload.call_args.kwargs["results"] == [{"trace_id": "t1"}] # Verify DB gets summary only, not traces - call_kwargs = mock_update.call_args.kwargs - assert call_kwargs["score"] == { - "summary_scores": [{"name": "accuracy", "avg": 0.9}] - } - assert call_kwargs["score_trace_url"] == "s3://bucket/traces.json" + update = mock_update.call_args.kwargs["update"] + assert update.score == {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + assert update.score_trace_url == "s3://bucket/traces.json" @patch("app.crud.evaluations.core.update_evaluation_run") @patch("app.crud.evaluations.core.get_evaluation_run_by_id") @@ -82,9 +80,9 @@ def test_fallback_to_db_when_s3_fails( save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) # Full score stored in DB as fallback - call_kwargs = mock_update.call_args.kwargs - assert call_kwargs["score"] == score - assert call_kwargs["score_trace_url"] is None + update = mock_update.call_args.kwargs["update"] + assert update.score == score + assert update.score_trace_url is None @patch("app.crud.evaluations.core.update_evaluation_run") @patch("app.crud.evaluations.core.get_evaluation_run_by_id") From 1af4148996e42b0486b1844f727e693708b20e93 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 16 Apr 2026 13:43:52 +0530 Subject: [PATCH 7/9] cleanups --- backend/app/crud/evaluations/cost.py | 12 ++++++------ backend/app/models/evaluation.py | 1 + .../app/tests/crud/evaluations/test_processing.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/backend/app/crud/evaluations/cost.py b/backend/app/crud/evaluations/cost.py index 76f2407f5..5a24f0d3f 100644 --- a/backend/app/crud/evaluations/cost.py +++ b/backend/app/crud/evaluations/cost.py @@ -88,7 +88,7 @@ def _build_cost_entry( } -def build_response_cost_entry( +def _build_response_cost_entry( session: Session, model: str, results: list[dict[str, Any]] ) -> dict[str, Any]: """Build a response-stage cost entry from parsed evaluation results.""" @@ -100,7 +100,7 @@ def build_response_cost_entry( return _build_cost_entry(session=session, model=model, totals=totals) -def build_embedding_cost_entry( +def _build_embedding_cost_entry( session: Session, model: str, raw_results: list[dict[str, Any]] ) -> dict[str, Any]: """Build an embedding-stage cost entry from raw embedding batch output.""" @@ -112,7 +112,7 @@ def build_embedding_cost_entry( return _build_cost_entry(session=session, model=model, totals=totals) -def build_cost_dict( +def _build_cost_dict( response_entry: dict[str, Any] | None, embedding_entry: dict[str, Any] | None, ) -> dict[str, Any]: @@ -150,7 +150,7 @@ def attach_cost( """ try: if response_model is not None and response_results is not None: - response_entry = build_response_cost_entry( + response_entry = _build_response_cost_entry( session=session, model=response_model, results=response_results ) else: @@ -158,13 +158,13 @@ def attach_cost( embedding_entry: dict[str, Any] | None = None if embedding_model is not None and embedding_raw_results is not None: - embedding_entry = build_embedding_cost_entry( + embedding_entry = _build_embedding_cost_entry( session=session, model=embedding_model, raw_results=embedding_raw_results, ) - eval_run.cost = build_cost_dict( + eval_run.cost = _build_cost_dict( response_entry=response_entry, embedding_entry=embedding_entry, ) diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index 00881e3bd..c9130d3c3 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -422,6 +422,7 @@ class EvaluationRunPublic(SQLModel): embedding_batch_job_id: int | None status: str object_store_url: str | None + score_trace_url: str | None total_items: int score: dict[str, Any] | None cost: dict[str, Any] | None diff --git a/backend/app/tests/crud/evaluations/test_processing.py b/backend/app/tests/crud/evaluations/test_processing.py index 171572e34..527ff86f4 100644 --- a/backend/app/tests/crud/evaluations/test_processing.py +++ b/backend/app/tests/crud/evaluations/test_processing.py @@ -580,7 +580,7 @@ async def test_process_completed_embedding_batch_success( db.commit() db.refresh(eval_run_with_embedding_batch) - # Raw results carry the usage payload that build_embedding_cost_entry reads. + # Raw results carry the usage payload that _build_embedding_cost_entry reads. mock_download.return_value = [ { "custom_id": "trace_123", From 77451b24f79c8cb4ad7be05825de61a032fef1a9 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 16 Apr 2026 13:49:22 +0530 Subject: [PATCH 8/9] coderabbit suggestions --- backend/app/crud/evaluations/core.py | 4 ++-- backend/app/crud/evaluations/cost.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index e5577322a..6374dca76 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -200,8 +200,8 @@ def update_evaluation_run( semantics), so callers don't accidentally clear unrelated columns. `updated_at` is always bumped. """ - for key, value in update.model_dump(exclude_unset=True).items(): - setattr(eval_run, key, value) + for field_name, new_value in update.model_dump(exclude_unset=True).items(): + setattr(eval_run, field_name, new_value) eval_run.updated_at = now() diff --git a/backend/app/crud/evaluations/cost.py b/backend/app/crud/evaluations/cost.py index 5a24f0d3f..653232dd1 100644 --- a/backend/app/crud/evaluations/cost.py +++ b/backend/app/crud/evaluations/cost.py @@ -144,25 +144,28 @@ def attach_cost( ) -> None: """Compute cost for the given stage(s) and attach to `eval_run.cost`, never raising. - Caller is responsible for persisting `eval_run` afterwards. When only the - embedding stage is provided, a previously-computed response entry on - `eval_run.cost` is preserved. + Caller is responsible for persisting `eval_run` afterwards. Either stage's + previously-computed entry on `eval_run.cost` is preserved when that stage's + inputs are not supplied, so partial updates never clobber prior data. """ try: + existing_cost = eval_run.cost or {} + if response_model is not None and response_results is not None: response_entry = _build_response_cost_entry( session=session, model=response_model, results=response_results ) else: - response_entry = (eval_run.cost or {}).get("response") + response_entry = existing_cost.get("response") - embedding_entry: dict[str, Any] | None = None if embedding_model is not None and embedding_raw_results is not None: embedding_entry = _build_embedding_cost_entry( session=session, model=embedding_model, raw_results=embedding_raw_results, ) + else: + embedding_entry = existing_cost.get("embedding") eval_run.cost = _build_cost_dict( response_entry=response_entry, From 5cbea94ae5da452af1a718ac933204c350a348c2 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 17 Apr 2026 11:32:26 +0530 Subject: [PATCH 9/9] update to main --- ...valuation_run.py => 054_add_cost_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{053_add_cost_to_evaluation_run.py => 054_add_cost_to_evaluation_run.py} (89%) diff --git a/backend/app/alembic/versions/053_add_cost_to_evaluation_run.py b/backend/app/alembic/versions/054_add_cost_to_evaluation_run.py similarity index 89% rename from backend/app/alembic/versions/053_add_cost_to_evaluation_run.py rename to backend/app/alembic/versions/054_add_cost_to_evaluation_run.py index 63110668d..4bacb4e0c 100644 --- a/backend/app/alembic/versions/053_add_cost_to_evaluation_run.py +++ b/backend/app/alembic/versions/054_add_cost_to_evaluation_run.py @@ -1,7 +1,7 @@ """add cost tracking to evaluation_run -Revision ID: 053 -Revises: 052 +Revision ID: 054 +Revises: 053 Create Date: 2026-04-09 12:00:00.000000 """ @@ -11,8 +11,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = "053" -down_revision = "052" +revision = "054" +down_revision = "053" branch_labels = None depends_on = None