ProjectTech4DevAI · AkhileshNegi · Apr 17, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 11, 2026
diff --git a/backend/app/alembic/versions/052_create_model_config_table.py b/backend/app/alembic/versions/052_create_model_config_table.py
@@ -115,7 +115,10 @@ def upgrade():
             (16, 'openai', 'gpt-5.4', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 2.5, "output_token_cost": 15}, "batch": {"input_token_cost": 1.25, "output_token_cost": 7.5}}', true, NOW(), NOW()),
             (17, 'openai', 'gpt-5.4-mini', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.75, "output_token_cost": 4.5}, "batch": {"input_token_cost": 0.375, "output_token_cost": 2.25}}', true, NOW(), NOW()),
             (18, 'openai', 'gpt-5.4-nano', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.2, "output_token_cost": 1.25}, "batch": {"input_token_cost": 0.1, "output_token_cost": 0.625}}', true, NOW(), NOW()),
-            (19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW())
+            (19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW()),
+            (20, 'openai', 'text-embedding-3-large', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.13, "output_token_cost": 0}, "batch": {"input_token_cost": 0.065, "output_token_cost": 0}}', true, NOW(), NOW()),
+            (21, 'openai', 'text-embedding-3-small', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.02, "output_token_cost": 0}, "batch": {"input_token_cost": 0.01, "output_token_cost": 0}}', true, NOW(), NOW()),
+            (22, 'openai', 'text-embedding-ada-002', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.1, "output_token_cost": 0}, "batch": {"input_token_cost": 0.05, "output_token_cost": 0}}', true, NOW(), NOW())
         """
     )
 

diff --git a/backend/app/alembic/versions/054_add_cost_to_evaluation_run.py b/backend/app/alembic/versions/054_add_cost_to_evaluation_run.py
@@ -0,0 +1,33 @@
+"""add cost tracking to evaluation_run
+
+Revision ID: 054
+Revises: 053
+Create Date: 2026-04-09 12:00:00.000000
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "054"
+down_revision = "053"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "cost",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+            comment="Cost tracking (response/embedding tokens and USD)",
+        ),
+    )
+
+
+def downgrade():
+    op.drop_column("evaluation_run", "cost")
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
@@ -5,19 +5,18 @@
 from langfuse import Langfuse
 from sqlmodel import Session, select
 
+from app.core.cloud.storage import get_cloud_storage
+from app.core.db import engine
+from app.core.storage_utils import upload_jsonl_to_object_store
 from app.core.util import now
 from app.crud.config.version import ConfigVersionCrud
 from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
 from app.crud.evaluations.score import EvaluationScore
-from app.models import EvaluationRun
+from app.models import EvaluationRun, EvaluationRunUpdate
 from app.models.llm.request import ConfigBlob, LLMCallConfig
 from app.models.stt_evaluation import EvaluationType
 from app.services.llm.jobs import resolve_config_blob
 
-from app.core.db import engine
-from app.core.cloud.storage import get_cloud_storage
-from app.core.storage_utils import upload_jsonl_to_object_store
-
 logger = logging.getLogger(__name__)
 
 
@@ -192,46 +191,18 @@ def get_evaluation_run_by_id(
 def update_evaluation_run(
     session: Session,
     eval_run: EvaluationRun,
-    status: str | None = None,
-    error_message: str | None = None,
-    object_store_url: str | None = None,
-    score_trace_url: str | None = None,
-    score: dict | None = None,
-    embedding_batch_job_id: int | None = None,
+    update: EvaluationRunUpdate,
 ) -> EvaluationRun:
     """
-    Update an evaluation run with new values and persist to database.
-
-    This helper function ensures consistency when updating evaluation runs
-    by always updating the timestamp and properly committing changes.
+    Apply a partial update to an evaluation run and persist it.
 
-    Args:
-        session: Database session
-        eval_run: EvaluationRun instance to update
-        status: New status value (optional)
-        error_message: New error message (optional)
-        object_store_url: New object store URL (optional)
-        score: New score dict (optional)
-        embedding_batch_job_id: New embedding batch job ID (optional)
-
-    Returns:
-        Updated and refreshed EvaluationRun instance
+    Only fields explicitly set on `update` are applied (`exclude_unset=True`
+    semantics), so callers don't accidentally clear unrelated columns.
+    `updated_at` is always bumped.
     """
-    # Update provided fields
-    if status is not None:
-        eval_run.status = status
-    if error_message is not None:
-        eval_run.error_message = error_message
-    if object_store_url is not None:
-        eval_run.object_store_url = object_store_url
-    if score is not None:
-        eval_run.score = score
-    if embedding_batch_job_id is not None:
-        eval_run.embedding_batch_job_id = embedding_batch_job_id
-    if score_trace_url is not None:
-        eval_run.score_trace_url = score_trace_url or None
-
-    # Always update timestamp
+    for field_name, new_value in update.model_dump(exclude_unset=True).items():
+        setattr(eval_run, field_name, new_value)
+
     eval_run.updated_at = now()
 
     # Persist to database
@@ -314,7 +285,11 @@ def get_or_fetch_score(
     }
 
     # Update score column using existing helper
-    update_evaluation_run(session=session, eval_run=eval_run, score=score)
+    update_evaluation_run(
+        session=session,
+        eval_run=eval_run,
+        update=EvaluationRunUpdate(score=score),
+    )
 
     total_traces = len(score.get("traces", []))
     logger.info(
@@ -400,8 +375,10 @@ def save_score(
         update_evaluation_run(
             session=session,
             eval_run=eval_run,
-            score=db_score,
-            score_trace_url=score_trace_url,
+            update=EvaluationRunUpdate(
+                score=db_score,
+                score_trace_url=score_trace_url or None,
+            ),
         )
 
         logger.info(

diff --git a/backend/app/crud/evaluations/cost.py b/backend/app/crud/evaluations/cost.py
@@ -0,0 +1,177 @@
+"""
+Cost tracking for evaluation runs.
+
+Token usage is aggregated per stage (response generation, embedding) and
+priced against `global.model_config` using OpenAI Batch rates. Failures
+here must never block evaluation completion — `attach_cost` swallows
+exceptions and logs a warning.
+
+Persisted shape on `eval_run.cost`:
+
+    {
+        "response":  {model, input_tokens, output_tokens, total_tokens, cost_usd},
+        "embedding": {model, input_tokens, output_tokens, total_tokens, cost_usd},
+        "total_cost_usd": float,
+    }
+
+Either stage entry is optional. Embedding entries use output_tokens=0.
+"""
+
+import logging
+from collections.abc import Callable, Iterable
+from typing import Any
+
+from sqlmodel import Session
+
+from app.crud.model_config import estimate_model_cost
+from app.models import EvaluationRun
+
+logger = logging.getLogger(__name__)
+
+# USD rounding precision for persisted cost values.
+COST_USD_DECIMALS = 6
+
+
+def _cost_usd(estimate: dict[str, Any] | None) -> float:
+    """Sum the per-direction costs from an estimate and round to our USD precision."""
+    if not estimate:
+        return 0.0
+    total = float(estimate.get("input_cost", 0.0)) + float(
+        estimate.get("output_cost", 0.0)
+    )
+    return round(total, COST_USD_DECIMALS)
+
+
+def _sum_tokens(
+    items: Iterable[dict[str, Any]],
+    usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None],
+    input_key: str,
+) -> dict[str, int]:
+    """Sum (input, output, total) tokens across items using a per-item usage extractor.
+
+    The OpenAI Embeddings API reports input tokens as ``prompt_tokens`` and has
+    no output tokens; chat/responses APIs use ``input_tokens`` and ``output_tokens``.
+    Missing keys default to 0, so the embedding case naturally produces
+    output_tokens=0.
+    """
+    totals = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+    for item in items:
+        usage = usage_extractor(item)
+        if not usage:
+            continue
+        totals["input_tokens"] += usage.get(input_key, 0)
+        totals["output_tokens"] += usage.get("output_tokens", 0)
+        totals["total_tokens"] += usage.get("total_tokens", 0)
+    return totals
+
+
+def _build_cost_entry(
+    session: Session,
+    model: str,
+    totals: dict[str, int],
+) -> dict[str, Any]:
+    """Price aggregated token usage against the model's batch pricing row."""
+    estimate = estimate_model_cost(
+        session=session,
+        provider="openai",
+        model_name=model,
+        input_tokens=totals["input_tokens"],
+        output_tokens=totals["output_tokens"],
+        usage_type="batch",
+    )
+    return {
+        "model": model,
+        "input_tokens": totals["input_tokens"],
+        "output_tokens": totals["output_tokens"],
+        "total_tokens": totals["total_tokens"],
+        "cost_usd": _cost_usd(estimate),
+    }
+
+
+def _build_response_cost_entry(
+    session: Session, model: str, results: list[dict[str, Any]]
+) -> dict[str, Any]:
+    """Build a response-stage cost entry from parsed evaluation results."""
+    totals = _sum_tokens(
+        items=results,
+        usage_extractor=lambda r: r.get("usage"),
+        input_key="input_tokens",
+    )
+    return _build_cost_entry(session=session, model=model, totals=totals)
+
+
+def _build_embedding_cost_entry(
+    session: Session, model: str, raw_results: list[dict[str, Any]]
+) -> dict[str, Any]:
+    """Build an embedding-stage cost entry from raw embedding batch output."""
+    totals = _sum_tokens(
+        items=raw_results,
+        usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"),
+        input_key="prompt_tokens",
+    )
+    return _build_cost_entry(session=session, model=model, totals=totals)
+
+
+def _build_cost_dict(
+    response_entry: dict[str, Any] | None,
+    embedding_entry: dict[str, Any] | None,
+) -> dict[str, Any]:
+    """Combine per-stage entries into the `eval_run.cost` payload with a grand total."""
+    cost: dict[str, Any] = {}
+    total = 0.0
+
+    if response_entry:
+        cost["response"] = response_entry
+        total += response_entry.get("cost_usd", 0.0)
+
+    if embedding_entry:
+        cost["embedding"] = embedding_entry
+        total += embedding_entry.get("cost_usd", 0.0)
+
+    cost["total_cost_usd"] = round(total, COST_USD_DECIMALS)
+    return cost
+
+
+def attach_cost(
+    session: Session,
+    eval_run: EvaluationRun,
+    log_prefix: str,
+    *,
+    response_model: str | None = None,
+    response_results: list[dict[str, Any]] | None = None,
+    embedding_model: str | None = None,
+    embedding_raw_results: list[dict[str, Any]] | None = None,
+) -> None:
+    """Compute cost for the given stage(s) and attach to `eval_run.cost`, never raising.
+
+    Caller is responsible for persisting `eval_run` afterwards. Either stage's
+    previously-computed entry on `eval_run.cost` is preserved when that stage's
+    inputs are not supplied, so partial updates never clobber prior data.
+    """
+    try:
+        existing_cost = eval_run.cost or {}
+
+        if response_model is not None and response_results is not None:
+            response_entry = _build_response_cost_entry(
+                session=session, model=response_model, results=response_results
+            )
+        else:
+            response_entry = existing_cost.get("response")
+
+        if embedding_model is not None and embedding_raw_results is not None:
+            embedding_entry = _build_embedding_cost_entry(
+                session=session,
+                model=embedding_model,
+                raw_results=embedding_raw_results,
+            )
+        else:
+            embedding_entry = existing_cost.get("embedding")
+
+        eval_run.cost = _build_cost_dict(
+            response_entry=response_entry,
+            embedding_entry=embedding_entry,
+        )
+    except Exception as cost_err:
+        logger.warning(
+            f"[attach_cost] {log_prefix} Failed to compute cost | {cost_err}"
+        )