Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@ def upgrade():
(16, 'openai', 'gpt-5.4', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 2.5, "output_token_cost": 15}, "batch": {"input_token_cost": 1.25, "output_token_cost": 7.5}}', true, NOW(), NOW()),
(17, 'openai', 'gpt-5.4-mini', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.75, "output_token_cost": 4.5}, "batch": {"input_token_cost": 0.375, "output_token_cost": 2.25}}', true, NOW(), NOW()),
(18, 'openai', 'gpt-5.4-nano', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 0.2, "output_token_cost": 1.25}, "batch": {"input_token_cost": 0.1, "output_token_cost": 0.625}}', true, NOW(), NOW()),
(19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW())
(19, 'openai', 'gpt-5.4-pro', '{"effort": {"type": "enum", "default": "medium", "options": ["none", "low", "medium", "high", "xhigh"], "description": "How long the model spends reasoning. Higher = better but slower."}, "summary": {"type": "enum", "default": "auto", "options": ["auto", "detailed", "concise", "null"], "description": "Summarize the reasoning result."}}', '{TEXT,IMAGE}', '{TEXT}', '{"response": {"input_token_cost": 30, "output_token_cost": 180}, "batch": {"input_token_cost": 15, "output_token_cost": 90}}', true, NOW(), NOW()),
(20, 'openai', 'text-embedding-3-large', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.13, "output_token_cost": 0}, "batch": {"input_token_cost": 0.065, "output_token_cost": 0}}', true, NOW(), NOW()),
(21, 'openai', 'text-embedding-3-small', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.02, "output_token_cost": 0}, "batch": {"input_token_cost": 0.01, "output_token_cost": 0}}', true, NOW(), NOW()),
(22, 'openai', 'text-embedding-ada-002', '{}', '{TEXT}', '{}', '{"response": {"input_token_cost": 0.1, "output_token_cost": 0}, "batch": {"input_token_cost": 0.05, "output_token_cost": 0}}', true, NOW(), NOW())
"""
)

Expand Down
33 changes: 33 additions & 0 deletions backend/app/alembic/versions/054_add_cost_to_evaluation_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""add cost tracking to evaluation_run

Revision ID: 054
Revises: 053
Create Date: 2026-04-09 12:00:00.000000

"""

import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "054"
down_revision = "053"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"evaluation_run",
sa.Column(
"cost",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="Cost tracking (response/embedding tokens and USD)",
),
)


def downgrade():
op.drop_column("evaluation_run", "cost")
Comment thread
AkhileshNegi marked this conversation as resolved.
65 changes: 21 additions & 44 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,18 @@
from langfuse import Langfuse
from sqlmodel import Session, select

from app.core.cloud.storage import get_cloud_storage
from app.core.db import engine
from app.core.storage_utils import upload_jsonl_to_object_store
from app.core.util import now
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
from app.crud.evaluations.score import EvaluationScore
from app.models import EvaluationRun
from app.models import EvaluationRun, EvaluationRunUpdate
from app.models.llm.request import ConfigBlob, LLMCallConfig
from app.models.stt_evaluation import EvaluationType
from app.services.llm.jobs import resolve_config_blob

from app.core.db import engine
from app.core.cloud.storage import get_cloud_storage
from app.core.storage_utils import upload_jsonl_to_object_store

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -192,46 +191,18 @@ def get_evaluation_run_by_id(
def update_evaluation_run(
session: Session,
eval_run: EvaluationRun,
status: str | None = None,
error_message: str | None = None,
object_store_url: str | None = None,
score_trace_url: str | None = None,
score: dict | None = None,
embedding_batch_job_id: int | None = None,
update: EvaluationRunUpdate,
) -> EvaluationRun:
"""
Update an evaluation run with new values and persist to database.

This helper function ensures consistency when updating evaluation runs
by always updating the timestamp and properly committing changes.
Apply a partial update to an evaluation run and persist it.

Args:
session: Database session
eval_run: EvaluationRun instance to update
status: New status value (optional)
error_message: New error message (optional)
object_store_url: New object store URL (optional)
score: New score dict (optional)
embedding_batch_job_id: New embedding batch job ID (optional)

Returns:
Updated and refreshed EvaluationRun instance
Only fields explicitly set on `update` are applied (`exclude_unset=True`
semantics), so callers don't accidentally clear unrelated columns.
`updated_at` is always bumped.
"""
# Update provided fields
if status is not None:
eval_run.status = status
if error_message is not None:
eval_run.error_message = error_message
if object_store_url is not None:
eval_run.object_store_url = object_store_url
if score is not None:
eval_run.score = score
if embedding_batch_job_id is not None:
eval_run.embedding_batch_job_id = embedding_batch_job_id
if score_trace_url is not None:
eval_run.score_trace_url = score_trace_url or None

# Always update timestamp
for field_name, new_value in update.model_dump(exclude_unset=True).items():
setattr(eval_run, field_name, new_value)

eval_run.updated_at = now()

# Persist to database
Expand Down Expand Up @@ -314,7 +285,11 @@ def get_or_fetch_score(
}

# Update score column using existing helper
update_evaluation_run(session=session, eval_run=eval_run, score=score)
update_evaluation_run(
session=session,
eval_run=eval_run,
update=EvaluationRunUpdate(score=score),
)

total_traces = len(score.get("traces", []))
logger.info(
Expand Down Expand Up @@ -400,8 +375,10 @@ def save_score(
update_evaluation_run(
session=session,
eval_run=eval_run,
score=db_score,
score_trace_url=score_trace_url,
update=EvaluationRunUpdate(
score=db_score,
score_trace_url=score_trace_url or None,
),
)

logger.info(
Expand Down
177 changes: 177 additions & 0 deletions backend/app/crud/evaluations/cost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""
Cost tracking for evaluation runs.

Token usage is aggregated per stage (response generation, embedding) and
priced against `global.model_config` using OpenAI Batch rates. Failures
here must never block evaluation completion — `attach_cost` swallows
exceptions and logs a warning.

Persisted shape on `eval_run.cost`:

{
"response": {model, input_tokens, output_tokens, total_tokens, cost_usd},
"embedding": {model, input_tokens, output_tokens, total_tokens, cost_usd},
"total_cost_usd": float,
}

Either stage entry is optional. Embedding entries use output_tokens=0.
"""

import logging
from collections.abc import Callable, Iterable
from typing import Any

from sqlmodel import Session

from app.crud.model_config import estimate_model_cost
from app.models import EvaluationRun

logger = logging.getLogger(__name__)

# USD rounding precision for persisted cost values.
COST_USD_DECIMALS = 6


def _cost_usd(estimate: dict[str, Any] | None) -> float:
"""Sum the per-direction costs from an estimate and round to our USD precision."""
if not estimate:
return 0.0
total = float(estimate.get("input_cost", 0.0)) + float(
estimate.get("output_cost", 0.0)
)
return round(total, COST_USD_DECIMALS)


def _sum_tokens(
items: Iterable[dict[str, Any]],
usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None],
input_key: str,
) -> dict[str, int]:
"""Sum (input, output, total) tokens across items using a per-item usage extractor.

The OpenAI Embeddings API reports input tokens as ``prompt_tokens`` and has
no output tokens; chat/responses APIs use ``input_tokens`` and ``output_tokens``.
Missing keys default to 0, so the embedding case naturally produces
output_tokens=0.
"""
totals = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
for item in items:
usage = usage_extractor(item)
if not usage:
continue
totals["input_tokens"] += usage.get(input_key, 0)
totals["output_tokens"] += usage.get("output_tokens", 0)
totals["total_tokens"] += usage.get("total_tokens", 0)
return totals
Comment on lines +45 to +65
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Harden token aggregation against malformed usage fields.

At Line 62–64, direct += usage.get(...) can throw if a provider returns non-numeric values, and then cost calculation is fully skipped by the outer catch.

Proposed fix
+def _to_int_token(value: Any) -> int:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return 0
+
 def _sum_tokens(
     items: Iterable[dict[str, Any]],
     usage_extractor: Callable[[dict[str, Any]], dict[str, Any] | None],
     input_key: str,
 ) -> dict[str, int]:
@@
         usage = usage_extractor(item)
         if not usage:
             continue
-        totals["input_tokens"] += usage.get(input_key, 0)
-        totals["output_tokens"] += usage.get("output_tokens", 0)
-        totals["total_tokens"] += usage.get("total_tokens", 0)
+        totals["input_tokens"] += _to_int_token(usage.get(input_key, 0))
+        totals["output_tokens"] += _to_int_token(usage.get("output_tokens", 0))
+        totals["total_tokens"] += _to_int_token(usage.get("total_tokens", 0))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/crud/evaluations/cost.py` around lines 45 - 65, The _sum_tokens
function currently adds values from usage.get(...) directly which can raise when
providers return non-numeric types; update the loop in _sum_tokens to coerce
each retrieved value for input_key, "output_tokens", and "total_tokens" to an
int (or float->int) safely and treat non-coercible/missing values as 0 before
adding to totals, e.g., call usage_extractor(item) as before but for each key
use a safe_numeric = _safe_int(usage.get(...)) pattern (implement a small helper
or inline try/except) so totals["input_tokens"], totals["output_tokens"], and
totals["total_tokens"] always receive numeric increments and malformed values
are ignored.



def _build_cost_entry(
session: Session,
model: str,
totals: dict[str, int],
) -> dict[str, Any]:
"""Price aggregated token usage against the model's batch pricing row."""
estimate = estimate_model_cost(
session=session,
provider="openai",
model_name=model,
input_tokens=totals["input_tokens"],
output_tokens=totals["output_tokens"],
usage_type="batch",
)
return {
"model": model,
"input_tokens": totals["input_tokens"],
"output_tokens": totals["output_tokens"],
"total_tokens": totals["total_tokens"],
"cost_usd": _cost_usd(estimate),
}


def _build_response_cost_entry(
session: Session, model: str, results: list[dict[str, Any]]
) -> dict[str, Any]:
"""Build a response-stage cost entry from parsed evaluation results."""
totals = _sum_tokens(
items=results,
usage_extractor=lambda r: r.get("usage"),
input_key="input_tokens",
)
return _build_cost_entry(session=session, model=model, totals=totals)


def _build_embedding_cost_entry(
session: Session, model: str, raw_results: list[dict[str, Any]]
) -> dict[str, Any]:
"""Build an embedding-stage cost entry from raw embedding batch output."""
totals = _sum_tokens(
items=raw_results,
usage_extractor=lambda r: r.get("response", {}).get("body", {}).get("usage"),
input_key="prompt_tokens",
)
return _build_cost_entry(session=session, model=model, totals=totals)


def _build_cost_dict(
response_entry: dict[str, Any] | None,
embedding_entry: dict[str, Any] | None,
) -> dict[str, Any]:
"""Combine per-stage entries into the `eval_run.cost` payload with a grand total."""
cost: dict[str, Any] = {}
total = 0.0

if response_entry:
cost["response"] = response_entry
total += response_entry.get("cost_usd", 0.0)

if embedding_entry:
cost["embedding"] = embedding_entry
total += embedding_entry.get("cost_usd", 0.0)

cost["total_cost_usd"] = round(total, COST_USD_DECIMALS)
return cost


def attach_cost(
session: Session,
eval_run: EvaluationRun,
log_prefix: str,
*,
response_model: str | None = None,
response_results: list[dict[str, Any]] | None = None,
embedding_model: str | None = None,
embedding_raw_results: list[dict[str, Any]] | None = None,
) -> None:
"""Compute cost for the given stage(s) and attach to `eval_run.cost`, never raising.

Caller is responsible for persisting `eval_run` afterwards. Either stage's
previously-computed entry on `eval_run.cost` is preserved when that stage's
inputs are not supplied, so partial updates never clobber prior data.
"""
try:
existing_cost = eval_run.cost or {}

if response_model is not None and response_results is not None:
response_entry = _build_response_cost_entry(
session=session, model=response_model, results=response_results
)
else:
response_entry = existing_cost.get("response")

if embedding_model is not None and embedding_raw_results is not None:
embedding_entry = _build_embedding_cost_entry(
session=session,
model=embedding_model,
raw_results=embedding_raw_results,
)
else:
embedding_entry = existing_cost.get("embedding")

eval_run.cost = _build_cost_dict(
response_entry=response_entry,
embedding_entry=embedding_entry,
)
except Exception as cost_err:
logger.warning(
f"[attach_cost] {log_prefix} Failed to compute cost | {cost_err}"
)
Loading
Loading