Skip to content

Commit e0838da

Browse files
authored
Evaluation: add export_format query param for grouped trace export (#562)
* Evaluation: add export_format query param with grouped traces support * Fix formatting and update docstring for group_traces_by_question_id * Fix trailing whitespace * Refactor: Move import of group_traces_by_question_id to the top and update docstring for clarity * Docs: Improve export_format parameter description for clarity
1 parent 8a2317f commit e0838da

4 files changed

Lines changed: 245 additions & 1 deletion

File tree

backend/app/api/docs/evaluation/get_evaluation.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ Returns comprehensive evaluation information including processing status, config
55
**Query Parameters:**
66
* `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations.
77
* `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`.
8+
* `export_format` (optional, default: row) - Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". Allowed values: `row`, `grouped`.
89

9-
**Score Format** (`get_trace_info=true`):
10+
**Score Format** (`get_trace_info=true`,`export_format=row`):
1011

1112
```json
1213
{
@@ -49,6 +50,33 @@ Returns comprehensive evaluation information including processing status, config
4950
}
5051
```
5152

53+
**Score Format** (`get_trace_info=true`,`export_format=grouped`):
54+
```json
55+
{
56+
"summary_scores": [...],
57+
"traces": [...],
58+
"grouped_traces": [
59+
{
60+
"question_id": 1,
61+
"question": "What is Python?",
62+
"ground_truth_answer": "Python is a high-level programming language.",
63+
"llm_answers": [
64+
"Answer from evaluation run 1...",
65+
"Answer from evaluation run 2..."
66+
],
67+
"trace_ids": [
68+
"uuid-123",
69+
"uuid-456"
70+
],
71+
"scores": [
72+
[{"name": "cosine_similarity", "value": 0.82, "data_type": "NUMERIC"}],
73+
[{"name": "cosine_similarity", "value": 0.75, "data_type": "NUMERIC"}]
74+
]
75+
}
76+
]
77+
}
78+
```
79+
5280
**Score Details:**
5381
* NUMERIC scores include average (`avg`) and standard deviation (`std`) in summary
5482
* CATEGORICAL scores include distribution counts in summary

backend/app/api/routes/evaluations/evaluation.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from app.api.deps import AuthContextDep, SessionDep
1515
from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
16+
from app.crud.evaluations.core import group_traces_by_question_id
1617
from app.models.evaluation import EvaluationRunPublic
1718
from app.api.permissions import Permission, require_permission
1819
from app.services.evaluations import (
@@ -121,13 +122,25 @@ def get_evaluation_run_status(
121122
"Requires get_trace_info=true."
122123
),
123124
),
125+
export_format: str = Query(
126+
"row",
127+
description=(
128+
"Controls the Traces structure."
129+
"'grouped' collates repeated questions horizontally using Parent Question ID."
130+
),
131+
enum=["row", "grouped"],
132+
),
124133
) -> APIResponse[EvaluationRunPublic]:
125134
"""Get evaluation run status with optional trace info."""
126135
if resync_score and not get_trace_info:
127136
raise HTTPException(
128137
status_code=400,
129138
detail="resync_score=true requires get_trace_info=true",
130139
)
140+
if export_format == "grouped" and not get_trace_info:
141+
raise HTTPException(
142+
status_code=400, detail="export_format=grouped requires get_trace_info=true"
143+
)
131144

132145
eval_run, error = get_evaluation_with_scores(
133146
session=_session,
@@ -146,6 +159,13 @@ def get_evaluation_run_status(
146159
"to this organization"
147160
),
148161
)
162+
# Formatter = grouped
163+
if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
164+
try:
165+
grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
166+
eval_run.score["traces"] = grouped_traces
167+
except ValueError as e:
168+
return APIResponse.failure_response(error=str(e), data=eval_run)
149169

150170
if error:
151171
return APIResponse.failure_response(error=error, data=eval_run)

backend/app/crud/evaluations/core.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Any
23
from uuid import UUID
34

45
from langfuse import Langfuse
@@ -352,6 +353,59 @@ def save_score(
352353
return eval_run
353354

354355

356+
def group_traces_by_question_id(
357+
traces: list[dict[str, Any]],
358+
) -> list[dict[str, Any]]:
359+
"""
360+
Group evaluation traces by question_id for horizontal comparison.
361+
362+
Returns:
363+
List of grouped traces sorted by question_id:
364+
[
365+
{
366+
"question_id": 1,
367+
"question": "What is Python?",
368+
"ground_truth_answer": "...",
369+
"llm_answers": ["Answer 1", "Answer 2"],
370+
"trace_ids": ["trace-1", "trace-2"],
371+
"scores": [[...], [...]]
372+
}
373+
]
374+
"""
375+
376+
# whether question_id exists in the traces
377+
if traces and (
378+
traces[0].get("question_id") is None or traces[0].get("question_id") == ""
379+
):
380+
raise ValueError("Grouped export format is not available for this evaluation.")
381+
382+
groups: dict[int, list[dict[str, Any]]] = {}
383+
384+
for trace in traces:
385+
question_id = trace.get("question_id")
386+
if question_id not in groups:
387+
groups[question_id] = []
388+
groups[question_id].append(trace)
389+
390+
result: list[dict[str, Any]] = []
391+
for question_id in sorted(groups.keys()):
392+
group_traces = groups[question_id]
393+
first = group_traces[0]
394+
result.append(
395+
{
396+
"question_id": question_id,
397+
"question": first.get("question", ""),
398+
"ground_truth_answer": first.get("ground_truth_answer", ""),
399+
"llm_answers": [t.get("llm_answer", "") for t in group_traces],
400+
"trace_ids": [t.get("trace_id", "") for t in group_traces],
401+
"scores": [t.get("scores", []) for t in group_traces],
402+
}
403+
)
404+
405+
logger.info(f"[group_traces_by_question_id] Created {len(result)} groups")
406+
return result
407+
408+
355409
def resolve_model_from_config(
356410
session: Session,
357411
eval_run: EvaluationRun,

backend/app/tests/api/routes/test_evaluation.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,148 @@ def test_get_evaluation_run_resync_without_trace_info_fails(
928928
and "get_trace_info" in error_str.lower()
929929
)
930930

931+
def test_get_evaluation_run_grouped_format_without_trace_info_fails(
932+
self,
933+
client: TestClient,
934+
user_api_key_header: dict[str, str],
935+
db: Session,
936+
user_api_key: TestAuthContext,
937+
create_test_dataset: EvaluationDataset,
938+
) -> None:
939+
eval_run = EvaluationRun(
940+
run_name="test_run",
941+
dataset_name=create_test_dataset.name,
942+
dataset_id=create_test_dataset.id,
943+
config={"model": "gpt-4o"},
944+
status="completed",
945+
total_items=3,
946+
organization_id=user_api_key.organization_id,
947+
project_id=user_api_key.project_id,
948+
)
949+
db.add(eval_run)
950+
db.commit()
951+
db.refresh(eval_run)
952+
953+
response = client.get(
954+
f"/api/v1/evaluations/{eval_run.id}",
955+
params={"export_format": "grouped"}, # Missing get_trace_info=true
956+
headers=user_api_key_header,
957+
)
958+
959+
assert response.status_code == 400
960+
response_data = response.json()
961+
error_str = response_data.get(
962+
"detail", response_data.get("error", str(response_data))
963+
)
964+
assert (
965+
"export_format" in error_str.lower()
966+
and "get_trace_info" in error_str.lower()
967+
)
968+
969+
def test_get_evaluation_run_grouped_format_success(
970+
self,
971+
client: TestClient,
972+
user_api_key_header: dict[str, str],
973+
db: Session,
974+
user_api_key: TestAuthContext,
975+
create_test_dataset: EvaluationDataset,
976+
) -> None:
977+
eval_run = EvaluationRun(
978+
run_name="test_run",
979+
dataset_name=create_test_dataset.name,
980+
dataset_id=create_test_dataset.id,
981+
config={"model": "gpt-4o"},
982+
status="completed",
983+
total_items=4,
984+
score={
985+
"traces": [
986+
{
987+
"trace_id": "trace-1a",
988+
"question_id": 1,
989+
"question": "What is Python?",
990+
"ground_truth_answer": "A programming language",
991+
"llm_answer": "Python is a high-level programming language",
992+
"scores": [
993+
{
994+
"name": "cosine_similarity",
995+
"value": 0.82,
996+
"data_type": "NUMERIC",
997+
}
998+
],
999+
},
1000+
{
1001+
"trace_id": "trace-1b",
1002+
"question_id": 1,
1003+
"question": "What is Python?",
1004+
"ground_truth_answer": "A programming language",
1005+
"llm_answer": "Python is an interpreted language",
1006+
"scores": [
1007+
{
1008+
"name": "cosine_similarity",
1009+
"value": 0.75,
1010+
"data_type": "NUMERIC",
1011+
}
1012+
],
1013+
},
1014+
# Row format - 1 trace for question_id=2
1015+
{
1016+
"trace_id": "trace-2a",
1017+
"question_id": 2,
1018+
"question": "What is Java?",
1019+
"ground_truth_answer": "An OOP language",
1020+
"llm_answer": "Java is a statically typed language",
1021+
"scores": [
1022+
{
1023+
"name": "cosine_similarity",
1024+
"value": 0.80,
1025+
"data_type": "NUMERIC",
1026+
}
1027+
],
1028+
},
1029+
],
1030+
"summary_scores": [
1031+
{
1032+
"avg": 0.79,
1033+
"std": 0.03,
1034+
"name": "cosine_similarity",
1035+
"data_type": "NUMERIC",
1036+
"total_pairs": 3,
1037+
}
1038+
],
1039+
},
1040+
organization_id=user_api_key.organization_id,
1041+
project_id=user_api_key.project_id,
1042+
)
1043+
db.add(eval_run)
1044+
db.commit()
1045+
db.refresh(eval_run)
1046+
1047+
response = client.get(
1048+
f"/api/v1/evaluations/{eval_run.id}",
1049+
params={
1050+
"export_format": "grouped",
1051+
"get_trace_info": True,
1052+
}, # Missing get_trace_info=true
1053+
headers=user_api_key_header,
1054+
)
1055+
1056+
assert response.status_code == 200
1057+
response_data = response.json()
1058+
assert response_data["success"] is True
1059+
data = response_data["data"]
1060+
assert data["id"] == eval_run.id
1061+
assert data["status"] == "completed"
1062+
1063+
traces = data["score"]["traces"]
1064+
assert (
1065+
isinstance(traces, list)
1066+
and len(traces) > 0
1067+
and "llm_answers" in traces[0]
1068+
and isinstance(traces[0]["llm_answers"], list)
1069+
and "trace_ids" in traces[0]
1070+
and isinstance(traces[0]["trace_ids"], list)
1071+
)
1072+
9311073

9321074
class TestGetDataset:
9331075
"""Test GET /evaluations/datasets/{dataset_id} endpoint."""

0 commit comments

Comments
 (0)