Evaluation: add export_format query param for grouped trace export (#562)

vprashrex · web-flow · commit e0838da40518 · 2026-01-29T15:21:35.000+05:30
* Evaluation: add export_format query param with grouped traces support

* Fix formatting and update docstring for group_traces_by_question_id

* Fix trailing whitespace

* Refactor: Move import of group_traces_by_question_id to the top and update docstring for clarity

* Docs: Improve export_format parameter description for clarity
diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md
@@ -5,8 +5,9 @@ Returns comprehensive evaluation information including processing status, config
 **Query Parameters:**
 * `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations.
 * `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`.
+* `export_format` (optional, default: row) -  Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". Allowed values: `row`, `grouped`.
 
-**Score Format** (`get_trace_info=true`):
+**Score Format** (`get_trace_info=true`,`export_format=row`):
 
 ```json
 {
@@ -49,6 +50,33 @@ Returns comprehensive evaluation information including processing status, config
 }
 ```
 
+**Score Format** (`get_trace_info=true`,`export_format=grouped`):
+```json
+{
+  "summary_scores": [...],
+  "traces": [...],
+  "grouped_traces": [
+    {
+      "question_id": 1,
+      "question": "What is Python?",
+      "ground_truth_answer": "Python is a high-level programming language.",
+      "llm_answers": [
+        "Answer from evaluation run 1...",
+        "Answer from evaluation run 2..."
+      ],
+      "trace_ids": [
+        "uuid-123",
+        "uuid-456"
+      ],
+      "scores": [
+        [{"name": "cosine_similarity", "value": 0.82, "data_type": "NUMERIC"}],
+        [{"name": "cosine_similarity", "value": 0.75, "data_type": "NUMERIC"}]
+      ]
+    }
+  ]
+}
+```
+
 **Score Details:**
 * NUMERIC scores include average (`avg`) and standard deviation (`std`) in summary
 * CATEGORICAL scores include distribution counts in summary
diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
@@ -13,6 +13,7 @@
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
+from app.crud.evaluations.core import group_traces_by_question_id
 from app.models.evaluation import EvaluationRunPublic
 from app.api.permissions import Permission, require_permission
 from app.services.evaluations import (
@@ -121,13 +122,25 @@ def get_evaluation_run_status(
             "Requires get_trace_info=true."
         ),
     ),
+    export_format: str = Query(
+        "row",
+        description=(
+            "Controls the Traces structure."
+            "'grouped' collates repeated questions horizontally using Parent Question ID."
+        ),
+        enum=["row", "grouped"],
+    ),
 ) -> APIResponse[EvaluationRunPublic]:
     """Get evaluation run status with optional trace info."""
     if resync_score and not get_trace_info:
         raise HTTPException(
             status_code=400,
             detail="resync_score=true requires get_trace_info=true",
         )
+    if export_format == "grouped" and not get_trace_info:
+        raise HTTPException(
+            status_code=400, detail="export_format=grouped requires get_trace_info=true"
+        )
 
     eval_run, error = get_evaluation_with_scores(
         session=_session,
@@ -146,6 +159,13 @@ def get_evaluation_run_status(
                 "to this organization"
             ),
         )
+    # Formatter = grouped
+    if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
+        try:
+            grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
+            eval_run.score["traces"] = grouped_traces
+        except ValueError as e:
+            return APIResponse.failure_response(error=str(e), data=eval_run)
 
     if error:
         return APIResponse.failure_response(error=error, data=eval_run)
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 from uuid import UUID
 
 from langfuse import Langfuse
@@ -352,6 +353,59 @@ def save_score(
         return eval_run
 
 
+def group_traces_by_question_id(
+    traces: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """
+    Group evaluation traces by question_id for horizontal comparison.
+
+    Returns:
+        List of grouped traces sorted by question_id:
+        [
+            {
+                "question_id": 1,
+                "question": "What is Python?",
+                "ground_truth_answer": "...",
+                "llm_answers": ["Answer 1", "Answer 2"],
+                "trace_ids": ["trace-1", "trace-2"],
+                "scores": [[...], [...]]
+            }
+        ]
+    """
+
+    # whether question_id exists in the traces
+    if traces and (
+        traces[0].get("question_id") is None or traces[0].get("question_id") == ""
+    ):
+        raise ValueError("Grouped export format is not available for this evaluation.")
+
+    groups: dict[int, list[dict[str, Any]]] = {}
+
+    for trace in traces:
+        question_id = trace.get("question_id")
+        if question_id not in groups:
+            groups[question_id] = []
+        groups[question_id].append(trace)
+
+    result: list[dict[str, Any]] = []
+    for question_id in sorted(groups.keys()):
+        group_traces = groups[question_id]
+        first = group_traces[0]
+        result.append(
+            {
+                "question_id": question_id,
+                "question": first.get("question", ""),
+                "ground_truth_answer": first.get("ground_truth_answer", ""),
+                "llm_answers": [t.get("llm_answer", "") for t in group_traces],
+                "trace_ids": [t.get("trace_id", "") for t in group_traces],
+                "scores": [t.get("scores", []) for t in group_traces],
+            }
+        )
+
+    logger.info(f"[group_traces_by_question_id] Created {len(result)} groups")
+    return result
+
+
 def resolve_model_from_config(
     session: Session,
     eval_run: EvaluationRun,
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
@@ -928,6 +928,148 @@ def test_get_evaluation_run_resync_without_trace_info_fails(
             and "get_trace_info" in error_str.lower()
         )
 
+    def test_get_evaluation_run_grouped_format_without_trace_info_fails(
+        self,
+        client: TestClient,
+        user_api_key_header: dict[str, str],
+        db: Session,
+        user_api_key: TestAuthContext,
+        create_test_dataset: EvaluationDataset,
+    ) -> None:
+        eval_run = EvaluationRun(
+            run_name="test_run",
+            dataset_name=create_test_dataset.name,
+            dataset_id=create_test_dataset.id,
+            config={"model": "gpt-4o"},
+            status="completed",
+            total_items=3,
+            organization_id=user_api_key.organization_id,
+            project_id=user_api_key.project_id,
+        )
+        db.add(eval_run)
+        db.commit()
+        db.refresh(eval_run)
+
+        response = client.get(
+            f"/api/v1/evaluations/{eval_run.id}",
+            params={"export_format": "grouped"},  # Missing get_trace_info=true
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 400
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("error", str(response_data))
+        )
+        assert (
+            "export_format" in error_str.lower()
+            and "get_trace_info" in error_str.lower()
+        )
+
+    def test_get_evaluation_run_grouped_format_success(
+        self,
+        client: TestClient,
+        user_api_key_header: dict[str, str],
+        db: Session,
+        user_api_key: TestAuthContext,
+        create_test_dataset: EvaluationDataset,
+    ) -> None:
+        eval_run = EvaluationRun(
+            run_name="test_run",
+            dataset_name=create_test_dataset.name,
+            dataset_id=create_test_dataset.id,
+            config={"model": "gpt-4o"},
+            status="completed",
+            total_items=4,
+            score={
+                "traces": [
+                    {
+                        "trace_id": "trace-1a",
+                        "question_id": 1,
+                        "question": "What is Python?",
+                        "ground_truth_answer": "A programming language",
+                        "llm_answer": "Python is a high-level programming language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.82,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                    {
+                        "trace_id": "trace-1b",
+                        "question_id": 1,
+                        "question": "What is Python?",
+                        "ground_truth_answer": "A programming language",
+                        "llm_answer": "Python is an interpreted language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.75,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                    # Row format - 1 trace for question_id=2
+                    {
+                        "trace_id": "trace-2a",
+                        "question_id": 2,
+                        "question": "What is Java?",
+                        "ground_truth_answer": "An OOP language",
+                        "llm_answer": "Java is a statically typed language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.80,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                ],
+                "summary_scores": [
+                    {
+                        "avg": 0.79,
+                        "std": 0.03,
+                        "name": "cosine_similarity",
+                        "data_type": "NUMERIC",
+                        "total_pairs": 3,
+                    }
+                ],
+            },
+            organization_id=user_api_key.organization_id,
+            project_id=user_api_key.project_id,
+        )
+        db.add(eval_run)
+        db.commit()
+        db.refresh(eval_run)
+
+        response = client.get(
+            f"/api/v1/evaluations/{eval_run.id}",
+            params={
+                "export_format": "grouped",
+                "get_trace_info": True,
+            },  # Missing get_trace_info=true
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 200
+        response_data = response.json()
+        assert response_data["success"] is True
+        data = response_data["data"]
+        assert data["id"] == eval_run.id
+        assert data["status"] == "completed"
+
+        traces = data["score"]["traces"]
+        assert (
+            isinstance(traces, list)
+            and len(traces) > 0
+            and "llm_answers" in traces[0]
+            and isinstance(traces[0]["llm_answers"], list)
+            and "trace_ids" in traces[0]
+            and isinstance(traces[0]["trace_ids"], list)
+        )
+
 
 class TestGetDataset:
     """Test GET /evaluations/datasets/{dataset_id} endpoint."""