JudgmentLabs · adivate2021 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 13, 2025
diff --git a/src/judgeval/__init__.py b/src/judgeval/__init__.py
@@ -83,15 +83,17 @@ def upload_custom_scorer(
         requirements_file_path: Optional[str] = None,
         unique_name: Optional[str] = None,
         overwrite: bool = False,
+        is_trace: bool = False,
     ) -> bool:
         """
-        Upload custom ExampleScorer from files to backend.
+        Upload custom ExampleScorer or TraceScorer from files to backend.
 
         Args:
             scorer_file_path: Path to Python file containing CustomScorer class
             requirements_file_path: Optional path to requirements.txt
             unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
             overwrite: Whether to overwrite existing scorer if it already exists
+            is_trace: Whether the scorer is a TraceScorer (default: False for ExampleScorer)
 
         Returns:
             bool: True if upload successful
@@ -121,21 +123,23 @@ def upload_custom_scorer(
             judgeval_logger.error(error_msg)
             raise ValueError(error_msg)
 
+        base_class_name = "TraceScorer" if is_trace else "ExampleScorer"
+
         scorer_classes = []
         for node in ast.walk(tree):
             if isinstance(node, ast.ClassDef):
                 for base in node.bases:
-                    if (isinstance(base, ast.Name) and base.id == "ExampleScorer") or (
-                        isinstance(base, ast.Attribute) and base.attr == "ExampleScorer"
+                    if (isinstance(base, ast.Name) and base.id == base_class_name) or (
+                        isinstance(base, ast.Attribute) and base.attr == base_class_name
                     ):
                         scorer_classes.append(node.name)
 
         if len(scorer_classes) > 1:
-            error_msg = f"Multiple ExampleScorer classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
+            error_msg = f"Multiple {base_class_name} classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
             judgeval_logger.error(error_msg)
             raise ValueError(error_msg)
         elif len(scorer_classes) == 0:
-            error_msg = f"No ExampleScorer class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from ExampleScorer."
+            error_msg = f"No {base_class_name} class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from {base_class_name}."
             judgeval_logger.error(error_msg)
             raise ValueError(error_msg)
 
@@ -158,6 +162,7 @@ def upload_custom_scorer(
                     "scorer_code": scorer_code,
                     "requirements_text": requirements_text,
                     "overwrite": overwrite,
+                    "is_trace": is_trace,
                 }
             )
 

diff --git a/src/judgeval/api/api_types.py b/src/judgeval/api/api_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-10-25T22:30:20+00:00
+#   timestamp: 2025-11-07T21:40:15+00:00
 
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -68,6 +68,7 @@ class CustomScorerUploadPayload(TypedDict):
     scorer_code: str
     requirements_text: str
     overwrite: NotRequired[bool]
+    is_trace: NotRequired[bool]
 
 
 class CustomScorerTemplateResponse(TypedDict):

diff --git a/src/judgeval/cli.py b/src/judgeval/cli.py
@@ -33,6 +33,12 @@ def upload_scorer(
         "-o",
         help="Overwrite existing scorer if it already exists",
     ),
+    is_trace: bool = typer.Option(
+        False,
+        "--trace",
+        "-t",
+        help="Specify that the scorer is a trace scorer",
+    ),
 ):
     # Validate file paths
     if not Path(scorer_file_path).exists():
@@ -51,6 +57,7 @@ def upload_scorer(
             requirements_file_path=requirements_file_path,
             unique_name=unique_name,
             overwrite=overwrite,
+            is_trace=is_trace,
         )
 
         if not result:

diff --git a/src/judgeval/data/__init__.py b/src/judgeval/data/__init__.py
@@ -2,6 +2,7 @@
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 from judgeval.data.trace import TraceUsage
+from judgeval.data.otel_trace import TraceData, TraceSpanData
 
 
 __all__ = [
@@ -12,4 +13,6 @@
     "ScoringResult",
     "generate_scoring_result",
     "TraceUsage",
+    "TraceData",
+    "TraceSpanData",
 ]
diff --git a/src/judgeval/data/judgment_types.py b/src/judgeval/data/judgment_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-10-25T22:30:19+00:00
+#   timestamp: 2025-11-07T21:40:14+00:00
 
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -70,6 +70,7 @@ class CustomScorerUploadPayload(BaseModel):
     scorer_code: Annotated[str, Field(title="Scorer Code")]
     requirements_text: Annotated[str, Field(title="Requirements Text")]
     overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
+    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
 
 
 class CustomScorerTemplateResponse(BaseModel):

diff --git a/src/judgeval/data/otel_trace.py b/src/judgeval/data/otel_trace.py
@@ -0,0 +1,11 @@
+from typing import List
+from pydantic import BaseModel
+from judgeval.data.judgment_types import OtelTraceSpan
+
+
+class TraceSpanData(OtelTraceSpan):
+    pass
+
+
+class TraceData(BaseModel):
+    trace_spans: List[TraceSpanData]
diff --git a/src/judgeval/data/result.py b/src/judgeval/data/result.py
@@ -1,6 +1,7 @@
 from typing import List, Union
 from judgeval.data import ScorerData, Example
 from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
+from judgeval.data.otel_trace import TraceSpanData
 
 
 class ScoringResult(JudgmentScoringResult):
@@ -17,7 +18,7 @@ class ScoringResult(JudgmentScoringResult):
     """
 
     # Need to override this so that it uses this repo's Example class
-    data_object: Example
+    data_object: Union[Example, TraceSpanData]
     scorers_data: List[ScorerData]
 
     def model_dump(self, **kwargs):
@@ -34,7 +35,7 @@ def __str__(self) -> str:
 
 
 def generate_scoring_result(
-    data_object: Union[Example],
+    data_object: Union[Example, TraceSpanData],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,

diff --git a/src/judgeval/scorers/agent_scorer.py b/src/judgeval/scorers/agent_scorer.py
diff --git a/src/judgeval/scorers/score_trace.py b/src/judgeval/scorers/score_trace.py
@@ -0,0 +1,193 @@
+"""
+Infrastructure for executing evaluations of `Trace`s using one or more `TraceScorer`s.
+"""
+
+import asyncio
+import time
+from tqdm.asyncio import tqdm_asyncio
+from typing import List, Optional, Callable, Union
+
+from judgeval.data import (
+    ScoringResult,
+    generate_scoring_result,
+    create_scorer_data,
+)
+
+from judgeval.data.otel_trace import TraceData
+from judgeval.scorers.trace_scorer import TraceScorer
+from judgeval.scorers.utils import clone_scorers
+from judgeval.logger import judgeval_logger
+from judgeval.judges import JudgevalJudge
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
+
+
+async def safe_a_score_trace(scorer: TraceScorer, trace: TraceData):
+    """
+    Scoring task function when not using a progress indicator!
+    "Safely" scores an `Trace` using a `TraceScorer` by gracefully handling any exceptions that may occur.
+
+    Args:
+        scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
+        trace (Trace): The `Trace` to be scored.
-        scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
-        trace (Trace): The `Trace` to be scored.
+        scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
+        trace (TraceData): The `TraceData` to be scored.
-        scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
-        trace (Trace): The `Trace` to be scored.
+        scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
+        trace (TraceData): The `TraceData` to be scored.
+
+    """
+    try:
+        score = await scorer.a_score_trace(trace)
+        if score is None:
+            raise Exception("a_score_trace need to return a score")
+        elif score < 0:
+            judgeval_logger.warning("score cannot be less than 0 , setting to 0")
+            score = 0
+        elif score > 1:
+            judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
+            score = 1
+        else:
+            scorer.score = score
+        scorer.success = scorer.success_check()
+    except Exception as e:
+        judgeval_logger.error(f"Error during scoring: {str(e)}")
+        scorer.error = str(e)
+        scorer.success = False
+        scorer.score = 0
+        return
+
+
+async def a_execute_trace_scoring(
+    traces: List[TraceData],
+    scorers: List[TraceScorer],
+    model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
+    ignore_errors: bool = False,
+    throttle_value: int = 0,
+    max_concurrent: int = 100,
+    show_progress: bool = True,
+) -> List[ScoringResult]:
+    """
+    Executes evaluations of `Trace`s asynchronously using one or more `TraceScorer`s.
+    Each `Trace` will be evaluated by all of the `TraceScorer`s in the `scorers` list.
+
+    Args:
+        traces (List[List[TraceSpan]]): A list of `TraceSpan` objects to be evaluated.
-        traces (List[List[TraceSpan]]): A list of `TraceSpan` objects to be evaluated.
+        traces (List[TraceData]): A list of `TraceData` objects to be evaluated.
-        traces (List[List[TraceSpan]]): A list of `TraceSpan` objects to be evaluated.
+        traces (List[TraceData]): A list of `TraceData` objects to be evaluated.
+        scorers (List[TraceScorer]): A list of `TraceScorer` objects to evaluate the traces.
+        ignore_errors (bool): Whether to ignore errors during evaluation.
+        throttle_value (int): The amount of time to wait between starting each task.
+        max_concurrent (int): The maximum number of concurrent tasks.
+
+    Returns:
+        List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def execute_with_semaphore(func: Callable, *args, **kwargs):
+        async with semaphore:
+            try:
+                return await func(*args, **kwargs)
+            except Exception as e:
+                judgeval_logger.error(f"Error executing function: {e}")
+                if kwargs.get("ignore_errors", False):
+                    return None
+                raise
+
+    for scorer in scorers:
+        if not scorer.model and isinstance(model, str):
+            scorer._add_model(model)
+
+    scoring_results: List[Optional[ScoringResult]] = [None for _ in traces]
+    tasks = []
+
+    if show_progress:
+        with tqdm_asyncio(
+            desc=f"Evaluating {len(traces)} trace(s) in parallel",
+            unit="TraceData",
+            total=len(traces),
+            bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
+        ) as pbar:
+            for i, trace in enumerate(traces):
+                if isinstance(trace, TraceData):
+                    if len(scorers) == 0:
+                        pbar.update(1)
+                        continue
+
+                    cloned_scorers = clone_scorers(scorers)  # type: ignore
+                    task = execute_with_semaphore(
+                        func=a_eval_traces_helper,
+                        scorers=cloned_scorers,
+                        trace=trace,
+                        scoring_results=scoring_results,
+                        score_index=i,
+                        ignore_errors=ignore_errors,
+                        pbar=pbar,
+                    )
+                    tasks.append(asyncio.create_task(task))
+
+                await asyncio.sleep(throttle_value)
+            await asyncio.gather(*tasks)
+    else:
+        for i, trace in enumerate(traces):
+            if isinstance(trace, TraceData):
+                if len(scorers) == 0:
+                    continue
+
+                cloned_scorers = clone_scorers(scorers)  # type: ignore
+                task = execute_with_semaphore(
+                    func=a_eval_traces_helper,
+                    scorers=cloned_scorers,
+                    trace=trace,
+                    scoring_results=scoring_results,
+                    score_index=i,
+                    ignore_errors=ignore_errors,
+                    pbar=None,
+                )
+                tasks.append(asyncio.create_task(task))
+
+            await asyncio.sleep(throttle_value)
+        await asyncio.gather(*tasks)
+    return [result for result in scoring_results if result is not None]
+
+
+async def a_eval_traces_helper(
+    scorers: List[TraceScorer],
+    trace: TraceData,
+    scoring_results: List[ScoringResult],
+    score_index: int,
+    ignore_errors: bool,
+    pbar: Optional[tqdm_asyncio] = None,
+) -> None:
+    """
+    Evaluate a single trace asynchronously using a list of scorers.
+
+    Args:
+        scorers (List[TraceScorer]): List of TraceScorer objects to evaluate the trace.
+        trace (Trace): The trace to be evaluated.
+        scoring_results (List[TestResult]): List to store the scoring results.
-        scoring_results (List[TestResult]): List to store the scoring results.
+        scoring_results (List[ScoringResult]): List to store the scoring results.
-        scoring_results (List[TestResult]): List to store the scoring results.
+        scoring_results (List[ScoringResult]): List to store the scoring results.
+        score_index (int): Index at which the result should be stored in scoring_results.
+        ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
+        pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
+    Returns:
+        None
+    """
+    # scoring the Trace
+    scoring_start_time = time.perf_counter()
+
+    tasks = [safe_a_score_trace(scorer, trace) for scorer in scorers]
+    await asyncio.gather(*tasks)
+
+    success = True
+    scorer_data_list = []
+    for scorer in scorers:
+        if getattr(scorer, "skipped", False):
+            continue
+        scorer_data = create_scorer_data(scorer)
+        for s in scorer_data:
+            success = success and s.success
+        scorer_data_list.extend(scorer_data)
+
+    scoring_end_time = time.perf_counter()
+    run_duration = scoring_end_time - scoring_start_time
+
+    scoring_result = generate_scoring_result(
+        trace.trace_spans[0], scorer_data_list, run_duration, success
+    )
+    scoring_results[score_index] = scoring_result
-    scoring_result = generate_scoring_result(
-        trace.trace_spans[0], scorer_data_list, run_duration, success
-    )
-    scoring_results[score_index] = scoring_result
+    if not trace.trace_spans:
+        judgeval_logger.warning("Trace contains no spans. Skipping result generation for this trace.")
+        return
+
+    scoring_result = generate_scoring_result(
+        trace.trace_spans[0], scorer_data_list, run_duration, success
+    )
+    scoring_results[score_index] = scoring_result
-    scoring_result = generate_scoring_result(
-        trace.trace_spans[0], scorer_data_list, run_duration, success
-    )
-    scoring_results[score_index] = scoring_result
+    if not trace.trace_spans:
+        judgeval_logger.warning("Trace contains no spans. Skipping result generation for this trace.")
+        return
+
+    scoring_result = generate_scoring_result(
+        trace.trace_spans[0], scorer_data_list, run_duration, success
+    )
+    scoring_results[score_index] = scoring_result
+
+    if pbar is not None:
+        pbar.update(1)
diff --git a/src/judgeval/scorers/trace_scorer.py b/src/judgeval/scorers/trace_scorer.py
@@ -0,0 +1,14 @@
+from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.data import TraceData
+
+
+class TraceScorer(BaseScorer):
+    score_type: str = "Custom Trace"
+
+    async def a_score_trace(self, trace: TraceData, *args, **kwargs) -> float:
+        """
+        Asynchronously measures the score on a single trace
+        """
+        raise NotImplementedError(
+            "You must implement the `a_score_trace` method in your custom scorer"
+        )