Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/judgeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,17 @@ def upload_custom_scorer(
requirements_file_path: Optional[str] = None,
unique_name: Optional[str] = None,
overwrite: bool = False,
is_trace: bool = False,
) -> bool:
"""
Upload custom ExampleScorer from files to backend.
Upload custom ExampleScorer or TraceScorer from files to backend.

Args:
scorer_file_path: Path to Python file containing CustomScorer class
requirements_file_path: Optional path to requirements.txt
unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
overwrite: Whether to overwrite existing scorer if it already exists
is_trace: Whether the scorer is a TraceScorer (default: False for ExampleScorer)

Returns:
bool: True if upload successful
Expand Down Expand Up @@ -121,21 +123,23 @@ def upload_custom_scorer(
judgeval_logger.error(error_msg)
raise ValueError(error_msg)

base_class_name = "TraceScorer" if is_trace else "ExampleScorer"

scorer_classes = []
for node in ast.walk(tree):
Comment on lines +127 to 130

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

The logic to find the scorer class by walking the AST has been made more complex by the introduction of the is_trace flag. This section is becoming difficult to read and maintain. Consider extracting the AST parsing and validation logic into a dedicated helper function to improve clarity and separation of concerns.

Context for Agents
[**BestPractice**]

The logic to find the scorer class by walking the AST has been made more complex by the introduction of the `is_trace` flag. This section is becoming difficult to read and maintain. Consider extracting the AST parsing and validation logic into a dedicated helper function to improve clarity and separation of concerns.

File: src/judgeval/__init__.py
Line: 130

if isinstance(node, ast.ClassDef):
for base in node.bases:
if (isinstance(base, ast.Name) and base.id == "ExampleScorer") or (
isinstance(base, ast.Attribute) and base.attr == "ExampleScorer"
if (isinstance(base, ast.Name) and base.id == base_class_name) or (
isinstance(base, ast.Attribute) and base.attr == base_class_name
):
scorer_classes.append(node.name)

if len(scorer_classes) > 1:
error_msg = f"Multiple ExampleScorer classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
error_msg = f"Multiple {base_class_name} classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
judgeval_logger.error(error_msg)
raise ValueError(error_msg)
elif len(scorer_classes) == 0:
error_msg = f"No ExampleScorer class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from ExampleScorer."
error_msg = f"No {base_class_name} class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from {base_class_name}."
judgeval_logger.error(error_msg)
raise ValueError(error_msg)

Expand All @@ -158,6 +162,7 @@ def upload_custom_scorer(
"scorer_code": scorer_code,
"requirements_text": requirements_text,
"overwrite": overwrite,
"is_trace": is_trace,
}
)

Expand Down
3 changes: 2 additions & 1 deletion src/judgeval/api/api_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: .openapi.json
# timestamp: 2025-10-25T22:30:20+00:00
# timestamp: 2025-11-07T21:40:15+00:00

from __future__ import annotations
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
Expand Down Expand Up @@ -68,6 +68,7 @@ class CustomScorerUploadPayload(TypedDict):
scorer_code: str
requirements_text: str
overwrite: NotRequired[bool]
is_trace: NotRequired[bool]


class CustomScorerTemplateResponse(TypedDict):
Expand Down
7 changes: 7 additions & 0 deletions src/judgeval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def upload_scorer(
"-o",
help="Overwrite existing scorer if it already exists",
),
is_trace: bool = typer.Option(
False,
"--trace",
"-t",
help="Specify that the scorer is a trace scorer",
),
):
# Validate file paths
if not Path(scorer_file_path).exists():
Expand All @@ -51,6 +57,7 @@ def upload_scorer(
requirements_file_path=requirements_file_path,
unique_name=unique_name,
overwrite=overwrite,
is_trace=is_trace,
)

if not result:
Expand Down
3 changes: 3 additions & 0 deletions src/judgeval/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from judgeval.data.scorer_data import ScorerData, create_scorer_data
from judgeval.data.result import ScoringResult, generate_scoring_result
from judgeval.data.trace import TraceUsage
from judgeval.data.otel_trace import TraceData, TraceSpanData


__all__ = [
Expand All @@ -12,4 +13,6 @@
"ScoringResult",
"generate_scoring_result",
"TraceUsage",
"TraceData",
"TraceSpanData",
]
3 changes: 2 additions & 1 deletion src/judgeval/data/judgment_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: .openapi.json
# timestamp: 2025-10-25T22:30:19+00:00
# timestamp: 2025-11-07T21:40:14+00:00

from __future__ import annotations
from typing import Annotated, Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -70,6 +70,7 @@ class CustomScorerUploadPayload(BaseModel):
scorer_code: Annotated[str, Field(title="Scorer Code")]
requirements_text: Annotated[str, Field(title="Requirements Text")]
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False


class CustomScorerTemplateResponse(BaseModel):
Expand Down
11 changes: 11 additions & 0 deletions src/judgeval/data/otel_trace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List
from pydantic import BaseModel
from judgeval.data.judgment_types import OtelTraceSpan


class TraceSpanData(OtelTraceSpan):
pass


class TraceData(BaseModel):
trace_spans: List[TraceSpanData]
5 changes: 3 additions & 2 deletions src/judgeval/data/result.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Union
from judgeval.data import ScorerData, Example
from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
from judgeval.data.otel_trace import TraceSpanData


class ScoringResult(JudgmentScoringResult):
Expand All @@ -17,7 +18,7 @@ class ScoringResult(JudgmentScoringResult):
"""

# Need to override this so that it uses this repo's Example class
data_object: Example
data_object: Union[Example, TraceSpanData]
scorers_data: List[ScorerData]

def model_dump(self, **kwargs):
Expand All @@ -34,7 +35,7 @@ def __str__(self) -> str:


def generate_scoring_result(
data_object: Union[Example],
data_object: Union[Example, TraceSpanData],
scorers_data: List[ScorerData],
run_duration: float,
success: bool,
Expand Down
17 changes: 0 additions & 17 deletions src/judgeval/scorers/agent_scorer.py

This file was deleted.

193 changes: 193 additions & 0 deletions src/judgeval/scorers/score_trace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""
Infrastructure for executing evaluations of `Trace`s using one or more `TraceScorer`s.
"""

import asyncio
import time
from tqdm.asyncio import tqdm_asyncio
from typing import List, Optional, Callable, Union

from judgeval.data import (
ScoringResult,
generate_scoring_result,
create_scorer_data,
)

from judgeval.data.otel_trace import TraceData
from judgeval.scorers.trace_scorer import TraceScorer
from judgeval.scorers.utils import clone_scorers
from judgeval.logger import judgeval_logger
from judgeval.judges import JudgevalJudge
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL


async def safe_a_score_trace(scorer: TraceScorer, trace: TraceData):
"""
Scoring task function when not using a progress indicator!
"Safely" scores an `Trace` using a `TraceScorer` by gracefully handling any exceptions that may occur.
Args:
scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
trace (Trace): The `Trace` to be scored.
Comment on lines +30 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The type hint for the trace parameter in the docstring is Trace, but the function signature specifies TraceData. To avoid confusion and ensure documentation accuracy, the docstring should be updated to match the signature.

Suggested change
scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
trace (Trace): The `Trace` to be scored.
scorer (TraceScorer): The `TraceScorer` to use for scoring the trace.
trace (TraceData): The `TraceData` to be scored.

"""
try:
score = await scorer.a_score_trace(trace)
if score is None:
raise Exception("a_score_trace need to return a score")
elif score < 0:
judgeval_logger.warning("score cannot be less than 0 , setting to 0")
score = 0
elif score > 1:
judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
score = 1
else:
scorer.score = score
scorer.success = scorer.success_check()
except Exception as e:
judgeval_logger.error(f"Error during scoring: {str(e)}")
scorer.error = str(e)
scorer.success = False
scorer.score = 0
return


async def a_execute_trace_scoring(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

[CodeDuplication] The logic in this file for scoring traces is almost a complete duplicate of the logic for scoring examples in src/judgeval/scorers/score.py. The functions safe_a_score_trace, a_execute_trace_scoring, and a_eval_traces_helper are structurally identical to their counterparts in score.py (safe_a_score_example, a_execute_scoring, a_eval_examples_helper).

To avoid code duplication and improve maintainability, consider creating a generic scoring execution utility that can be reused for both ExampleScorer with Examples and TraceScorer with TraceData.

Context for Agents
[**BestPractice**]

[CodeDuplication] The logic in this file for scoring traces is almost a complete duplicate of the logic for scoring examples in `src/judgeval/scorers/score.py`. The functions `safe_a_score_trace`, `a_execute_trace_scoring`, and `a_eval_traces_helper` are structurally identical to their counterparts in `score.py` (`safe_a_score_example`, `a_execute_scoring`, `a_eval_examples_helper`).

To avoid code duplication and improve maintainability, consider creating a generic scoring execution utility that can be reused for both `ExampleScorer` with `Example`s and `TraceScorer` with `TraceData`.

File: src/judgeval/scorers/score_trace.py
Line: 55

traces: List[TraceData],
scorers: List[TraceScorer],
model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
ignore_errors: bool = False,
throttle_value: int = 0,
max_concurrent: int = 100,
show_progress: bool = True,
) -> List[ScoringResult]:
"""
Executes evaluations of `Trace`s asynchronously using one or more `TraceScorer`s.
Each `Trace` will be evaluated by all of the `TraceScorer`s in the `scorers` list.
Args:
traces (List[List[TraceSpan]]): A list of `TraceSpan` objects to be evaluated.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The type hint for the traces parameter in the docstring is List[List[TraceSpan]], but the function signature uses List[TraceData]. This is inconsistent and could be misleading for developers using this function. Please update the docstring to match the function signature.

Suggested change
traces (List[List[TraceSpan]]): A list of `TraceSpan` objects to be evaluated.
traces (List[TraceData]): A list of `TraceData` objects to be evaluated.

scorers (List[TraceScorer]): A list of `TraceScorer` objects to evaluate the traces.
ignore_errors (bool): Whether to ignore errors during evaluation.
throttle_value (int): The amount of time to wait between starting each task.
max_concurrent (int): The maximum number of concurrent tasks.
Returns:
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
"""
semaphore = asyncio.Semaphore(max_concurrent)

async def execute_with_semaphore(func: Callable, *args, **kwargs):
async with semaphore:
try:
return await func(*args, **kwargs)
except Exception as e:
judgeval_logger.error(f"Error executing function: {e}")
if kwargs.get("ignore_errors", False):
return None
raise

for scorer in scorers:
if not scorer.model and isinstance(model, str):
scorer._add_model(model)

scoring_results: List[Optional[ScoringResult]] = [None for _ in traces]
tasks = []

if show_progress:
with tqdm_asyncio(
desc=f"Evaluating {len(traces)} trace(s) in parallel",
unit="TraceData",
total=len(traces),
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
) as pbar:
for i, trace in enumerate(traces):
if isinstance(trace, TraceData):
if len(scorers) == 0:
pbar.update(1)
continue

cloned_scorers = clone_scorers(scorers) # type: ignore
task = execute_with_semaphore(
func=a_eval_traces_helper,
scorers=cloned_scorers,
trace=trace,
scoring_results=scoring_results,
score_index=i,
ignore_errors=ignore_errors,
pbar=pbar,
)
tasks.append(asyncio.create_task(task))

await asyncio.sleep(throttle_value)
await asyncio.gather(*tasks)
else:
for i, trace in enumerate(traces):
if isinstance(trace, TraceData):
if len(scorers) == 0:
continue

cloned_scorers = clone_scorers(scorers) # type: ignore
task = execute_with_semaphore(
func=a_eval_traces_helper,
scorers=cloned_scorers,
trace=trace,
scoring_results=scoring_results,
score_index=i,
ignore_errors=ignore_errors,
pbar=None,
)
tasks.append(asyncio.create_task(task))

await asyncio.sleep(throttle_value)
await asyncio.gather(*tasks)
Comment on lines +97 to +143
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There is significant code duplication between the if show_progress: and else: blocks. The core logic for iterating through traces, cloning scorers, and creating asynchronous tasks is identical in both branches. This duplication makes the code harder to maintain, as any change needs to be applied in two places.

To improve maintainability, consider refactoring this to a single loop. You could use a context manager for the progress bar that does nothing when show_progress is False, allowing you to unify the logic. For example:

import contextlib

# ...

progress_context = tqdm_asyncio(...) if show_progress else contextlib.nullcontext()
with progress_context as pbar:
    for i, trace in enumerate(traces):
        # ... common logic for creating tasks ...
        # pass pbar to helper, which can handle if it's None

return [result for result in scoring_results if result is not None]


async def a_eval_traces_helper(
scorers: List[TraceScorer],
trace: TraceData,
scoring_results: List[ScoringResult],
score_index: int,
ignore_errors: bool,
pbar: Optional[tqdm_asyncio] = None,
) -> None:
"""
Evaluate a single trace asynchronously using a list of scorers.
Args:
scorers (List[TraceScorer]): List of TraceScorer objects to evaluate the trace.
trace (Trace): The trace to be evaluated.
scoring_results (List[TestResult]): List to store the scoring results.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The type hint for the scoring_results parameter in the docstring is List[TestResult]. This appears to be a copy-paste error, as the function signature correctly types it as List[ScoringResult]. Please correct the docstring to maintain consistency and avoid confusion.

Suggested change
scoring_results (List[TestResult]): List to store the scoring results.
scoring_results (List[ScoringResult]): List to store the scoring results.

score_index (int): Index at which the result should be stored in scoring_results.
ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
Returns:
None
"""
# scoring the Trace
scoring_start_time = time.perf_counter()

tasks = [safe_a_score_trace(scorer, trace) for scorer in scorers]
await asyncio.gather(*tasks)

success = True
scorer_data_list = []
for scorer in scorers:
if getattr(scorer, "skipped", False):
continue
scorer_data = create_scorer_data(scorer)
for s in scorer_data:
success = success and s.success
scorer_data_list.extend(scorer_data)

scoring_end_time = time.perf_counter()
run_duration = scoring_end_time - scoring_start_time

scoring_result = generate_scoring_result(
trace.trace_spans[0], scorer_data_list, run_duration, success
)
scoring_results[score_index] = scoring_result
Comment on lines +187 to +190
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Accessing trace.trace_spans[0] assumes that the trace_spans list is never empty. If a TraceData object is passed with an empty trace_spans list, this will raise an IndexError and crash the scoring process for that trace. It's crucial to add a check to handle this edge case gracefully to prevent runtime errors.

Suggested change
scoring_result = generate_scoring_result(
trace.trace_spans[0], scorer_data_list, run_duration, success
)
scoring_results[score_index] = scoring_result
if not trace.trace_spans:
judgeval_logger.warning("Trace contains no spans. Skipping result generation for this trace.")
return
scoring_result = generate_scoring_result(
trace.trace_spans[0], scorer_data_list, run_duration, success
)
scoring_results[score_index] = scoring_result


if pbar is not None:
pbar.update(1)
14 changes: 14 additions & 0 deletions src/judgeval/scorers/trace_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from judgeval.scorers.base_scorer import BaseScorer
from judgeval.data import TraceData


class TraceScorer(BaseScorer):
score_type: str = "Custom Trace"

async def a_score_trace(self, trace: TraceData, *args, **kwargs) -> float:
"""
Asynchronously measures the score on a single trace
"""
raise NotImplementedError(
"You must implement the `a_score_trace` method in your custom scorer"
)
Loading