Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion INSTRUCTIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,15 @@ uv run evaluate \
--trajectories traces/trajectories \
--scenarios groundtruth/101.json \
--scorer-default llm_judge \
--judge-model litellm_proxy/aws/claude-opus-4-6
--judge-model litellm_proxy/azure/gpt-5.4
```

Output lands under `reports/` — one `<run_id>.json` per trajectory plus `_aggregate.json` for the rollup.

> [!NOTE]
> If `llm_judge` is used, `--judge-model` must not match the trajectory's `model`
> for any evaluated run. The evaluator now rejects self-judging rows with a clear error.

Scorer families follow MLflow's evaluator/scorer split: `llm_judge` is wired up; `exact_string_match`, `numeric_match`, and `semantic_similarity` ship as skeletons (raise `NotImplementedError`).

Full reference — scenario schema, report layout, custom scorers, looping over ground-truth: **[docs/evaluation.md](docs/evaluation.md)**.
Expand Down
29 changes: 27 additions & 2 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ uv run evaluate \
--trajectories traces/trajectories \
--scenarios groundtruth/101.json \
--scorer-default llm_judge \
--judge-model litellm_proxy/aws/claude-opus-4-6
--judge-model litellm_proxy/azure/gpt-5.4
```

Output:
Expand Down Expand Up @@ -230,6 +230,19 @@ free-form `suggestions` (or legacy `reason`) lands in
To customise: edit `_PROMPT_TEMPLATE` in
`src/evaluation/scorers/llm_judge.py`.

## Self-judging guard

When `llm_judge` is active and `--judge-model` is provided, evaluation
aborts any row where the trajectory `model` matches the judge model
(after normalizing the `litellm_proxy/` prefix). This avoids
out-of-the-box self-evaluation bias.

Example error:

```
self-judging is not allowed for llm_judge: trajectory model 'litellm_proxy/aws/claude-opus-4-6' matches judge model 'litellm_proxy/aws/claude-opus-4-6'
```

## Programmatic use

```python
Expand All @@ -249,6 +262,18 @@ for r in report.results:
print(r.run_id, r.score.passed, r.score.score)
```

To enforce the self-judging guard in programmatic usage, pass `judge_model`:

```python
report = Evaluator(
default_scorer="llm_judge",
judge_model="litellm_proxy/azure/gpt-5.4",
).evaluate(
trajectories_path=Path("traces/trajectories"),
scenarios_paths=[Path("groundtruth/101.json")],
)
```

## Plug in a custom scorer

```python
Expand Down Expand Up @@ -285,5 +310,5 @@ uv run evaluate \
--trajectories traces/trajectories \
--scenarios groundtruth/*.json \
--scorer-default llm_judge \
--judge-model litellm_proxy/aws/claude-opus-4-6
--judge-model litellm_proxy/azure/gpt-5.4
```
5 changes: 4 additions & 1 deletion src/evaluation/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ def main(argv: list[str] | None = None) -> int:
_maybe_install_judge(args.judge_model)
_validate_scorer_default(args.scorer_default)

report = Evaluator(default_scorer=args.scorer_default).evaluate(
report = Evaluator(
default_scorer=args.scorer_default,
judge_model=args.judge_model,
).evaluate(
trajectories_path=args.trajectories,
scenarios_paths=list(args.scenarios),
)
Expand Down
33 changes: 31 additions & 2 deletions src/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
PersistedTrajectory,
Scenario,
ScenarioResult,
ScorerResult,
)
from .report import build_report
from .scorers import Scorer
Expand All @@ -35,8 +34,13 @@ class Evaluator:
take precedence.
"""

def __init__(self, default_scorer: str = "llm_judge") -> None:
def __init__(
self,
default_scorer: str = "llm_judge",
judge_model: str | None = None,
) -> None:
self.default_scorer = default_scorer
self.judge_model = judge_model

def evaluate(
self,
Expand All @@ -58,6 +62,7 @@ def _score_one(
) -> ScenarioResult:
name = scenario.scoring_method or self.default_scorer
scorer = self._resolve(name)
self._validate_judge_model(name, traj)
trajectory_text = _trajectory_to_text(traj)
score = scorer(scenario, traj.answer, trajectory_text)

Expand All @@ -77,6 +82,21 @@ def _score_one(
def _resolve(name: str) -> Scorer:
return scorer_registry.get(name)

def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None:
if scorer_name != "llm_judge" or not self.judge_model:
return

trajectory_model = _normalize_model_id(traj.model)
judge_model = _normalize_model_id(self.judge_model)
if not trajectory_model or not judge_model:
return

if trajectory_model == judge_model:
raise ValueError(
"self-judging is not allowed for llm_judge: "
f"trajectory model '{traj.model}' matches judge model '{self.judge_model}'"
)


def _trajectory_to_text(traj: PersistedTrajectory) -> str:
"""Flatten a trajectory to a text blob for the LLM-As-Judge prompt."""
Expand All @@ -86,3 +106,12 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str:
return json.dumps(traj.trajectory, indent=2, default=str)
except (TypeError, ValueError):
return str(traj.trajectory)


def _normalize_model_id(model_id: str | None) -> str:
if not model_id:
return ""
normalized = model_id.strip()
if normalized.startswith("litellm_proxy/"):
normalized = normalized[len("litellm_proxy/") :]
return normalized
6 changes: 5 additions & 1 deletion src/evaluation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@ def evaluate(
trajectories_path: Path,
scenarios_paths: list[Path],
default_scoring_method: str = "llm_judge",
judge_model: str | None = None,
) -> EvalReport:
"""Load, score, and aggregate.

Per-scenario scorer is picked from ``scenario.scoring_method`` when
set, falling back to ``default_scoring_method``.
"""
return Evaluator(default_scorer=default_scoring_method).evaluate(
return Evaluator(
default_scorer=default_scoring_method,
judge_model=judge_model,
).evaluate(
trajectories_path=trajectories_path,
scenarios_paths=scenarios_paths,
)
97 changes: 97 additions & 0 deletions src/evaluation/tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,100 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec

assert report.totals["passed"] == 1
assert report.results[0].score.scorer == "stub-evaluator"


def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record):
trajectories_dir = tmp_path / "trajectories"
trajectories_dir.mkdir()

rec = make_persisted_record(
run_id="run-1",
scenario_id=1,
model="litellm_proxy/aws/claude-opus-4-6",
)
(trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")

scenarios_path = tmp_path / "scenarios.json"
scenarios_path.write_text(
json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
encoding="utf-8",
)

registry.register("llm_judge", _stub_scorer)

try:
Evaluator(
default_scorer="llm_judge",
judge_model="litellm_proxy/aws/claude-opus-4-6",
).evaluate(
trajectories_path=trajectories_dir,
scenarios_paths=[scenarios_path],
)
except ValueError as exc:
assert "self-judging is not allowed" in str(exc)
else:
raise AssertionError("expected ValueError for self-judging")


def test_evaluator_rejects_self_judging_with_normalized_model_ids(
tmp_path: Path, make_persisted_record
):
trajectories_dir = tmp_path / "trajectories"
trajectories_dir.mkdir()

rec = make_persisted_record(
run_id="run-1",
scenario_id=1,
model="litellm_proxy/aws/claude-opus-4-6",
)
(trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")

scenarios_path = tmp_path / "scenarios.json"
scenarios_path.write_text(
json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
encoding="utf-8",
)

registry.register("llm_judge", _stub_scorer)

try:
Evaluator(
default_scorer="llm_judge",
judge_model="aws/claude-opus-4-6",
).evaluate(
trajectories_path=trajectories_dir,
scenarios_paths=[scenarios_path],
)
except ValueError as exc:
assert "self-judging is not allowed" in str(exc)
else:
raise AssertionError("expected ValueError for self-judging")


def test_evaluator_allows_non_llm_judge_even_with_matching_model(
tmp_path: Path, make_persisted_record
):
rec = make_persisted_record(
run_id="run-1",
scenario_id=1,
model="litellm_proxy/aws/claude-opus-4-6",
)
(tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")

scenarios_path = tmp_path / "scenarios.json"
scenarios_path.write_text(
json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
encoding="utf-8",
)

registry.register("stub-evaluator", _stub_scorer)

report = Evaluator(
default_scorer="stub-evaluator",
judge_model="aws/claude-opus-4-6",
).evaluate(
trajectories_path=tmp_path,
scenarios_paths=[scenarios_path],
)

assert report.totals["passed"] == 1