diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 22b9e60d..afc30f19 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -241,11 +241,15 @@ uv run evaluate \ --trajectories traces/trajectories \ --scenarios groundtruth/101.json \ --scorer-default llm_judge \ - --judge-model litellm_proxy/aws/claude-opus-4-6 + --judge-model litellm_proxy/azure/gpt-5.4 ``` Output lands under `reports/` — one `.json` per trajectory plus `_aggregate.json` for the rollup. +> [!NOTE] +> If `llm_judge` is used, `--judge-model` must not match the trajectory's `model` +> for any evaluated run. The evaluator now rejects self-judging rows with a clear error. + Scorer families follow MLflow's evaluator/scorer split: `llm_judge` is wired up; `exact_string_match`, `numeric_match`, and `semantic_similarity` ship as skeletons (raise `NotImplementedError`). Full reference — scenario schema, report layout, custom scorers, looping over ground-truth: **[docs/evaluation.md](docs/evaluation.md)**. diff --git a/docs/evaluation.md b/docs/evaluation.md index 12ce4e1e..f7ba3d77 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -87,7 +87,7 @@ uv run evaluate \ --trajectories traces/trajectories \ --scenarios groundtruth/101.json \ --scorer-default llm_judge \ - --judge-model litellm_proxy/aws/claude-opus-4-6 + --judge-model litellm_proxy/azure/gpt-5.4 ``` Output: @@ -230,6 +230,19 @@ free-form `suggestions` (or legacy `reason`) lands in To customise: edit `_PROMPT_TEMPLATE` in `src/evaluation/scorers/llm_judge.py`. +## Self-judging guard + +When `llm_judge` is active and `--judge-model` is provided, evaluation +aborts any row where the trajectory `model` matches the judge model +(after normalizing the `litellm_proxy/` prefix). This avoids +out-of-the-box self-evaluation bias. + +Example error: + +``` +self-judging is not allowed for llm_judge: trajectory model 'litellm_proxy/aws/claude-opus-4-6' matches judge model 'litellm_proxy/aws/claude-opus-4-6' +``` + ## Programmatic use ```python @@ -249,6 +262,18 @@ for r in report.results: print(r.run_id, r.score.passed, r.score.score) ``` +To enforce the self-judging guard in programmatic usage, pass `judge_model`: + +```python +report = Evaluator( + default_scorer="llm_judge", + judge_model="litellm_proxy/azure/gpt-5.4", +).evaluate( + trajectories_path=Path("traces/trajectories"), + scenarios_paths=[Path("groundtruth/101.json")], +) +``` + ## Plug in a custom scorer ```python @@ -285,5 +310,5 @@ uv run evaluate \ --trajectories traces/trajectories \ --scenarios groundtruth/*.json \ --scorer-default llm_judge \ - --judge-model litellm_proxy/aws/claude-opus-4-6 + --judge-model litellm_proxy/azure/gpt-5.4 ``` diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index faf36965..122628a1 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -95,7 +95,10 @@ def main(argv: list[str] | None = None) -> int: _maybe_install_judge(args.judge_model) _validate_scorer_default(args.scorer_default) - report = Evaluator(default_scorer=args.scorer_default).evaluate( + report = Evaluator( + default_scorer=args.scorer_default, + judge_model=args.judge_model, + ).evaluate( trajectories_path=args.trajectories, scenarios_paths=list(args.scenarios), ) diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index dedc8288..27845f08 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -19,7 +19,6 @@ PersistedTrajectory, Scenario, ScenarioResult, - ScorerResult, ) from .report import build_report from .scorers import Scorer @@ -35,8 +34,13 @@ class Evaluator: take precedence. """ - def __init__(self, default_scorer: str = "llm_judge") -> None: + def __init__( + self, + default_scorer: str = "llm_judge", + judge_model: str | None = None, + ) -> None: self.default_scorer = default_scorer + self.judge_model = judge_model def evaluate( self, @@ -58,6 +62,7 @@ def _score_one( ) -> ScenarioResult: name = scenario.scoring_method or self.default_scorer scorer = self._resolve(name) + self._validate_judge_model(name, traj) trajectory_text = _trajectory_to_text(traj) score = scorer(scenario, traj.answer, trajectory_text) @@ -77,6 +82,21 @@ def _score_one( def _resolve(name: str) -> Scorer: return scorer_registry.get(name) + def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None: + if scorer_name != "llm_judge" or not self.judge_model: + return + + trajectory_model = _normalize_model_id(traj.model) + judge_model = _normalize_model_id(self.judge_model) + if not trajectory_model or not judge_model: + return + + if trajectory_model == judge_model: + raise ValueError( + "self-judging is not allowed for llm_judge: " + f"trajectory model '{traj.model}' matches judge model '{self.judge_model}'" + ) + def _trajectory_to_text(traj: PersistedTrajectory) -> str: """Flatten a trajectory to a text blob for the LLM-As-Judge prompt.""" @@ -86,3 +106,12 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str: return json.dumps(traj.trajectory, indent=2, default=str) except (TypeError, ValueError): return str(traj.trajectory) + + +def _normalize_model_id(model_id: str | None) -> str: + if not model_id: + return "" + normalized = model_id.strip() + if normalized.startswith("litellm_proxy/"): + normalized = normalized[len("litellm_proxy/") :] + return normalized diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py index 507cdaa2..c86889b3 100644 --- a/src/evaluation/runner.py +++ b/src/evaluation/runner.py @@ -13,13 +13,17 @@ def evaluate( trajectories_path: Path, scenarios_paths: list[Path], default_scoring_method: str = "llm_judge", + judge_model: str | None = None, ) -> EvalReport: """Load, score, and aggregate. Per-scenario scorer is picked from ``scenario.scoring_method`` when set, falling back to ``default_scoring_method``. """ - return Evaluator(default_scorer=default_scoring_method).evaluate( + return Evaluator( + default_scorer=default_scoring_method, + judge_model=judge_model, + ).evaluate( trajectories_path=trajectories_path, scenarios_paths=scenarios_paths, ) diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index 6d91e633..5cab8665 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -70,3 +70,100 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec assert report.totals["passed"] == 1 assert report.results[0].score.scorer == "stub-evaluator" + + +def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record): + trajectories_dir = tmp_path / "trajectories" + trajectories_dir.mkdir() + + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("llm_judge", _stub_scorer) + + try: + Evaluator( + default_scorer="llm_judge", + judge_model="litellm_proxy/aws/claude-opus-4-6", + ).evaluate( + trajectories_path=trajectories_dir, + scenarios_paths=[scenarios_path], + ) + except ValueError as exc: + assert "self-judging is not allowed" in str(exc) + else: + raise AssertionError("expected ValueError for self-judging") + + +def test_evaluator_rejects_self_judging_with_normalized_model_ids( + tmp_path: Path, make_persisted_record +): + trajectories_dir = tmp_path / "trajectories" + trajectories_dir.mkdir() + + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("llm_judge", _stub_scorer) + + try: + Evaluator( + default_scorer="llm_judge", + judge_model="aws/claude-opus-4-6", + ).evaluate( + trajectories_path=trajectories_dir, + scenarios_paths=[scenarios_path], + ) + except ValueError as exc: + assert "self-judging is not allowed" in str(exc) + else: + raise AssertionError("expected ValueError for self-judging") + + +def test_evaluator_allows_non_llm_judge_even_with_matching_model( + tmp_path: Path, make_persisted_record +): + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("stub-evaluator", _stub_scorer) + + report = Evaluator( + default_scorer="stub-evaluator", + judge_model="aws/claude-opus-4-6", + ).evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert report.totals["passed"] == 1