diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index 9ce8ca24..21ed45ed 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1,4 +1,4 @@ -__version__ = "1.7.0" +__version__ = "1.8.0" # Will be templated during the build __git_commit__ = "__GIT_COMMIT__" diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py index ce620778..537171df 100644 --- a/src/layerlens/resources/comparisons/comparisons.py +++ b/src/layerlens/resources/comparisons/comparisons.py @@ -18,6 +18,11 @@ def _find_evaluation_id(response: Optional[EvaluationsResponse], model_id: str, return str(response.evaluations[0].id) +def _require_one_of(id_value: Optional[str], key_value: Optional[str], id_name: str, key_name: str) -> None: + if (id_value is None) == (key_value is None): + raise ValueError(f"Exactly one of '{id_name}' or '{key_name}' must be provided.") + + class Comparisons(SyncPublicAPIResource): def compare( self, @@ -58,9 +63,12 @@ def compare( def compare_models( self, *, - benchmark_id: str, - model_id_1: str, - model_id_2: str, + benchmark_id: Optional[str] = None, + model_id_1: Optional[str] = None, + model_id_2: Optional[str] = None, + benchmark_key: Optional[str] = None, + model_key_1: Optional[str] = None, + model_key_2: Optional[str] = None, page: Optional[int] = None, page_size: Optional[int] = None, outcome_filter: Optional[_OUTCOME_FILTER] = None, @@ -69,15 +77,29 @@ def compare_models( ) -> Optional[ComparisonResponse]: """Compare two models on a benchmark by automatically finding their evaluations. + Each of the benchmark and the two models can be addressed by either its + ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``, + one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``. + Finds the most recent successful evaluation for each model on the given benchmark, then compares the results side-by-side. Raises: - ValueError: If no successful evaluation is found for either model. + ValueError: If both ID and key are provided for the same entity, + neither is provided, a key cannot be resolved, or no successful + evaluation is found for either model. """ + _require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key") + _require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1") + _require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2") + + resolved_benchmark_id = benchmark_id or self._resolve_benchmark_key(benchmark_key, timeout=timeout) + resolved_model_id_1 = model_id_1 or self._resolve_model_key(model_key_1, timeout=timeout) + resolved_model_id_2 = model_id_2 or self._resolve_model_key(model_key_2, timeout=timeout) + resp1 = self._client.evaluations.get_many( - model_ids=[model_id_1], - benchmark_ids=[benchmark_id], + model_ids=[resolved_model_id_1], + benchmark_ids=[resolved_benchmark_id], status=EvaluationStatus.SUCCESS, sort_by="submitted_at", order="desc", @@ -85,11 +107,11 @@ def compare_models( unique=True, timeout=timeout, ) - eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id) + eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id) resp2 = self._client.evaluations.get_many( - model_ids=[model_id_2], - benchmark_ids=[benchmark_id], + model_ids=[resolved_model_id_2], + benchmark_ids=[resolved_benchmark_id], status=EvaluationStatus.SUCCESS, sort_by="submitted_at", order="desc", @@ -97,7 +119,7 @@ def compare_models( unique=True, timeout=timeout, ) - eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id) + eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id) return self.compare( evaluation_id_1=eval_id_1, @@ -109,6 +131,22 @@ def compare_models( timeout=timeout, ) + def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str: + resp = self._client.models.get(key=key, timeout=timeout) + if resp is not None: + for model in resp.models: + if model.key == key: + return str(model.id) + raise ValueError(f"No model found for key '{key}'") + + def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str: + resp = self._client.benchmarks.get(key=key, timeout=timeout) + if resp is not None: + for benchmark in resp.datasets: + if benchmark.key == key: + return str(benchmark.id) + raise ValueError(f"No benchmark found for key '{key}'") + class AsyncComparisons(AsyncPublicAPIResource): async def compare( @@ -150,9 +188,12 @@ async def compare( async def compare_models( self, *, - benchmark_id: str, - model_id_1: str, - model_id_2: str, + benchmark_id: Optional[str] = None, + model_id_1: Optional[str] = None, + model_id_2: Optional[str] = None, + benchmark_key: Optional[str] = None, + model_key_1: Optional[str] = None, + model_key_2: Optional[str] = None, page: Optional[int] = None, page_size: Optional[int] = None, outcome_filter: Optional[_OUTCOME_FILTER] = None, @@ -161,15 +202,29 @@ async def compare_models( ) -> Optional[ComparisonResponse]: """Compare two models on a benchmark by automatically finding their evaluations. + Each of the benchmark and the two models can be addressed by either its + ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``, + one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``. + Finds the most recent successful evaluation for each model on the given benchmark, then compares the results side-by-side. Raises: - ValueError: If no successful evaluation is found for either model. + ValueError: If both ID and key are provided for the same entity, + neither is provided, a key cannot be resolved, or no successful + evaluation is found for either model. """ + _require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key") + _require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1") + _require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2") + + resolved_benchmark_id = benchmark_id or await self._resolve_benchmark_key(benchmark_key, timeout=timeout) + resolved_model_id_1 = model_id_1 or await self._resolve_model_key(model_key_1, timeout=timeout) + resolved_model_id_2 = model_id_2 or await self._resolve_model_key(model_key_2, timeout=timeout) + resp1 = await self._client.evaluations.get_many( - model_ids=[model_id_1], - benchmark_ids=[benchmark_id], + model_ids=[resolved_model_id_1], + benchmark_ids=[resolved_benchmark_id], status=EvaluationStatus.SUCCESS, sort_by="submitted_at", order="desc", @@ -177,11 +232,11 @@ async def compare_models( unique=True, timeout=timeout, ) - eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id) + eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id) resp2 = await self._client.evaluations.get_many( - model_ids=[model_id_2], - benchmark_ids=[benchmark_id], + model_ids=[resolved_model_id_2], + benchmark_ids=[resolved_benchmark_id], status=EvaluationStatus.SUCCESS, sort_by="submitted_at", order="desc", @@ -189,7 +244,7 @@ async def compare_models( unique=True, timeout=timeout, ) - eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id) + eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id) return await self.compare( evaluation_id_1=eval_id_1, @@ -200,3 +255,19 @@ async def compare_models( search=search, timeout=timeout, ) + + async def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str: + resp = await self._client.models.get(key=key, timeout=timeout) + if resp is not None: + for model in resp.models: + if model.key == key: + return str(model.id) + raise ValueError(f"No model found for key '{key}'") + + async def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str: + resp = await self._client.benchmarks.get(key=key, timeout=timeout) + if resp is not None: + for benchmark in resp.datasets: + if benchmark.key == key: + return str(benchmark.id) + raise ValueError(f"No benchmark found for key '{key}'") diff --git a/tests/resources/test_comparisons.py b/tests/resources/test_comparisons.py index b9552945..8df54479 100644 --- a/tests/resources/test_comparisons.py +++ b/tests/resources/test_comparisons.py @@ -6,8 +6,12 @@ Evaluation, Pagination, EvaluationStatus, + PublicModelDetail, ComparisonResponse, EvaluationsResponse, + PublicBenchmarkDetail, + PublicModelsListResponse, + PublicBenchmarksListResponse, ) from layerlens.resources.comparisons.comparisons import Comparisons @@ -37,6 +41,22 @@ def _make_eval_response(evaluations: list[Evaluation]) -> EvaluationsResponse: ) +def _make_models_response(models: list[PublicModelDetail]) -> PublicModelsListResponse: + return PublicModelsListResponse( + models=models, + count=len(models), + total_count=len(models), + ) + + +def _make_benchmarks_response(benchmarks: list[PublicBenchmarkDetail]) -> PublicBenchmarksListResponse: + return PublicBenchmarksListResponse( + datasets=benchmarks, + count=len(benchmarks), + total_count=len(benchmarks), + ) + + class TestCompareModels: """Test Comparisons.compare_models convenience method.""" @@ -45,6 +65,8 @@ def mock_public_client(self): client = Mock() client.get_cast = Mock() client.evaluations = Mock() + client.models = Mock() + client.benchmarks = Mock() return client @pytest.fixture @@ -201,3 +223,112 @@ def test_compare_models_picks_most_recent(self, comparisons, mock_public_client) assert call.kwargs["page_size"] == 1 assert call.kwargs["status"] == EvaluationStatus.SUCCESS assert call.kwargs["unique"] is True + + def test_compare_models_resolves_keys(self, comparisons, mock_public_client): + """compare_models resolves benchmark_key/model_key_* into IDs before lookup.""" + mock_public_client.benchmarks.get.return_value = _make_benchmarks_response( + [PublicBenchmarkDetail(id="bench-1", key="mmlu_pro", name="MMLU Pro")] + ) + mock_public_client.models.get.side_effect = [ + _make_models_response([PublicModelDetail(id="model-a", key="gpt-4", name="GPT-4")]), + _make_models_response([PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")]), + ] + + eval1 = _make_eval("eval-1", "model-a", "bench-1") + eval2 = _make_eval("eval-2", "model-b", "bench-1") + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([eval2]), + ] + comparisons._get.return_value = { + "results": [], + "total_count": 0, + "correct_count_1": 0, + "total_results_1": 0, + "correct_count_2": 0, + "total_results_2": 0, + } + + result = comparisons.compare_models( + benchmark_key="mmlu_pro", + model_key_1="gpt-4", + model_key_2="claude-opus", + ) + + assert isinstance(result, ComparisonResponse) + + # Benchmark + model keys were each looked up + mock_public_client.benchmarks.get.assert_called_once() + assert mock_public_client.benchmarks.get.call_args.kwargs["key"] == "mmlu_pro" + + model_get_keys = [c.kwargs["key"] for c in mock_public_client.models.get.call_args_list] + assert model_get_keys == ["gpt-4", "claude-opus"] + + # Resolved IDs are forwarded to evaluations.get_many + eval_calls = mock_public_client.evaluations.get_many.call_args_list + assert eval_calls[0].kwargs["model_ids"] == ["model-a"] + assert eval_calls[0].kwargs["benchmark_ids"] == ["bench-1"] + assert eval_calls[1].kwargs["model_ids"] == ["model-b"] + + def test_compare_models_mixed_id_and_key(self, comparisons, mock_public_client): + """compare_models accepts mixing IDs for some entities and keys for others.""" + mock_public_client.models.get.return_value = _make_models_response( + [PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")] + ) + + eval1 = _make_eval("eval-1", "model-a", "bench-1") + eval2 = _make_eval("eval-2", "model-b", "bench-1") + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([eval2]), + ] + comparisons._get.return_value = { + "results": [], + "total_count": 0, + "correct_count_1": 0, + "total_results_1": 0, + "correct_count_2": 0, + "total_results_2": 0, + } + + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_key_2="claude-opus", + ) + + mock_public_client.benchmarks.get.assert_not_called() + mock_public_client.models.get.assert_called_once() + assert mock_public_client.models.get.call_args.kwargs["key"] == "claude-opus" + + eval_calls = mock_public_client.evaluations.get_many.call_args_list + assert eval_calls[1].kwargs["model_ids"] == ["model-b"] + + def test_compare_models_rejects_both_id_and_key(self, comparisons): + """Supplying both ID and key for the same entity is an error.""" + with pytest.raises(ValueError, match="benchmark_id"): + comparisons.compare_models( + benchmark_id="bench-1", + benchmark_key="mmlu_pro", + model_id_1="model-a", + model_id_2="model-b", + ) + + def test_compare_models_rejects_neither_id_nor_key(self, comparisons): + """Supplying neither ID nor key for an entity is an error.""" + with pytest.raises(ValueError, match="model_id_1"): + comparisons.compare_models( + benchmark_id="bench-1", + model_id_2="model-b", + ) + + def test_compare_models_unknown_key_raises(self, comparisons, mock_public_client): + """An unresolvable key raises ValueError with the key in the message.""" + mock_public_client.benchmarks.get.return_value = _make_benchmarks_response([]) + + with pytest.raises(ValueError, match="missing-bench"): + comparisons.compare_models( + benchmark_key="missing-bench", + model_id_1="model-a", + model_id_2="model-b", + )