LayerLens · m-peko · May 26, 2026 · May 26, 2026
diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py
@@ -1,4 +1,4 @@
-__version__ = "1.7.0"
+__version__ = "1.8.0"
 
 # Will be templated during the build
 __git_commit__ = "__GIT_COMMIT__"
diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py
@@ -18,6 +18,11 @@ def _find_evaluation_id(response: Optional[EvaluationsResponse], model_id: str,
     return str(response.evaluations[0].id)
 
 
+def _require_one_of(id_value: Optional[str], key_value: Optional[str], id_name: str, key_name: str) -> None:
+    if (id_value is None) == (key_value is None):
+        raise ValueError(f"Exactly one of '{id_name}' or '{key_name}' must be provided.")
+
+
 class Comparisons(SyncPublicAPIResource):
     def compare(
         self,
@@ -58,9 +63,12 @@ def compare(
     def compare_models(
         self,
         *,
-        benchmark_id: str,
-        model_id_1: str,
-        model_id_2: str,
+        benchmark_id: Optional[str] = None,
+        model_id_1: Optional[str] = None,
+        model_id_2: Optional[str] = None,
+        benchmark_key: Optional[str] = None,
+        model_key_1: Optional[str] = None,
+        model_key_2: Optional[str] = None,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
         outcome_filter: Optional[_OUTCOME_FILTER] = None,
@@ -69,35 +77,49 @@ def compare_models(
     ) -> Optional[ComparisonResponse]:
         """Compare two models on a benchmark by automatically finding their evaluations.
 
+        Each of the benchmark and the two models can be addressed by either its
+        ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``,
+        one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``.
+
         Finds the most recent successful evaluation for each model on the given
         benchmark, then compares the results side-by-side.
 
         Raises:
-            ValueError: If no successful evaluation is found for either model.
+            ValueError: If both ID and key are provided for the same entity,
+                neither is provided, a key cannot be resolved, or no successful
+                evaluation is found for either model.
         """
+        _require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key")
+        _require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1")
+        _require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2")
+
+        resolved_benchmark_id = benchmark_id or self._resolve_benchmark_key(benchmark_key, timeout=timeout)
+        resolved_model_id_1 = model_id_1 or self._resolve_model_key(model_key_1, timeout=timeout)
+        resolved_model_id_2 = model_id_2 or self._resolve_model_key(model_key_2, timeout=timeout)
+
         resp1 = self._client.evaluations.get_many(
-            model_ids=[model_id_1],
-            benchmark_ids=[benchmark_id],
+            model_ids=[resolved_model_id_1],
+            benchmark_ids=[resolved_benchmark_id],
             status=EvaluationStatus.SUCCESS,
             sort_by="submitted_at",
             order="desc",
             page_size=1,
             unique=True,
             timeout=timeout,
         )
-        eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
+        eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id)
 
         resp2 = self._client.evaluations.get_many(
-            model_ids=[model_id_2],
-            benchmark_ids=[benchmark_id],
+            model_ids=[resolved_model_id_2],
+            benchmark_ids=[resolved_benchmark_id],
             status=EvaluationStatus.SUCCESS,
             sort_by="submitted_at",
             order="desc",
             page_size=1,
             unique=True,
             timeout=timeout,
         )
-        eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
+        eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id)
 
         return self.compare(
             evaluation_id_1=eval_id_1,
@@ -109,6 +131,22 @@ def compare_models(
             timeout=timeout,
         )
 
+    def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
+        resp = self._client.models.get(key=key, timeout=timeout)
+        if resp is not None:
+            for model in resp.models:
+                if model.key == key:
+                    return str(model.id)
+        raise ValueError(f"No model found for key '{key}'")
+
+    def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
+        resp = self._client.benchmarks.get(key=key, timeout=timeout)
+        if resp is not None:
+            for benchmark in resp.datasets:
+                if benchmark.key == key:
+                    return str(benchmark.id)
+        raise ValueError(f"No benchmark found for key '{key}'")
+
 
 class AsyncComparisons(AsyncPublicAPIResource):
     async def compare(
@@ -150,9 +188,12 @@ async def compare(
     async def compare_models(
         self,
         *,
-        benchmark_id: str,
-        model_id_1: str,
-        model_id_2: str,
+        benchmark_id: Optional[str] = None,
+        model_id_1: Optional[str] = None,
+        model_id_2: Optional[str] = None,
+        benchmark_key: Optional[str] = None,
+        model_key_1: Optional[str] = None,
+        model_key_2: Optional[str] = None,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
         outcome_filter: Optional[_OUTCOME_FILTER] = None,
@@ -161,35 +202,49 @@ async def compare_models(
     ) -> Optional[ComparisonResponse]:
         """Compare two models on a benchmark by automatically finding their evaluations.
 
+        Each of the benchmark and the two models can be addressed by either its
+        ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``,
+        one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``.
+
         Finds the most recent successful evaluation for each model on the given
         benchmark, then compares the results side-by-side.
 
         Raises:
-            ValueError: If no successful evaluation is found for either model.
+            ValueError: If both ID and key are provided for the same entity,
+                neither is provided, a key cannot be resolved, or no successful
+                evaluation is found for either model.
         """
+        _require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key")
+        _require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1")
+        _require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2")
+
+        resolved_benchmark_id = benchmark_id or await self._resolve_benchmark_key(benchmark_key, timeout=timeout)
+        resolved_model_id_1 = model_id_1 or await self._resolve_model_key(model_key_1, timeout=timeout)
+        resolved_model_id_2 = model_id_2 or await self._resolve_model_key(model_key_2, timeout=timeout)
+
         resp1 = await self._client.evaluations.get_many(
-            model_ids=[model_id_1],
-            benchmark_ids=[benchmark_id],
+            model_ids=[resolved_model_id_1],
+            benchmark_ids=[resolved_benchmark_id],
             status=EvaluationStatus.SUCCESS,
             sort_by="submitted_at",
             order="desc",
             page_size=1,
             unique=True,
             timeout=timeout,
         )
-        eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
+        eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id)
 
         resp2 = await self._client.evaluations.get_many(
-            model_ids=[model_id_2],
-            benchmark_ids=[benchmark_id],
+            model_ids=[resolved_model_id_2],
+            benchmark_ids=[resolved_benchmark_id],
             status=EvaluationStatus.SUCCESS,
             sort_by="submitted_at",
             order="desc",
             page_size=1,
             unique=True,
             timeout=timeout,
         )
-        eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
+        eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id)
 
         return await self.compare(
             evaluation_id_1=eval_id_1,
@@ -200,3 +255,19 @@ async def compare_models(
             search=search,
             timeout=timeout,
         )
+
+    async def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
+        resp = await self._client.models.get(key=key, timeout=timeout)
+        if resp is not None:
+            for model in resp.models:
+                if model.key == key:
+                    return str(model.id)
+        raise ValueError(f"No model found for key '{key}'")
+
+    async def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
+        resp = await self._client.benchmarks.get(key=key, timeout=timeout)
+        if resp is not None:
+            for benchmark in resp.datasets:
+                if benchmark.key == key:
+                    return str(benchmark.id)
+        raise ValueError(f"No benchmark found for key '{key}'")
diff --git a/tests/resources/test_comparisons.py b/tests/resources/test_comparisons.py
@@ -6,8 +6,12 @@
     Evaluation,
     Pagination,
     EvaluationStatus,
+    PublicModelDetail,
     ComparisonResponse,
     EvaluationsResponse,
+    PublicBenchmarkDetail,
+    PublicModelsListResponse,
+    PublicBenchmarksListResponse,
 )
 from layerlens.resources.comparisons.comparisons import Comparisons
 
@@ -37,6 +41,22 @@ def _make_eval_response(evaluations: list[Evaluation]) -> EvaluationsResponse:
     )
 
 
+def _make_models_response(models: list[PublicModelDetail]) -> PublicModelsListResponse:
+    return PublicModelsListResponse(
+        models=models,
+        count=len(models),
+        total_count=len(models),
+    )
+
+
+def _make_benchmarks_response(benchmarks: list[PublicBenchmarkDetail]) -> PublicBenchmarksListResponse:
+    return PublicBenchmarksListResponse(
+        datasets=benchmarks,
+        count=len(benchmarks),
+        total_count=len(benchmarks),
+    )
+
+
 class TestCompareModels:
     """Test Comparisons.compare_models convenience method."""
 
@@ -45,6 +65,8 @@ def mock_public_client(self):
         client = Mock()
         client.get_cast = Mock()
         client.evaluations = Mock()
+        client.models = Mock()
+        client.benchmarks = Mock()
         return client
 
     @pytest.fixture
@@ -201,3 +223,112 @@ def test_compare_models_picks_most_recent(self, comparisons, mock_public_client)
             assert call.kwargs["page_size"] == 1
             assert call.kwargs["status"] == EvaluationStatus.SUCCESS
             assert call.kwargs["unique"] is True
+
+    def test_compare_models_resolves_keys(self, comparisons, mock_public_client):
+        """compare_models resolves benchmark_key/model_key_* into IDs before lookup."""
+        mock_public_client.benchmarks.get.return_value = _make_benchmarks_response(
+            [PublicBenchmarkDetail(id="bench-1", key="mmlu_pro", name="MMLU Pro")]
+        )
+        mock_public_client.models.get.side_effect = [
+            _make_models_response([PublicModelDetail(id="model-a", key="gpt-4", name="GPT-4")]),
+            _make_models_response([PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")]),
+        ]
+
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+        eval2 = _make_eval("eval-2", "model-b", "bench-1")
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([eval2]),
+        ]
+        comparisons._get.return_value = {
+            "results": [],
+            "total_count": 0,
+            "correct_count_1": 0,
+            "total_results_1": 0,
+            "correct_count_2": 0,
+            "total_results_2": 0,
+        }
+
+        result = comparisons.compare_models(
+            benchmark_key="mmlu_pro",
+            model_key_1="gpt-4",
+            model_key_2="claude-opus",
+        )
+
+        assert isinstance(result, ComparisonResponse)
+
+        # Benchmark + model keys were each looked up
+        mock_public_client.benchmarks.get.assert_called_once()
+        assert mock_public_client.benchmarks.get.call_args.kwargs["key"] == "mmlu_pro"
+
+        model_get_keys = [c.kwargs["key"] for c in mock_public_client.models.get.call_args_list]
+        assert model_get_keys == ["gpt-4", "claude-opus"]
+
+        # Resolved IDs are forwarded to evaluations.get_many
+        eval_calls = mock_public_client.evaluations.get_many.call_args_list
+        assert eval_calls[0].kwargs["model_ids"] == ["model-a"]
+        assert eval_calls[0].kwargs["benchmark_ids"] == ["bench-1"]
+        assert eval_calls[1].kwargs["model_ids"] == ["model-b"]
+
+    def test_compare_models_mixed_id_and_key(self, comparisons, mock_public_client):
+        """compare_models accepts mixing IDs for some entities and keys for others."""
+        mock_public_client.models.get.return_value = _make_models_response(
+            [PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")]
+        )
+
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+        eval2 = _make_eval("eval-2", "model-b", "bench-1")
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([eval2]),
+        ]
+        comparisons._get.return_value = {
+            "results": [],
+            "total_count": 0,
+            "correct_count_1": 0,
+            "total_results_1": 0,
+            "correct_count_2": 0,
+            "total_results_2": 0,
+        }
+
+        comparisons.compare_models(
+            benchmark_id="bench-1",
+            model_id_1="model-a",
+            model_key_2="claude-opus",
+        )
+
+        mock_public_client.benchmarks.get.assert_not_called()
+        mock_public_client.models.get.assert_called_once()
+        assert mock_public_client.models.get.call_args.kwargs["key"] == "claude-opus"
+
+        eval_calls = mock_public_client.evaluations.get_many.call_args_list
+        assert eval_calls[1].kwargs["model_ids"] == ["model-b"]
+
+    def test_compare_models_rejects_both_id_and_key(self, comparisons):
+        """Supplying both ID and key for the same entity is an error."""
+        with pytest.raises(ValueError, match="benchmark_id"):
+            comparisons.compare_models(
+                benchmark_id="bench-1",
+                benchmark_key="mmlu_pro",
+                model_id_1="model-a",
+                model_id_2="model-b",
+            )
+
+    def test_compare_models_rejects_neither_id_nor_key(self, comparisons):
+        """Supplying neither ID nor key for an entity is an error."""
+        with pytest.raises(ValueError, match="model_id_1"):
+            comparisons.compare_models(
+                benchmark_id="bench-1",
+                model_id_2="model-b",
+            )
+
+    def test_compare_models_unknown_key_raises(self, comparisons, mock_public_client):
+        """An unresolvable key raises ValueError with the key in the message."""
+        mock_public_client.benchmarks.get.return_value = _make_benchmarks_response([])
+
+        with pytest.raises(ValueError, match="missing-bench"):
+            comparisons.compare_models(
+                benchmark_key="missing-bench",
+                model_id_1="model-a",
+                model_id_2="model-b",
+            )