Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/layerlens/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.7.0"
__version__ = "1.8.0"

# Will be templated during the build
__git_commit__ = "__GIT_COMMIT__"
111 changes: 91 additions & 20 deletions src/layerlens/resources/comparisons/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ def _find_evaluation_id(response: Optional[EvaluationsResponse], model_id: str,
return str(response.evaluations[0].id)


def _require_one_of(id_value: Optional[str], key_value: Optional[str], id_name: str, key_name: str) -> None:
if (id_value is None) == (key_value is None):
raise ValueError(f"Exactly one of '{id_name}' or '{key_name}' must be provided.")


class Comparisons(SyncPublicAPIResource):
def compare(
self,
Expand Down Expand Up @@ -58,9 +63,12 @@ def compare(
def compare_models(
self,
*,
benchmark_id: str,
model_id_1: str,
model_id_2: str,
benchmark_id: Optional[str] = None,
model_id_1: Optional[str] = None,
model_id_2: Optional[str] = None,
benchmark_key: Optional[str] = None,
model_key_1: Optional[str] = None,
model_key_2: Optional[str] = None,
page: Optional[int] = None,
page_size: Optional[int] = None,
outcome_filter: Optional[_OUTCOME_FILTER] = None,
Expand All @@ -69,35 +77,49 @@ def compare_models(
) -> Optional[ComparisonResponse]:
"""Compare two models on a benchmark by automatically finding their evaluations.

Each of the benchmark and the two models can be addressed by either its
ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``,
one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``.

Finds the most recent successful evaluation for each model on the given
benchmark, then compares the results side-by-side.

Raises:
ValueError: If no successful evaluation is found for either model.
ValueError: If both ID and key are provided for the same entity,
neither is provided, a key cannot be resolved, or no successful
evaluation is found for either model.
"""
_require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key")
_require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1")
_require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2")

resolved_benchmark_id = benchmark_id or self._resolve_benchmark_key(benchmark_key, timeout=timeout)
resolved_model_id_1 = model_id_1 or self._resolve_model_key(model_key_1, timeout=timeout)
resolved_model_id_2 = model_id_2 or self._resolve_model_key(model_key_2, timeout=timeout)

resp1 = self._client.evaluations.get_many(
model_ids=[model_id_1],
benchmark_ids=[benchmark_id],
model_ids=[resolved_model_id_1],
benchmark_ids=[resolved_benchmark_id],
status=EvaluationStatus.SUCCESS,
sort_by="submitted_at",
order="desc",
page_size=1,
unique=True,
timeout=timeout,
)
eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id)

resp2 = self._client.evaluations.get_many(
model_ids=[model_id_2],
benchmark_ids=[benchmark_id],
model_ids=[resolved_model_id_2],
benchmark_ids=[resolved_benchmark_id],
status=EvaluationStatus.SUCCESS,
sort_by="submitted_at",
order="desc",
page_size=1,
unique=True,
timeout=timeout,
)
eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id)

return self.compare(
evaluation_id_1=eval_id_1,
Expand All @@ -109,6 +131,22 @@ def compare_models(
timeout=timeout,
)

def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
resp = self._client.models.get(key=key, timeout=timeout)
if resp is not None:
for model in resp.models:
if model.key == key:
return str(model.id)
raise ValueError(f"No model found for key '{key}'")

def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
resp = self._client.benchmarks.get(key=key, timeout=timeout)
if resp is not None:
for benchmark in resp.datasets:
if benchmark.key == key:
return str(benchmark.id)
raise ValueError(f"No benchmark found for key '{key}'")


class AsyncComparisons(AsyncPublicAPIResource):
async def compare(
Expand Down Expand Up @@ -150,9 +188,12 @@ async def compare(
async def compare_models(
self,
*,
benchmark_id: str,
model_id_1: str,
model_id_2: str,
benchmark_id: Optional[str] = None,
model_id_1: Optional[str] = None,
model_id_2: Optional[str] = None,
benchmark_key: Optional[str] = None,
model_key_1: Optional[str] = None,
model_key_2: Optional[str] = None,
page: Optional[int] = None,
page_size: Optional[int] = None,
outcome_filter: Optional[_OUTCOME_FILTER] = None,
Expand All @@ -161,35 +202,49 @@ async def compare_models(
) -> Optional[ComparisonResponse]:
"""Compare two models on a benchmark by automatically finding their evaluations.

Each of the benchmark and the two models can be addressed by either its
ID or its unique key — provide one of ``benchmark_id``/``benchmark_key``,
one of ``model_id_1``/``model_key_1``, and one of ``model_id_2``/``model_key_2``.

Finds the most recent successful evaluation for each model on the given
benchmark, then compares the results side-by-side.

Raises:
ValueError: If no successful evaluation is found for either model.
ValueError: If both ID and key are provided for the same entity,
neither is provided, a key cannot be resolved, or no successful
evaluation is found for either model.
"""
_require_one_of(benchmark_id, benchmark_key, "benchmark_id", "benchmark_key")
_require_one_of(model_id_1, model_key_1, "model_id_1", "model_key_1")
_require_one_of(model_id_2, model_key_2, "model_id_2", "model_key_2")

resolved_benchmark_id = benchmark_id or await self._resolve_benchmark_key(benchmark_key, timeout=timeout)
resolved_model_id_1 = model_id_1 or await self._resolve_model_key(model_key_1, timeout=timeout)
resolved_model_id_2 = model_id_2 or await self._resolve_model_key(model_key_2, timeout=timeout)

resp1 = await self._client.evaluations.get_many(
model_ids=[model_id_1],
benchmark_ids=[benchmark_id],
model_ids=[resolved_model_id_1],
benchmark_ids=[resolved_benchmark_id],
status=EvaluationStatus.SUCCESS,
sort_by="submitted_at",
order="desc",
page_size=1,
unique=True,
timeout=timeout,
)
eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
eval_id_1 = _find_evaluation_id(resp1, resolved_model_id_1, resolved_benchmark_id)

resp2 = await self._client.evaluations.get_many(
model_ids=[model_id_2],
benchmark_ids=[benchmark_id],
model_ids=[resolved_model_id_2],
benchmark_ids=[resolved_benchmark_id],
status=EvaluationStatus.SUCCESS,
sort_by="submitted_at",
order="desc",
page_size=1,
unique=True,
timeout=timeout,
)
eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
eval_id_2 = _find_evaluation_id(resp2, resolved_model_id_2, resolved_benchmark_id)

return await self.compare(
evaluation_id_1=eval_id_1,
Expand All @@ -200,3 +255,19 @@ async def compare_models(
search=search,
timeout=timeout,
)

async def _resolve_model_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
resp = await self._client.models.get(key=key, timeout=timeout)
if resp is not None:
for model in resp.models:
if model.key == key:
return str(model.id)
raise ValueError(f"No model found for key '{key}'")

async def _resolve_benchmark_key(self, key: Optional[str], *, timeout: float | httpx.Timeout | None) -> str:
resp = await self._client.benchmarks.get(key=key, timeout=timeout)
if resp is not None:
for benchmark in resp.datasets:
if benchmark.key == key:
return str(benchmark.id)
raise ValueError(f"No benchmark found for key '{key}'")
131 changes: 131 additions & 0 deletions tests/resources/test_comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
Evaluation,
Pagination,
EvaluationStatus,
PublicModelDetail,
ComparisonResponse,
EvaluationsResponse,
PublicBenchmarkDetail,
PublicModelsListResponse,
PublicBenchmarksListResponse,
)
from layerlens.resources.comparisons.comparisons import Comparisons

Expand Down Expand Up @@ -37,6 +41,22 @@ def _make_eval_response(evaluations: list[Evaluation]) -> EvaluationsResponse:
)


def _make_models_response(models: list[PublicModelDetail]) -> PublicModelsListResponse:
return PublicModelsListResponse(
models=models,
count=len(models),
total_count=len(models),
)


def _make_benchmarks_response(benchmarks: list[PublicBenchmarkDetail]) -> PublicBenchmarksListResponse:
return PublicBenchmarksListResponse(
datasets=benchmarks,
count=len(benchmarks),
total_count=len(benchmarks),
)


class TestCompareModels:
"""Test Comparisons.compare_models convenience method."""

Expand All @@ -45,6 +65,8 @@ def mock_public_client(self):
client = Mock()
client.get_cast = Mock()
client.evaluations = Mock()
client.models = Mock()
client.benchmarks = Mock()
return client

@pytest.fixture
Expand Down Expand Up @@ -201,3 +223,112 @@ def test_compare_models_picks_most_recent(self, comparisons, mock_public_client)
assert call.kwargs["page_size"] == 1
assert call.kwargs["status"] == EvaluationStatus.SUCCESS
assert call.kwargs["unique"] is True

def test_compare_models_resolves_keys(self, comparisons, mock_public_client):
"""compare_models resolves benchmark_key/model_key_* into IDs before lookup."""
mock_public_client.benchmarks.get.return_value = _make_benchmarks_response(
[PublicBenchmarkDetail(id="bench-1", key="mmlu_pro", name="MMLU Pro")]
)
mock_public_client.models.get.side_effect = [
_make_models_response([PublicModelDetail(id="model-a", key="gpt-4", name="GPT-4")]),
_make_models_response([PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")]),
]

eval1 = _make_eval("eval-1", "model-a", "bench-1")
eval2 = _make_eval("eval-2", "model-b", "bench-1")
mock_public_client.evaluations.get_many.side_effect = [
_make_eval_response([eval1]),
_make_eval_response([eval2]),
]
comparisons._get.return_value = {
"results": [],
"total_count": 0,
"correct_count_1": 0,
"total_results_1": 0,
"correct_count_2": 0,
"total_results_2": 0,
}

result = comparisons.compare_models(
benchmark_key="mmlu_pro",
model_key_1="gpt-4",
model_key_2="claude-opus",
)

assert isinstance(result, ComparisonResponse)

# Benchmark + model keys were each looked up
mock_public_client.benchmarks.get.assert_called_once()
assert mock_public_client.benchmarks.get.call_args.kwargs["key"] == "mmlu_pro"

model_get_keys = [c.kwargs["key"] for c in mock_public_client.models.get.call_args_list]
assert model_get_keys == ["gpt-4", "claude-opus"]

# Resolved IDs are forwarded to evaluations.get_many
eval_calls = mock_public_client.evaluations.get_many.call_args_list
assert eval_calls[0].kwargs["model_ids"] == ["model-a"]
assert eval_calls[0].kwargs["benchmark_ids"] == ["bench-1"]
assert eval_calls[1].kwargs["model_ids"] == ["model-b"]

def test_compare_models_mixed_id_and_key(self, comparisons, mock_public_client):
"""compare_models accepts mixing IDs for some entities and keys for others."""
mock_public_client.models.get.return_value = _make_models_response(
[PublicModelDetail(id="model-b", key="claude-opus", name="Claude Opus")]
)

eval1 = _make_eval("eval-1", "model-a", "bench-1")
eval2 = _make_eval("eval-2", "model-b", "bench-1")
mock_public_client.evaluations.get_many.side_effect = [
_make_eval_response([eval1]),
_make_eval_response([eval2]),
]
comparisons._get.return_value = {
"results": [],
"total_count": 0,
"correct_count_1": 0,
"total_results_1": 0,
"correct_count_2": 0,
"total_results_2": 0,
}

comparisons.compare_models(
benchmark_id="bench-1",
model_id_1="model-a",
model_key_2="claude-opus",
)

mock_public_client.benchmarks.get.assert_not_called()
mock_public_client.models.get.assert_called_once()
assert mock_public_client.models.get.call_args.kwargs["key"] == "claude-opus"

eval_calls = mock_public_client.evaluations.get_many.call_args_list
assert eval_calls[1].kwargs["model_ids"] == ["model-b"]

def test_compare_models_rejects_both_id_and_key(self, comparisons):
"""Supplying both ID and key for the same entity is an error."""
with pytest.raises(ValueError, match="benchmark_id"):
comparisons.compare_models(
benchmark_id="bench-1",
benchmark_key="mmlu_pro",
model_id_1="model-a",
model_id_2="model-b",
)

def test_compare_models_rejects_neither_id_nor_key(self, comparisons):
"""Supplying neither ID nor key for an entity is an error."""
with pytest.raises(ValueError, match="model_id_1"):
comparisons.compare_models(
benchmark_id="bench-1",
model_id_2="model-b",
)

def test_compare_models_unknown_key_raises(self, comparisons, mock_public_client):
"""An unresolvable key raises ValueError with the key in the message."""
mock_public_client.benchmarks.get.return_value = _make_benchmarks_response([])

with pytest.raises(ValueError, match="missing-bench"):
comparisons.compare_models(
benchmark_key="missing-bench",
model_id_1="model-a",
model_id_2="model-b",
)
Loading