From 2f30eec83ed5e470758b8c85e170737e1a8a0bc5 Mon Sep 17 00:00:00 2001
From: Marin Peko <26385728+m-peko@users.noreply.github.com>
Date: Fri, 27 Feb 2026 07:56:30 +0100
Subject: [PATCH 1/3] PublicClient, model & benchmarks management (#50)

* Add public resources to SDK

* Add model and benchmark managing functionality to the SDK

* Bump version

* Return evaluation summary

* Fetch public evaluations
---
 docs/SUMMARY.md                               |   1 +
 docs/api-reference/client.md                  |  19 +
 docs/api-reference/evaluations.md             |  87 ++-
 docs/api-reference/models-benchmarks.md       | 220 +++++++-
 docs/api-reference/public-client.md           | 419 ++++++++++++++
 examples/compare_evaluations.py               |  92 +++
 examples/create_custom_benchmark.py           |  50 ++
 examples/create_custom_model.py               |  41 ++
 examples/create_smart_benchmark.py            |  51 ++
 examples/evaluation_sorting.py                |  92 +++
 examples/get_evaluation.py                    |   2 +-
 examples/manage_project_models_benchmarks.py  |  41 ++
 examples/public_benchmarks.py                 |  60 ++
 examples/public_evaluations.py                |  66 +++
 examples/public_models.py                     |  49 ++
 pyproject.toml                                |   2 +-
 src/layerlens/__init__.py                     |  19 +
 src/layerlens/_client.py                      |  13 +
 src/layerlens/_public_client.py               | 219 +++++++
 src/layerlens/_resource.py                    |  23 +
 src/layerlens/_version.py                     |   2 +-
 src/layerlens/models/__init__.py              |  46 +-
 src/layerlens/models/api.py                   |  12 +
 src/layerlens/models/evaluation.py            |  73 ++-
 src/layerlens/models/public.py                |  94 +++
 .../resources/benchmarks/benchmarks.py        | 375 +++++++++++-
 .../resources/comparisons/__init__.py         |   3 +
 .../resources/comparisons/comparisons.py      |  89 +++
 .../resources/evaluations/evaluations.py      |  49 +-
 src/layerlens/resources/models/models.py      | 172 +++++-
 .../resources/public_benchmarks/__init__.py   |   3 +
 .../public_benchmarks/public_benchmarks.py    | 263 +++++++++
 .../resources/public_evaluations/__init__.py  |   3 +
 .../public_evaluations/public_evaluations.py  | 218 +++++++
 .../resources/public_models/__init__.py       |   3 +
 .../resources/public_models/public_models.py  | 139 +++++
 tests/resources/test_benchmarks.py            | 533 +++++++++++++++++-
 tests/resources/test_evaluations.py           | 426 ++++++++++++++
 tests/resources/test_models_resource.py       | 290 +++++++++-
 39 files changed, 4327 insertions(+), 32 deletions(-)
 create mode 100644 docs/api-reference/public-client.md
 create mode 100644 examples/compare_evaluations.py
 create mode 100644 examples/create_custom_benchmark.py
 create mode 100644 examples/create_custom_model.py
 create mode 100644 examples/create_smart_benchmark.py
 create mode 100644 examples/evaluation_sorting.py
 create mode 100644 examples/manage_project_models_benchmarks.py
 create mode 100644 examples/public_benchmarks.py
 create mode 100644 examples/public_evaluations.py
 create mode 100644 examples/public_models.py
 create mode 100644 src/layerlens/_public_client.py
 create mode 100644 src/layerlens/models/public.py
 create mode 100644 src/layerlens/resources/comparisons/__init__.py
 create mode 100644 src/layerlens/resources/comparisons/comparisons.py
 create mode 100644 src/layerlens/resources/public_benchmarks/__init__.py
 create mode 100644 src/layerlens/resources/public_benchmarks/public_benchmarks.py
 create mode 100644 src/layerlens/resources/public_evaluations/__init__.py
 create mode 100644 src/layerlens/resources/public_evaluations/public_evaluations.py
 create mode 100644 src/layerlens/resources/public_models/__init__.py
 create mode 100644 src/layerlens/resources/public_models/public_models.py

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index 3002655..a89a4b7 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -9,6 +9,7 @@
 
 ## API Reference
 * [Client Configuration](api-reference/client.md)
+* [Public Client](api-reference/public-client.md)
 * [Evaluations](api-reference/evaluations.md)
 * [Results](api-reference/results.md)
 * [Models & Benchmarks](api-reference/models-benchmarks.md)
diff --git a/docs/api-reference/client.md b/docs/api-reference/client.md
index 97f13bc..a7e534e 100644
--- a/docs/api-reference/client.md
+++ b/docs/api-reference/client.md
@@ -51,6 +51,25 @@ The client automatically loads configuration from these environment variables:
 LAYERLENS_STRATIX_API_KEY="your_api_key_here"
 ```
 
+## Public Client
+
+For accessing public endpoints (models, benchmarks, comparisons), use `PublicClient` or `AsyncPublicClient`. See the [Public Client](public-client.md) reference for full details.
+
+```python
+from layerlens import PublicClient
+
+# Loads API key from the "LAYERLENS_STRATIX_API_KEY" environment variable
+public = PublicClient()
+models = public.models.get(companies=["OpenAI"])
+```
+
+You can also access public endpoints from an authenticated client via the `.public` property:
+
+```python
+client = Stratix()
+public_models = client.public.models.get(query="claude")
+```
+
 ## Timeout Configuration
 
 ### Simple Timeout
diff --git a/docs/api-reference/evaluations.md b/docs/api-reference/evaluations.md
index d2b0972..a9039e0 100644
--- a/docs/api-reference/evaluations.md
+++ b/docs/api-reference/evaluations.md
@@ -177,17 +177,22 @@ async def get_evaluation():
 asyncio.run(get_evaluation())
 ```
 
-### `get_many(page=None, page_size=None, timeout=None)`
+### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, timeout=None)`
 
-Retrieves multiple evaluations with optional pagination support.
+Retrieves multiple evaluations with optional pagination, sorting, and filtering.
 
 #### Parameters
 
-| Parameter   | Type                             | Required | Description                                             |
-| ----------- | -------------------------------- | -------- | ------------------------------------------------------- |
-| `page`      | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)     |
-| `page_size` | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500) |
-| `timeout`   | `float \| httpx.Timeout \| None` | No       | Override request timeout                                |
+| Parameter       | Type                             | Required | Description                                             |
+| --------------- | -------------------------------- | -------- | ------------------------------------------------------- |
+| `page`          | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)     |
+| `page_size`     | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500) |
+| `sort_by`       | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` |
+| `order`         | `str \| None`                    | No       | Sort order: `asc` or `desc`                             |
+| `model_ids`     | `List[str] \| None`              | No       | Filter by model IDs                                     |
+| `benchmark_ids` | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                         |
+| `status`        | `EvaluationStatus \| None`       | No       | Filter by evaluation status                             |
+| `timeout`       | `float \| httpx.Timeout \| None` | No       | Override request timeout                                |
 
 #### Returns
 
@@ -198,6 +203,27 @@ Returns an `EvaluationsResponse` object containing:
 
 Returns `None` if the request fails.
 
+#### Example
+
+```python
+from layerlens import Stratix
+from layerlens.models import EvaluationStatus
+
+client = Stratix()
+
+# Get top evaluations by accuracy
+response = client.evaluations.get_many(
+    sort_by="accuracy",
+    order="desc",
+    status=EvaluationStatus.SUCCESS,
+    page_size=10,
+)
+
+if response:
+    for evaluation in response.evaluations:
+        print(f"{evaluation.id}: accuracy={evaluation.accuracy:.2f}%")
+```
+
 ### `get_results(page=None, page_size=None, timeout=None)`
 
 Fetches results for this evaluation with pagination support. This is a synchronous method.
@@ -378,16 +404,43 @@ The `create`, `get_by_id` and `get_many` method returns an `Evaluation` objects
 
 ### Evaluation Object Properties
 
-| Property           | Type               | Description                                               |
-| ------------------ | ------------------ | --------------------------------------------------------- |
-| `id`               | `str`              | Unique evaluation identifier                              |
-| `status`           | `EvaluationStatus` | Current evaluation status (enum)                          |
-| `submitted_at`     | `int`              | Unix timestamp when evaluation was submitted              |
-| `finished_at`      | `int`              | Unix timestamp when evaluation finished                   |
-| `model_id`         | `str`              | ID of the model used in the evaluation                    |
-| `benchmark_id`     | `str`              | ID of the benchmark used (aliased as "dataset_id" in API) |
-| `average_duration` | `int`              | Average response time in milliseconds                     |
-| `accuracy`         | `float`            | Overall accuracy score (0.0 to 1.0)                       |
+| Property             | Type                          | Description                                               |
+| -------------------- | ----------------------------- | --------------------------------------------------------- |
+| `id`                 | `str`                         | Unique evaluation identifier                              |
+| `status`             | `EvaluationStatus`            | Current evaluation status (enum)                          |
+| `status_description` | `str`                         | Human-readable status description (default: `""`)         |
+| `submitted_at`       | `int`                         | Unix timestamp when evaluation was submitted              |
+| `finished_at`        | `int`                         | Unix timestamp when evaluation finished                   |
+| `model_id`           | `str`                         | ID of the model used in the evaluation                    |
+| `model_name`         | `str`                         | Name of the model (default: `""`)                         |
+| `model_key`          | `str`                         | Key identifier of the model (default: `""`)               |
+| `model_company`      | `str`                         | Company/provider of the model (default: `""`)             |
+| `benchmark_id`       | `str`                         | ID of the benchmark used (aliased as "dataset_id" in API) |
+| `benchmark_name`     | `str`                         | Name of the benchmark (aliased as "dataset_name" in API, default: `""`) |
+| `average_duration`   | `int`                         | Average response time in milliseconds                     |
+| `accuracy`           | `float`                       | Overall accuracy score (0.0 to 1.0)                       |
+| `readability_score`  | `float`                       | Readability score (default: `0.0`)                        |
+| `toxicity_score`     | `float`                       | Toxicity score (default: `0.0`)                           |
+| `ethics_score`       | `float`                       | Ethics score (default: `0.0`)                             |
+| `failed_prompt_count`| `int`                         | Number of failed prompts (default: `0`)                   |
+| `queue_id`           | `int`                         | Queue identifier (default: `0`)                           |
+| `summary`            | `EvaluationSummary \| None`   | Rich evaluation summary (see below, default: `None`)      |
+
+### EvaluationSummary Object
+
+The `summary` field contains a rich analysis of the evaluation when available.
+
+| Property              | Type                            | Description                              |
+| --------------------- | ------------------------------- | ---------------------------------------- |
+| `name`                | `str`                           | Summary title                            |
+| `goal`                | `str`                           | Goal of the evaluation                   |
+| `metrics`             | `List[EvaluationMetric]`        | Metrics used (each has `name`, `description`) |
+| `task_types`          | `List[EvaluationTaskType]`      | Task types (each has `name`, `description`)   |
+| `dataset`             | `EvaluationDataset \| None`     | Dataset info (`total_size`, `training_size`, `test_size`, `characteristics`) |
+| `model`               | `EvaluationModelInfo \| None`   | Model info (`model_name`, `performance`)  |
+| `performance_details` | `PerformanceDetails \| None`    | Strengths and challenges lists            |
+| `error_analysis`      | `ErrorAnalysis \| None`         | Common failure modes and example          |
+| `analysis_summary`    | `AnalysisSummary \| None`       | Key takeaways list                        |
 
 #### Evaluation Status
 
diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md
index 1681b75..935cbbc 100644
--- a/docs/api-reference/models-benchmarks.md
+++ b/docs/api-reference/models-benchmarks.md
@@ -95,6 +95,87 @@ Retrieves a specific model by its unique key. Both the `Stratix` and `AsyncStrat
 
 Returns an `Optional[Model]` - a single `Model` object if found, or `None` if the model doesn't exist or there's an error.
 
+### `add(*model_ids, timeout=None)`
+
+Adds public models to the project by their IDs.
+
+#### Parameters
+
+| Parameter    | Type                             | Required | Description                    |
+| ------------ | -------------------------------- | -------- | ------------------------------ |
+| `*model_ids` | `str`                            | Yes      | One or more model IDs to add   |
+| `timeout`    | `float \| httpx.Timeout \| None` | No       | Override request timeout       |
+
+#### Returns
+
+Returns `bool` - `True` if the operation succeeded, `False` otherwise.
+
+#### Example
+
+```python
+client = Stratix()
+success = client.models.add("model-id-1", "model-id-2")
+```
+
+### `remove(*model_ids, timeout=None)`
+
+Removes models from the project by their IDs.
+
+#### Parameters
+
+| Parameter    | Type                             | Required | Description                       |
+| ------------ | -------------------------------- | -------- | --------------------------------- |
+| `*model_ids` | `str`                            | Yes      | One or more model IDs to remove   |
+| `timeout`    | `float \| httpx.Timeout \| None` | No       | Override request timeout          |
+
+#### Returns
+
+Returns `bool` - `True` if the operation succeeded, `False` otherwise.
+
+### `create_custom(name, key, description, api_url, max_tokens, api_key=None, timeout=None)`
+
+Creates a custom model backed by an OpenAI-compatible API endpoint. This allows you to evaluate any model accessible via a chat completions endpoint.
+
+#### Parameters
+
+| Parameter     | Type                             | Required | Description                                                                       |
+| ------------- | -------------------------------- | -------- | --------------------------------------------------------------------------------- |
+| `name`        | `str`                            | Yes      | Model name (max 256 characters)                                                   |
+| `key`         | `str`                            | Yes      | Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 chars)|
+| `description` | `str`                            | Yes      | Model description (max 500 characters)                                            |
+| `api_url`     | `str`                            | Yes      | Base URL of the OpenAI-compatible API endpoint                                    |
+| `max_tokens`  | `int`                            | Yes      | Maximum number of tokens the model supports                                       |
+| `api_key`     | `str \| None`                    | No       | API key for the model provider                                                    |
+| `timeout`     | `float \| httpx.Timeout \| None` | No       | Override request timeout                                                          |
+
+#### Returns
+
+Returns an `Optional[CreateModelResponse]` containing:
+
+- `organization_id`: Organization identifier
+- `project_id`: Project identifier
+- `model_id`: The newly created model's identifier
+
+Returns `None` if the request fails.
+
+#### Example
+
+```python
+client = Stratix()
+
+result = client.models.create_custom(
+    name="My Custom Model",
+    key="my-org/custom-model-v1",
+    description="Custom fine-tuned model served via vLLM",
+    api_url="https://my-model-endpoint.example.com/v1",
+    api_key="my-provider-api-key",
+    max_tokens=4096,
+)
+
+if result:
+    print(f"Created model: {result.model_id}")
+```
+
 ## Benchmarks
 
 ### `get(type=None, name=None, timeout=None)`
@@ -111,8 +192,6 @@ Retrieves a list of available benchmarks with optional filtering parameters. Bot
 
 #### Returns
 
-Returns a `List[Benchmark]` containing available benchmarks that match the filter criteria. Returns `None` if no benchmarks are found or if there's an error.
-
 Returns `Optional[List[Benchmark]]` - a list of `Benchmark` objects that match the filter criteria. Returns an empty list `[]` if no benchmarks match the criteria, or `None` if there's an error.
 
 #### Benchmark Object Properties
@@ -154,3 +233,140 @@ Retrieves a specific benchmark by its unique key. Both the `Stratix` and `AsyncS
 #### Returns
 
 Returns an `Optional[Benchmark]` - a single `Benchmark` object if found, or `None` if the benchmark doesn't exist or there's an error.
+
+### `add(*benchmark_ids, timeout=None)`
+
+Adds benchmarks to the project by their IDs.
+
+#### Parameters
+
+| Parameter        | Type                             | Required | Description                        |
+| ---------------- | -------------------------------- | -------- | ---------------------------------- |
+| `*benchmark_ids` | `str`                            | Yes      | One or more benchmark IDs to add   |
+| `timeout`        | `float \| httpx.Timeout \| None` | No       | Override request timeout           |
+
+#### Returns
+
+Returns `bool` - `True` if the operation succeeded, `False` otherwise.
+
+#### Example
+
+```python
+client = Stratix()
+success = client.benchmarks.add("benchmark-id-1", "benchmark-id-2")
+```
+
+### `remove(*benchmark_ids, timeout=None)`
+
+Removes benchmarks from the project by their IDs.
+
+#### Parameters
+
+| Parameter        | Type                             | Required | Description                           |
+| ---------------- | -------------------------------- | -------- | ------------------------------------- |
+| `*benchmark_ids` | `str`                            | Yes      | One or more benchmark IDs to remove   |
+| `timeout`        | `float \| httpx.Timeout \| None` | No       | Override request timeout              |
+
+#### Returns
+
+Returns `bool` - `True` if the operation succeeded, `False` otherwise.
+
+### `create_custom(name, description, file_path, additional_metrics=None, custom_scorer_ids=None, input_type=None, timeout=None)`
+
+Creates a custom benchmark by uploading a JSONL file. The file should contain one JSON object per line with `input` and `truth` fields.
+
+#### Parameters
+
+| Parameter            | Type                             | Required | Description                                                          |
+| -------------------- | -------------------------------- | -------- | -------------------------------------------------------------------- |
+| `name`               | `str`                            | Yes      | Benchmark name (max 64 characters)                                   |
+| `description`        | `str`                            | Yes      | Benchmark description (max 280 characters)                           |
+| `file_path`          | `str`                            | Yes      | Path to a JSONL file with benchmark prompts                          |
+| `additional_metrics` | `List[str] \| None`              | No       | Additional metrics: `readability`, `toxicity`, `hallucination`       |
+| `custom_scorer_ids`  | `List[str] \| None`              | No       | List of custom scorer IDs to use                                     |
+| `input_type`         | `str \| None`                    | No       | Input type: `messages` or `json_payload`                             |
+| `timeout`            | `float \| httpx.Timeout \| None` | No       | Override request timeout                                             |
+
+#### JSONL File Format
+
+Each line should be a JSON object:
+
+```json
+{"input": "What is 2+2?", "truth": "4"}
+{"input": "Capital of France?", "truth": "Paris"}
+```
+
+Optional fields: `subset` (for grouping prompts into categories).
+
+#### Returns
+
+Returns an `Optional[CreateBenchmarkResponse]` containing:
+
+- `organization_id`: Organization identifier
+- `project_id`: Project identifier
+- `benchmark_id`: The newly created benchmark's identifier
+
+Returns `None` if the request fails.
+
+#### Example
+
+```python
+client = Stratix()
+
+result = client.benchmarks.create_custom(
+    name="QA Benchmark",
+    description="Tests model factual accuracy",
+    file_path="benchmark_data.jsonl",
+    additional_metrics=["hallucination"],
+)
+
+if result:
+    print(f"Created benchmark: {result.benchmark_id}")
+```
+
+### `create_smart(name, description, system_prompt, file_paths, metrics=None, timeout=None)`
+
+Creates a smart benchmark from uploaded files. The platform uses AI to automatically generate benchmark prompts from the provided documents. The benchmark is generated asynchronously.
+
+#### Parameters
+
+| Parameter       | Type                             | Required | Description                                                         |
+| --------------- | -------------------------------- | -------- | ------------------------------------------------------------------- |
+| `name`          | `str`                            | Yes      | Benchmark name (max 256 characters)                                 |
+| `description`   | `str`                            | Yes      | Benchmark description (max 500 characters)                          |
+| `system_prompt` | `str`                            | Yes      | System prompt guiding benchmark generation (max 4000 characters)    |
+| `file_paths`    | `List[str]`                      | Yes      | List of file paths to upload (1-20 files, max 50 MB each)           |
+| `metrics`       | `List[str] \| None`              | No       | Additional metrics: `readability`, `toxicity`, `hallucination`      |
+| `timeout`       | `float \| httpx.Timeout \| None` | No       | Override request timeout                                            |
+
+#### Supported File Types
+
+`.txt`, `.pdf`, `.html`, `.docx`, `.csv`, `.json`, `.jsonl`, `.parquet`
+
+#### Returns
+
+Returns an `Optional[CreateBenchmarkResponse]` containing:
+
+- `organization_id`: Organization identifier
+- `project_id`: Project identifier
+- `benchmark_id`: The newly created benchmark's identifier
+
+Returns `None` if the request fails.
+
+#### Example
+
+```python
+client = Stratix()
+
+result = client.benchmarks.create_smart(
+    name="Product Knowledge Benchmark",
+    description="Evaluates model knowledge of product docs",
+    system_prompt="Generate QA pairs testing understanding of product features.",
+    file_paths=["product_docs.pdf", "faq.txt"],
+    metrics=["hallucination"],
+)
+
+if result:
+    print(f"Smart benchmark created: {result.benchmark_id}")
+    print("Check the dashboard for generation progress.")
+```
diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md
new file mode 100644
index 0000000..9a79d1e
--- /dev/null
+++ b/docs/api-reference/public-client.md
@@ -0,0 +1,419 @@
+# Public Client
+
+The `PublicClient` (synchronous) and `AsyncPublicClient` (asynchronous) classes provide access to public LayerLens API endpoints for browsing public models, benchmarks, benchmark content, fetching evaluations, and comparing evaluation results.
+
+## Basic Usage
+
+### Synchronous Client
+
+```python
+from layerlens import PublicClient
+
+# Loads API key from the "LAYERLENS_STRATIX_API_KEY" environment variable
+client = PublicClient()
+
+# Browse public models
+models = client.models.get(companies=["OpenAI"])
+
+# Browse public benchmarks
+benchmarks = client.benchmarks.get(languages=["English"])
+```
+
+### Asynchronous Client
+
+```python
+import asyncio
+from layerlens import AsyncPublicClient
+
+async def main():
+    client = AsyncPublicClient()
+
+    models = await client.models.get(companies=["OpenAI"])
+    benchmarks = await client.benchmarks.get(languages=["English"])
+
+asyncio.run(main())
+```
+
+### Accessing from an Authenticated Client
+
+If you already have an authenticated `Stratix` or `AsyncStratix` client, you can access public endpoints through the `.public` property:
+
+```python
+from layerlens import Stratix
+
+client = Stratix()  # requires API key
+
+# Access public endpoints through the authenticated client
+public_models = client.public.models.get(query="claude")
+```
+
+## Constructor Parameters
+
+### `PublicClient(api_key, base_url, timeout)` and `AsyncPublicClient(api_key, base_url, timeout)`
+
+| Parameter  | Type                             | Required | Default         | Description                   |
+| ---------- | -------------------------------- | -------- | --------------- | ----------------------------- |
+| `api_key`  | `str \| None`                    | Yes\*    | `None`          | Your LayerLens Stratix API key  |
+| `base_url` | `str \| httpx.URL \| None`       | No       | Stratix API URL | Custom API base URL           |
+| `timeout`  | `float \| httpx.Timeout \| None` | No       | 10 minutes      | Request timeout configuration |
+
+\*Required unless set via the `LAYERLENS_STRATIX_API_KEY` environment variable
+
+## Public Models
+
+### `models.get(...)`
+
+Retrieves a list of public models with optional filtering, sorting, and pagination.
+
+#### Parameters
+
+| Parameter            | Type                   | Required | Description                                                                                  |
+| -------------------- | ---------------------- | -------- | -------------------------------------------------------------------------------------------- |
+| `query`              | `str \| None`          | No       | Full-text search on model name                                                               |
+| `name`               | `str \| None`          | No       | Filter by model name                                                                         |
+| `key`                | `str \| None`          | No       | Filter by model key                                                                          |
+| `ids`                | `List[str] \| None`    | No       | Filter by specific model IDs                                                                 |
+| `categories`         | `List[str] \| None`    | No       | Filter by categories (e.g. `transformer`, `moe`, `open-source`, `closed-source`, `usa`, `china`, `size-sm`, `size-md`, `size-lg`, `size-xl`) |
+| `companies`          | `List[str] \| None`    | No       | Filter by company names                                                                      |
+| `regions`            | `List[str] \| None`    | No       | Filter by regions                                                                            |
+| `licenses`           | `List[str] \| None`    | No       | Filter by license types                                                                      |
+| `sizes`              | `List[str] \| None`    | No       | Filter by size (Small, Medium, Large, Extra Large)                                           |
+| `sort_by`            | `str \| None`          | No       | Sort column: `name`, `createdAt`, `releasedAt`, `architectureType`, `contextLength`, `license`, `region` |
+| `order`              | `str \| None`          | No       | Sort order: `asc` or `desc`                                                                  |
+| `page`               | `int \| None`          | No       | Page number (1-based)                                                                        |
+| `page_size`          | `int \| None`          | No       | Results per page                                                                             |
+| `include_deprecated` | `bool \| None`         | No       | Include deprecated models (default: false)                                                   |
+| `timeout`            | `float \| httpx.Timeout \| None` | No | Override request timeout                                                                     |
+
+#### Returns
+
+Returns a `PublicModelsListResponse` containing:
+
+- `models`: List of `PublicModelDetail` objects
+- `categories`: List of available category strings
+- `count`: Number of results in current page
+- `total_count`: Total number of matching results
+
+Returns `None` if the request fails.
+
+#### PublicModelDetail Properties
+
+| Property               | Type             | Description                        |
+| ---------------------- | ---------------- | ---------------------------------- |
+| `id`                   | `str`            | Unique model identifier            |
+| `key`                  | `str`            | Unique model key                   |
+| `name`                 | `str`            | Human-readable model name          |
+| `description`          | `str \| None`    | Text description                   |
+| `company`              | `str \| None`    | Model provider company             |
+| `released_at`          | `int \| None`    | Release timestamp                  |
+| `parameters`           | `float \| None`  | Number of parameters               |
+| `modality`             | `str \| None`    | Model modality                     |
+| `context_length`       | `int \| None`    | Maximum context length             |
+| `architecture_type`    | `str \| None`    | Architecture type                  |
+| `license`              | `str \| None`    | License type                       |
+| `open_weights`         | `bool \| None`   | Whether weights are open           |
+| `region`               | `str \| None`    | Region                             |
+| `key_takeaways`        | `List[str] \| None` | Key takeaways                   |
+| `deprecated`           | `bool \| None`   | Whether the model is deprecated    |
+| `cost_per_input_token` | `str \| None`    | Cost per input token               |
+| `cost_per_output_token`| `str \| None`    | Cost per output token              |
+
+#### Example
+
+```python
+from layerlens import PublicClient
+
+client = PublicClient()
+
+# Get newest OpenAI models
+response = client.models.get(
+    companies=["OpenAI"],
+    sort_by="releasedAt",
+    order="desc",
+    page_size=5,
+)
+
+for model in response.models:
+    print(f"{model.name} - {model.context_length} context length")
+```
+
+## Public Benchmarks
+
+### `benchmarks.get(...)`
+
+Retrieves a list of public benchmarks with optional filtering, sorting, and pagination.
+
+#### Parameters
+
+| Parameter            | Type                   | Required | Description                                |
+| -------------------- | ---------------------- | -------- | ------------------------------------------ |
+| `query`              | `str \| None`          | No       | Full-text search                           |
+| `name`               | `str \| None`          | No       | Filter by name                             |
+| `key`                | `str \| None`          | No       | Filter by key                              |
+| `ids`                | `List[str] \| None`    | No       | Filter by specific IDs                     |
+| `categories`         | `List[str] \| None`    | No       | Filter by categories                       |
+| `languages`          | `List[str] \| None`    | No       | Filter by languages                        |
+| `sort_by`            | `str \| None`          | No       | Sort column (currently: `name`)            |
+| `order`              | `str \| None`          | No       | Sort order: `asc` or `desc`                |
+| `page`               | `int \| None`          | No       | Page number (1-based)                      |
+| `page_size`          | `int \| None`          | No       | Results per page                           |
+| `include_deprecated` | `bool \| None`         | No       | Include deprecated benchmarks              |
+| `timeout`            | `float \| httpx.Timeout \| None` | No | Override request timeout               |
+
+#### Returns
+
+Returns a `PublicBenchmarksListResponse` containing:
+
+- `datasets`: List of `PublicBenchmarkDetail` objects
+- `categories`: List of available category strings
+- `count`: Number of results in current page
+- `total_count`: Total number of matching results
+
+Returns `None` if the request fails.
+
+#### PublicBenchmarkDetail Properties
+
+| Property          | Type               | Description                           |
+| ----------------- | ------------------ | ------------------------------------- |
+| `id`              | `str`              | Unique benchmark identifier           |
+| `key`             | `str`              | Unique benchmark key                  |
+| `name`            | `str`              | Human-readable name                   |
+| `description`     | `str \| None`      | Text description                      |
+| `prompt_count`    | `int \| None`      | Number of prompts in the benchmark    |
+| `language`        | `str \| None`      | Language of the benchmark             |
+| `categories`      | `List[str] \| None`| Categories                            |
+| `characteristics` | `List[str] \| None`| Characteristics                       |
+| `deprecated`      | `bool \| None`     | Whether the benchmark is deprecated   |
+| `is_public`       | `bool \| None`     | Whether the benchmark is public       |
+
+### `benchmarks.get_prompts(benchmark_id, ...)`
+
+Fetches prompts/content from a public benchmark with optional search and pagination.
+
+#### Parameters
+
+| Parameter      | Type                   | Required | Description                                    |
+| -------------- | ---------------------- | -------- | ---------------------------------------------- |
+| `benchmark_id` | `str`                  | Yes      | The benchmark ID to fetch prompts from         |
+| `page`         | `int \| None`          | No       | Page number (1-based)                          |
+| `page_size`    | `int \| None`          | No       | Results per page                               |
+| `search_field` | `str \| None`          | No       | Search field: `id`, `input`, or `truth`        |
+| `search_value` | `str \| None`          | No       | Search value                                   |
+| `sort_by`      | `str \| None`          | No       | Sort field: `id`, `input`, or `truth`          |
+| `sort_order`   | `str \| None`          | No       | Sort order: `asc` or `desc`                    |
+| `timeout`      | `float \| httpx.Timeout \| None` | No | Override request timeout                   |
+
+#### Returns
+
+Returns a `BenchmarkPromptsResponse` containing:
+
+- `status`: Response status string
+- `data.prompts`: List of `BenchmarkPrompt` objects
+- `data.count`: Total number of prompts
+
+Returns `None` if the request fails.
+
+#### BenchmarkPrompt Properties
+
+| Property | Type  | Description                            |
+| -------- | ----- | -------------------------------------- |
+| `id`     | `str` | Unique prompt identifier               |
+| `input`  | `str \| List \| Dict` | The prompt input          |
+| `truth`  | `str` | The expected/ground truth answer       |
+
+### `benchmarks.get_all_prompts(benchmark_id, timeout=None)`
+
+Fetches all prompts from a benchmark by automatically handling pagination.
+
+#### Parameters
+
+| Parameter      | Type                   | Required | Description                             |
+| -------------- | ---------------------- | -------- | --------------------------------------- |
+| `benchmark_id` | `str`                  | Yes      | The benchmark ID to fetch prompts from  |
+| `timeout`      | `float \| httpx.Timeout \| None` | No | Override request timeout            |
+
+#### Returns
+
+Returns a `List[BenchmarkPrompt]` containing all prompts in the benchmark.
+
+#### Example
+
+```python
+from layerlens import PublicClient
+
+client = PublicClient()
+
+# List benchmarks
+benchmarks = client.benchmarks.get(query="mmlu")
+
+if benchmarks and benchmarks.datasets:
+    benchmark = benchmarks.datasets[0]
+
+    # Get first page of prompts
+    prompts = client.benchmarks.get_prompts(benchmark.id, page=1, page_size=10)
+
+    if prompts:
+        print(f"Total prompts: {prompts.data.count}")
+        for prompt in prompts.data.prompts:
+            print(f"  Input: {str(prompt.input)[:80]}...")
+            print(f"  Truth: {prompt.truth[:50]}")
+
+    # Or fetch all prompts at once
+    all_prompts = client.benchmarks.get_all_prompts(benchmark.id)
+    print(f"All prompts: {len(all_prompts)}")
+```
+
+## Evaluations
+
+### `evaluations.get_by_id(id, ...)`
+
+Retrieves a single evaluation by its unique identifier, including the full evaluation summary.
+
+#### Parameters
+
+| Parameter | Type                             | Required | Description                      |
+| --------- | -------------------------------- | -------- | -------------------------------- |
+| `id`      | `str`                            | Yes      | The unique evaluation identifier |
+| `timeout` | `float \| httpx.Timeout \| None` | No       | Override request timeout         |
+
+#### Returns
+
+Returns an `Evaluation` object if found, `None` otherwise. See [Evaluations](evaluations.md) for the full `Evaluation` object properties.
+
+### `evaluations.get_many(...)`
+
+Retrieves evaluations for a given organization and project with optional pagination, sorting, and filtering.
+
+#### Parameters
+
+| Parameter         | Type                             | Required | Description                                                        |
+| ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ |
+| `organization_id` | `str`                            | Yes      | Organization ID (MongoDB ObjectID format)                          |
+| `project_id`      | `str`                            | Yes      | Project ID (MongoDB ObjectID format)                               |
+| `page`            | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)                |
+| `page_size`       | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500)            |
+| `sort_by`         | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration`     |
+| `order`           | `str \| None`                    | No       | Sort order: `asc` or `desc`                                       |
+| `model_ids`       | `List[str] \| None`              | No       | Filter by model IDs                                                |
+| `benchmark_ids`   | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                                    |
+| `status`          | `EvaluationStatus \| None`       | No       | Filter by evaluation status                                        |
+| `timeout`         | `float \| httpx.Timeout \| None` | No       | Override request timeout                                           |
+
+#### Returns
+
+Returns an `EvaluationsResponse` object containing:
+
+- `evaluations`: List of `Evaluation` objects
+- `pagination`: Pagination metadata with `page`, `page_size`, `total_pages`, and `total_count`
+
+Returns `None` if the request fails.
+
+#### Example
+
+```python
+from layerlens import PublicClient
+from layerlens.models import EvaluationStatus
+
+client = PublicClient()
+
+# Get a specific evaluation by ID (with full summary)
+evaluation = client.evaluations.get_by_id("eval_abc123")
+if evaluation:
+    print(f"{evaluation.model_name} on {evaluation.benchmark_name}: {evaluation.accuracy:.2f}%")
+    if evaluation.summary:
+        print(f"Goal: {evaluation.summary.goal}")
+        for takeaway in evaluation.summary.analysis_summary.key_takeaways:
+            print(f"  - {takeaway}")
+
+# List evaluations for an organization/project
+response = client.evaluations.get_many(
+    organization_id="683e63925ef7e1c53c1f4b28",
+    project_id="683e63925ef7e1c53c1f4b29",
+    status=EvaluationStatus.SUCCESS,
+    sort_by="accuracy",
+    order="desc",
+    page_size=10,
+)
+if response:
+    print(f"Top evaluations ({response.pagination.total_count} total):")
+    for e in response.evaluations:
+        print(f"  {e.model_name}: {e.accuracy:.2f}%")
+```
+
+## Comparisons
+
+### `comparisons.compare(...)`
+
+Compares results between two evaluations side-by-side.
+
+#### Parameters
+
+| Parameter          | Type                   | Required | Description                                                                |
+| ------------------ | ---------------------- | -------- | -------------------------------------------------------------------------- |
+| `evaluation_id_1`  | `str`                  | Yes      | First evaluation ID                                                        |
+| `evaluation_id_2`  | `str`                  | Yes      | Second evaluation ID                                                       |
+| `page`             | `int \| None`          | No       | Page number (1-based)                                                      |
+| `page_size`        | `int \| None`          | No       | Results per page                                                           |
+| `outcome_filter`   | `str \| None`          | No       | Filter by outcome (see below)                                              |
+| `search`           | `str \| None`          | No       | Search within results                                                      |
+| `timeout`          | `float \| httpx.Timeout \| None` | No | Override request timeout                                               |
+
+#### Outcome Filter Options
+
+| Value                | Description                                    |
+| -------------------- | ---------------------------------------------- |
+| `"all"`              | All results (default)                          |
+| `"both_succeed"`     | Both models answered correctly                 |
+| `"both_fail"`        | Both models answered incorrectly               |
+| `"reference_fails"`  | First model fails, second succeeds             |
+| `"comparison_fails"` | Second model fails, first succeeds             |
+
+#### Returns
+
+Returns a `ComparisonResponse` containing:
+
+- `results`: List of `ComparisonResult` objects
+- `total_count`: Total number of comparable results
+- `correct_count_1`: Number of correct answers for evaluation 1
+- `total_results_1`: Total results for evaluation 1
+- `correct_count_2`: Number of correct answers for evaluation 2
+- `total_results_2`: Total results for evaluation 2
+
+Returns `None` if the request fails.
+
+#### ComparisonResult Properties
+
+| Property      | Type            | Description                           |
+| ------------- | --------------- | ------------------------------------- |
+| `result_id_1` | `int \| None`   | Result ID from evaluation 1           |
+| `result_id_2` | `int \| None`   | Result ID from evaluation 2           |
+| `prompt`      | `str`           | The prompt text                       |
+| `truth`       | `str`           | The ground truth answer               |
+| `result1`     | `str \| None`   | Model 1's response                    |
+| `score1`      | `float \| None` | Model 1's score                       |
+| `result2`     | `str \| None`   | Model 2's response                    |
+| `score2`      | `float \| None` | Model 2's score                       |
+
+#### Example
+
+```python
+from layerlens import PublicClient
+
+client = PublicClient()
+
+comparison = client.comparisons.compare(
+    evaluation_id_1="eval-abc",
+    evaluation_id_2="eval-def",
+    outcome_filter="reference_fails",
+    page=1,
+    page_size=20,
+)
+
+if comparison:
+    print(f"Eval 1: {comparison.correct_count_1}/{comparison.total_results_1}")
+    print(f"Eval 2: {comparison.correct_count_2}/{comparison.total_results_2}")
+
+    for result in comparison.results:
+        print(f"  Prompt: {result.prompt[:80]}...")
+        print(f"  Model 1 score: {result.score1}, Model 2 score: {result.score2}")
+```
diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py
new file mode 100644
index 0000000..eb292b4
--- /dev/null
+++ b/examples/compare_evaluations.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import Stratix
+from layerlens.models import EvaluationStatus
+
+
+def main():
+    # Construct client (API key from env or inline)
+    client = Stratix()
+
+    # --- Get successful evaluations to find a comparable pair
+    response = client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+        page_size=100,
+    )
+
+    if not response or len(response.evaluations) < 2:
+        print("Need at least 2 successful evaluations to compare, exiting")
+        return
+
+    # Find two evaluations on the same benchmark
+    eval_1 = None
+    eval_2 = None
+    for i, e1 in enumerate(response.evaluations):
+        for e2 in response.evaluations[i + 1 :]:
+            if e1.benchmark_id == e2.benchmark_id and e1.id != e2.id:
+                eval_1 = e1
+                eval_2 = e2
+                break
+        if eval_1:
+            break
+
+    if not eval_1 or not eval_2:
+        print("No two evaluations share the same benchmark, exiting")
+        return
+
+    print(f"Comparing evaluations on the same benchmark ({eval_1.benchmark_id}):")
+    print(f"  Evaluation 1: {eval_1.id} (accuracy={eval_1.accuracy:.2f}%)")
+    print(f"  Evaluation 2: {eval_2.id} (accuracy={eval_2.accuracy:.2f}%)")
+
+    # --- Get comparison results
+    comparison = client.public.comparisons.compare(
+        evaluation_id_1=eval_1.id,
+        evaluation_id_2=eval_2.id,
+        page=1,
+        page_size=10,
+    )
+
+    if comparison:
+        print(f"\n=== Comparison Summary ===")
+        print(f"Evaluation 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+        print(f"Evaluation 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
+        print(f"Total compared: {comparison.total_count}")
+
+        # --- Show individual results
+        if comparison.results:
+            print(f"\nFirst {len(comparison.results)} results:")
+            for result in comparison.results:
+                score_indicator_1 = "✓" if result.score1 and result.score1 > 0.5 else "✗"
+                score_indicator_2 = "✓" if result.score2 and result.score2 > 0.5 else "✗"
+                print(f"  Prompt: {result.prompt[:80]}...")
+                print(f"    Model 1: {score_indicator_1} (score={result.score1})")
+                print(f"    Model 2: {score_indicator_2} (score={result.score2})")
+                print()
+
+    # --- Filter by outcome: where only model 1 fails
+    comparison = client.public.comparisons.compare(
+        evaluation_id_1=eval_1.id,
+        evaluation_id_2=eval_2.id,
+        outcome_filter="reference_fails",
+    )
+
+    if comparison:
+        print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===")
+        print(f"Found {comparison.total_count} such cases")
+
+    # --- Filter by outcome: where both models fail
+    comparison = client.public.comparisons.compare(
+        evaluation_id_1=eval_1.id,
+        evaluation_id_2=eval_2.id,
+        outcome_filter="both_fail",
+    )
+
+    if comparison:
+        print(f"\n=== Where Both Models Fail ===")
+        print(f"Found {comparison.total_count} such cases")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/create_custom_benchmark.py b/examples/create_custom_benchmark.py
new file mode 100644
index 0000000..4b263de
--- /dev/null
+++ b/examples/create_custom_benchmark.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import Stratix
+
+
+def main():
+    # Construct client (API key from env or inline)
+    client = Stratix()
+
+    # --- Create a custom benchmark from a JSONL file
+    #
+    # The JSONL file should have one JSON object per line with these fields:
+    #   {"input": "What is 2+2?", "truth": "4"}
+    #   {"input": "Capital of France?", "truth": "Paris"}
+    #
+    # Optional fields: "subset" (for grouping prompts)
+
+    result = client.benchmarks.create_custom(
+        name="My Custom Benchmark",
+        description="A simple test benchmark for QA evaluation",
+        file_path="path/to/benchmark.jsonl",
+    )
+
+    if result:
+        print(f"Custom benchmark created: {result.benchmark_id}")
+    else:
+        print("Failed to create custom benchmark")
+
+    # --- Create with additional metrics and input type
+    result = client.benchmarks.create_custom(
+        name="Advanced Benchmark",
+        description="Benchmark with toxicity and readability scoring",
+        file_path="path/to/benchmark.jsonl",
+        additional_metrics=["toxicity", "readability"],
+        input_type="messages",
+    )
+
+    if result:
+        print(f"Advanced benchmark created: {result.benchmark_id}")
+
+    # --- Verify the benchmark was added to the project
+    benchmarks = client.benchmarks.get(type="custom")
+    if benchmarks:
+        print(f"\nCustom benchmarks in project ({len(benchmarks)}):")
+        for b in benchmarks:
+            print(f"  - {b.name} (id={b.id})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/create_custom_model.py b/examples/create_custom_model.py
new file mode 100644
index 0000000..8d375f6
--- /dev/null
+++ b/examples/create_custom_model.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import Stratix
+
+
+def main():
+    # Construct client (API key from env or inline)
+    client = Stratix()
+
+    # --- Create a custom model backed by an OpenAI-compatible API
+    #
+    # Custom models let you evaluate any model accessible via an
+    # OpenAI-compatible chat completions endpoint.
+    #
+    # Key format: lowercase alphanumeric with dots, hyphens, slashes
+    #   e.g. "my-org/custom-llama-3.1-70b"
+
+    result = client.models.create_custom(
+        name="My Custom Model",
+        key="my-org/custom-model-v1",
+        description="Custom fine-tuned model served via vLLM",
+        api_url="https://my-model-endpoint.example.com/v1",
+        api_key="my-provider-api-key",
+        max_tokens=4096,
+    )
+
+    if result:
+        print(f"Custom model created: {result.model_id}")
+    else:
+        print("Failed to create custom model")
+
+    # --- Verify the model was added to the project
+    models = client.models.get(type="custom")
+    if models:
+        print(f"\nCustom models in project ({len(models)}):")
+        for m in models:
+            print(f"  - {m.name} (id={m.id}, key={m.key})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/create_smart_benchmark.py b/examples/create_smart_benchmark.py
new file mode 100644
index 0000000..9c628d3
--- /dev/null
+++ b/examples/create_smart_benchmark.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import Stratix
+
+
+def main():
+    # Construct client (API key from env or inline)
+    client = Stratix()
+
+    # --- Create a smart benchmark from source files
+    #
+    # Smart benchmarks use AI to automatically generate benchmark prompts
+    # from your uploaded documents. Supported file types include:
+    #   .txt, .pdf, .html, .docx, .csv, .json, .jsonl, .parquet
+    #
+    # You provide a system prompt that guides how the AI generates
+    # evaluation questions from the source material.
+
+    result = client.benchmarks.create_smart(
+        name="Product Knowledge Benchmark",
+        description="Evaluates model knowledge of our product documentation",
+        system_prompt=(
+            "Generate question-answer pairs that test understanding of the "
+            "product features, capabilities, and limitations described in "
+            "the provided documents. Each question should have a clear, "
+            "factual answer derived from the source material."
+        ),
+        file_paths=[
+            "path/to/product_docs.pdf",
+            "path/to/faq.txt",
+        ],
+        metrics=["hallucination"],
+    )
+
+    if result:
+        print(f"Smart benchmark created: {result.benchmark_id}")
+        print("The benchmark is being generated asynchronously.")
+        print("Check the dashboard for progress.")
+    else:
+        print("Failed to create smart benchmark")
+
+    # --- Verify the benchmark was added to the project
+    benchmarks = client.benchmarks.get(type="custom")
+    if benchmarks:
+        print(f"\nCustom benchmarks in project ({len(benchmarks)}):")
+        for b in benchmarks:
+            print(f"  - {b.name} (id={b.id})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/evaluation_sorting.py b/examples/evaluation_sorting.py
new file mode 100644
index 0000000..cb1906f
--- /dev/null
+++ b/examples/evaluation_sorting.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env -S poetry run python
+
+import asyncio
+
+from layerlens import AsyncStratix
+from layerlens.models import EvaluationStatus
+
+
+async def main():
+    # Construct async client (requires API key)
+    client = AsyncStratix()
+
+    # --- Get evaluations sorted by accuracy (highest first)
+    response = await client.evaluations.get_many(
+        sort_by="accuracy",
+        order="desc",
+        page_size=10,
+    )
+    if response:
+        print(f"Top {len(response.evaluations)} evaluations by accuracy:")
+        for evaluation in response.evaluations:
+            print(f"  - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%, status={evaluation.status.value}")
+
+    # --- Get evaluations sorted by submission date (newest first)
+    response = await client.evaluations.get_many(
+        sort_by="submittedAt",
+        order="desc",
+        page_size=5,
+    )
+    if response:
+        print(f"\nLatest {len(response.evaluations)} evaluations:")
+        for evaluation in response.evaluations:
+            print(f"  - {evaluation.id}: submitted_at={evaluation.submitted_at}")
+
+    # --- Get evaluations sorted by average duration (fastest first)
+    response = await client.evaluations.get_many(
+        sort_by="averageDuration",
+        order="asc",
+        page_size=5,
+    )
+    if response:
+        print(f"\nFastest {len(response.evaluations)} evaluations:")
+        for evaluation in response.evaluations:
+            print(f"  - {evaluation.id}: avg_duration={evaluation.average_duration}ms")
+
+    # --- Filter by status (only successful evaluations)
+    response = await client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+    )
+    if response:
+        print(f"\nSuccessful evaluations: {response.pagination.total_count}")
+
+    # --- Filter by specific model IDs
+    # Replace with actual model IDs from your organization
+    response = await client.evaluations.get_many(
+        model_ids=["your-model-id"],
+        sort_by="accuracy",
+        order="desc",
+    )
+    if response:
+        print(f"\nEvaluations for specified model: {response.pagination.total_count}")
+
+    # --- Filter by specific benchmark IDs
+    # Replace with actual benchmark IDs from your organization
+    response = await client.evaluations.get_many(
+        benchmark_ids=["your-benchmark-id"],
+        sort_by="submittedAt",
+        order="desc",
+    )
+    if response:
+        print(f"\nEvaluations for specified benchmark: {response.pagination.total_count}")
+
+    # --- Combine sorting, filtering, and pagination
+    response = await client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+        page=1,
+        page_size=20,
+    )
+    if response:
+        print(f"\nPage 1 of successful evaluations (sorted by accuracy):")
+        print(f"  Total: {response.pagination.total_count}")
+        print(f"  Pages: {response.pagination.total_pages}")
+        for evaluation in response.evaluations:
+            print(f"  - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/get_evaluation.py b/examples/get_evaluation.py
index 45d667c..a6d8fe6 100644
--- a/examples/get_evaluation.py
+++ b/examples/get_evaluation.py
@@ -10,7 +10,7 @@ async def main():
     client = AsyncStratix()
 
     # --- Get evaluation by id
-    evaluation_id = "eval_123"
+    evaluation_id = "699f1426c1212b2d9c78e947"
     evaluation = await client.evaluations.get_by_id(evaluation_id)
     print(f"Found evaluation {evaluation.id}")
     print(evaluation)
diff --git a/examples/manage_project_models_benchmarks.py b/examples/manage_project_models_benchmarks.py
new file mode 100644
index 0000000..0067051
--- /dev/null
+++ b/examples/manage_project_models_benchmarks.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import Stratix
+
+
+def main():
+    # Construct client (API key from env or inline)
+    client = Stratix()
+
+    # --- Add public models to the project
+    success = client.models.add("model-id-1", "model-id-2")
+    print(f"Add models: {'success' if success else 'failed'}")
+
+    # --- Remove a model from the project
+    success = client.models.remove("model-id-1")
+    print(f"Remove model: {'success' if success else 'failed'}")
+
+    # --- Add public benchmarks to the project
+    success = client.benchmarks.add("benchmark-id-1")
+    print(f"Add benchmark: {'success' if success else 'failed'}")
+
+    # --- Remove a benchmark from the project
+    success = client.benchmarks.remove("benchmark-id-1")
+    print(f"Remove benchmark: {'success' if success else 'failed'}")
+
+    # --- List current models and benchmarks
+    models = client.models.get()
+    if models:
+        print(f"\nModels in project ({len(models)}):")
+        for m in models:
+            print(f"  - {m.name} (id={m.id})")
+
+    benchmarks = client.benchmarks.get()
+    if benchmarks:
+        print(f"\nBenchmarks in project ({len(benchmarks)}):")
+        for b in benchmarks:
+            print(f"  - {b.name} (id={b.id})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/public_benchmarks.py b/examples/public_benchmarks.py
new file mode 100644
index 0000000..396c0d9
--- /dev/null
+++ b/examples/public_benchmarks.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import PublicClient
+
+
+def main():
+    # Construct public client (API key from env or inline)
+    client = PublicClient()
+
+    # --- Browse all public benchmarks
+    response = client.benchmarks.get(page=1, page_size=10)
+    print(f"Found {response.total_count} public benchmarks (showing first {len(response.datasets)})")
+    for benchmark in response.datasets:
+        print(f"  - {benchmark.name} (prompts={benchmark.prompt_count}, language={benchmark.language})")
+
+    # --- Filter by language
+    response = client.benchmarks.get(languages=["English"])
+    print(f"\nFound {response.total_count} English benchmarks")
+
+    # --- Filter by category
+    if response.categories:
+        print(f"\nAvailable categories: {response.categories}")
+
+    # --- Search by name
+    response = client.benchmarks.get(query="mmlu")
+    print(f"\nFound {response.total_count} benchmarks matching 'mmlu'")
+    for benchmark in response.datasets:
+        print(f"  - {benchmark.name}: {benchmark.description[:80] if benchmark.description else 'N/A'}...")
+
+    # --- Get benchmark prompts (content download)
+    if response.datasets:
+        benchmark = response.datasets[0]
+        print(f"\nFetching prompts for '{benchmark.name}' (id={benchmark.id})...")
+
+        prompts_response = client.benchmarks.get_prompts(
+            benchmark.id,
+            page=1,
+            page_size=5,
+        )
+
+        if prompts_response:
+            print(f"Total prompts: {prompts_response.data.count}")
+            print(f"Showing first {len(prompts_response.data.prompts)} prompts:")
+            for prompt in prompts_response.data.prompts:
+                input_preview = str(prompt.input)[:80]
+                truth_preview = prompt.truth[:50] if prompt.truth else "N/A"
+                print(f"  - Input: {input_preview}...")
+                print(f"    Truth: {truth_preview}")
+                print()
+
+    # --- Get all prompts (auto-paginates)
+    if response.datasets:
+        benchmark = response.datasets[0]
+        print(f"Fetching ALL prompts for '{benchmark.name}'...")
+        all_prompts = client.benchmarks.get_all_prompts(benchmark.id)
+        print(f"Retrieved {len(all_prompts)} total prompts")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py
new file mode 100644
index 0000000..f28236f
--- /dev/null
+++ b/examples/public_evaluations.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import PublicClient
+from layerlens.models import EvaluationStatus
+
+
+def main():
+    # Construct public client (API key from LAYERLENS_STRATIX_API_KEY env var or inline)
+    client = PublicClient()
+
+    # --- Get a specific evaluation by ID
+    evaluation_id = "699f1426c1212b2d9c78e947"
+    evaluation = client.evaluations.get_by_id(evaluation_id)
+    if evaluation:
+        print(f"Evaluation: {evaluation.id}")
+        print(f"  Model: {evaluation.model_name} ({evaluation.model_company})")
+        print(f"  Benchmark: {evaluation.benchmark_name}")
+        print(f"  Status: {evaluation.status.value}")
+        print(f"  Accuracy: {evaluation.accuracy:.2f}%")
+
+        if evaluation.summary:
+            print(f"  Summary: {evaluation.summary.name}")
+            print(f"  Goal: {evaluation.summary.goal}")
+            if evaluation.summary.metrics:
+                print(f"  Metrics: {', '.join(m.name for m in evaluation.summary.metrics)}")
+            if evaluation.summary.performance_details:
+                print(f"  Strengths: {evaluation.summary.performance_details.strengths}")
+            if evaluation.summary.analysis_summary:
+                print(f"  Key takeaways: {evaluation.summary.analysis_summary.key_takeaways}")
+    else:
+        print(f"Evaluation {evaluation_id} not found")
+
+    # --- List evaluations for a specific organization/project
+    organization_id = "683e63925ef7e1c53c1f4b28"
+    project_id = "683e63925ef7e1c53c1f4b29"
+
+    response = client.evaluations.get_many(
+        organization_id=organization_id,
+        project_id=project_id,
+        page=1,
+        page_size=5,
+        sort_by="submittedAt",
+        order="desc",
+    )
+    if response:
+        print(f"\nLatest evaluations ({response.pagination.total_count} total):")
+        for e in response.evaluations:
+            print(f"  - {e.id}: {e.model_name} on {e.benchmark_name} -> {e.accuracy:.2f}% ({e.status.value})")
+
+    # --- Filter by status (only successful)
+    response = client.evaluations.get_many(
+        organization_id=organization_id,
+        project_id=project_id,
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+        page_size=5,
+    )
+    if response:
+        print(f"\nTop successful evaluations ({response.pagination.total_count} total):")
+        for e in response.evaluations:
+            print(f"  - {e.model_name}: {e.accuracy:.2f}%")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/public_models.py b/examples/public_models.py
new file mode 100644
index 0000000..7d86550
--- /dev/null
+++ b/examples/public_models.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env -S poetry run python
+
+from layerlens import PublicClient
+
+
+def main():
+    # Construct public client (API key from env or inline)
+    client = PublicClient()
+
+    # --- Browse all public models (first page)
+    response = client.models.get(page=1, page_size=10)
+    print(f"Found {response.total_count} public models (showing first {len(response.models)})")
+    for model in response.models:
+        print(f"  - {model.name} ({model.company})")
+
+    # --- Search models by query
+    response = client.models.get(query="gpt")
+    print(f"\nFound {response.total_count} models matching 'gpt'")
+    for model in response.models:
+        print(f"  - {model.name}")
+
+    # --- Filter by company
+    companies = ["OpenAI", "Anthropic"]
+    response = client.models.get(companies=companies)
+    print(f"\nFound {response.total_count} models from {companies}")
+    for model in response.models:
+        print(f"  - {model.name} ({model.company})")
+
+    # --- Filter by region
+    response = client.models.get(regions=["usa"])
+    print(f"\nFound {response.total_count} models in region 'usa'")
+
+    # --- Filter by category
+    response = client.models.get(categories=["open-source"])
+    print(f"\nFound {response.total_count} open-source models")
+
+    # --- Sort by release date (newest first)
+    response = client.models.get(sort_by="releasedAt", order="desc", page_size=5)
+    print(f"\nNewest 5 models:")
+    for model in response.models:
+        print(f"  - {model.name} (released_at={model.released_at})")
+
+    # --- Include deprecated models
+    response = client.models.get(include_deprecated=True)
+    print(f"\nTotal models (including deprecated): {response.total_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 1d08dfa..ef86a87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "layerlens"
-version = "1.1.1"
+version = "1.2.0"
 description = "The official Python library for the LayerLens Stratix API"
 license = "Apache-2.0"
 authors = [{ name = "LayerLens", email = "support@layerlens.ai" }]
diff --git a/src/layerlens/__init__.py b/src/layerlens/__init__.py
index a5fcc80..78a69f9 100644
--- a/src/layerlens/__init__.py
+++ b/src/layerlens/__init__.py
@@ -3,15 +3,23 @@
     Trace,
     JudgeVersion,
     JudgeSnapshot,
+    BenchmarkPrompt,
     TraceEvaluation,
+    ComparisonResult,
+    PublicModelDetail,
+    ComparisonResponse,
     OptimizationBudget,
     TraceEvaluationStep,
     JudgeOptimizationRun,
     TraceWithEvaluations,
     OptimizationRunStatus,
+    PublicBenchmarkDetail,
     TraceEvaluationResult,
     TraceEvaluationStatus,
     TraceEvaluationSummary,
+    BenchmarkPromptsResponse,
+    PublicModelsListResponse,
+    PublicBenchmarksListResponse,
 )
 from ._client import Atlas, Client, Stratix, AsyncAtlas, AsyncClient, AsyncStratix
 from ._exceptions import (
@@ -22,17 +30,23 @@
     BadRequestError,
     AuthenticationError,
 )
+from ._public_client import PublicClient, AsyncPublicClient
 
 __all__ = [
     "APIError",
     "AsyncAtlas",
     "AsyncClient",
+    "AsyncPublicClient",
     "AsyncStratix",
     "Atlas",
     "AtlasError",
     "AuthenticationError",
     "BadRequestError",
+    "BenchmarkPrompt",
+    "BenchmarkPromptsResponse",
     "Client",
+    "ComparisonResult",
+    "ComparisonResponse",
     "Judge",
     "JudgeOptimizationRun",
     "JudgeSnapshot",
@@ -40,6 +54,11 @@
     "NotFoundError",
     "OptimizationBudget",
     "OptimizationRunStatus",
+    "PublicBenchmarkDetail",
+    "PublicBenchmarksListResponse",
+    "PublicClient",
+    "PublicModelDetail",
+    "PublicModelsListResponse",
     "Stratix",
     "StratixError",
     "Trace",
diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py
index 9599ab9..e146f29 100644
--- a/src/layerlens/_client.py
+++ b/src/layerlens/_client.py
@@ -17,6 +17,7 @@
 from ._base_client import BaseClient, BaseAsyncClient
 
 if TYPE_CHECKING:
+    from ._public_client import PublicClient, AsyncPublicClient
     from .resources.judges import Judges, AsyncJudges
     from .resources.models import Models, AsyncModels
     from .resources.traces import Traces, AsyncTraces
@@ -124,6 +125,12 @@ def trace_evaluations(self) -> TraceEvaluations:
 
         return TraceEvaluations(self)
 
+    @cached_property
+    def public(self) -> PublicClient:
+        from ._public_client import PublicClient
+
+        return PublicClient(api_key=self.api_key, base_url=str(self.base_url), timeout=self.timeout)
+
     @property
     @override
     def auth_headers(self) -> dict[str, str]:
@@ -287,6 +294,12 @@ def trace_evaluations(self) -> AsyncTraceEvaluations:
 
         return AsyncTraceEvaluations(self)
 
+    @cached_property
+    def public(self) -> AsyncPublicClient:
+        from ._public_client import AsyncPublicClient
+
+        return AsyncPublicClient(api_key=self.api_key, base_url=str(self.base_url), timeout=self.timeout)
+
     @property
     @override
     def auth_headers(self) -> dict[str, str]:
diff --git a/src/layerlens/_public_client.py b/src/layerlens/_public_client.py
new file mode 100644
index 0000000..fb04979
--- /dev/null
+++ b/src/layerlens/_public_client.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+import os
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, Union, Mapping
+from functools import cached_property
+from typing_extensions import Self, override
+
+import httpx
+
+from . import _exceptions
+from ._utils import is_mapping
+from ._constants import DEFAULT_TIMEOUT
+from ._exceptions import APIStatusError
+from ._base_client import BaseClient, BaseAsyncClient
+
+if TYPE_CHECKING:
+    from .resources.comparisons import Comparisons, AsyncComparisons
+    from .resources.public_models import PublicModelsResource, AsyncPublicModelsResource
+    from .resources.public_benchmarks import PublicBenchmarksResource, AsyncPublicBenchmarksResource
+    from .resources.public_evaluations import PublicEvaluationsResource, AsyncPublicEvaluationsResource
+
+
+__all__ = ["PublicClient", "AsyncPublicClient"]
+
+
+def _make_status_error(
+    err_msg: str,
+    *,
+    body: object,
+    response: httpx.Response,
+) -> APIStatusError:
+    data = body.get("error", body) if is_mapping(body) else body
+
+    if response.status_code == HTTPStatus.BAD_REQUEST:
+        return _exceptions.BadRequestError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.UNAUTHORIZED:
+        return _exceptions.AuthenticationError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.FORBIDDEN:
+        return _exceptions.PermissionDeniedError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.NOT_FOUND:
+        return _exceptions.NotFoundError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.CONFLICT:
+        return _exceptions.ConflictError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY:
+        return _exceptions.UnprocessableEntityError(err_msg, response=response, body=data)
+    if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
+        return _exceptions.RateLimitError(err_msg, response=response, body=data)
+    if response.status_code >= HTTPStatus.INTERNAL_SERVER_ERROR:
+        return _exceptions.InternalServerError(err_msg, response=response, body=data)
+
+    return APIStatusError(err_msg, response=response, body=data)
+
+
+class PublicClient(BaseClient):
+    """Client for accessing public LayerLens API endpoints."""
+
+    api_key: str
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
+        timeout: Union[float, httpx.Timeout, None] = DEFAULT_TIMEOUT,
+    ) -> None:
+        if api_key is None:
+            api_key = os.environ.get("LAYERLENS_STRATIX_API_KEY")
+        if api_key is None or api_key == "":
+            raise _exceptions.StratixError(
+                "The api_key client option must be set either by passing api_key to the client or by setting the LAYERLENS_STRATIX_API_KEY environment variable",
+            )
+        self.api_key = api_key
+
+        if base_url is None:
+            base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
+        if base_url is None:
+            base_url = "https://api.layerlens.ai/api/v1"
+
+        super().__init__(base_url=base_url, timeout=timeout)
+
+    @cached_property
+    def models(self) -> PublicModelsResource:
+        from .resources.public_models import PublicModelsResource
+
+        return PublicModelsResource(self)
+
+    @cached_property
+    def benchmarks(self) -> PublicBenchmarksResource:
+        from .resources.public_benchmarks import PublicBenchmarksResource
+
+        return PublicBenchmarksResource(self)
+
+    @cached_property
+    def comparisons(self) -> Comparisons:
+        from .resources.comparisons import Comparisons
+
+        return Comparisons(self)
+
+    @cached_property
+    def evaluations(self) -> PublicEvaluationsResource:
+        from .resources.public_evaluations import PublicEvaluationsResource
+
+        return PublicEvaluationsResource(self)
+
+    @property
+    @override
+    def auth_headers(self) -> dict[str, str]:
+        return {"x-api-key": self.api_key}
+
+    def copy(
+        self,
+        *,
+        api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+        _extra_kwargs: Mapping[str, Any] = {},
+    ) -> Self:
+        return self.__class__(
+            api_key=api_key or self.api_key,
+            base_url=base_url or self.base_url,
+            timeout=self.timeout or timeout,
+            **_extra_kwargs,
+        )
+
+    with_options = copy
+
+    @override
+    def _make_status_error(
+        self,
+        err_msg: str,
+        *,
+        body: object,
+        response: httpx.Response,
+    ) -> APIStatusError:
+        return _make_status_error(err_msg, body=body, response=response)
+
+
+class AsyncPublicClient(BaseAsyncClient):
+    """Async client for accessing public LayerLens API endpoints."""
+
+    api_key: str
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> None:
+        if api_key is None:
+            api_key = os.environ.get("LAYERLENS_STRATIX_API_KEY")
+        if api_key is None or api_key == "":
+            raise _exceptions.StratixError(
+                "The api_key client option must be set either by passing api_key to the client or by setting the LAYERLENS_STRATIX_API_KEY environment variable",
+            )
+        self.api_key = api_key
+
+        if base_url is None:
+            base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
+        if base_url is None:
+            base_url = "https://api.layerlens.ai/api/v1"
+
+        super().__init__(base_url=base_url, timeout=timeout)
+
+    @cached_property
+    def models(self) -> AsyncPublicModelsResource:
+        from .resources.public_models import AsyncPublicModelsResource
+
+        return AsyncPublicModelsResource(self)
+
+    @cached_property
+    def benchmarks(self) -> AsyncPublicBenchmarksResource:
+        from .resources.public_benchmarks import AsyncPublicBenchmarksResource
+
+        return AsyncPublicBenchmarksResource(self)
+
+    @cached_property
+    def comparisons(self) -> AsyncComparisons:
+        from .resources.comparisons import AsyncComparisons
+
+        return AsyncComparisons(self)
+
+    @cached_property
+    def evaluations(self) -> AsyncPublicEvaluationsResource:
+        from .resources.public_evaluations import AsyncPublicEvaluationsResource
+
+        return AsyncPublicEvaluationsResource(self)
+
+    @property
+    @override
+    def auth_headers(self) -> dict[str, str]:
+        return {"x-api-key": self.api_key}
+
+    def copy(
+        self,
+        *,
+        api_key: str | None = None,
+        base_url: str | httpx.URL | None = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+        _extra_kwargs: Mapping[str, Any] = {},
+    ) -> Self:
+        return self.__class__(
+            api_key=api_key or self.api_key,
+            base_url=base_url or self.base_url,
+            timeout=self.timeout or timeout,
+            **_extra_kwargs,
+        )
+
+    with_options = copy
+
+    def _make_status_error(
+        self,
+        err_msg: str,
+        *,
+        body: object,
+        response: httpx.Response,
+    ) -> APIStatusError:
+        return _make_status_error(err_msg, body=body, response=response)
diff --git a/src/layerlens/_resource.py b/src/layerlens/_resource.py
index 973ae62..3606c97 100644
--- a/src/layerlens/_resource.py
+++ b/src/layerlens/_resource.py
@@ -6,6 +6,7 @@
 
 if TYPE_CHECKING:
     from ._client import Stratix, AsyncStratix
+    from ._public_client import PublicClient, AsyncPublicClient
 
 
 class SyncAPIResource:
@@ -34,3 +35,25 @@ def __init__(self, client: AsyncStratix) -> None:
 
     async def _sleep(self, seconds: float) -> None:
         await asyncio.sleep(seconds)
+
+
+class SyncPublicAPIResource:
+    _client: PublicClient
+
+    def __init__(self, client: PublicClient) -> None:
+        self._client = client
+        self._get = client.get_cast
+
+    def _sleep(self, seconds: float) -> None:
+        time.sleep(seconds)
+
+
+class AsyncPublicAPIResource:
+    _client: AsyncPublicClient
+
+    def __init__(self, client: AsyncPublicClient) -> None:
+        self._client = client
+        self._get = client.get_cast
+
+    async def _sleep(self, seconds: float) -> None:
+        await asyncio.sleep(seconds)
diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py
index a82b376..c68196d 100644
--- a/src/layerlens/_version.py
+++ b/src/layerlens/_version.py
@@ -1 +1 @@
-__version__ = "1.1.1"
+__version__ = "1.2.0"
diff --git a/src/layerlens/models/__init__.py b/src/layerlens/models/__init__.py
index 85c1da0..26bcfb9 100644
--- a/src/layerlens/models/__init__.py
+++ b/src/layerlens/models/__init__.py
@@ -8,12 +8,14 @@
     UploadURLResponse,
     BenchmarksResponse,
     CreateJudgeResponse,
+    CreateModelResponse,
     DeleteJudgeResponse,
     EvaluationsResponse,
     UpdateJudgeResponse,
     CostEstimateResponse,
     CreateTracesResponse,
     OrganizationResponse,
+    CreateBenchmarkResponse,
     TraceEvaluationsResponse,
     CreateEvaluationsResponse,
     JudgeOptimizationRunsResponse,
@@ -25,8 +27,31 @@
 from .judge import Judge, JudgeVersion
 from .model import Model, CustomModel, PublicModel
 from .trace import Trace, TraceWithEvaluations, TraceEvaluationSummary
+from .public import (
+    BenchmarkPrompt,
+    ComparisonResult,
+    PublicModelDetail,
+    ComparisonResponse,
+    BenchmarkPromptsData,
+    PublicBenchmarkDetail,
+    BenchmarkPromptsResponse,
+    PublicModelsListResponse,
+    PublicBenchmarksListResponse,
+)
 from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark
-from .evaluation import Result, Evaluation, EvaluationStatus
+from .evaluation import (
+    Result,
+    Evaluation,
+    ErrorAnalysis,
+    AnalysisSummary,
+    EvaluationMetric,
+    EvaluationStatus,
+    EvaluationDataset,
+    EvaluationSummary,
+    EvaluationTaskType,
+    PerformanceDetails,
+    EvaluationModelInfo,
+)
 from .organization import Project, Organization
 from .trace_evaluation import (
     JudgeSnapshot,
@@ -44,9 +69,16 @@
 __all__ = [
     "ApplyJudgeOptimizationResultResponse",
     "Benchmark",
+    "BenchmarkPrompt",
+    "BenchmarkPromptsData",
+    "BenchmarkPromptsResponse",
     "BenchmarksResponse",
+    "ComparisonResult",
+    "ComparisonResponse",
     "CostEstimateResponse",
+    "CreateBenchmarkResponse",
     "CreateEvaluationsResponse",
+    "CreateModelResponse",
     "CreateJudgeOptimizationRunResponse",
     "CreateJudgeResponse",
     "CreateTracesResponse",
@@ -54,8 +86,15 @@
     "CustomModel",
     "DeleteJudgeResponse",
     "EstimateJudgeOptimizationCostResponse",
+    "AnalysisSummary",
+    "ErrorAnalysis",
     "Evaluation",
+    "EvaluationDataset",
+    "EvaluationMetric",
+    "EvaluationModelInfo",
     "EvaluationStatus",
+    "EvaluationSummary",
+    "EvaluationTaskType",
     "EvaluationsResponse",
     "Judge",
     "JudgeOptimizationRun",
@@ -70,9 +109,14 @@
     "Organization",
     "OrganizationResponse",
     "Pagination",
+    "PerformanceDetails",
     "Project",
     "PublicBenchmark",
+    "PublicBenchmarkDetail",
+    "PublicBenchmarksListResponse",
     "PublicModel",
+    "PublicModelDetail",
+    "PublicModelsListResponse",
     "Result",
     "ResultMetrics",
     "ResultsResponse",
diff --git a/src/layerlens/models/api.py b/src/layerlens/models/api.py
index 398ebe4..bb36468 100644
--- a/src/layerlens/models/api.py
+++ b/src/layerlens/models/api.py
@@ -95,6 +95,18 @@ class UploadURLResponse(BaseModel):
     url: str
 
 
+class CreateBenchmarkResponse(BaseModel):
+    organization_id: str
+    project_id: str
+    benchmark_id: str
+
+
+class CreateModelResponse(BaseModel):
+    organization_id: str
+    project_id: str
+    model_id: str
+
+
 class CreateTracesResponse(BaseModel):
     trace_ids: List[str]
 
diff --git a/src/layerlens/models/evaluation.py b/src/layerlens/models/evaluation.py
index 651c60b..859d09f 100644
--- a/src/layerlens/models/evaluation.py
+++ b/src/layerlens/models/evaluation.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 from datetime import timedelta
 
 import httpx
@@ -22,17 +22,76 @@ class EvaluationStatus(str, Enum):
     CANCELLED = "cancelled"
 
 
+class EvaluationMetric(BaseModel):
+    name: str
+    description: str = ""
+
+
+class EvaluationTaskType(BaseModel):
+    name: str
+    description: str = ""
+
+
+class EvaluationDataset(BaseModel):
+    total_size: int = 0
+    training_size: int = 0
+    test_size: int = 0
+    characteristics: List[str] = []
+
+
+class EvaluationModelInfo(BaseModel):
+    model_name: str = ""
+    performance: Any = None
+
+
+class PerformanceDetails(BaseModel):
+    strengths: List[str] = []
+    challenges: List[str] = []
+
+
+class ErrorAnalysis(BaseModel):
+    common_failure_modes: List[str] = []
+    example: str = ""
+
+
+class AnalysisSummary(BaseModel):
+    key_takeaways: List[str] = []
+
+
+class EvaluationSummary(BaseModel):
+    name: str = ""
+    goal: str = ""
+    metrics: List[EvaluationMetric] = []
+    task_types: List[EvaluationTaskType] = []
+    dataset: Optional[EvaluationDataset] = None
+    model: Optional[EvaluationModelInfo] = None
+    performance_details: Optional[PerformanceDetails] = None
+    error_analysis: Optional[ErrorAnalysis] = None
+    analysis_summary: Optional[AnalysisSummary] = None
+
+
 class Evaluation(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     id: str
     status: EvaluationStatus
+    status_description: str = ""
     submitted_at: int
     finished_at: int
     model_id: str
+    model_name: str = ""
+    model_key: str = ""
+    model_company: str = ""
     benchmark_id: str = Field(..., alias="dataset_id")
+    benchmark_name: str = Field("", alias="dataset_name")
     average_duration: int
     accuracy: float
+    readability_score: float = 0.0
+    toxicity_score: float = 0.0
+    ethics_score: float = 0.0
+    failed_prompt_count: int = 0
+    queue_id: int = 0
+    summary: Optional[EvaluationSummary] = None
 
     _client: "Optional[Stratix | AsyncStratix]" = None
 
@@ -134,9 +193,15 @@ def wait_for_completion(
         )
         if evaluation:
             self.status = evaluation.status
+            self.status_description = evaluation.status_description
             self.finished_at = evaluation.finished_at
             self.average_duration = evaluation.average_duration
             self.accuracy = evaluation.accuracy
+            self.readability_score = evaluation.readability_score
+            self.toxicity_score = evaluation.toxicity_score
+            self.ethics_score = evaluation.ethics_score
+            self.failed_prompt_count = evaluation.failed_prompt_count
+            self.summary = evaluation.summary
 
         return self
 
@@ -156,9 +221,15 @@ async def wait_for_completion_async(
         )
         if evaluation:
             self.status = evaluation.status
+            self.status_description = evaluation.status_description
             self.finished_at = evaluation.finished_at
             self.average_duration = evaluation.average_duration
             self.accuracy = evaluation.accuracy
+            self.readability_score = evaluation.readability_score
+            self.toxicity_score = evaluation.toxicity_score
+            self.ethics_score = evaluation.ethics_score
+            self.failed_prompt_count = evaluation.failed_prompt_count
+            self.summary = evaluation.summary
 
         return self
 
diff --git a/src/layerlens/models/public.py b/src/layerlens/models/public.py
new file mode 100644
index 0000000..5905a09
--- /dev/null
+++ b/src/layerlens/models/public.py
@@ -0,0 +1,94 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Union, Optional
+
+from pydantic import BaseModel, ConfigDict
+
+
+class PublicModelDetail(BaseModel):
+    model_config = ConfigDict(populate_by_name=True)
+
+    id: str
+    key: str
+    name: str
+    description: Optional[str] = None
+    company: Optional[str] = None
+    created_at: Optional[Union[int, str]] = None
+    released_at: Optional[Union[int, str]] = None
+    parameters: Optional[float] = None
+    modality: Optional[str] = None
+    context_length: Optional[int] = None
+    architecture_type: Optional[str] = None
+    license: Optional[str] = None
+    open_weights: Optional[bool] = None
+    region: Optional[str] = None
+    key_takeaways: Optional[List[str]] = None
+    deprecated: Optional[bool] = None
+    cost_per_input_token: Optional[str] = None
+    cost_per_output_token: Optional[str] = None
+
+
+class PublicModelsListResponse(BaseModel):
+    models: List[PublicModelDetail]
+    categories: List[str] = []
+    count: int
+    total_count: int
+
+
+class PublicBenchmarkDetail(BaseModel):
+    model_config = ConfigDict(populate_by_name=True)
+
+    id: str
+    key: str
+    name: str
+    description: Optional[str] = None
+    created_at: Optional[Union[int, str]] = None
+    prompt_count: Optional[int] = None
+    language: Optional[str] = None
+    categories: Optional[List[str]] = None
+    characteristics: Optional[List[str]] = None
+    deprecated: Optional[bool] = None
+    is_public: Optional[bool] = None
+
+
+class PublicBenchmarksListResponse(BaseModel):
+    datasets: List[PublicBenchmarkDetail]
+    categories: List[str] = []
+    count: int
+    total_count: int
+
+
+class BenchmarkPrompt(BaseModel):
+    id: str
+    input: Union[str, List[Dict[str, Any]], Dict[str, Any]]
+    truth: str
+
+
+class BenchmarkPromptsData(BaseModel):
+    prompts: List[BenchmarkPrompt]
+    count: int
+
+
+class BenchmarkPromptsResponse(BaseModel):
+    status: str
+    data: BenchmarkPromptsData
+
+
+class ComparisonResult(BaseModel):
+    result_id_1: Optional[int] = None
+    result_id_2: Optional[int] = None
+    prompt: str
+    truth: str
+    result1: Optional[str] = None
+    score1: Optional[float] = None
+    result2: Optional[str] = None
+    score2: Optional[float] = None
+
+
+class ComparisonResponse(BaseModel):
+    results: Optional[List[ComparisonResult]] = None
+    total_count: int
+    correct_count_1: int
+    total_results_1: int
+    correct_count_2: int
+    total_results_2: int
diff --git a/src/layerlens/resources/benchmarks/benchmarks.py b/src/layerlens/resources/benchmarks/benchmarks.py
index c694f16..f8e4bf3 100644
--- a/src/layerlens/resources/benchmarks/benchmarks.py
+++ b/src/layerlens/resources/benchmarks/benchmarks.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
-from typing import List, Literal, Optional
+import os
+import mimetypes
+from typing import Any, Dict, List, Literal, Optional
 
 import httpx
 
@@ -9,10 +11,26 @@
     CustomBenchmark,
     PublicBenchmark,
     BenchmarksResponse,
+    CreateBenchmarkResponse,
 )
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._constants import DEFAULT_TIMEOUT
 
+MAX_UPLOAD_SIZE = 50 * 1024 * 1024  # 50 MB
+
+
+def _get_content_type(filename: str) -> str:
+    ct, _ = mimetypes.guess_type(filename)
+    if ct:
+        return ct
+    ext = os.path.splitext(filename)[1].lower()
+    return {
+        ".jsonl": "application/jsonl",
+        ".json": "application/json",
+        ".csv": "text/csv",
+        ".parquet": "application/x-parquet",
+    }.get(ext, "application/octet-stream")
+
 
 class Benchmarks(SyncAPIResource):
     def get(
@@ -100,6 +118,183 @@ def get_by_key(
                 return benchmark
         return None
 
+    def add(
+        self,
+        *benchmark_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Add benchmarks to the project by their IDs."""
+        current = self.get(timeout=timeout) or []
+        current_ids = [b.id for b in current]
+        new_ids = list(dict.fromkeys(current_ids + list(benchmark_ids)))
+        return self._patch_project_benchmarks(new_ids, timeout)
+
+    def remove(
+        self,
+        *benchmark_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Remove benchmarks from the project by their IDs."""
+        current = self.get(timeout=timeout) or []
+        remove_set = set(benchmark_ids)
+        new_ids = [b.id for b in current if b.id not in remove_set]
+        return self._patch_project_benchmarks(new_ids, timeout)
+
+    def _patch_project_benchmarks(
+        self,
+        dataset_ids: List[str],
+        timeout: float | httpx.Timeout | None,
+    ) -> bool:
+        url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        resp = self._patch(
+            url,
+            body={"datasets": dataset_ids},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        return isinstance(resp, dict) and "id" in resp
+
+    def _upload_file(
+        self,
+        file_path: str,
+        benchmark_name: str,
+        timeout: float | httpx.Timeout | None,
+    ) -> str:
+        """Upload a file and return the filename for use in benchmark creation."""
+        file_path = os.path.abspath(file_path)
+        filename = os.path.basename(file_path)
+        file_size = os.path.getsize(file_path)
+
+        if file_size > MAX_UPLOAD_SIZE:
+            raise ValueError(f"File size {file_size} exceeds maximum of {MAX_UPLOAD_SIZE} bytes (50 MB)")
+
+        content_type = _get_content_type(filename)
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+
+        raw_resp = self._post(
+            f"{base}/upload",
+            body={"key": benchmark_name, "filename": filename, "type": content_type, "size": file_size},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        # Unwrap {"status": ..., "data": {...}} envelope if present
+        resp = raw_resp
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if not isinstance(resp, dict) or "url" not in resp:
+            raise ValueError("Failed to get upload URL")
+
+        with open(file_path, "rb") as f:
+            put_resp = httpx.put(
+                resp["url"],
+                content=f.read(),
+                headers={"Content-Type": content_type},
+                timeout=timeout if isinstance(timeout, httpx.Timeout) else httpx.Timeout(timeout),
+            )
+            put_resp.raise_for_status()
+
+        return filename
+
+    def create_custom(
+        self,
+        *,
+        name: str,
+        description: str,
+        file_path: str,
+        additional_metrics: Optional[List[str]] = None,
+        custom_scorer_ids: Optional[List[str]] = None,
+        input_type: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateBenchmarkResponse]:
+        """Create a custom benchmark by uploading a JSONL file.
+
+        Args:
+            name: Benchmark name (max 64 characters).
+            description: Benchmark description (max 280 characters).
+            file_path: Path to a JSONL file with benchmark prompts.
+            additional_metrics: Optional metrics: "readability", "toxicity", "hallucination".
+            custom_scorer_ids: Optional list of custom scorer IDs.
+            input_type: Optional input type: "messages" or "json_payload".
+            timeout: Request timeout override.
+
+        Returns:
+            CreateBenchmarkResponse with benchmark_id, or None on failure.
+        """
+        filename = self._upload_file(file_path, name, timeout)
+
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {"name": name, "description": description, "file": filename}
+        if additional_metrics:
+            body["additional_metrics"] = additional_metrics
+        if custom_scorer_ids:
+            body["custom_scorers"] = custom_scorer_ids
+        if input_type:
+            body["input_type"] = input_type
+
+        resp = self._post(
+            f"{base}/custom-benchmarks",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "benchmark_id" in resp:
+            return CreateBenchmarkResponse(**resp)
+        return None
+
+    def create_smart(
+        self,
+        *,
+        name: str,
+        description: str,
+        system_prompt: str,
+        file_paths: List[str],
+        metrics: Optional[List[str]] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateBenchmarkResponse]:
+        """Create a smart benchmark from uploaded files.
+
+        The platform will use AI to generate benchmark prompts from the provided files.
+
+        Args:
+            name: Benchmark name (max 256 characters).
+            description: Benchmark description (max 500 characters).
+            system_prompt: System prompt for benchmark generation (max 4000 characters).
+            file_paths: List of file paths to upload (1-20 files).
+            metrics: Optional metrics: "readability", "toxicity", "hallucination".
+            timeout: Request timeout override.
+
+        Returns:
+            CreateBenchmarkResponse with benchmark_id, or None on failure.
+        """
+        filenames = []
+        for fp in file_paths:
+            filename = self._upload_file(fp, name, timeout)
+            filenames.append(filename)
+
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {
+            "name": name,
+            "description": description,
+            "system_prompt": system_prompt,
+            "files": filenames,
+        }
+        if metrics:
+            body["metrics"] = metrics
+
+        resp = self._post(
+            f"{base}/smart-benchmarks",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "benchmark_id" in resp:
+            return CreateBenchmarkResponse(**resp)
+        return None
+
 
 class AsyncBenchmarks(AsyncAPIResource):
     async def get(
@@ -188,3 +383,181 @@ async def get_by_key(
             if benchmark.key == key:
                 return benchmark
         return None
+
+    async def add(
+        self,
+        *benchmark_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Add benchmarks to the project by their IDs."""
+        current = await self.get(timeout=timeout) or []
+        current_ids = [b.id for b in current]
+        new_ids = list(dict.fromkeys(current_ids + list(benchmark_ids)))
+        return await self._patch_project_benchmarks(new_ids, timeout)
+
+    async def remove(
+        self,
+        *benchmark_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Remove benchmarks from the project by their IDs."""
+        current = await self.get(timeout=timeout) or []
+        remove_set = set(benchmark_ids)
+        new_ids = [b.id for b in current if b.id not in remove_set]
+        return await self._patch_project_benchmarks(new_ids, timeout)
+
+    async def _patch_project_benchmarks(
+        self,
+        dataset_ids: List[str],
+        timeout: float | httpx.Timeout | None,
+    ) -> bool:
+        url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        resp = await self._patch(
+            url,
+            body={"datasets": dataset_ids},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        return isinstance(resp, dict) and "id" in resp
+
+    async def _upload_file(
+        self,
+        file_path: str,
+        benchmark_name: str,
+        timeout: float | httpx.Timeout | None,
+    ) -> str:
+        """Upload a file and return the filename for use in benchmark creation."""
+        file_path = os.path.abspath(file_path)
+        filename = os.path.basename(file_path)
+        file_size = os.path.getsize(file_path)
+
+        if file_size > MAX_UPLOAD_SIZE:
+            raise ValueError(f"File size {file_size} exceeds maximum of {MAX_UPLOAD_SIZE} bytes (50 MB)")
+
+        content_type = _get_content_type(filename)
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+
+        raw_resp = await self._post(
+            f"{base}/upload",
+            body={"key": benchmark_name, "filename": filename, "type": content_type, "size": file_size},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        # Unwrap {"status": ..., "data": {...}} envelope if present
+        resp = raw_resp
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if not isinstance(resp, dict) or "url" not in resp:
+            raise ValueError("Failed to get upload URL")
+
+        async with httpx.AsyncClient() as http:
+            with open(file_path, "rb") as f:
+                put_resp = await http.put(
+                    resp["url"],
+                    content=f.read(),
+                    headers={"Content-Type": content_type},
+                    timeout=timeout if isinstance(timeout, httpx.Timeout) else httpx.Timeout(timeout),
+                )
+                put_resp.raise_for_status()
+
+        return filename
+
+    async def create_custom(
+        self,
+        *,
+        name: str,
+        description: str,
+        file_path: str,
+        additional_metrics: Optional[List[str]] = None,
+        custom_scorer_ids: Optional[List[str]] = None,
+        input_type: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateBenchmarkResponse]:
+        """Create a custom benchmark by uploading a JSONL file.
+
+        Args:
+            name: Benchmark name (max 64 characters).
+            description: Benchmark description (max 280 characters).
+            file_path: Path to a JSONL file with benchmark prompts.
+            additional_metrics: Optional metrics: "readability", "toxicity", "hallucination".
+            custom_scorer_ids: Optional list of custom scorer IDs.
+            input_type: Optional input type: "messages" or "json_payload".
+            timeout: Request timeout override.
+
+        Returns:
+            CreateBenchmarkResponse with benchmark_id, or None on failure.
+        """
+        filename = await self._upload_file(file_path, name, timeout)
+
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {"name": name, "description": description, "file": filename}
+        if additional_metrics:
+            body["additional_metrics"] = additional_metrics
+        if custom_scorer_ids:
+            body["custom_scorers"] = custom_scorer_ids
+        if input_type:
+            body["input_type"] = input_type
+
+        resp = await self._post(
+            f"{base}/custom-benchmarks",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "benchmark_id" in resp:
+            return CreateBenchmarkResponse(**resp)
+        return None
+
+    async def create_smart(
+        self,
+        *,
+        name: str,
+        description: str,
+        system_prompt: str,
+        file_paths: List[str],
+        metrics: Optional[List[str]] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateBenchmarkResponse]:
+        """Create a smart benchmark from uploaded files.
+
+        The platform will use AI to generate benchmark prompts from the provided files.
+
+        Args:
+            name: Benchmark name (max 256 characters).
+            description: Benchmark description (max 500 characters).
+            system_prompt: System prompt for benchmark generation (max 4000 characters).
+            file_paths: List of file paths to upload (1-20 files).
+            metrics: Optional metrics: "readability", "toxicity", "hallucination".
+            timeout: Request timeout override.
+
+        Returns:
+            CreateBenchmarkResponse with benchmark_id, or None on failure.
+        """
+        filenames = []
+        for fp in file_paths:
+            filename = await self._upload_file(fp, name, timeout)
+            filenames.append(filename)
+
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {
+            "name": name,
+            "description": description,
+            "system_prompt": system_prompt,
+            "files": filenames,
+        }
+        if metrics:
+            body["metrics"] = metrics
+
+        resp = await self._post(
+            f"{base}/smart-benchmarks",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "benchmark_id" in resp:
+            return CreateBenchmarkResponse(**resp)
+        return None
diff --git a/src/layerlens/resources/comparisons/__init__.py b/src/layerlens/resources/comparisons/__init__.py
new file mode 100644
index 0000000..b0a514e
--- /dev/null
+++ b/src/layerlens/resources/comparisons/__init__.py
@@ -0,0 +1,3 @@
+from .comparisons import Comparisons, AsyncComparisons
+
+__all__ = ["Comparisons", "AsyncComparisons"]
diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py
new file mode 100644
index 0000000..eef469a
--- /dev/null
+++ b/src/layerlens/resources/comparisons/comparisons.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from typing import Literal, Optional
+
+import httpx
+
+from ...models import ComparisonResponse
+from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+
+class Comparisons(SyncPublicAPIResource):
+    def compare(
+        self,
+        *,
+        evaluation_id_1: str,
+        evaluation_id_2: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        outcome_filter: Optional[
+            Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"]
+        ] = None,
+        search: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ComparisonResponse]:
+        params = {
+            "evaluation_id_1": evaluation_id_1,
+            "evaluation_id_2": evaluation_id_2,
+        }
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if outcome_filter:
+            params["outcomeFilter"] = outcome_filter
+        if search:
+            params["search"] = search
+
+        resp = self._get(
+            "/results/comparison",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return ComparisonResponse.model_validate(resp)
+
+
+class AsyncComparisons(AsyncPublicAPIResource):
+    async def compare(
+        self,
+        *,
+        evaluation_id_1: str,
+        evaluation_id_2: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        outcome_filter: Optional[
+            Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"]
+        ] = None,
+        search: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ComparisonResponse]:
+        params = {
+            "evaluation_id_1": evaluation_id_1,
+            "evaluation_id_2": evaluation_id_2,
+        }
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if outcome_filter:
+            params["outcomeFilter"] = outcome_filter
+        if search:
+            params["search"] = search
+
+        resp = await self._get(
+            "/results/comparison",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return ComparisonResponse.model_validate(resp)
diff --git a/src/layerlens/resources/evaluations/evaluations.py b/src/layerlens/resources/evaluations/evaluations.py
index d1ea851..c5cc040 100644
--- a/src/layerlens/resources/evaluations/evaluations.py
+++ b/src/layerlens/resources/evaluations/evaluations.py
@@ -3,7 +3,7 @@
 import math
 import time
 import asyncio
-from typing import Optional
+from typing import List, Literal, Optional
 
 import httpx
 
@@ -13,6 +13,7 @@
     Evaluation,
     CustomModel,
     CustomBenchmark,
+    EvaluationStatus,
     EvaluationsResponse,
     CreateEvaluationsResponse,
 )
@@ -80,14 +81,24 @@ def get_many(
         *,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
+        sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        model_ids: Optional[List[str]] = None,
+        benchmark_ids: Optional[List[str]] = None,
+        status: Optional[EvaluationStatus] = None,
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
     ) -> Optional[EvaluationsResponse]:
         """
-        Get evaluations with optional pagination.
+        Get evaluations with optional pagination, sorting, and filtering.
 
         Args:
             page: Page number for pagination (1-based, defaults to 1 if not provided)
             page_size: Number of evaluations per page (default: 100, optional)
+            sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
+            order: Sort order (asc or desc)
+            model_ids: Filter by model IDs
+            benchmark_ids: Filter by benchmark/dataset IDs
+            status: Filter by evaluation status
             timeout: Request timeout
 
         Returns:
@@ -104,6 +115,17 @@ def get_many(
         params["page"] = str(effective_page)
         params["pageSize"] = str(effective_page_size)
 
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if model_ids:
+            params["models"] = ",".join(model_ids)
+        if benchmark_ids:
+            params["datasets"] = ",".join(benchmark_ids)
+        if status:
+            params["status"] = status.value
+
         resp = self._get(
             f"/evaluations",
             params=params,
@@ -214,14 +236,24 @@ async def get_many(
         *,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
+        sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        model_ids: Optional[List[str]] = None,
+        benchmark_ids: Optional[List[str]] = None,
+        status: Optional[EvaluationStatus] = None,
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
     ) -> Optional[EvaluationsResponse]:
         """
-        Get evaluations with optional pagination.
+        Get evaluations with optional pagination, sorting, and filtering.
 
         Args:
             page: Page number for pagination (1-based, defaults to 1 if not provided)
             page_size: Number of evaluations per page (default: 100, optional)
+            sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
+            order: Sort order (asc or desc)
+            model_ids: Filter by model IDs
+            benchmark_ids: Filter by benchmark/dataset IDs
+            status: Filter by evaluation status
             timeout: Request timeout
 
         Returns:
@@ -238,6 +270,17 @@ async def get_many(
         params["page"] = str(effective_page)
         params["pageSize"] = str(effective_page_size)
 
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if model_ids:
+            params["models"] = ",".join(model_ids)
+        if benchmark_ids:
+            params["datasets"] = ",".join(benchmark_ids)
+        if status:
+            params["status"] = status.value
+
         resp = await self._get(
             f"/evaluations",
             params=params,
diff --git a/src/layerlens/resources/models/models.py b/src/layerlens/resources/models/models.py
index 7e0cd70..122a3ae 100644
--- a/src/layerlens/resources/models/models.py
+++ b/src/layerlens/resources/models/models.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
-from typing import List, Literal, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import httpx
 
-from ...models import Model, CustomModel, PublicModel, ModelsResponse
+from ...models import Model, CustomModel, PublicModel, ModelsResponse, CreateModelResponse
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._constants import DEFAULT_TIMEOUT
 
@@ -104,6 +104,90 @@ def get_by_key(
                 return model
         return None
 
+    def add(
+        self,
+        *model_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Add models to the project by their IDs."""
+        current = self.get(timeout=timeout) or []
+        current_ids = [m.id for m in current]
+        new_ids = list(dict.fromkeys(current_ids + list(model_ids)))
+        return self._patch_project_models(new_ids, timeout)
+
+    def remove(
+        self,
+        *model_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Remove models from the project by their IDs."""
+        current = self.get(timeout=timeout) or []
+        remove_set = set(model_ids)
+        new_ids = [m.id for m in current if m.id not in remove_set]
+        return self._patch_project_models(new_ids, timeout)
+
+    def _patch_project_models(
+        self,
+        model_ids: List[str],
+        timeout: float | httpx.Timeout | None,
+    ) -> bool:
+        url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        resp = self._patch(
+            url,
+            body={"models": model_ids},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        return isinstance(resp, dict) and "id" in resp
+
+    def create_custom(
+        self,
+        *,
+        name: str,
+        key: str,
+        description: str,
+        api_url: str,
+        max_tokens: int,
+        api_key: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateModelResponse]:
+        """Create a custom model backed by an OpenAI-compatible API.
+
+        Args:
+            name: Model name (max 256 characters).
+            key: Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 characters).
+            description: Model description (max 500 characters).
+            api_url: Base URL of the OpenAI-compatible API endpoint.
+            max_tokens: Maximum number of tokens the model supports.
+            api_key: Optional API key for the model provider.
+            timeout: Request timeout override.
+
+        Returns:
+            CreateModelResponse with model_id, or None on failure.
+        """
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {
+            "name": name,
+            "key": key,
+            "description": description,
+            "api_url": api_url,
+            "max_tokens": max_tokens,
+        }
+        if api_key is not None:
+            body["api_key"] = api_key
+
+        resp = self._post(
+            f"{base}/custom-models",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "model_id" in resp:
+            return CreateModelResponse(**resp)
+        return None
+
 
 class AsyncModels(AsyncAPIResource):
     async def get(
@@ -199,3 +283,87 @@ async def get_by_key(
             if model.key == key:
                 return model
         return None
+
+    async def add(
+        self,
+        *model_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Add models to the project by their IDs."""
+        current = await self.get(timeout=timeout) or []
+        current_ids = [m.id for m in current]
+        new_ids = list(dict.fromkeys(current_ids + list(model_ids)))
+        return await self._patch_project_models(new_ids, timeout)
+
+    async def remove(
+        self,
+        *model_ids: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Remove models from the project by their IDs."""
+        current = await self.get(timeout=timeout) or []
+        remove_set = set(model_ids)
+        new_ids = [m.id for m in current if m.id not in remove_set]
+        return await self._patch_project_models(new_ids, timeout)
+
+    async def _patch_project_models(
+        self,
+        model_ids: List[str],
+        timeout: float | httpx.Timeout | None,
+    ) -> bool:
+        url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        resp = await self._patch(
+            url,
+            body={"models": model_ids},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        return isinstance(resp, dict) and "id" in resp
+
+    async def create_custom(
+        self,
+        *,
+        name: str,
+        key: str,
+        description: str,
+        api_url: str,
+        max_tokens: int,
+        api_key: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[CreateModelResponse]:
+        """Create a custom model backed by an OpenAI-compatible API.
+
+        Args:
+            name: Model name (max 256 characters).
+            key: Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 characters).
+            description: Model description (max 500 characters).
+            api_url: Base URL of the OpenAI-compatible API endpoint.
+            max_tokens: Maximum number of tokens the model supports.
+            api_key: Optional API key for the model provider.
+            timeout: Request timeout override.
+
+        Returns:
+            CreateModelResponse with model_id, or None on failure.
+        """
+        base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}"
+        body: Dict[str, Any] = {
+            "name": name,
+            "key": key,
+            "description": description,
+            "api_url": api_url,
+            "max_tokens": max_tokens,
+        }
+        if api_key is not None:
+            body["api_key"] = api_key
+
+        resp = await self._post(
+            f"{base}/custom-models",
+            body=body,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if isinstance(resp, dict) and "data" in resp and "status" in resp:
+            resp = resp["data"]
+        if isinstance(resp, dict) and "model_id" in resp:
+            return CreateModelResponse(**resp)
+        return None
diff --git a/src/layerlens/resources/public_benchmarks/__init__.py b/src/layerlens/resources/public_benchmarks/__init__.py
new file mode 100644
index 0000000..52ea1d4
--- /dev/null
+++ b/src/layerlens/resources/public_benchmarks/__init__.py
@@ -0,0 +1,3 @@
+from .public_benchmarks import PublicBenchmarksResource, AsyncPublicBenchmarksResource
+
+__all__ = ["PublicBenchmarksResource", "AsyncPublicBenchmarksResource"]
diff --git a/src/layerlens/resources/public_benchmarks/public_benchmarks.py b/src/layerlens/resources/public_benchmarks/public_benchmarks.py
new file mode 100644
index 0000000..f73d75e
--- /dev/null
+++ b/src/layerlens/resources/public_benchmarks/public_benchmarks.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import math
+from typing import List, Literal, Optional
+
+import httpx
+
+from ...models import (
+    BenchmarkPrompt,
+    BenchmarkPromptsResponse,
+    PublicBenchmarksListResponse,
+)
+from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+DEFAULT_PROMPTS_PAGE_SIZE = 100
+MAX_PROMPTS_PAGE_SIZE = 500
+
+
+class PublicBenchmarksResource(SyncPublicAPIResource):
+    def get(
+        self,
+        *,
+        query: Optional[str] = None,
+        name: Optional[str] = None,
+        key: Optional[str] = None,
+        ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        languages: Optional[List[str]] = None,
+        sort_by: Optional[Literal["name"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        include_deprecated: Optional[bool] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[PublicBenchmarksListResponse]:
+        params = {}
+        if query:
+            params["query"] = query
+        if name:
+            params["name"] = name
+        if key:
+            params["key"] = key
+        if ids:
+            params["ids"] = ",".join(ids)
+        if categories:
+            params["categories"] = ",".join(categories)
+        if languages:
+            params["languages"] = ",".join(languages)
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if include_deprecated is not None:
+            params["include_deprecated"] = str(include_deprecated).lower()
+
+        resp = self._get(
+            "/datasets",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return PublicBenchmarksListResponse.model_validate(resp)
+
+    def get_prompts(
+        self,
+        benchmark_id: str,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        search_field: Optional[Literal["id", "input", "truth"]] = None,
+        search_value: Optional[str] = None,
+        sort_by: Optional[Literal["id", "input", "truth"]] = None,
+        sort_order: Optional[Literal["asc", "desc"]] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[BenchmarkPromptsResponse]:
+        params = {}
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if search_field:
+            params["search"] = search_field
+        if search_value:
+            params["searchValue"] = search_value
+        if sort_by:
+            params["sortBy"] = sort_by
+        if sort_order:
+            params["sortOrder"] = sort_order
+
+        resp = self._get(
+            f"/datasets/{benchmark_id}/prompts",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return BenchmarkPromptsResponse.model_validate(resp)
+
+    def get_all_prompts(
+        self,
+        benchmark_id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> List[BenchmarkPrompt]:
+        all_prompts: List[BenchmarkPrompt] = []
+        page = 1
+        page_size = DEFAULT_PROMPTS_PAGE_SIZE
+
+        while True:
+            resp = self.get_prompts(
+                benchmark_id,
+                page=page,
+                page_size=page_size,
+                timeout=timeout,
+            )
+            if resp is None or not resp.data.prompts:
+                break
+
+            all_prompts.extend(resp.data.prompts)
+
+            total_count = resp.data.count
+            total_pages = math.ceil(total_count / page_size) if total_count > 0 else 0
+            if page >= total_pages:
+                break
+
+            page += 1
+
+        return all_prompts
+
+
+class AsyncPublicBenchmarksResource(AsyncPublicAPIResource):
+    async def get(
+        self,
+        *,
+        query: Optional[str] = None,
+        name: Optional[str] = None,
+        key: Optional[str] = None,
+        ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        languages: Optional[List[str]] = None,
+        sort_by: Optional[Literal["name"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        include_deprecated: Optional[bool] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[PublicBenchmarksListResponse]:
+        params = {}
+        if query:
+            params["query"] = query
+        if name:
+            params["name"] = name
+        if key:
+            params["key"] = key
+        if ids:
+            params["ids"] = ",".join(ids)
+        if categories:
+            params["categories"] = ",".join(categories)
+        if languages:
+            params["languages"] = ",".join(languages)
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if include_deprecated is not None:
+            params["include_deprecated"] = str(include_deprecated).lower()
+
+        resp = await self._get(
+            "/datasets",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return PublicBenchmarksListResponse.model_validate(resp)
+
+    async def get_prompts(
+        self,
+        benchmark_id: str,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        search_field: Optional[Literal["id", "input", "truth"]] = None,
+        search_value: Optional[str] = None,
+        sort_by: Optional[Literal["id", "input", "truth"]] = None,
+        sort_order: Optional[Literal["asc", "desc"]] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[BenchmarkPromptsResponse]:
+        params = {}
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if search_field:
+            params["search"] = search_field
+        if search_value:
+            params["searchValue"] = search_value
+        if sort_by:
+            params["sortBy"] = sort_by
+        if sort_order:
+            params["sortOrder"] = sort_order
+
+        resp = await self._get(
+            f"/datasets/{benchmark_id}/prompts",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return BenchmarkPromptsResponse.model_validate(resp)
+
+    async def get_all_prompts(
+        self,
+        benchmark_id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> List[BenchmarkPrompt]:
+        all_prompts: List[BenchmarkPrompt] = []
+        page = 1
+        page_size = DEFAULT_PROMPTS_PAGE_SIZE
+
+        while True:
+            resp = await self.get_prompts(
+                benchmark_id,
+                page=page,
+                page_size=page_size,
+                timeout=timeout,
+            )
+            if resp is None or not resp.data.prompts:
+                break
+
+            all_prompts.extend(resp.data.prompts)
+
+            total_count = resp.data.count
+            total_pages = math.ceil(total_count / page_size) if total_count > 0 else 0
+            if page >= total_pages:
+                break
+
+            page += 1
+
+        return all_prompts
diff --git a/src/layerlens/resources/public_evaluations/__init__.py b/src/layerlens/resources/public_evaluations/__init__.py
new file mode 100644
index 0000000..e1b1781
--- /dev/null
+++ b/src/layerlens/resources/public_evaluations/__init__.py
@@ -0,0 +1,3 @@
+from .public_evaluations import PublicEvaluationsResource, AsyncPublicEvaluationsResource
+
+__all__ = ["PublicEvaluationsResource", "AsyncPublicEvaluationsResource"]
diff --git a/src/layerlens/resources/public_evaluations/public_evaluations.py b/src/layerlens/resources/public_evaluations/public_evaluations.py
new file mode 100644
index 0000000..ddd1cdf
--- /dev/null
+++ b/src/layerlens/resources/public_evaluations/public_evaluations.py
@@ -0,0 +1,218 @@
+from __future__ import annotations
+
+import math
+from typing import List, Literal, Optional
+
+import httpx
+
+from ...models import (
+    Evaluation,
+    EvaluationStatus,
+    EvaluationsResponse,
+)
+from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+DEFAULT_PAGE = 1
+DEFAULT_PAGE_SIZE = 100
+MAX_PAGE_SIZE = 500
+
+
+class PublicEvaluationsResource(SyncPublicAPIResource):
+    def get_by_id(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Evaluation]:
+        evaluation = self._get(
+            f"/evaluations/{id}",
+            timeout=timeout,
+            cast_to=Evaluation,
+        )
+        if isinstance(evaluation, Evaluation):
+            return evaluation
+        return None
+
+    def get_many(
+        self,
+        *,
+        organization_id: str,
+        project_id: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        model_ids: Optional[List[str]] = None,
+        benchmark_ids: Optional[List[str]] = None,
+        status: Optional[EvaluationStatus] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationsResponse]:
+        """
+        Get evaluations with optional pagination, sorting, and filtering.
+
+        Args:
+            organization_id: Organization ID (required)
+            project_id: Project ID (required)
+            page: Page number for pagination (1-based, defaults to 1 if not provided)
+            page_size: Number of evaluations per page (default: 100, optional)
+            sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
+            order: Sort order (asc or desc)
+            model_ids: Filter by model IDs
+            benchmark_ids: Filter by benchmark/dataset IDs
+            status: Filter by evaluation status
+            timeout: Request timeout
+
+        Returns:
+            EvaluationsResponse object or None
+        """
+        params = {
+            "organizationID": organization_id,
+            "projectID": project_id,
+        }
+
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+
+        params["page"] = str(effective_page)
+        params["pageSize"] = str(effective_page_size)
+
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if model_ids:
+            params["models"] = ",".join(model_ids)
+        if benchmark_ids:
+            params["datasets"] = ",".join(benchmark_ids)
+        if status:
+            params["status"] = status.value
+
+        resp = self._get(
+            "/evaluations",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if not resp or not isinstance(resp, dict):
+            return None
+
+        evaluations = [e if isinstance(e, Evaluation) else Evaluation(**e) for e in resp.get("evaluations", [])]
+
+        total_count = resp.get("total_count", 0)
+        total_pages = math.ceil(total_count / effective_page_size) if total_count > 0 and effective_page_size > 0 else 0
+
+        resp_with_pagination = {
+            "evaluations": evaluations,
+            "pagination": {
+                "page": effective_page,
+                "page_size": effective_page_size,
+                "total_pages": total_pages,
+                "total_count": total_count,
+            },
+        }
+
+        try:
+            return EvaluationsResponse.model_validate(resp_with_pagination)
+        except Exception:
+            return None
+
+
+class AsyncPublicEvaluationsResource(AsyncPublicAPIResource):
+    async def get_by_id(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Evaluation]:
+        evaluation = await self._get(
+            f"/evaluations/{id}",
+            timeout=timeout,
+            cast_to=Evaluation,
+        )
+        if isinstance(evaluation, Evaluation):
+            return evaluation
+        return None
+
+    async def get_many(
+        self,
+        *,
+        organization_id: str,
+        project_id: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        model_ids: Optional[List[str]] = None,
+        benchmark_ids: Optional[List[str]] = None,
+        status: Optional[EvaluationStatus] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationsResponse]:
+        """
+        Get evaluations with optional pagination, sorting, and filtering.
+
+        Args:
+            organization_id: Organization ID (required)
+            project_id: Project ID (required)
+            page: Page number for pagination (1-based, defaults to 1 if not provided)
+            page_size: Number of evaluations per page (default: 100, optional)
+            sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
+            order: Sort order (asc or desc)
+            model_ids: Filter by model IDs
+            benchmark_ids: Filter by benchmark/dataset IDs
+            status: Filter by evaluation status
+            timeout: Request timeout
+
+        Returns:
+            EvaluationsResponse object or None
+        """
+        params = {
+            "organizationID": organization_id,
+            "projectID": project_id,
+        }
+
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+
+        params["page"] = str(effective_page)
+        params["pageSize"] = str(effective_page_size)
+
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if model_ids:
+            params["models"] = ",".join(model_ids)
+        if benchmark_ids:
+            params["datasets"] = ",".join(benchmark_ids)
+        if status:
+            params["status"] = status.value
+
+        resp = await self._get(
+            "/evaluations",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if not resp or not isinstance(resp, dict):
+            return None
+
+        evaluations = [e if isinstance(e, Evaluation) else Evaluation(**e) for e in resp.get("evaluations", [])]
+
+        total_count = resp.get("total_count", 0)
+        total_pages = math.ceil(total_count / effective_page_size) if total_count > 0 and effective_page_size > 0 else 0
+
+        resp_with_pagination = {
+            "evaluations": evaluations,
+            "pagination": {
+                "page": effective_page,
+                "page_size": effective_page_size,
+                "total_pages": total_pages,
+                "total_count": total_count,
+            },
+        }
+
+        try:
+            return EvaluationsResponse.model_validate(resp_with_pagination)
+        except Exception:
+            return None
diff --git a/src/layerlens/resources/public_models/__init__.py b/src/layerlens/resources/public_models/__init__.py
new file mode 100644
index 0000000..cc73f53
--- /dev/null
+++ b/src/layerlens/resources/public_models/__init__.py
@@ -0,0 +1,3 @@
+from .public_models import PublicModelsResource, AsyncPublicModelsResource
+
+__all__ = ["PublicModelsResource", "AsyncPublicModelsResource"]
diff --git a/src/layerlens/resources/public_models/public_models.py b/src/layerlens/resources/public_models/public_models.py
new file mode 100644
index 0000000..3b23b41
--- /dev/null
+++ b/src/layerlens/resources/public_models/public_models.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+from typing import List, Literal, Optional
+
+import httpx
+
+from ...models import PublicModelsListResponse
+from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+
+class PublicModelsResource(SyncPublicAPIResource):
+    def get(
+        self,
+        *,
+        query: Optional[str] = None,
+        name: Optional[str] = None,
+        key: Optional[str] = None,
+        ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        companies: Optional[List[str]] = None,
+        regions: Optional[List[str]] = None,
+        licenses: Optional[List[str]] = None,
+        sizes: Optional[List[str]] = None,
+        sort_by: Optional[
+            Literal["name", "createdAt", "releasedAt", "architectureType", "contextLength", "license", "region"]
+        ] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        include_deprecated: Optional[bool] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[PublicModelsListResponse]:
+        params = {}
+        if query:
+            params["query"] = query
+        if name:
+            params["name"] = name
+        if key:
+            params["key"] = key
+        if ids:
+            params["ids"] = ",".join(ids)
+        if categories:
+            params["categories"] = ",".join(categories)
+        if companies:
+            params["companies"] = ",".join(companies)
+        if regions:
+            params["regions"] = ",".join(regions)
+        if licenses:
+            params["licenses"] = ",".join(licenses)
+        if sizes:
+            params["sizes"] = ",".join(sizes)
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if include_deprecated is not None:
+            params["include_deprecated"] = str(include_deprecated).lower()
+
+        resp = self._get(
+            "/models",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return PublicModelsListResponse.model_validate(resp)
+
+
+class AsyncPublicModelsResource(AsyncPublicAPIResource):
+    async def get(
+        self,
+        *,
+        query: Optional[str] = None,
+        name: Optional[str] = None,
+        key: Optional[str] = None,
+        ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        companies: Optional[List[str]] = None,
+        regions: Optional[List[str]] = None,
+        licenses: Optional[List[str]] = None,
+        sizes: Optional[List[str]] = None,
+        sort_by: Optional[
+            Literal["name", "createdAt", "releasedAt", "architectureType", "contextLength", "license", "region"]
+        ] = None,
+        order: Optional[Literal["asc", "desc"]] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        include_deprecated: Optional[bool] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[PublicModelsListResponse]:
+        params = {}
+        if query:
+            params["query"] = query
+        if name:
+            params["name"] = name
+        if key:
+            params["key"] = key
+        if ids:
+            params["ids"] = ",".join(ids)
+        if categories:
+            params["categories"] = ",".join(categories)
+        if companies:
+            params["companies"] = ",".join(companies)
+        if regions:
+            params["regions"] = ",".join(regions)
+        if licenses:
+            params["licenses"] = ",".join(licenses)
+        if sizes:
+            params["sizes"] = ",".join(sizes)
+        if sort_by:
+            params["sortBy"] = sort_by
+        if order:
+            params["order"] = order
+        if page is not None:
+            params["page"] = str(page)
+        if page_size is not None:
+            params["pageSize"] = str(page_size)
+        if include_deprecated is not None:
+            params["include_deprecated"] = str(include_deprecated).lower()
+
+        resp = await self._get(
+            "/models",
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+
+        if not isinstance(resp, dict):
+            return None
+
+        return PublicModelsListResponse.model_validate(resp)
diff --git a/tests/resources/test_benchmarks.py b/tests/resources/test_benchmarks.py
index 2155105..9e03e22 100644
--- a/tests/resources/test_benchmarks.py
+++ b/tests/resources/test_benchmarks.py
@@ -1,4 +1,4 @@
-from unittest.mock import Mock, call
+from unittest.mock import Mock, call, patch
 
 import httpx
 import pytest
@@ -8,6 +8,7 @@
     CustomBenchmark,
     PublicBenchmark,
     BenchmarksResponse,
+    CreateBenchmarkResponse,
 )
 from layerlens._constants import DEFAULT_TIMEOUT
 from layerlens.resources.benchmarks.benchmarks import Benchmarks
@@ -445,3 +446,533 @@ def test_get_benchmarks_mixed_benchmark_types(self, benchmarks_resource):
         assert isinstance(result[1], PublicBenchmark)
         assert result[0].key == "my-bench"
         assert result[1].key == "mmlu"
+
+
+class TestBenchmarksAdd:
+    """Test Benchmarks.add() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.patch_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    def test_add_single_benchmark(self, benchmarks_resource):
+        """add() merges new ID with current benchmarks and PATCHes."""
+        existing = PublicBenchmark(id="b1", key="b1", name="B1")
+        benchmarks_resource.get = Mock(return_value=[existing])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        result = benchmarks_resource.add("b2")
+
+        assert result is True
+        benchmarks_resource._patch.assert_called_once_with(
+            "/organizations/org-123/projects/proj-456",
+            body={"datasets": ["b1", "b2"]},
+            timeout=DEFAULT_TIMEOUT,
+            cast_to=dict,
+        )
+
+    def test_add_multiple_benchmarks(self, benchmarks_resource):
+        """add() handles multiple benchmark IDs."""
+        benchmarks_resource.get = Mock(return_value=[])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        result = benchmarks_resource.add("b1", "b2", "b3")
+
+        assert result is True
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b1", "b2", "b3"]}
+
+    def test_add_deduplicates(self, benchmarks_resource):
+        """add() deduplicates IDs already in the project."""
+        existing = PublicBenchmark(id="b1", key="b1", name="B1")
+        benchmarks_resource.get = Mock(return_value=[existing])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        benchmarks_resource.add("b1", "b2")
+
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b1", "b2"]}
+
+    def test_add_returns_false_on_failure(self, benchmarks_resource):
+        """add() returns False when PATCH fails."""
+        benchmarks_resource.get = Mock(return_value=[])
+        benchmarks_resource._patch.return_value = "error"
+
+        result = benchmarks_resource.add("b1")
+
+        assert result is False
+
+    def test_add_with_none_get_response(self, benchmarks_resource):
+        """add() handles None from get() gracefully."""
+        benchmarks_resource.get = Mock(return_value=None)
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        result = benchmarks_resource.add("b1")
+
+        assert result is True
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b1"]}
+
+    def test_add_uses_datasets_field(self, benchmarks_resource):
+        """add() sends 'datasets' (not 'benchmarks') in the PATCH body."""
+        benchmarks_resource.get = Mock(return_value=[])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        benchmarks_resource.add("b1")
+
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert "datasets" in call_body
+        assert "benchmarks" not in call_body
+
+
+class TestBenchmarksRemove:
+    """Test Benchmarks.remove() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.patch_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    def test_remove_single_benchmark(self, benchmarks_resource):
+        """remove() removes specified ID and PATCHes remaining."""
+        b1 = PublicBenchmark(id="b1", key="b1", name="B1")
+        b2 = PublicBenchmark(id="b2", key="b2", name="B2")
+        benchmarks_resource.get = Mock(return_value=[b1, b2])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        result = benchmarks_resource.remove("b1")
+
+        assert result is True
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b2"]}
+
+    def test_remove_multiple_benchmarks(self, benchmarks_resource):
+        """remove() handles removing multiple IDs."""
+        b1 = PublicBenchmark(id="b1", key="b1", name="B1")
+        b2 = PublicBenchmark(id="b2", key="b2", name="B2")
+        b3 = PublicBenchmark(id="b3", key="b3", name="B3")
+        benchmarks_resource.get = Mock(return_value=[b1, b2, b3])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        benchmarks_resource.remove("b1", "b3")
+
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b2"]}
+
+    def test_remove_nonexistent_id(self, benchmarks_resource):
+        """remove() ignores IDs that aren't in the project."""
+        b1 = PublicBenchmark(id="b1", key="b1", name="B1")
+        benchmarks_resource.get = Mock(return_value=[b1])
+        benchmarks_resource._patch.return_value = {"id": "proj-456"}
+
+        benchmarks_resource.remove("nonexistent")
+
+        call_body = benchmarks_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"datasets": ["b1"]}
+
+    def test_remove_returns_false_on_failure(self, benchmarks_resource):
+        """remove() returns False when PATCH fails."""
+        benchmarks_resource.get = Mock(return_value=[])
+        benchmarks_resource._patch.return_value = None
+
+        result = benchmarks_resource.remove("b1")
+
+        assert result is False
+
+
+class TestBenchmarksCreateCustom:
+    """Test Benchmarks.create_custom() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    @pytest.fixture
+    def tmp_jsonl(self, tmp_path):
+        """Create a temporary JSONL file."""
+        f = tmp_path / "test.jsonl"
+        f.write_text('{"input": "What is 2+2?", "truth": "4"}\n')
+        return str(f)
+
+    def test_create_custom_success_with_envelope(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() unwraps envelope and returns CreateBenchmarkResponse."""
+        # Mock _upload_file to skip actual upload
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {
+                "benchmark_id": "bench-123",
+                "organization_id": "org-123",
+                "project_id": "proj-456",
+            },
+        }
+
+        result = benchmarks_resource.create_custom(
+            name="Test Benchmark",
+            description="Test description",
+            file_path=tmp_jsonl,
+        )
+
+        assert isinstance(result, CreateBenchmarkResponse)
+        assert result.benchmark_id == "bench-123"
+        assert result.organization_id == "org-123"
+
+    def test_create_custom_success_without_envelope(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() works when response has no envelope."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "benchmark_id": "bench-123",
+            "organization_id": "org-123",
+            "project_id": "proj-456",
+        }
+
+        result = benchmarks_resource.create_custom(
+            name="Test",
+            description="Test",
+            file_path=tmp_jsonl,
+        )
+
+        assert isinstance(result, CreateBenchmarkResponse)
+        assert result.benchmark_id == "bench-123"
+
+    def test_create_custom_sends_correct_body(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() sends all fields in the request body."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_custom(
+            name="My Bench",
+            description="A benchmark",
+            file_path=tmp_jsonl,
+            additional_metrics=["toxicity", "readability"],
+            custom_scorer_ids=["scorer-1"],
+            input_type="messages",
+        )
+
+        call_kwargs = benchmarks_resource._post.call_args.kwargs
+        assert call_kwargs["body"] == {
+            "name": "My Bench",
+            "description": "A benchmark",
+            "file": "test.jsonl",
+            "additional_metrics": ["toxicity", "readability"],
+            "custom_scorers": ["scorer-1"],
+            "input_type": "messages",
+        }
+
+    def test_create_custom_omits_optional_fields(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() does not include optional fields when not provided."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_custom(
+            name="Bench",
+            description="Desc",
+            file_path=tmp_jsonl,
+        )
+
+        call_body = benchmarks_resource._post.call_args.kwargs["body"]
+        assert "additional_metrics" not in call_body
+        assert "custom_scorers" not in call_body
+        assert "input_type" not in call_body
+
+    def test_create_custom_correct_url(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() posts to the correct endpoint."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_custom(
+            name="B",
+            description="D",
+            file_path=tmp_jsonl,
+        )
+
+        call_args = benchmarks_resource._post.call_args
+        assert call_args[0][0] == "/organizations/org-123/projects/proj-456/custom-benchmarks"
+
+    def test_create_custom_returns_none_on_failure(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() returns None when response is unexpected."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = "not-a-dict"
+
+        result = benchmarks_resource.create_custom(
+            name="B",
+            description="D",
+            file_path=tmp_jsonl,
+        )
+
+        assert result is None
+
+    def test_create_custom_calls_upload_file(self, benchmarks_resource, tmp_jsonl):
+        """create_custom() calls _upload_file with correct args."""
+        benchmarks_resource._upload_file = Mock(return_value="test.jsonl")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_custom(
+            name="My Bench",
+            description="Desc",
+            file_path=tmp_jsonl,
+        )
+
+        benchmarks_resource._upload_file.assert_called_once_with(tmp_jsonl, "My Bench", DEFAULT_TIMEOUT)
+
+
+class TestBenchmarksCreateSmart:
+    """Test Benchmarks.create_smart() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    def test_create_smart_success_with_envelope(self, benchmarks_resource):
+        """create_smart() unwraps envelope and returns CreateBenchmarkResponse."""
+        benchmarks_resource._upload_file = Mock(return_value="doc.txt")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {
+                "benchmark_id": "smart-123",
+                "organization_id": "org-123",
+                "project_id": "proj-456",
+            },
+        }
+
+        result = benchmarks_resource.create_smart(
+            name="Smart Bench",
+            description="Smart benchmark",
+            system_prompt="Generate QA pairs",
+            file_paths=["/tmp/doc.txt"],
+        )
+
+        assert isinstance(result, CreateBenchmarkResponse)
+        assert result.benchmark_id == "smart-123"
+
+    def test_create_smart_sends_correct_body(self, benchmarks_resource):
+        """create_smart() sends all fields in the request body."""
+        benchmarks_resource._upload_file = Mock(side_effect=["doc1.txt", "doc2.pdf"])
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_smart(
+            name="Smart",
+            description="Desc",
+            system_prompt="Generate pairs",
+            file_paths=["/tmp/doc1.txt", "/tmp/doc2.pdf"],
+            metrics=["hallucination"],
+        )
+
+        call_kwargs = benchmarks_resource._post.call_args.kwargs
+        assert call_kwargs["body"] == {
+            "name": "Smart",
+            "description": "Desc",
+            "system_prompt": "Generate pairs",
+            "files": ["doc1.txt", "doc2.pdf"],
+            "metrics": ["hallucination"],
+        }
+
+    def test_create_smart_uploads_all_files(self, benchmarks_resource):
+        """create_smart() calls _upload_file for each file path."""
+        benchmarks_resource._upload_file = Mock(side_effect=["a.txt", "b.pdf", "c.csv"])
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_smart(
+            name="S",
+            description="D",
+            system_prompt="P",
+            file_paths=["/tmp/a.txt", "/tmp/b.pdf", "/tmp/c.csv"],
+        )
+
+        assert benchmarks_resource._upload_file.call_count == 3
+
+    def test_create_smart_correct_url(self, benchmarks_resource):
+        """create_smart() posts to the correct endpoint."""
+        benchmarks_resource._upload_file = Mock(return_value="doc.txt")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_smart(
+            name="S",
+            description="D",
+            system_prompt="P",
+            file_paths=["/tmp/doc.txt"],
+        )
+
+        call_args = benchmarks_resource._post.call_args
+        assert call_args[0][0] == "/organizations/org-123/projects/proj-456/smart-benchmarks"
+
+    def test_create_smart_omits_metrics_when_none(self, benchmarks_resource):
+        """create_smart() does not include metrics when not provided."""
+        benchmarks_resource._upload_file = Mock(return_value="doc.txt")
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        benchmarks_resource.create_smart(
+            name="S",
+            description="D",
+            system_prompt="P",
+            file_paths=["/tmp/doc.txt"],
+        )
+
+        call_body = benchmarks_resource._post.call_args.kwargs["body"]
+        assert "metrics" not in call_body
+
+    def test_create_smart_returns_none_on_failure(self, benchmarks_resource):
+        """create_smart() returns None when response is unexpected."""
+        benchmarks_resource._upload_file = Mock(return_value="doc.txt")
+        benchmarks_resource._post.return_value = None
+
+        result = benchmarks_resource.create_smart(
+            name="S",
+            description="D",
+            system_prompt="P",
+            file_paths=["/tmp/doc.txt"],
+        )
+
+        assert result is None
+
+
+class TestBenchmarksUploadFile:
+    """Test Benchmarks._upload_file() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    @pytest.fixture
+    def tmp_jsonl(self, tmp_path):
+        """Create a temporary JSONL file."""
+        f = tmp_path / "data.jsonl"
+        f.write_text('{"input": "test", "truth": "answer"}\n')
+        return str(f)
+
+    @patch("layerlens.resources.benchmarks.benchmarks.httpx.put")
+    def test_upload_file_success_with_envelope(self, mock_put, benchmarks_resource, tmp_jsonl):
+        """_upload_file() unwraps envelope and uploads to presigned URL."""
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"url": "https://s3.example.com/upload?signed=1"},
+        }
+        mock_put.return_value = Mock(status_code=200, raise_for_status=Mock())
+
+        result = benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT)
+
+        assert result == "data.jsonl"
+        mock_put.assert_called_once()
+        assert mock_put.call_args.args[0] == "https://s3.example.com/upload?signed=1"
+
+    @patch("layerlens.resources.benchmarks.benchmarks.httpx.put")
+    def test_upload_file_success_without_envelope(self, mock_put, benchmarks_resource, tmp_jsonl):
+        """_upload_file() works when response has no envelope."""
+        benchmarks_resource._post.return_value = {
+            "url": "https://s3.example.com/upload?signed=1",
+        }
+        mock_put.return_value = Mock(status_code=200, raise_for_status=Mock())
+
+        result = benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT)
+
+        assert result == "data.jsonl"
+
+    def test_upload_file_raises_on_missing_url(self, benchmarks_resource, tmp_jsonl):
+        """_upload_file() raises ValueError when URL is missing."""
+        benchmarks_resource._post.return_value = {"status": "success", "data": {"no_url": True}}
+
+        with pytest.raises(ValueError, match="Failed to get upload URL"):
+            benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT)
+
+    def test_upload_file_raises_on_invalid_response(self, benchmarks_resource, tmp_jsonl):
+        """_upload_file() raises ValueError when response is not a dict."""
+        benchmarks_resource._post.return_value = "not-a-dict"
+
+        with pytest.raises(ValueError, match="Failed to get upload URL"):
+            benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT)
+
+    def test_upload_file_raises_on_oversized_file(self, benchmarks_resource, tmp_path):
+        """_upload_file() raises ValueError when file exceeds size limit."""
+        big_file = tmp_path / "big.jsonl"
+        # Create a file that appears to be larger than MAX_UPLOAD_SIZE
+        big_file.write_text("x")
+
+        with patch("os.path.getsize", return_value=51 * 1024 * 1024):
+            with pytest.raises(ValueError, match="exceeds maximum"):
+                benchmarks_resource._upload_file(str(big_file), "my-bench", DEFAULT_TIMEOUT)
+
+    @patch("layerlens.resources.benchmarks.benchmarks.httpx.put")
+    def test_upload_file_sends_correct_upload_request(self, mock_put, benchmarks_resource, tmp_jsonl):
+        """_upload_file() sends correct metadata to upload endpoint."""
+        benchmarks_resource._post.return_value = {
+            "status": "success",
+            "data": {"url": "https://s3.example.com/upload"},
+        }
+        mock_put.return_value = Mock(status_code=200, raise_for_status=Mock())
+
+        benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT)
+
+        post_kwargs = benchmarks_resource._post.call_args.kwargs
+        body = post_kwargs["body"]
+        assert body["key"] == "my-bench"
+        assert body["filename"] == "data.jsonl"
+        assert "type" in body
+        assert "size" in body
diff --git a/tests/resources/test_evaluations.py b/tests/resources/test_evaluations.py
index 02480a3..0d40f08 100644
--- a/tests/resources/test_evaluations.py
+++ b/tests/resources/test_evaluations.py
@@ -5,12 +5,22 @@
 
 from layerlens.models import (
     Evaluation,
+    ErrorAnalysis,
+    AnalysisSummary,
+    EvaluationMetric,
     EvaluationStatus,
+    EvaluationDataset,
+    EvaluationSummary,
+    EvaluationTaskType,
+    PerformanceDetails,
     EvaluationsResponse,
     CreateEvaluationsResponse,
 )
 from layerlens._constants import DEFAULT_TIMEOUT
 from layerlens.resources.evaluations.evaluations import Evaluations
+from layerlens.resources.public_evaluations.public_evaluations import (
+    PublicEvaluationsResource,
+)
 
 
 class TestEvaluations:
@@ -446,3 +456,419 @@ def test_create_evaluation_end_to_end_flow(self):
         assert "/organizations/test-org/projects/test-project/evaluations" in call_args[0][0]
         assert call_args.kwargs["body"][0]["model_id"] == mock_model.id
         assert call_args.kwargs["body"][0]["dataset_id"] == mock_benchmark.id
+
+
+class TestEvaluationModelFields:
+    """Test Evaluation model parses all backend fields."""
+
+    @pytest.fixture
+    def full_evaluation_data(self):
+        return {
+            "id": "eval-full",
+            "status": "success",
+            "status_description": "Evaluation completed successfully",
+            "submitted_at": 1640995200,
+            "finished_at": 1640995800,
+            "model_id": "model-456",
+            "model_name": "GPT-4",
+            "model_key": "gpt-4",
+            "model_company": "OpenAI",
+            "dataset_id": "benchmark-789",
+            "dataset_name": "MMLU",
+            "average_duration": 2500,
+            "accuracy": 0.89,
+            "readability_score": 0.75,
+            "toxicity_score": 0.02,
+            "ethics_score": 0.95,
+            "failed_prompt_count": 3,
+            "queue_id": 42,
+            "summary": {
+                "name": "GPT-4 on MMLU",
+                "goal": "Evaluate general knowledge",
+                "metrics": [
+                    {"name": "accuracy", "description": "Correctness of responses"},
+                    {"name": "toxicity", "description": "Harmful content detection"},
+                ],
+                "task_types": [
+                    {"name": "multiple_choice", "description": "Select correct answer"},
+                ],
+                "dataset": {
+                    "total_size": 15908,
+                    "training_size": 0,
+                    "test_size": 15908,
+                    "characteristics": ["multi-domain", "multiple-choice"],
+                },
+                "model": {
+                    "model_name": "GPT-4",
+                    "performance": {"overall": 0.89},
+                },
+                "performance_details": {
+                    "strengths": ["Strong reasoning", "Good factual recall"],
+                    "challenges": ["Abstract math", "Ambiguous questions"],
+                },
+                "error_analysis": {
+                    "common_failure_modes": ["Off-by-one errors", "Misinterpreting negation"],
+                    "example": "Q: Which is NOT true? A: Selected a true statement.",
+                },
+                "analysis_summary": {
+                    "key_takeaways": [
+                        "Strong overall performance at 89%",
+                        "Struggles with negation-based questions",
+                    ],
+                },
+            },
+        }
+
+    def test_parse_all_fields(self, full_evaluation_data):
+        """Evaluation model parses all backend fields correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+
+        assert evaluation.id == "eval-full"
+        assert evaluation.status == EvaluationStatus.SUCCESS
+        assert evaluation.status_description == "Evaluation completed successfully"
+        assert evaluation.model_id == "model-456"
+        assert evaluation.model_name == "GPT-4"
+        assert evaluation.model_key == "gpt-4"
+        assert evaluation.model_company == "OpenAI"
+        assert evaluation.benchmark_id == "benchmark-789"
+        assert evaluation.benchmark_name == "MMLU"
+        assert evaluation.accuracy == 0.89
+        assert evaluation.readability_score == 0.75
+        assert evaluation.toxicity_score == 0.02
+        assert evaluation.ethics_score == 0.95
+        assert evaluation.failed_prompt_count == 3
+        assert evaluation.queue_id == 42
+
+    def test_parse_summary(self, full_evaluation_data):
+        """Evaluation model parses nested summary correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+
+        assert evaluation.summary is not None
+        summary = evaluation.summary
+        assert isinstance(summary, EvaluationSummary)
+        assert summary.name == "GPT-4 on MMLU"
+        assert summary.goal == "Evaluate general knowledge"
+
+    def test_parse_summary_metrics(self, full_evaluation_data):
+        """Summary metrics are parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        metrics = evaluation.summary.metrics
+
+        assert len(metrics) == 2
+        assert isinstance(metrics[0], EvaluationMetric)
+        assert metrics[0].name == "accuracy"
+        assert metrics[1].name == "toxicity"
+
+    def test_parse_summary_task_types(self, full_evaluation_data):
+        """Summary task types are parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        task_types = evaluation.summary.task_types
+
+        assert len(task_types) == 1
+        assert isinstance(task_types[0], EvaluationTaskType)
+        assert task_types[0].name == "multiple_choice"
+
+    def test_parse_summary_dataset(self, full_evaluation_data):
+        """Summary dataset info is parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        dataset = evaluation.summary.dataset
+
+        assert isinstance(dataset, EvaluationDataset)
+        assert dataset.total_size == 15908
+        assert dataset.test_size == 15908
+        assert "multi-domain" in dataset.characteristics
+
+    def test_parse_summary_performance_details(self, full_evaluation_data):
+        """Summary performance details are parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        perf = evaluation.summary.performance_details
+
+        assert isinstance(perf, PerformanceDetails)
+        assert len(perf.strengths) == 2
+        assert "Strong reasoning" in perf.strengths
+        assert len(perf.challenges) == 2
+
+    def test_parse_summary_error_analysis(self, full_evaluation_data):
+        """Summary error analysis is parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        errors = evaluation.summary.error_analysis
+
+        assert isinstance(errors, ErrorAnalysis)
+        assert len(errors.common_failure_modes) == 2
+        assert "Off-by-one errors" in errors.common_failure_modes
+        assert "NOT true" in errors.example
+
+    def test_parse_summary_analysis_summary(self, full_evaluation_data):
+        """Summary analysis summary is parsed correctly."""
+        evaluation = Evaluation(**full_evaluation_data)
+        analysis = evaluation.summary.analysis_summary
+
+        assert isinstance(analysis, AnalysisSummary)
+        assert len(analysis.key_takeaways) == 2
+
+    def test_missing_optional_fields_default(self):
+        """Evaluation model uses defaults for missing optional fields."""
+        minimal_data = {
+            "id": "eval-min",
+            "status": "pending",
+            "submitted_at": 1640995200,
+            "finished_at": 0,
+            "model_id": "m1",
+            "dataset_id": "b1",
+            "average_duration": 0,
+            "accuracy": 0.0,
+        }
+        evaluation = Evaluation(**minimal_data)
+
+        assert evaluation.status_description == ""
+        assert evaluation.model_name == ""
+        assert evaluation.model_key == ""
+        assert evaluation.model_company == ""
+        assert evaluation.benchmark_name == ""
+        assert evaluation.readability_score == 0.0
+        assert evaluation.toxicity_score == 0.0
+        assert evaluation.ethics_score == 0.0
+        assert evaluation.failed_prompt_count == 0
+        assert evaluation.queue_id == 0
+        assert evaluation.summary is None
+
+    def test_null_summary_field(self):
+        """Evaluation model handles null summary."""
+        data = {
+            "id": "eval-no-summary",
+            "status": "in-progress",
+            "submitted_at": 1640995200,
+            "finished_at": 0,
+            "model_id": "m1",
+            "dataset_id": "b1",
+            "average_duration": 0,
+            "accuracy": 0.0,
+            "summary": None,
+        }
+        evaluation = Evaluation(**data)
+        assert evaluation.summary is None
+
+    def test_get_by_id_returns_full_evaluation(self):
+        """get_by_id returns Evaluation with all fields populated."""
+        mock_client = Mock()
+        mock_client.organization_id = "org-123"
+        mock_client.project_id = "proj-456"
+        mock_client.get_cast = Mock()
+
+        full_eval = Evaluation(
+            id="eval-123",
+            status=EvaluationStatus.SUCCESS,
+            status_description="Done",
+            submitted_at=1640995200,
+            finished_at=1640995800,
+            model_id="m1",
+            model_name="GPT-4",
+            model_key="gpt-4",
+            model_company="OpenAI",
+            dataset_id="b1",
+            dataset_name="MMLU",
+            average_duration=2500,
+            accuracy=0.89,
+            readability_score=0.75,
+            toxicity_score=0.02,
+            ethics_score=0.95,
+            failed_prompt_count=3,
+            queue_id=42,
+            summary=EvaluationSummary(
+                name="Test",
+                goal="Evaluate",
+                metrics=[EvaluationMetric(name="accuracy", description="Correctness")],
+            ),
+        )
+        mock_client.get_cast.return_value = full_eval
+
+        evaluations = Evaluations(mock_client)
+        result = evaluations.get_by_id("eval-123")
+
+        assert result is not None
+        assert result.model_name == "GPT-4"
+        assert result.benchmark_name == "MMLU"
+        assert result.readability_score == 0.75
+        assert result.summary is not None
+        assert result.summary.goal == "Evaluate"
+
+
+class TestPublicEvaluationsResource:
+    """Test PublicEvaluationsResource for the public client."""
+
+    @pytest.fixture
+    def mock_public_client(self):
+        """Mock PublicClient."""
+        client = Mock()
+        client.get_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def public_evaluations(self, mock_public_client):
+        """PublicEvaluationsResource instance."""
+        return PublicEvaluationsResource(mock_public_client)
+
+    @pytest.fixture
+    def sample_evaluation_data(self):
+        return {
+            "id": "eval-pub-123",
+            "status": "success",
+            "status_description": "Done",
+            "submitted_at": 1640995200,
+            "finished_at": 1640995800,
+            "model_id": "model-456",
+            "model_name": "GPT-4",
+            "dataset_id": "benchmark-789",
+            "dataset_name": "MMLU",
+            "average_duration": 2500,
+            "accuracy": 0.89,
+            "summary": {
+                "name": "GPT-4 on MMLU",
+                "goal": "Evaluate general knowledge",
+                "metrics": [{"name": "accuracy", "description": "Correctness"}],
+            },
+        }
+
+    def test_get_by_id_success(self, public_evaluations, sample_evaluation_data):
+        """get_by_id returns Evaluation on success."""
+        evaluation = Evaluation(**sample_evaluation_data)
+        public_evaluations._get.return_value = evaluation
+
+        result = public_evaluations.get_by_id("eval-pub-123")
+
+        assert isinstance(result, Evaluation)
+        assert result.id == "eval-pub-123"
+        assert result.model_name == "GPT-4"
+        assert result.summary is not None
+        assert result.summary.name == "GPT-4 on MMLU"
+
+    def test_get_by_id_correct_url(self, public_evaluations, sample_evaluation_data):
+        """get_by_id calls correct endpoint."""
+        evaluation = Evaluation(**sample_evaluation_data)
+        public_evaluations._get.return_value = evaluation
+
+        public_evaluations.get_by_id("eval-pub-123")
+
+        public_evaluations._get.assert_called_once_with(
+            "/evaluations/eval-pub-123",
+            timeout=DEFAULT_TIMEOUT,
+            cast_to=Evaluation,
+        )
+
+    def test_get_by_id_returns_none_on_invalid(self, public_evaluations):
+        """get_by_id returns None when response is not Evaluation."""
+        public_evaluations._get.return_value = None
+
+        result = public_evaluations.get_by_id("nonexistent")
+
+        assert result is None
+
+    def test_get_by_id_no_client_attached(self, public_evaluations, sample_evaluation_data):
+        """get_by_id does not attach client (public client has no org/project)."""
+        evaluation = Evaluation(**sample_evaluation_data)
+        public_evaluations._get.return_value = evaluation
+
+        result = public_evaluations.get_by_id("eval-pub-123")
+
+        assert result._client is None
+
+    def test_get_many_success(self, public_evaluations, sample_evaluation_data):
+        """get_many returns EvaluationsResponse with evaluations."""
+        resp = {
+            "evaluations": [sample_evaluation_data],
+            "total_count": 1,
+        }
+        public_evaluations._get.return_value = resp
+
+        result = public_evaluations.get_many(
+            organization_id="org-123",
+            project_id="proj-456",
+        )
+
+        assert isinstance(result, EvaluationsResponse)
+        assert len(result.evaluations) == 1
+        assert result.evaluations[0].id == "eval-pub-123"
+
+    def test_get_many_sends_org_and_project(self, public_evaluations, sample_evaluation_data):
+        """get_many sends organizationID and projectID as params."""
+        resp = {"evaluations": [sample_evaluation_data], "total_count": 1}
+        public_evaluations._get.return_value = resp
+
+        public_evaluations.get_many(
+            organization_id="org-abc",
+            project_id="proj-xyz",
+        )
+
+        call_args = public_evaluations._get.call_args
+        params = call_args.kwargs.get("params") or call_args[1].get("params")
+        assert params["organizationID"] == "org-abc"
+        assert params["projectID"] == "proj-xyz"
+
+    def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data):
+        """get_many passes filter parameters correctly."""
+        resp = {"evaluations": [sample_evaluation_data], "total_count": 1}
+        public_evaluations._get.return_value = resp
+
+        public_evaluations.get_many(
+            organization_id="org-123",
+            project_id="proj-456",
+            page=2,
+            page_size=50,
+            sort_by="accuracy",
+            order="desc",
+            model_ids=["m1", "m2"],
+            benchmark_ids=["b1"],
+            status=EvaluationStatus.SUCCESS,
+        )
+
+        call_args = public_evaluations._get.call_args
+        params = call_args.kwargs.get("params") or call_args[1].get("params")
+        assert params["page"] == "2"
+        assert params["pageSize"] == "50"
+        assert params["sortBy"] == "accuracy"
+        assert params["order"] == "desc"
+        assert params["models"] == "m1,m2"
+        assert params["datasets"] == "b1"
+        assert params["status"] == "success"
+
+    def test_get_many_pagination(self, public_evaluations, sample_evaluation_data):
+        """get_many computes pagination correctly."""
+        resp = {"evaluations": [sample_evaluation_data] * 3, "total_count": 25}
+        public_evaluations._get.return_value = resp
+
+        result = public_evaluations.get_many(
+            organization_id="org-123",
+            project_id="proj-456",
+            page=1,
+            page_size=10,
+        )
+
+        assert result.pagination.page == 1
+        assert result.pagination.page_size == 10
+        assert result.pagination.total_count == 25
+        assert result.pagination.total_pages == 3  # ceil(25/10)
+
+    def test_get_many_returns_none_on_invalid(self, public_evaluations):
+        """get_many returns None when response is invalid."""
+        public_evaluations._get.return_value = "not-a-dict"
+
+        result = public_evaluations.get_many(
+            organization_id="org-123",
+            project_id="proj-456",
+        )
+
+        assert result is None
+
+    def test_get_many_empty_results(self, public_evaluations):
+        """get_many handles empty evaluations list."""
+        resp = {"evaluations": [], "total_count": 0}
+        public_evaluations._get.return_value = resp
+
+        result = public_evaluations.get_many(
+            organization_id="org-123",
+            project_id="proj-456",
+        )
+
+        assert isinstance(result, EvaluationsResponse)
+        assert len(result.evaluations) == 0
+        assert result.pagination.total_count == 0
diff --git a/tests/resources/test_models_resource.py b/tests/resources/test_models_resource.py
index 92eb7b4..4852a00 100644
--- a/tests/resources/test_models_resource.py
+++ b/tests/resources/test_models_resource.py
@@ -3,7 +3,7 @@
 import httpx
 import pytest
 
-from layerlens.models import CustomModel, PublicModel, ModelsResponse
+from layerlens.models import CustomModel, PublicModel, ModelsResponse, CreateModelResponse
 from layerlens._constants import DEFAULT_TIMEOUT
 from layerlens.resources.models.models import Models
 
@@ -555,3 +555,291 @@ def test_get_models_large_parameters_handling(self, models_resource):
         assert result[0].context_length == 200000
         assert isinstance(result[0].parameters, float)
         assert isinstance(result[0].context_length, int)
+
+
+class TestModelsAdd:
+    """Test Models.add() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.patch_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def models_resource(self, mock_client):
+        return Models(mock_client)
+
+    def test_add_single_model(self, models_resource):
+        """add() merges new ID with current models and PATCHes."""
+        existing = PublicModel(id="m1", key="m1", name="M1", description="")
+        models_resource.get = Mock(return_value=[existing])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        result = models_resource.add("m2")
+
+        assert result is True
+        models_resource._patch.assert_called_once_with(
+            "/organizations/org-123/projects/proj-456",
+            body={"models": ["m1", "m2"]},
+            timeout=DEFAULT_TIMEOUT,
+            cast_to=dict,
+        )
+
+    def test_add_multiple_models(self, models_resource):
+        """add() handles multiple model IDs."""
+        models_resource.get = Mock(return_value=[])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        result = models_resource.add("m1", "m2", "m3")
+
+        assert result is True
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m1", "m2", "m3"]}
+
+    def test_add_deduplicates(self, models_resource):
+        """add() deduplicates IDs already in the project."""
+        existing = PublicModel(id="m1", key="m1", name="M1", description="")
+        models_resource.get = Mock(return_value=[existing])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        models_resource.add("m1", "m2")
+
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m1", "m2"]}
+
+    def test_add_returns_false_on_failure(self, models_resource):
+        """add() returns False when PATCH fails."""
+        models_resource.get = Mock(return_value=[])
+        models_resource._patch.return_value = "error"
+
+        result = models_resource.add("m1")
+
+        assert result is False
+
+    def test_add_with_none_get_response(self, models_resource):
+        """add() handles None from get() gracefully."""
+        models_resource.get = Mock(return_value=None)
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        result = models_resource.add("m1")
+
+        assert result is True
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m1"]}
+
+
+class TestModelsRemove:
+    """Test Models.remove() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.patch_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def models_resource(self, mock_client):
+        return Models(mock_client)
+
+    def test_remove_single_model(self, models_resource):
+        """remove() removes specified ID and PATCHes remaining."""
+        m1 = PublicModel(id="m1", key="m1", name="M1", description="")
+        m2 = PublicModel(id="m2", key="m2", name="M2", description="")
+        models_resource.get = Mock(return_value=[m1, m2])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        result = models_resource.remove("m1")
+
+        assert result is True
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m2"]}
+
+    def test_remove_multiple_models(self, models_resource):
+        """remove() handles removing multiple IDs."""
+        m1 = PublicModel(id="m1", key="m1", name="M1", description="")
+        m2 = PublicModel(id="m2", key="m2", name="M2", description="")
+        m3 = PublicModel(id="m3", key="m3", name="M3", description="")
+        models_resource.get = Mock(return_value=[m1, m2, m3])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        models_resource.remove("m1", "m3")
+
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m2"]}
+
+    def test_remove_nonexistent_id(self, models_resource):
+        """remove() ignores IDs that aren't in the project."""
+        m1 = PublicModel(id="m1", key="m1", name="M1", description="")
+        models_resource.get = Mock(return_value=[m1])
+        models_resource._patch.return_value = {"id": "proj-456"}
+
+        models_resource.remove("nonexistent")
+
+        call_body = models_resource._patch.call_args.kwargs["body"]
+        assert call_body == {"models": ["m1"]}
+
+    def test_remove_returns_false_on_failure(self, models_resource):
+        """remove() returns False when PATCH fails."""
+        models_resource.get = Mock(return_value=[])
+        models_resource._patch.return_value = None
+
+        result = models_resource.remove("m1")
+
+        assert result is False
+
+
+class TestModelsCreateCustom:
+    """Test Models.create_custom() method."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def models_resource(self, mock_client):
+        return Models(mock_client)
+
+    def test_create_custom_success_with_envelope(self, models_resource):
+        """create_custom() unwraps envelope and returns CreateModelResponse."""
+        models_resource._post.return_value = {
+            "status": "success",
+            "data": {
+                "model_id": "new-model-123",
+                "organization_id": "org-123",
+                "project_id": "proj-456",
+            },
+        }
+
+        result = models_resource.create_custom(
+            name="Test Model",
+            key="test/model-v1",
+            description="A test model",
+            api_url="https://api.example.com/v1",
+            max_tokens=4096,
+        )
+
+        assert isinstance(result, CreateModelResponse)
+        assert result.model_id == "new-model-123"
+        assert result.organization_id == "org-123"
+        assert result.project_id == "proj-456"
+
+    def test_create_custom_success_without_envelope(self, models_resource):
+        """create_custom() works when response has no envelope."""
+        models_resource._post.return_value = {
+            "model_id": "new-model-123",
+            "organization_id": "org-123",
+            "project_id": "proj-456",
+        }
+
+        result = models_resource.create_custom(
+            name="Test Model",
+            key="test/model-v1",
+            description="A test model",
+            api_url="https://api.example.com/v1",
+            max_tokens=4096,
+        )
+
+        assert isinstance(result, CreateModelResponse)
+        assert result.model_id == "new-model-123"
+
+    def test_create_custom_sends_correct_body(self, models_resource):
+        """create_custom() sends all required fields in the request body."""
+        models_resource._post.return_value = {
+            "status": "success",
+            "data": {"model_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        models_resource.create_custom(
+            name="My Model",
+            key="my/model",
+            description="desc",
+            api_url="https://example.com/v1",
+            max_tokens=8192,
+            api_key="sk-secret",
+        )
+
+        call_kwargs = models_resource._post.call_args.kwargs
+        assert call_kwargs["body"] == {
+            "name": "My Model",
+            "key": "my/model",
+            "description": "desc",
+            "api_url": "https://example.com/v1",
+            "max_tokens": 8192,
+            "api_key": "sk-secret",
+        }
+
+    def test_create_custom_omits_api_key_when_none(self, models_resource):
+        """create_custom() does not include api_key when not provided."""
+        models_resource._post.return_value = {
+            "status": "success",
+            "data": {"model_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        models_resource.create_custom(
+            name="My Model",
+            key="my/model",
+            description="desc",
+            api_url="https://example.com/v1",
+            max_tokens=4096,
+        )
+
+        call_body = models_resource._post.call_args.kwargs["body"]
+        assert "api_key" not in call_body
+
+    def test_create_custom_correct_url(self, models_resource):
+        """create_custom() posts to the correct endpoint."""
+        models_resource._post.return_value = {
+            "status": "success",
+            "data": {"model_id": "x", "organization_id": "o", "project_id": "p"},
+        }
+
+        models_resource.create_custom(
+            name="M",
+            key="k",
+            description="d",
+            api_url="https://x.com",
+            max_tokens=1,
+        )
+
+        call_args = models_resource._post.call_args
+        assert call_args[0][0] == "/organizations/org-123/projects/proj-456/custom-models"
+
+    def test_create_custom_returns_none_on_failure(self, models_resource):
+        """create_custom() returns None when response is unexpected."""
+        models_resource._post.return_value = "not-a-dict"
+
+        result = models_resource.create_custom(
+            name="M",
+            key="k",
+            description="d",
+            api_url="https://x.com",
+            max_tokens=1,
+        )
+
+        assert result is None
+
+    def test_create_custom_returns_none_on_error_envelope(self, models_resource):
+        """create_custom() returns None when response has no model_id."""
+        models_resource._post.return_value = {"status": "error", "data": {"message": "failed"}}
+
+        result = models_resource.create_custom(
+            name="M",
+            key="k",
+            description="d",
+            api_url="https://x.com",
+            max_tokens=1,
+        )
+
+        assert result is None

From dc1c2427f09d7322bd3ace31fc747550446370cc Mon Sep 17 00:00:00 2001
From: Marin Peko <26385728+m-peko@users.noreply.github.com>
Date: Fri, 27 Feb 2026 08:25:44 +0100
Subject: [PATCH 2/3] Improve comparisons resource (#51)

---
 docs/api-reference/public-client.md           |  52 ++++-
 examples/compare_evaluations.py               |  95 ++++-----
 examples/public_evaluations.py                |   9 +-
 .../resources/comparisons/comparisons.py      | 123 ++++++++++-
 .../public_evaluations/public_evaluations.py  |  18 +-
 tests/resources/test_comparisons.py           | 201 ++++++++++++++++++
 tests/resources/test_evaluations.py           |  36 +---
 7 files changed, 407 insertions(+), 127 deletions(-)
 create mode 100644 tests/resources/test_comparisons.py

diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md
index 9a79d1e..31afc82 100644
--- a/docs/api-reference/public-client.md
+++ b/docs/api-reference/public-client.md
@@ -282,14 +282,12 @@ Returns an `Evaluation` object if found, `None` otherwise. See [Evaluations](eva
 
 ### `evaluations.get_many(...)`
 
-Retrieves evaluations for a given organization and project with optional pagination, sorting, and filtering.
+Retrieves evaluations with optional pagination, sorting, and filtering.
 
 #### Parameters
 
 | Parameter         | Type                             | Required | Description                                                        |
 | ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ |
-| `organization_id` | `str`                            | Yes      | Organization ID (MongoDB ObjectID format)                          |
-| `project_id`      | `str`                            | Yes      | Project ID (MongoDB ObjectID format)                               |
 | `page`            | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)                |
 | `page_size`       | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500)            |
 | `sort_by`         | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration`     |
@@ -325,10 +323,8 @@ if evaluation:
         for takeaway in evaluation.summary.analysis_summary.key_takeaways:
             print(f"  - {takeaway}")
 
-# List evaluations for an organization/project
+# List successful evaluations sorted by accuracy
 response = client.evaluations.get_many(
-    organization_id="683e63925ef7e1c53c1f4b28",
-    project_id="683e63925ef7e1c53c1f4b29",
     status=EvaluationStatus.SUCCESS,
     sort_by="accuracy",
     order="desc",
@@ -417,3 +413,47 @@ if comparison:
         print(f"  Prompt: {result.prompt[:80]}...")
         print(f"  Model 1 score: {result.score1}, Model 2 score: {result.score2}")
 ```
+
+### `comparisons.compare_models(...)`
+
+Compares two models on a benchmark by automatically finding their most recent successful evaluations. This is a convenience method that wraps `compare()`.
+
+#### Parameters
+
+| Parameter        | Type                   | Required | Description                                |
+| ---------------- | ---------------------- | -------- | ------------------------------------------ |
+| `benchmark_id`   | `str`                  | Yes      | Benchmark ID to compare on                 |
+| `model_id_1`     | `str`                  | Yes      | First model ID                             |
+| `model_id_2`     | `str`                  | Yes      | Second model ID                            |
+| `page`           | `int \| None`          | No       | Page number (1-based)                      |
+| `page_size`      | `int \| None`          | No       | Results per page                           |
+| `outcome_filter` | `str \| None`          | No       | Filter by outcome (same options as `compare`) |
+| `search`         | `str \| None`          | No       | Search within results                      |
+| `timeout`        | `float \| httpx.Timeout \| None` | No | Override request timeout               |
+
+#### Returns
+
+Returns a `ComparisonResponse` (same as `compare()`), or `None` if the comparison request fails.
+
+Raises `ValueError` if no successful evaluation is found for either model on the given benchmark.
+
+#### Example
+
+```python
+from layerlens import PublicClient
+
+client = PublicClient()
+
+# Compare two models on AIME 2025 - no need to look up evaluation IDs
+comparison = client.comparisons.compare_models(
+    benchmark_id="682bddc1e014f9fa440f8a91",
+    model_id_1="699f9761e014f9c3072b0513",
+    model_id_2="699f9761e014f9c3072b0512",
+    page=1,
+    page_size=10,
+)
+
+if comparison:
+    print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+    print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
+```
diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py
index eb292b4..2293e8d 100644
--- a/examples/compare_evaluations.py
+++ b/examples/compare_evaluations.py
@@ -1,74 +1,49 @@
 #!/usr/bin/env -S poetry run python
 
-from layerlens import Stratix
-from layerlens.models import EvaluationStatus
+from layerlens import PublicClient
 
 
 def main():
-    # Construct client (API key from env or inline)
-    client = Stratix()
-
-    # --- Get successful evaluations to find a comparable pair
-    response = client.evaluations.get_many(
-        status=EvaluationStatus.SUCCESS,
-        sort_by="accuracy",
-        order="desc",
-        page_size=100,
-    )
-
-    if not response or len(response.evaluations) < 2:
-        print("Need at least 2 successful evaluations to compare, exiting")
-        return
-
-    # Find two evaluations on the same benchmark
-    eval_1 = None
-    eval_2 = None
-    for i, e1 in enumerate(response.evaluations):
-        for e2 in response.evaluations[i + 1 :]:
-            if e1.benchmark_id == e2.benchmark_id and e1.id != e2.id:
-                eval_1 = e1
-                eval_2 = e2
-                break
-        if eval_1:
-            break
-
-    if not eval_1 or not eval_2:
-        print("No two evaluations share the same benchmark, exiting")
-        return
-
-    print(f"Comparing evaluations on the same benchmark ({eval_1.benchmark_id}):")
-    print(f"  Evaluation 1: {eval_1.id} (accuracy={eval_1.accuracy:.2f}%)")
-    print(f"  Evaluation 2: {eval_2.id} (accuracy={eval_2.accuracy:.2f}%)")
-
-    # --- Get comparison results
-    comparison = client.public.comparisons.compare(
-        evaluation_id_1=eval_1.id,
-        evaluation_id_2=eval_2.id,
+    # Construct public client (API key from LAYERLENS_STRATIX_API_KEY env var or inline)
+    client = PublicClient()
+
+    # --- Compare two models on a benchmark using compare_models
+    # Just provide the benchmark and two model IDs - the SDK automatically
+    # finds the most recent successful evaluation for each model.
+    benchmark_id = "682bddc1e014f9fa440f8a91"  # AIME 2025
+    model_id_1 = "699f9761e014f9c3072b0513"  # Qwen3.5 27B
+    model_id_2 = "699f9761e014f9c3072b0512"  # Qwen3.5 122B A10B
+
+    print(f"Comparing models on benchmark {benchmark_id}...")
+    comparison = client.comparisons.compare_models(
+        benchmark_id=benchmark_id,
+        model_id_1=model_id_1,
+        model_id_2=model_id_2,
         page=1,
         page_size=10,
     )
 
     if comparison:
         print(f"\n=== Comparison Summary ===")
-        print(f"Evaluation 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
-        print(f"Evaluation 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
+        print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+        print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
         print(f"Total compared: {comparison.total_count}")
 
-        # --- Show individual results
         if comparison.results:
             print(f"\nFirst {len(comparison.results)} results:")
             for result in comparison.results:
-                score_indicator_1 = "✓" if result.score1 and result.score1 > 0.5 else "✗"
-                score_indicator_2 = "✓" if result.score2 and result.score2 > 0.5 else "✗"
+                s1 = "Y" if result.score1 and result.score1 > 0.5 else "N"
+                s2 = "Y" if result.score2 and result.score2 > 0.5 else "N"
                 print(f"  Prompt: {result.prompt[:80]}...")
-                print(f"    Model 1: {score_indicator_1} (score={result.score1})")
-                print(f"    Model 2: {score_indicator_2} (score={result.score2})")
+                print(f"    Model 1: {s1} (score={result.score1})")
+                print(f"    Model 2: {s2} (score={result.score2})")
                 print()
 
-    # --- Filter by outcome: where only model 1 fails
-    comparison = client.public.comparisons.compare(
-        evaluation_id_1=eval_1.id,
-        evaluation_id_2=eval_2.id,
+    # --- Filter: where model 1 fails but model 2 succeeds
+    comparison = client.comparisons.compare_models(
+        benchmark_id=benchmark_id,
+        model_id_1=model_id_1,
+        model_id_2=model_id_2,
         outcome_filter="reference_fails",
     )
 
@@ -76,16 +51,18 @@ def main():
         print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===")
         print(f"Found {comparison.total_count} such cases")
 
-    # --- Filter by outcome: where both models fail
-    comparison = client.public.comparisons.compare(
-        evaluation_id_1=eval_1.id,
-        evaluation_id_2=eval_2.id,
-        outcome_filter="both_fail",
+    # --- You can also compare using evaluation IDs directly
+    comparison = client.comparisons.compare(
+        evaluation_id_1="699f9938a03d70bf6607081f",  # Qwen3.5 27B on AIME 2025
+        evaluation_id_2="699f991ca782d00ebd666ba1",  # Qwen3.5 122B A10B on AIME 2025
+        page=1,
+        page_size=5,
     )
 
     if comparison:
-        print(f"\n=== Where Both Models Fail ===")
-        print(f"Found {comparison.total_count} such cases")
+        print(f"\n=== Direct Comparison by Evaluation IDs ===")
+        print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+        print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
 
 
 if __name__ == "__main__":
diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py
index f28236f..a8eb588 100644
--- a/examples/public_evaluations.py
+++ b/examples/public_evaluations.py
@@ -30,13 +30,8 @@ def main():
     else:
         print(f"Evaluation {evaluation_id} not found")
 
-    # --- List evaluations for a specific organization/project
-    organization_id = "683e63925ef7e1c53c1f4b28"
-    project_id = "683e63925ef7e1c53c1f4b29"
-
+    # --- List latest evaluations
     response = client.evaluations.get_many(
-        organization_id=organization_id,
-        project_id=project_id,
         page=1,
         page_size=5,
         sort_by="submittedAt",
@@ -49,8 +44,6 @@ def main():
 
     # --- Filter by status (only successful)
     response = client.evaluations.get_many(
-        organization_id=organization_id,
-        project_id=project_id,
         status=EvaluationStatus.SUCCESS,
         sort_by="accuracy",
         order="desc",
diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py
index eef469a..ed4851c 100644
--- a/src/layerlens/resources/comparisons/comparisons.py
+++ b/src/layerlens/resources/comparisons/comparisons.py
@@ -4,10 +4,19 @@
 
 import httpx
 
-from ...models import ComparisonResponse
+from ...models import EvaluationStatus, ComparisonResponse, EvaluationsResponse
 from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource
 from ..._constants import DEFAULT_TIMEOUT
 
+_OUTCOME_FILTER = Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"]
+
+
+def _find_evaluation_id(response: Optional[EvaluationsResponse], model_id: str, benchmark_id: str) -> str:
+    """Extract the first evaluation ID from a response, or raise ValueError."""
+    if not response or not response.evaluations:
+        raise ValueError(f"No successful evaluation found for model '{model_id}' on benchmark '{benchmark_id}'")
+    return str(response.evaluations[0].id)
+
 
 class Comparisons(SyncPublicAPIResource):
     def compare(
@@ -17,9 +26,7 @@ def compare(
         evaluation_id_2: str,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
-        outcome_filter: Optional[
-            Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"]
-        ] = None,
+        outcome_filter: Optional[_OUTCOME_FILTER] = None,
         search: Optional[str] = None,
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
     ) -> Optional[ComparisonResponse]:
@@ -48,6 +55,58 @@ def compare(
 
         return ComparisonResponse.model_validate(resp)
 
+    def compare_models(
+        self,
+        *,
+        benchmark_id: str,
+        model_id_1: str,
+        model_id_2: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        outcome_filter: Optional[_OUTCOME_FILTER] = None,
+        search: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ComparisonResponse]:
+        """Compare two models on a benchmark by automatically finding their evaluations.
+
+        Finds the most recent successful evaluation for each model on the given
+        benchmark, then compares the results side-by-side.
+
+        Raises:
+            ValueError: If no successful evaluation is found for either model.
+        """
+        resp1 = self._client.evaluations.get_many(
+            model_ids=[model_id_1],
+            benchmark_ids=[benchmark_id],
+            status=EvaluationStatus.SUCCESS,
+            sort_by="submittedAt",
+            order="desc",
+            page_size=1,
+            timeout=timeout,
+        )
+        eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
+
+        resp2 = self._client.evaluations.get_many(
+            model_ids=[model_id_2],
+            benchmark_ids=[benchmark_id],
+            status=EvaluationStatus.SUCCESS,
+            sort_by="submittedAt",
+            order="desc",
+            page_size=1,
+            timeout=timeout,
+        )
+        eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
+
+        return self.compare(
+            evaluation_id_1=eval_id_1,
+            evaluation_id_2=eval_id_2,
+            page=page,
+            page_size=page_size,
+            outcome_filter=outcome_filter,
+            search=search,
+            timeout=timeout,
+        )
+
 
 class AsyncComparisons(AsyncPublicAPIResource):
     async def compare(
@@ -57,9 +116,7 @@ async def compare(
         evaluation_id_2: str,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
-        outcome_filter: Optional[
-            Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"]
-        ] = None,
+        outcome_filter: Optional[_OUTCOME_FILTER] = None,
         search: Optional[str] = None,
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
     ) -> Optional[ComparisonResponse]:
@@ -87,3 +144,55 @@ async def compare(
             return None
 
         return ComparisonResponse.model_validate(resp)
+
+    async def compare_models(
+        self,
+        *,
+        benchmark_id: str,
+        model_id_1: str,
+        model_id_2: str,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        outcome_filter: Optional[_OUTCOME_FILTER] = None,
+        search: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ComparisonResponse]:
+        """Compare two models on a benchmark by automatically finding their evaluations.
+
+        Finds the most recent successful evaluation for each model on the given
+        benchmark, then compares the results side-by-side.
+
+        Raises:
+            ValueError: If no successful evaluation is found for either model.
+        """
+        resp1 = await self._client.evaluations.get_many(
+            model_ids=[model_id_1],
+            benchmark_ids=[benchmark_id],
+            status=EvaluationStatus.SUCCESS,
+            sort_by="submittedAt",
+            order="desc",
+            page_size=1,
+            timeout=timeout,
+        )
+        eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id)
+
+        resp2 = await self._client.evaluations.get_many(
+            model_ids=[model_id_2],
+            benchmark_ids=[benchmark_id],
+            status=EvaluationStatus.SUCCESS,
+            sort_by="submittedAt",
+            order="desc",
+            page_size=1,
+            timeout=timeout,
+        )
+        eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id)
+
+        return await self.compare(
+            evaluation_id_1=eval_id_1,
+            evaluation_id_2=eval_id_2,
+            page=page,
+            page_size=page_size,
+            outcome_filter=outcome_filter,
+            search=search,
+            timeout=timeout,
+        )
diff --git a/src/layerlens/resources/public_evaluations/public_evaluations.py b/src/layerlens/resources/public_evaluations/public_evaluations.py
index ddd1cdf..71f736d 100644
--- a/src/layerlens/resources/public_evaluations/public_evaluations.py
+++ b/src/layerlens/resources/public_evaluations/public_evaluations.py
@@ -37,8 +37,6 @@ def get_by_id(
     def get_many(
         self,
         *,
-        organization_id: str,
-        project_id: str,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
         sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
@@ -52,8 +50,6 @@ def get_many(
         Get evaluations with optional pagination, sorting, and filtering.
 
         Args:
-            organization_id: Organization ID (required)
-            project_id: Project ID (required)
             page: Page number for pagination (1-based, defaults to 1 if not provided)
             page_size: Number of evaluations per page (default: 100, optional)
             sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
@@ -66,10 +62,7 @@ def get_many(
         Returns:
             EvaluationsResponse object or None
         """
-        params = {
-            "organizationID": organization_id,
-            "projectID": project_id,
-        }
+        params: dict[str, str] = {}
 
         effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
         effective_page = page if page is not None else DEFAULT_PAGE
@@ -137,8 +130,6 @@ async def get_by_id(
     async def get_many(
         self,
         *,
-        organization_id: str,
-        project_id: str,
         page: Optional[int] = None,
         page_size: Optional[int] = None,
         sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None,
@@ -152,8 +143,6 @@ async def get_many(
         Get evaluations with optional pagination, sorting, and filtering.
 
         Args:
-            organization_id: Organization ID (required)
-            project_id: Project ID (required)
             page: Page number for pagination (1-based, defaults to 1 if not provided)
             page_size: Number of evaluations per page (default: 100, optional)
             sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration)
@@ -166,10 +155,7 @@ async def get_many(
         Returns:
             EvaluationsResponse object or None
         """
-        params = {
-            "organizationID": organization_id,
-            "projectID": project_id,
-        }
+        params: dict[str, str] = {}
 
         effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
         effective_page = page if page is not None else DEFAULT_PAGE
diff --git a/tests/resources/test_comparisons.py b/tests/resources/test_comparisons.py
new file mode 100644
index 0000000..85dfa2f
--- /dev/null
+++ b/tests/resources/test_comparisons.py
@@ -0,0 +1,201 @@
+from unittest.mock import Mock
+
+import pytest
+
+from layerlens.models import (
+    Evaluation,
+    Pagination,
+    EvaluationStatus,
+    ComparisonResponse,
+    EvaluationsResponse,
+)
+from layerlens.resources.comparisons.comparisons import Comparisons
+
+
+def _make_eval(eval_id: str, model_id: str, benchmark_id: str) -> Evaluation:
+    return Evaluation(
+        id=eval_id,
+        status=EvaluationStatus.SUCCESS,
+        submitted_at=1640995200,
+        finished_at=1640995800,
+        model_id=model_id,
+        dataset_id=benchmark_id,
+        average_duration=2500,
+        accuracy=0.89,
+    )
+
+
+def _make_eval_response(evaluations: list[Evaluation]) -> EvaluationsResponse:
+    return EvaluationsResponse(
+        evaluations=evaluations,
+        pagination=Pagination(
+            page=1,
+            page_size=1,
+            total_pages=1,
+            total_count=len(evaluations),
+        ),
+    )
+
+
+class TestCompareModels:
+    """Test Comparisons.compare_models convenience method."""
+
+    @pytest.fixture
+    def mock_public_client(self):
+        client = Mock()
+        client.get_cast = Mock()
+        client.evaluations = Mock()
+        return client
+
+    @pytest.fixture
+    def comparisons(self, mock_public_client):
+        return Comparisons(mock_public_client)
+
+    def test_compare_models_success(self, comparisons, mock_public_client):
+        """compare_models finds evaluations for both models and calls compare."""
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+        eval2 = _make_eval("eval-2", "model-b", "bench-1")
+
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([eval2]),
+        ]
+
+        comparisons._get.return_value = {
+            "results": [],
+            "total_count": 0,
+            "correct_count_1": 5,
+            "total_results_1": 10,
+            "correct_count_2": 7,
+            "total_results_2": 10,
+        }
+
+        result = comparisons.compare_models(
+            benchmark_id="bench-1",
+            model_id_1="model-a",
+            model_id_2="model-b",
+        )
+
+        assert isinstance(result, ComparisonResponse)
+
+        # Verify get_many was called correctly for both models
+        calls = mock_public_client.evaluations.get_many.call_args_list
+        assert len(calls) == 2
+
+        assert calls[0].kwargs["model_ids"] == ["model-a"]
+        assert calls[0].kwargs["benchmark_ids"] == ["bench-1"]
+        assert calls[0].kwargs["status"] == EvaluationStatus.SUCCESS
+        assert calls[0].kwargs["sort_by"] == "submittedAt"
+        assert calls[0].kwargs["order"] == "desc"
+        assert calls[0].kwargs["page_size"] == 1
+
+        assert calls[1].kwargs["model_ids"] == ["model-b"]
+
+        # Verify compare was called with the found evaluation IDs
+        compare_call = comparisons._get.call_args
+        params = compare_call.kwargs.get("params") or compare_call[1].get("params")
+        assert params["evaluation_id_1"] == "eval-1"
+        assert params["evaluation_id_2"] == "eval-2"
+
+    def test_compare_models_model_1_not_found(self, comparisons, mock_public_client):
+        """compare_models raises ValueError when model 1 has no evaluation."""
+        mock_public_client.evaluations.get_many.return_value = _make_eval_response([])
+
+        with pytest.raises(ValueError, match="model-a"):
+            comparisons.compare_models(
+                benchmark_id="bench-1",
+                model_id_1="model-a",
+                model_id_2="model-b",
+            )
+
+    def test_compare_models_model_2_not_found(self, comparisons, mock_public_client):
+        """compare_models raises ValueError when model 2 has no evaluation."""
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([]),
+        ]
+
+        with pytest.raises(ValueError, match="model-b"):
+            comparisons.compare_models(
+                benchmark_id="bench-1",
+                model_id_1="model-a",
+                model_id_2="model-b",
+            )
+
+    def test_compare_models_none_response(self, comparisons, mock_public_client):
+        """compare_models raises ValueError when get_many returns None."""
+        mock_public_client.evaluations.get_many.return_value = None
+
+        with pytest.raises(ValueError, match="model-a"):
+            comparisons.compare_models(
+                benchmark_id="bench-1",
+                model_id_1="model-a",
+                model_id_2="model-b",
+            )
+
+    def test_compare_models_passes_through_params(self, comparisons, mock_public_client):
+        """compare_models forwards pagination, filter, and search to compare."""
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+        eval2 = _make_eval("eval-2", "model-b", "bench-1")
+
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([eval2]),
+        ]
+        comparisons._get.return_value = {
+            "results": [],
+            "total_count": 0,
+            "correct_count_1": 0,
+            "total_results_1": 0,
+            "correct_count_2": 0,
+            "total_results_2": 0,
+        }
+
+        comparisons.compare_models(
+            benchmark_id="bench-1",
+            model_id_1="model-a",
+            model_id_2="model-b",
+            page=2,
+            page_size=50,
+            outcome_filter="both_succeed",
+            search="test query",
+        )
+
+        compare_call = comparisons._get.call_args
+        params = compare_call.kwargs.get("params") or compare_call[1].get("params")
+        assert params["page"] == "2"
+        assert params["pageSize"] == "50"
+        assert params["outcomeFilter"] == "both_succeed"
+        assert params["search"] == "test query"
+
+    def test_compare_models_picks_most_recent(self, comparisons, mock_public_client):
+        """compare_models requests sort by submittedAt desc to get the most recent."""
+        eval1 = _make_eval("eval-1", "model-a", "bench-1")
+        eval2 = _make_eval("eval-2", "model-b", "bench-1")
+
+        mock_public_client.evaluations.get_many.side_effect = [
+            _make_eval_response([eval1]),
+            _make_eval_response([eval2]),
+        ]
+        comparisons._get.return_value = {
+            "results": [],
+            "total_count": 0,
+            "correct_count_1": 0,
+            "total_results_1": 0,
+            "correct_count_2": 0,
+            "total_results_2": 0,
+        }
+
+        comparisons.compare_models(
+            benchmark_id="bench-1",
+            model_id_1="model-a",
+            model_id_2="model-b",
+        )
+
+        for call in mock_public_client.evaluations.get_many.call_args_list:
+            assert call.kwargs["sort_by"] == "submittedAt"
+            assert call.kwargs["order"] == "desc"
+            assert call.kwargs["page_size"] == 1
+            assert call.kwargs["status"] == EvaluationStatus.SUCCESS
diff --git a/tests/resources/test_evaluations.py b/tests/resources/test_evaluations.py
index 0d40f08..8a337eb 100644
--- a/tests/resources/test_evaluations.py
+++ b/tests/resources/test_evaluations.py
@@ -780,38 +780,18 @@ def test_get_many_success(self, public_evaluations, sample_evaluation_data):
         }
         public_evaluations._get.return_value = resp
 
-        result = public_evaluations.get_many(
-            organization_id="org-123",
-            project_id="proj-456",
-        )
+        result = public_evaluations.get_many()
 
         assert isinstance(result, EvaluationsResponse)
         assert len(result.evaluations) == 1
         assert result.evaluations[0].id == "eval-pub-123"
 
-    def test_get_many_sends_org_and_project(self, public_evaluations, sample_evaluation_data):
-        """get_many sends organizationID and projectID as params."""
-        resp = {"evaluations": [sample_evaluation_data], "total_count": 1}
-        public_evaluations._get.return_value = resp
-
-        public_evaluations.get_many(
-            organization_id="org-abc",
-            project_id="proj-xyz",
-        )
-
-        call_args = public_evaluations._get.call_args
-        params = call_args.kwargs.get("params") or call_args[1].get("params")
-        assert params["organizationID"] == "org-abc"
-        assert params["projectID"] == "proj-xyz"
-
     def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data):
         """get_many passes filter parameters correctly."""
         resp = {"evaluations": [sample_evaluation_data], "total_count": 1}
         public_evaluations._get.return_value = resp
 
         public_evaluations.get_many(
-            organization_id="org-123",
-            project_id="proj-456",
             page=2,
             page_size=50,
             sort_by="accuracy",
@@ -830,6 +810,8 @@ def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data)
         assert params["models"] == "m1,m2"
         assert params["datasets"] == "b1"
         assert params["status"] == "success"
+        assert "organizationID" not in params
+        assert "projectID" not in params
 
     def test_get_many_pagination(self, public_evaluations, sample_evaluation_data):
         """get_many computes pagination correctly."""
@@ -837,8 +819,6 @@ def test_get_many_pagination(self, public_evaluations, sample_evaluation_data):
         public_evaluations._get.return_value = resp
 
         result = public_evaluations.get_many(
-            organization_id="org-123",
-            project_id="proj-456",
             page=1,
             page_size=10,
         )
@@ -852,10 +832,7 @@ def test_get_many_returns_none_on_invalid(self, public_evaluations):
         """get_many returns None when response is invalid."""
         public_evaluations._get.return_value = "not-a-dict"
 
-        result = public_evaluations.get_many(
-            organization_id="org-123",
-            project_id="proj-456",
-        )
+        result = public_evaluations.get_many()
 
         assert result is None
 
@@ -864,10 +841,7 @@ def test_get_many_empty_results(self, public_evaluations):
         resp = {"evaluations": [], "total_count": 0}
         public_evaluations._get.return_value = resp
 
-        result = public_evaluations.get_many(
-            organization_id="org-123",
-            project_id="proj-456",
-        )
+        result = public_evaluations.get_many()
 
         assert isinstance(result, EvaluationsResponse)
         assert len(result.evaluations) == 0

From 99a6a152f6eda06d34026c671fce6577e084b147 Mon Sep 17 00:00:00 2001
From: Leandro Echevarria <leandro.echevarria@layerlens.ai>
Date: Fri, 27 Feb 2026 04:38:10 -0300
Subject: [PATCH 3/3] Feat | LAY-885 cicd publish (#7)

* feat | LAY-885 Added ground files for our publishing CICD

* feat | LAY-885 Added twine to publish the package. Updated scripts.

* feat | LAY-885 Addressing PR feedback

* Couple of fixes to release process

---------

Co-authored-by: m-peko <marinpeko5@gmail.com>
---
 .github/workflows/publish-sdk.yaml      | 63 ++++++++++++++++++++
 .github/workflows/release-tag.yaml      | 78 +++++++++++++++++++++++++
 .github/workflows/test-publish-sdk.yaml | 41 +++++++++++++
 Makefile                                | 48 +++++++++++++++
 pyproject.toml                          | 13 ++++-
 requirements-dev.lock                   | 58 ++++++++++++++++--
 requirements.lock                       |  6 +-
 scripts/get_version.sh                  | 10 +---
 scripts/publish.sh                      | 24 ++++++++
 scripts/push-release-tag.sh             | 72 +++++++++++++++++++++++
 scripts/template-version.sh             | 28 +++++++++
 scripts/validate-release-tag.sh         | 62 ++++++++++++++++++++
 src/layerlens/_version.py               |  3 +
 13 files changed, 488 insertions(+), 18 deletions(-)
 create mode 100644 .github/workflows/publish-sdk.yaml
 create mode 100644 .github/workflows/release-tag.yaml
 create mode 100644 .github/workflows/test-publish-sdk.yaml
 create mode 100644 Makefile
 create mode 100755 scripts/publish.sh
 create mode 100755 scripts/push-release-tag.sh
 create mode 100644 scripts/template-version.sh
 create mode 100755 scripts/validate-release-tag.sh

diff --git a/.github/workflows/publish-sdk.yaml b/.github/workflows/publish-sdk.yaml
new file mode 100644
index 0000000..bc27b0d
--- /dev/null
+++ b/.github/workflows/publish-sdk.yaml
@@ -0,0 +1,63 @@
+# This workflow is used to publish the Python SDK to the actual PyPI.
+# It is triggered by a tag push, and will only publish if the tag is valid.
+# The tag must match the format sdk-v*.*.*
+
+name: Publish Python SDK
+
+on:
+  push:
+    tags:
+      - "sdk-v*.*.*" # Trigger on version tags like sdk-v0.1.0 etc.
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    environment: production
+    outputs:
+      release_tag: ${{ steps.set_release_tag.outputs.release_tag }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for checking branch
+      - name: Set release tag
+        id: set_release_tag
+        # ensure the tag is valid (matches code, is on main, etc)
+        run: |
+          RELEASE_TAG=${GITHUB_REF#refs/tags/}
+          echo "Using tag: $RELEASE_TAG"
+          ./scripts/validate-release-tag.sh "$RELEASE_TAG"
+          echo "RELEASE_TAG=$RELEASE_TAG" >> $GITHUB_ENV
+          echo "release_tag=$RELEASE_TAG" >> $GITHUB_OUTPUT
+
+  build-and-publish:
+    needs: validate
+    runs-on: ubuntu-latest
+    environment: production
+
+    env:
+      TWINE_USERNAME: __token__
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      RELEASE_TAG: ${{ needs.validate.outputs.release_tag }}
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Install build dependencies
+        run: make install-build-deps
+      - name: Build
+        run: make build
+      - name: Test wheel
+        run: make test-wheel
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdk-dist
+          path: dist/
+          retention-days: 5
+      - name: Publish to PyPI
+        run: make _publish
+        env:
+          PYPI_REPO: pypi
diff --git a/.github/workflows/release-tag.yaml b/.github/workflows/release-tag.yaml
new file mode 100644
index 0000000..ae495c3
--- /dev/null
+++ b/.github/workflows/release-tag.yaml
@@ -0,0 +1,78 @@
+# This workflow creates and pushes a release tag using the push-release-tag.sh script.
+# It can be triggered manually and will prompt for confirmation before creating the tag.
+
+name: Create Release Tag
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Run in dry-run mode (show what would be done without actually creating/pushing the tag)"
+        required: false
+        type: boolean
+        default: true
+      confirm_release:
+        description: "Type 'YES' to confirm you want to create and push the release tag"
+        required: true
+        type: string
+
+jobs:
+  check-branch:
+    runs-on: ubuntu-latest
+    environment: production
+    steps:
+      - name: Check if running on release branch
+        run: |
+          if [ "${{ github.ref }}" != "refs/heads/release" ]; then
+            echo "Error: This workflow can only be run from the 'release' branch."
+            echo "Current branch: ${{ github.ref }}"
+            echo "Please switch to the 'release' branch and try again."
+            exit 1
+          fi
+          echo "Running on release branch - proceeding with workflow."
+
+  create-release-tag:
+    runs-on: ubuntu-latest
+    needs: check-branch
+    environment: production
+    if: github.ref == 'refs/heads/release'
+
+    permissions:
+      contents: write # Required to create and push tags
+
+    steps:
+      - name: Validate confirmation
+        if: github.event.inputs.confirm_release != 'YES' && github.event.inputs.dry_run != 'true'
+        run: |
+          echo "Error: You must type 'YES' in the confirm_release input to proceed with creating a release tag."
+          echo "Received: '${{ github.event.inputs.confirm_release }}'"
+          exit 1
+
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history and tags
+
+      - name: Make scripts executable
+        run: |
+          chmod +x scripts/push-release-tag.sh
+          chmod +x scripts/get_version.sh
+
+      - name: Configure Git
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Run push-release-tag script (dry-run)
+        if: github.event.inputs.dry_run == 'true'
+        run: |
+          echo "Running in dry-run mode..."
+          make push-release-tag DRY_RUN=--dry-run
+
+      - name: Run push-release-tag script
+        if: github.event.inputs.dry_run != 'true'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "Creating and pushing release tag..."
+          # Override the interactive confirmation since we already confirmed via workflow input
+          echo "YES" | make push-release-tag
diff --git a/.github/workflows/test-publish-sdk.yaml b/.github/workflows/test-publish-sdk.yaml
new file mode 100644
index 0000000..9703970
--- /dev/null
+++ b/.github/workflows/test-publish-sdk.yaml
@@ -0,0 +1,41 @@
+# This workflow is used to publish the Python SDK to TestPyPI. Do not need to upgrade the
+# version number to use this workflow.
+# Only upgrade the version number when you are ready to publish to PyPi
+# The script will automatically add an "rc" suffix to the version number for test.pypi.org releases.
+
+name: Publish Python SDK to TestPyPI
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Publish the given Git ref to test.pypi.org (branch, tag, or commit SHA)"
+        required: true
+        type: string
+        default: "main"
+
+jobs:
+  build-and-publish-test:
+    runs-on: ubuntu-latest
+
+    env:
+      TWINE_USERNAME: __token__
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      PYPI_REPO: testpypi
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.ref }}
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Install build dependencies
+        run: make install-build-deps
+      - name: Build
+        run: make build
+      - name: Test wheel
+        run: make test-wheel
+      - name: Publish to TestPyPI
+        run: make _publish
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e511695
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,48 @@
+install-build-deps:
+	pip install build twine
+
+build: clean _template-version
+	python -m build
+	# Restore the original version file after the build
+	git checkout src/layerlens/_version.py
+
+test-wheel:
+	pip install dist/*.whl
+	python -c "import layerlens; print('Package imported successfully')"
+
+clean:
+	rm -rf build dist
+
+_publish:
+	./scripts/publish.sh
+
+_template-version:
+	@bash scripts/template-version.sh
+
+_check-git-clean:
+	@if [ -n "$$(git status --porcelain)" ]; then \
+		echo "Error: Git working directory is not clean. Won't run publish."; \
+		exit 1; \
+	fi
+
+_verify-build-publish: _check-git-clean build test-wheel _publish
+
+publish-to-testpypi: export PYPI_REPO := testpypi
+publish-to-testpypi: _verify-build-publish
+
+publish-to-pypi: export PYPI_REPO := pypi
+publish-to-pypi: _verify-build-publish
+
+push-release-tag:
+	@bash scripts/push-release-tag.sh $(DRY_RUN)
+
+help:
+	@echo "Available targets:"
+	@echo "  build               - Build Python package"
+	@echo "  clean               - Remove build artifacts"
+	@echo "  help                - Show this help message"
+	@echo "  install-build-deps  - Install build dependencies for CI"
+	@echo "  test-wheel          - Run tests against built wheel"
+	@echo "  publish-to-pypi     - Publish to PyPI"
+	@echo "  publish-to-testpypi - Publish to TestPyPI"
+	@echo "  push-release-tag    - Create and push a release tag"
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ef86a87..f5efccc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,14 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.version]
+path = "src/layerlens/_version.py"
+pattern = '__version__ = "(?P<version>[^"]+)"'
+
 [project]
 name = "layerlens"
-version = "1.2.0"
+dynamic = ["version"]
 description = "The official Python library for the LayerLens Stratix API"
 license = "Apache-2.0"
 authors = [{ name = "LayerLens", email = "support@layerlens.ai" }]
@@ -30,7 +38,6 @@ Repository = "https://github.com/LayerLens/stratix-python"
 [project.scripts]
 layerlens = "layerlens.cli:main"
 
-
 [tool.rye]
 managed = true
 # version pins are in requirements-dev.lock
@@ -41,6 +48,8 @@ dev-dependencies = [
   "pytest-cov>=6.2.1",
   "ruff",
   "types-requests",
+  "build",
+  "twine==6.1.0",
 ]
 
 [tool.rye.scripts]
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 9c0f730..2aaa85b 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -4,7 +4,7 @@
 # last locked with the following flags:
 #   pre: false
 #   features: []
-#   all-features: false
+#   all-features: true
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
@@ -14,6 +14,9 @@ annotated-types==0.7.0
     # via pydantic
 anyio==4.9.0
     # via httpx
+backports-tarfile==1.2.0
+    # via jaraco-context
+build==1.3.0
 certifi==2025.7.14
     # via httpcore
     # via httpx
@@ -22,6 +25,8 @@ charset-normalizer==3.4.3
     # via requests
 coverage==7.10.2
     # via pytest-cov
+docutils==0.22
+    # via readme-renderer
 exceptiongroup==1.3.0
     # via anyio
     # via pytest
@@ -30,44 +35,86 @@ h11==0.16.0
 httpcore==1.0.9
     # via httpx
 httpx==0.28.1
-    # via atlas
+    # via test-atlas-lzok
+id==1.5.0
+    # via twine
 idna==3.10
     # via anyio
     # via httpx
     # via requests
+importlib-metadata==8.7.0
+    # via build
+    # via keyring
+    # via twine
 iniconfig==2.1.0
     # via pytest
+jaraco-classes==3.4.0
+    # via keyring
+jaraco-context==6.0.1
+    # via keyring
+jaraco-functools==4.2.1
+    # via keyring
+keyring==25.6.0
+    # via twine
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+more-itertools==10.7.0
+    # via jaraco-classes
+    # via jaraco-functools
 mypy==1.17.0
 mypy-extensions==1.1.0
     # via mypy
+nh3==0.3.0
+    # via readme-renderer
 nodeenv==1.9.1
     # via pyright
 packaging==25.0
+    # via build
     # via pytest
+    # via twine
 pathspec==0.12.1
     # via mypy
 pluggy==1.6.0
     # via pytest
     # via pytest-cov
 pydantic==2.11.7
-    # via atlas
+    # via test-atlas-lzok
 pydantic-core==2.33.2
     # via pydantic
 pygments==2.19.2
     # via pytest
+    # via readme-renderer
+    # via rich
+pyproject-hooks==1.2.0
+    # via build
 pyright==1.1.399
 pytest==8.4.1
     # via pytest-cov
 pytest-cov==6.2.1
+readme-renderer==44.0
+    # via twine
 requests==2.32.5
-    # via atlas
+    # via id
+    # via layerlens
+    # via requests-toolbelt
+    # via twine
+requests-toolbelt==1.0.0
+    # via twine
+rfc3986==2.0.0
+    # via twine
+rich==14.1.0
+    # via twine
 ruff==0.12.7
 sniffio==1.3.1
     # via anyio
 tomli==2.2.1
+    # via build
     # via coverage
     # via mypy
     # via pytest
+twine==6.1.0
 types-requests==2.32.4.20250809
 typing-extensions==4.14.1
     # via anyio
@@ -81,4 +128,7 @@ typing-inspection==0.4.1
     # via pydantic
 urllib3==2.5.0
     # via requests
+    # via twine
     # via types-requests
+zipp==3.23.0
+    # via importlib-metadata
diff --git a/requirements.lock b/requirements.lock
index 887d3cc..540f4d6 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -4,7 +4,7 @@
 # last locked with the following flags:
 #   pre: false
 #   features: []
-#   all-features: false
+#   all-features: true
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
@@ -27,13 +27,13 @@ h11==0.16.0
 httpcore==1.0.9
     # via httpx
 httpx==0.28.1
-    # via atlas
+    # via test-atlas-lzok
 idna==3.10
     # via anyio
     # via httpx
     # via requests
 pydantic==2.11.7
-    # via atlas
+    # via test-atlas-lzok
 pydantic-core==2.33.2
     # via pydantic
 requests==2.32.5
diff --git a/scripts/get_version.sh b/scripts/get_version.sh
index 42caa27..04ab2da 100755
--- a/scripts/get_version.sh
+++ b/scripts/get_version.sh
@@ -6,24 +6,16 @@ set -e
 ROOT_DIR=$(git rev-parse --show-toplevel)
 VERSION_FILE="$ROOT_DIR/src/layerlens/_version.py"
 
-echo "Debug: ROOT_DIR=$ROOT_DIR" >&2
-echo "Debug: VERSION_FILE=$VERSION_FILE" >&2
-
 if [ ! -f "$VERSION_FILE" ]; then
     echo "Error: Version file not found at $VERSION_FILE" >&2
     exit 1
 fi
 
-echo "Debug: File exists, content:" >&2
-cat "$VERSION_FILE" >&2
-
 VERSION=$(grep -E '^__version__\s*=' "$VERSION_FILE" | grep -o '".*"' | tr -d '"')
 
-echo "Debug: Extracted version='$VERSION'" >&2
-
 if [ -z "$VERSION" ]; then
     echo "Error: Could not extract version from $VERSION_FILE" >&2
     exit 1
 fi
 
-echo "$VERSION"
\ No newline at end of file
+echo "$VERSION"
diff --git a/scripts/publish.sh b/scripts/publish.sh
new file mode 100755
index 0000000..e6ac0f7
--- /dev/null
+++ b/scripts/publish.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Publish the package to PyPI or TestPyPI depending
+# on the PYPI_REPO (pypi | testpypi) environment variable
+
+if [ -z "$PYPI_REPO" ]; then
+    echo "Error: PYPI_REPO environment variable must be set"
+    exit 1
+fi
+
+if [ "$PYPI_REPO" != "pypi" ] && [ "$PYPI_REPO" != "testpypi" ]; then
+    echo "Error: PYPI_REPO must be either 'pypi' or 'testpypi'"
+    exit 1
+fi
+
+VERSION=$(bash scripts/get_version.sh)
+
+if [ -z "$VERSION" ]; then
+    echo "Error: Could not determine version"
+    exit 1
+fi
+
+echo "Publishing version $VERSION to $PYPI_REPO"
+
+twine upload --repository "$PYPI_REPO" dist/*
\ No newline at end of file
diff --git a/scripts/push-release-tag.sh b/scripts/push-release-tag.sh
new file mode 100755
index 0000000..64ba944
--- /dev/null
+++ b/scripts/push-release-tag.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -euo pipefail
+
+ROOT_DIR=$(git rev-parse --show-toplevel)
+
+# Parse command line arguments
+DRY_RUN=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --dry-run)
+      DRY_RUN=true
+      shift
+      ;;
+    *)
+      echo "Unknown option: $1"
+      echo "Usage: $0 [--dry-run]"
+      exit 1
+      ;;
+  esac
+done
+
+git fetch --tags --prune
+
+REPO_URL="https://github.com/LayerLens/atlas-python"
+TAG_PREFIX="sdk-v"
+COMMIT=$(git rev-parse --short HEAD)
+VERSION=$(bash "$ROOT_DIR/scripts/get_version.sh")
+TAG="${TAG_PREFIX}${VERSION}"
+
+if git rev-parse "$TAG" >/dev/null 2>&1; then
+  echo "Error: Tag $TAG already exists"
+  exit 1
+fi
+
+# Find the most recent version tag
+LAST_RELEASE=$(git tag -l "${TAG_PREFIX}*" --sort=-v:refname | head -n 1)
+
+echo "================================================"
+echo "  Atlas Python SDK Release"
+echo "================================================"
+echo "version:      ${TAG}"
+echo "commit:       ${COMMIT}"
+echo "code:         ${REPO_URL}/commit/${COMMIT}"
+echo "changeset:    ${REPO_URL}/compare/${LAST_RELEASE}...${COMMIT}"
+
+if [ "$DRY_RUN" = true ]; then
+  exit 0
+fi
+
+echo ""
+echo ""
+echo "Are you ready to release version ${VERSION}? Type 'YES' to continue:"
+read -r CONFIRMATION
+
+if [ "$CONFIRMATION" != "YES" ]; then
+  echo "Release cancelled."
+  exit 1
+fi
+
+# Create and push the tag
+echo ""
+echo "Creating and pushing tag ${TAG}"
+echo ""
+
+git tag "$TAG" "$COMMIT"
+git push origin "$TAG"
+
+echo ""
+echo "Tag ${TAG} has been created and pushed to origin. Check GitHub Actions for build progress:"
+echo "https://github.com/LayerLens/atlas-python/actions/workflows/publish-sdk.yaml"
+echo ""
\ No newline at end of file
diff --git a/scripts/template-version.sh b/scripts/template-version.sh
new file mode 100644
index 0000000..d3d8b84
--- /dev/null
+++ b/scripts/template-version.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -e
+
+VERSION_FILE="src/layerlens/_version.py"
+
+GIT_COMMIT=$(git rev-parse HEAD)
+
+sed_inplace() {
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        sed -i '' "$@"
+    else
+        sed -i "$@"
+    fi
+}
+
+# Update git commit hash
+sed_inplace "s/__GIT_COMMIT__/$GIT_COMMIT/g" "$VERSION_FILE"
+
+# Get current version
+CURRENT_VERSION=$(grep '__version__ = ' "$VERSION_FILE" | cut -d'"' -f2)
+
+# If we're uploading to testpypi, add a run number to the version so we can
+# test multiple times.
+if [[ "$PYPI_REPO" == "testpypi" ]] && [[ -n "$GITHUB_RUN_NUMBER" ]]; then
+    NEW_VERSION="${CURRENT_VERSION}rc${GITHUB_RUN_NUMBER}"
+    sed_inplace "s/__version__ = \".*\"/__version__ = \"$NEW_VERSION\"/" "$VERSION_FILE"
+fi
diff --git a/scripts/validate-release-tag.sh b/scripts/validate-release-tag.sh
new file mode 100755
index 0000000..175464e
--- /dev/null
+++ b/scripts/validate-release-tag.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Validate release requirements
+# - Checks if the tag matches naming convention (sdk-v*.*.*)
+# - Checks if the tag matches the version in the package
+# - Ensures we're releasing from the release branch
+
+set -e
+
+# Get the tag from the first command line argument
+if [ $# -eq 0 ]; then
+  echo "ERROR: Release tag argument not provided"
+  echo "Usage: $0 <release-tag>"
+  exit 1
+fi
+
+ROOT_DIR=$(git rev-parse --show-toplevel)
+
+# Fetch the latest tags to ensure we're up to date
+git fetch --tags --prune --force
+
+TAG=$1
+
+# Check if tag starts with sdk-v
+if [[ ! "$TAG" =~ ^sdk-v ]]; then
+  echo "ERROR: Tag must start with 'sdk-v'"
+  exit 1
+fi
+
+# Extract version without the 'sdk-v' prefix
+VERSION=${TAG#sdk-v}
+
+PACKAGE_VERSION=$(bash "$ROOT_DIR/scripts/get_version.sh")
+
+# Check if the tag version matches the package version
+if [ "$VERSION" != "$PACKAGE_VERSION" ]; then
+  echo "ERROR: Tag version ($VERSION) does not match package version ($PACKAGE_VERSION)"
+  exit 1
+fi
+
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+if [ "$CURRENT_BRANCH" != "release" ]; then
+  # If we're in detached HEAD state (which is likely in GitHub Actions with a tag),
+  # we need to check if the tag is on the release branch
+  if ! git rev-parse "$TAG" &>/dev/null; then
+    echo "ERROR: Tag $TAG does not exist in the repository"
+    exit 1
+  fi
+
+  TAG_COMMIT=$(git rev-parse "$TAG")
+
+  # Ensure we have release branch history
+  git fetch origin release --depth=1000
+
+  # Check if tag is on release branch
+  if ! git merge-base --is-ancestor "$TAG_COMMIT" origin/release; then
+    echo "ERROR: Tag $TAG is not on the release branch"
+    exit 1
+  fi
+fi
+
+# All checks passed
+exit 0
\ No newline at end of file
diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py
index c68196d..8fb65ee 100644
--- a/src/layerlens/_version.py
+++ b/src/layerlens/_version.py
@@ -1 +1,4 @@
 __version__ = "1.2.0"
+
+# Will be templated during the build
+__git_commit__ = "__GIT_COMMIT__"