From bc9b0825ebfa4f37f4e5060aba1a3870f1f29058 Mon Sep 17 00:00:00 2001
From: Marin Peko <26385728+m-peko@users.noreply.github.com>
Date: Wed, 11 Mar 2026 12:17:20 +0100
Subject: [PATCH] Merge pull request #57 from LayerLens/bugfix/minor-issues

Bugfix/minor issues
---
 README.md                                    | 266 +++++++++++++++
 docs/README.md                               | 282 ++++++++++++----
 docs/SUMMARY.md                              |   6 +-
 docs/api-reference/models-benchmarks.md      |  14 +
 docs/api-reference/traces.md                 |   8 +
 docs/examples/README.md                      |  39 +++
 docs/examples/creating-evaluations.md        | 337 ++++++++++++++-----
 docs/examples/judges-and-traces.md           | 301 ++++++++++-------
 docs/examples/models-and-benchmarks.md       | 256 ++++++++++++++
 docs/examples/public-api.md                  | 279 +++++++++++++++
 docs/examples/retrieving-results.md          | 194 +++++++++++
 examples/all_results_no_pagination.py        |   2 +-
 examples/async_client.py                     |   2 +-
 examples/async_client_simple.py              |   2 +-
 examples/async_judges_and_traces.py          |   2 +-
 examples/async_run_evaluations.py            |   2 +-
 examples/client.py                           |   2 +-
 examples/client_simple.py                    |   2 +-
 examples/compare_evaluations.py              |   2 +-
 examples/create_custom_benchmark.py          |   2 +-
 examples/create_custom_model.py              |   6 +-
 examples/create_smart_benchmark.py           |   2 +-
 examples/evaluation_sorting.py               |   2 +-
 examples/fetch_results_async.py              |   2 +-
 examples/get_benchmarks.py                   |   2 +-
 examples/get_evaluation.py                   |   2 +-
 examples/get_models.py                       |   2 +-
 examples/judge_optimizations.py              |   2 +-
 examples/judges.py                           |   2 +-
 examples/manage_project_models_benchmarks.py |   2 +-
 examples/paginated_results.py                |   2 +-
 examples/public_benchmarks.py                |   2 +-
 examples/public_evaluations.py               |   2 +-
 examples/public_models.py                    |   2 +-
 examples/trace_evaluations.py                |   2 +-
 examples/traces.py                           |   2 +-
 pyproject.toml                               |   4 +-
 src/layerlens/_base_client.py                | 128 ++++---
 src/layerlens/_client.py                     |  12 +-
 src/layerlens/_constants.py                  |   2 +
 src/layerlens/_public_client.py              |   6 +-
 src/layerlens/_version.py                    |   2 +-
 src/layerlens/cli.py                         |  16 +
 43 files changed, 1861 insertions(+), 345 deletions(-)
 create mode 100644 README.md
 create mode 100644 docs/examples/models-and-benchmarks.md
 create mode 100644 docs/examples/public-api.md
 create mode 100644 docs/examples/retrieving-results.md
 create mode 100644 src/layerlens/cli.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a6de2ce
--- /dev/null
+++ b/README.md
@@ -0,0 +1,266 @@
+# LayerLens Stratix Python SDK
+
+The official Python library for the [LayerLens Stratix](https://layerlens.ai) evaluation API.
+
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+
+## Installation
+
+```bash
+pip install layerlens --extra-index-url https://sdk.layerlens.ai/package
+```
+
+## Authentication
+
+Set your API key as an environment variable:
+
+```bash
+export LAYERLENS_STRATIX_API_KEY="your-api-key"
+```
+
+Or pass it directly when creating a client:
+
+```python
+from layerlens import Stratix
+
+client = Stratix(api_key="your-api-key")
+```
+
+## Quick Start
+
+### Run an evaluation
+
+```python
+import os
+from layerlens import Stratix
+
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+# Get a model and benchmark by key
+model = client.models.get_by_key("openai/gpt-4o")
+benchmark = client.benchmarks.get_by_key("arc-agi-2")
+
+# Create an evaluation (pass the full model and benchmark objects)
+evaluation = client.evaluations.create(
+    model=model,
+    benchmark=benchmark,
+)
+
+# Wait for results (pass the evaluation object, not just the ID)
+result = client.evaluations.wait_for_completion(evaluation)
+print(f"Accuracy: {result.accuracy}")
+```
+
+### Async usage
+
+```python
+import os
+import asyncio
+from layerlens import AsyncStratix
+
+async def main():
+    client = AsyncStratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+    model = await client.models.get_by_key("openai/gpt-4o")
+    benchmark = await client.benchmarks.get_by_key("arc-agi-2")
+
+    evaluation = await client.evaluations.create(
+        model=model,
+        benchmark=benchmark,
+    )
+
+    result = await client.evaluations.wait_for_completion(evaluation)
+    print(f"Accuracy: {result.accuracy}")
+
+asyncio.run(main())
+```
+
+### Public endpoints
+
+Public models, benchmarks, and evaluations are accessible through `client.public`. Note: the public client still requires an API key.
+
+```python
+import os
+from layerlens import Stratix
+
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+# Browse public models
+models = client.public.models.get()
+for model in models.models:
+    print(f"{model.key}: {model.name}")
+```
+
+Or instantiate the public client directly:
+
+```python
+import os
+from layerlens import PublicClient
+
+public = PublicClient(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+models = public.models.get()
+```
+
+## Resources
+
+The SDK provides access to these resource types:
+
+| Resource                     | Description                                                                   |
+| ---------------------------- | ----------------------------------------------------------------------------- |
+| `client.models`              | Manage models (get, get_by_key, add, remove, create_custom)                   |
+| `client.benchmarks`          | Manage benchmarks (get, get_by_key, add, remove, create_custom, create_smart) |
+| `client.evaluations`         | Create evaluations and wait for results                                       |
+| `client.judges`              | CRUD operations for evaluation judges                                         |
+| `client.traces`              | Upload trace files and manage traces                                          |
+| `client.trace_evaluations`   | Run trace-level evaluations with judges                                       |
+| `client.judge_optimizations` | Optimize judge configurations                                                 |
+| `client.results`             | Retrieve evaluation results                                                   |
+| `client.public`              | Public models, benchmarks, evaluations, and comparisons                       |
+
+Every resource is available in both sync (`Stratix`) and async (`AsyncStratix`) clients.
+
+## Examples
+
+### Working with judges
+
+```python
+# Create a judge (name and evaluation_goal are required)
+judge = client.judges.create(
+    name="Response Quality Judge",
+    evaluation_goal="Rate whether the response is accurate, complete, and well-structured",
+)
+
+# List judges (returns a JudgesResponse with .judges list)
+response = client.judges.get_many()
+for j in response.judges:
+    print(f"{j.name} (id: {j.id})")
+
+# Update a judge
+client.judges.update(judge.id, name="Updated Judge Name")
+
+# Delete a judge
+client.judges.delete(judge.id)
+```
+
+### Uploading and evaluating traces
+
+Trace upload works with JSON or JSONL files (up to 50 MB). The SDK handles presigned S3 uploads automatically.
+
+```python
+# Upload a trace file (pass a file path, not raw data)
+result = client.traces.upload("./my_traces.json")
+print(f"Uploaded trace IDs: {result.trace_ids}")
+
+# List traces
+traces = client.traces.get_many()
+for t in traces.traces:
+    print(f"Trace {t.id}")
+
+# Create a trace evaluation
+trace_eval = client.trace_evaluations.create(
+    trace_id=t.id,
+    judge_id=judge.id,
+)
+
+# Get results
+results = client.trace_evaluations.get_results(trace_eval.id)
+```
+
+### Custom models
+
+Custom models require an OpenAI-compatible API endpoint.
+
+```python
+response = client.models.create_custom(
+    name="My Fine-tuned Model",
+    key="my-org/custom-model-v1",
+    description="Fine-tuned GPT for medical Q&A",
+    api_url="https://my-api.example.com/v1",
+    max_tokens=4096,
+    api_key=os.environ.get("MY_PROVIDER_API_KEY"),  # optional
+)
+print(f"Created model: {response.model_id}")
+```
+
+## Client aliases
+
+For backward compatibility, multiple import names are available:
+
+```python
+from layerlens import Stratix          # Primary
+from layerlens import AsyncStratix     # Async primary
+from layerlens import Client           # Alias for Stratix
+from layerlens import AsyncClient      # Alias for AsyncStratix
+from layerlens import Atlas            # Legacy alias
+from layerlens import AsyncAtlas       # Legacy alias
+from layerlens import PublicClient     # Public endpoints
+from layerlens import AsyncPublicClient
+```
+
+## Configuration
+
+| Environment Variable         | Description               | Default                           |
+| ---------------------------- | ------------------------- | --------------------------------- |
+| `LAYERLENS_STRATIX_API_KEY`  | Your API key              | (required)                        |
+| `LAYERLENS_STRATIX_BASE_URL` | Override the API base URL | `https://api.layerlens.ai/api/v1` |
+
+Legacy env vars (`LAYERLENS_ATLAS_API_KEY`, `LAYERLENS_ATLAS_BASE_URL`) are also supported.
+
+## Error handling
+
+The SDK raises typed exceptions for API errors:
+
+```python
+import os
+from layerlens import Stratix, StratixError, APIError, BadRequestError, NotFoundError
+
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+try:
+    result = client.models.get_by_id("nonexistent-id")
+except NotFoundError as e:
+    print(f"Not found (HTTP {e.status_code}): {e.message}")
+except BadRequestError as e:
+    print(f"Bad request: {e.message}")
+except APIError as e:
+    print(f"API error: {e.message}")
+except StratixError as e:
+    print(f"Client error: {e}")
+```
+
+Catch the most specific exception first. The hierarchy:
+
+- `StratixError` (base for all SDK errors)
+  - `APIError` (base for all API-related errors)
+    - `APIConnectionError` (network issues)
+      - `APITimeoutError` (request timed out)
+    - `APIResponseValidationError` (response didn't match expected schema)
+    - `APIStatusError` (HTTP 4xx/5xx)
+      - `BadRequestError` (400)
+      - `AuthenticationError` (401)
+      - `PermissionDeniedError` (403)
+      - `NotFoundError` (404)
+      - `ConflictError` (409)
+      - `UnprocessableEntityError` (422)
+      - `RateLimitError` (429)
+      - `InternalServerError` (500+)
+
+Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`.
+
+## Requirements
+
+- Python 3.8+
+- Dependencies: `httpx`, `pydantic`, `requests`
+
+## Documentation
+
+Full API reference and examples are available in the [docs/](docs/) directory:
+
+- [API Reference](docs/api-reference/) (client config, all resource methods, error handling)
+- [Code Examples](docs/examples/) (evaluations, judges, traces)
+- [Troubleshooting](docs/troubleshooting/) (auth issues, error codes)
+
+## License
+
+Apache 2.0. See [LICENSE](LICENSE) for details.
diff --git a/docs/README.md b/docs/README.md
index d1dbfa4..a6de2ce 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,98 +1,266 @@
-# Layerlens Python SDK Documentation
+# LayerLens Stratix Python SDK
 
-Welcome to the official documentation for the Layerlens Python SDK for the Stratix platform. This library provides convenient programmatic to the Stratix platform from any Python 3.8+ application.
+The official Python library for the [LayerLens Stratix](https://layerlens.ai) evaluation API.
 
-## What is Stratix?
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
 
-Stratix is an evaluation platform that allows you to benchmark AI models against various datasets and metrics. The Python SDK provides two HTTP clients (syncronous and asynchronous) powered by [httpx](https://github.com/encode/httpx) and [Pydantic](https://pydantic.dev/) models for type-safe API interactions.
+## Installation
 
-## Quick Start
+```bash
+pip install layerlens --extra-index-url https://sdk.layerlens.ai/package
+```
 
-### Install LayerLens python sdk
+## Authentication
 
-Install the layerlens python sdk using the following command
+Set your API key as an environment variable:
 
 ```bash
-pip install layerlens --index-url https://sdk.layerlens.ai/package
+export LAYERLENS_STRATIX_API_KEY="your-api-key"
 ```
 
-### Generate an api key on the Stratix platform
+Or pass it directly when creating a client:
 
-Login to your organization at [app.layerlens.ai](https://app.layerlens.ai) to generate an api key. Admin users of organizations can generate a keys in the settings page.
-
-Run this command to add your API key to your environment:
+```python
+from layerlens import Stratix
 
-```bash
-export LAYERLENS_STRATIX_API_KEY="YOUR_API_KEY"
+client = Stratix(api_key="your-api-key")
 ```
 
-### Running an evaluation on the Stratix platform
-
-Before triggering an evaluation using the sdk, login to your organization at [app.layerlens.ai](https://app.layerlens.ai) to ensure that the model and benchmark you are trying to evaluate has been added to your organizations dashboard.
+## Quick Start
 
-#### Using synchronous client
+### Run an evaluation
 
 ```python
+import os
 from layerlens import Stratix
 
-    # Construct sync client
-    client = Stratix()
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
 
-    # --- Models replace with the model key you want to run
-    model = client.models.get_by_key("openai/gpt-4o")
+# Get a model and benchmark by key
+model = client.models.get_by_key("openai/gpt-4o")
+benchmark = client.benchmarks.get_by_key("arc-agi-2")
 
-    if not model:
-      print("Model not found")
+# Create an evaluation (pass the full model and benchmark objects)
+evaluation = client.evaluations.create(
+    model=model,
+    benchmark=benchmark,
+)
 
-    # --- Benchmarks replace with the benchmark name you want to run
-    benchmark = client.benchmarks.get_by_key("aime2024")
+# Wait for results (pass the evaluation object, not just the ID)
+result = client.evaluations.wait_for_completion(evaluation)
+print(f"Accuracy: {result.accuracy}")
+```
+
+### Async usage
 
-    if not benchmark:
-      print("benchmark not found")
+```python
+import os
+import asyncio
+from layerlens import AsyncStratix
 
-    # --- Create evaluation
-    evaluation = client.evaluations.create(
+async def main():
+    client = AsyncStratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+    model = await client.models.get_by_key("openai/gpt-4o")
+    benchmark = await client.benchmarks.get_by_key("arc-agi-2")
+
+    evaluation = await client.evaluations.create(
         model=model,
         benchmark=benchmark,
     )
+
+    result = await client.evaluations.wait_for_completion(evaluation)
+    print(f"Accuracy: {result.accuracy}")
+
+asyncio.run(main())
 ```
 
-#### Using Async Client
+### Public endpoints
+
+Public models, benchmarks, and evaluations are accessible through `client.public`. Note: the public client still requires an API key.
 
 ```python
-import asyncio
-from layerlens import AsyncStratix
+import os
+from layerlens import Stratix
 
-async def run_evaluation_async():
-    # Construct async client
-    client = AsyncStratix()
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
 
-    # --- Model to use
-    model = await client.models.get_by_key("openai/gpt-4o")
+# Browse public models
+models = client.public.models.get()
+for model in models.models:
+    print(f"{model.key}: {model.name}")
+```
 
-    if not model:
-        print("Model not found")
-        return
+Or instantiate the public client directly:
 
-    # --- Benchmark to use
-    benchmark = await client.benchmarks.get_by_key("aime2024")
+```python
+import os
+from layerlens import PublicClient
 
-    if not benchmark:
-        print("benchmark not found")
-        return
+public = PublicClient(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+models = public.models.get()
+```
 
-    # --- Create evaluation
-    evaluation = await client.evaluations.create(
-        model=model,
-        benchmark=benchmark,
-    )
+## Resources
+
+The SDK provides access to these resource types:
+
+| Resource                     | Description                                                                   |
+| ---------------------------- | ----------------------------------------------------------------------------- |
+| `client.models`              | Manage models (get, get_by_key, add, remove, create_custom)                   |
+| `client.benchmarks`          | Manage benchmarks (get, get_by_key, add, remove, create_custom, create_smart) |
+| `client.evaluations`         | Create evaluations and wait for results                                       |
+| `client.judges`              | CRUD operations for evaluation judges                                         |
+| `client.traces`              | Upload trace files and manage traces                                          |
+| `client.trace_evaluations`   | Run trace-level evaluations with judges                                       |
+| `client.judge_optimizations` | Optimize judge configurations                                                 |
+| `client.results`             | Retrieve evaluation results                                                   |
+| `client.public`              | Public models, benchmarks, evaluations, and comparisons                       |
+
+Every resource is available in both sync (`Stratix`) and async (`AsyncStratix`) clients.
 
-if __name__ == "__main__":
-    asyncio.run(main())
+## Examples
+
+### Working with judges
+
+```python
+# Create a judge (name and evaluation_goal are required)
+judge = client.judges.create(
+    name="Response Quality Judge",
+    evaluation_goal="Rate whether the response is accurate, complete, and well-structured",
+)
+
+# List judges (returns a JudgesResponse with .judges list)
+response = client.judges.get_many()
+for j in response.judges:
+    print(f"{j.name} (id: {j.id})")
+
+# Update a judge
+client.judges.update(judge.id, name="Updated Judge Name")
+
+# Delete a judge
+client.judges.delete(judge.id)
+```
+
+### Uploading and evaluating traces
+
+Trace upload works with JSON or JSONL files (up to 50 MB). The SDK handles presigned S3 uploads automatically.
+
+```python
+# Upload a trace file (pass a file path, not raw data)
+result = client.traces.upload("./my_traces.json")
+print(f"Uploaded trace IDs: {result.trace_ids}")
+
+# List traces
+traces = client.traces.get_many()
+for t in traces.traces:
+    print(f"Trace {t.id}")
+
+# Create a trace evaluation
+trace_eval = client.trace_evaluations.create(
+    trace_id=t.id,
+    judge_id=judge.id,
+)
+
+# Get results
+results = client.trace_evaluations.get_results(trace_eval.id)
+```
+
+### Custom models
+
+Custom models require an OpenAI-compatible API endpoint.
+
+```python
+response = client.models.create_custom(
+    name="My Fine-tuned Model",
+    key="my-org/custom-model-v1",
+    description="Fine-tuned GPT for medical Q&A",
+    api_url="https://my-api.example.com/v1",
+    max_tokens=4096,
+    api_key=os.environ.get("MY_PROVIDER_API_KEY"),  # optional
+)
+print(f"Created model: {response.model_id}")
+```
+
+## Client aliases
+
+For backward compatibility, multiple import names are available:
+
+```python
+from layerlens import Stratix          # Primary
+from layerlens import AsyncStratix     # Async primary
+from layerlens import Client           # Alias for Stratix
+from layerlens import AsyncClient      # Alias for AsyncStratix
+from layerlens import Atlas            # Legacy alias
+from layerlens import AsyncAtlas       # Legacy alias
+from layerlens import PublicClient     # Public endpoints
+from layerlens import AsyncPublicClient
+```
+
+## Configuration
+
+| Environment Variable         | Description               | Default                           |
+| ---------------------------- | ------------------------- | --------------------------------- |
+| `LAYERLENS_STRATIX_API_KEY`  | Your API key              | (required)                        |
+| `LAYERLENS_STRATIX_BASE_URL` | Override the API base URL | `https://api.layerlens.ai/api/v1` |
+
+Legacy env vars (`LAYERLENS_ATLAS_API_KEY`, `LAYERLENS_ATLAS_BASE_URL`) are also supported.
+
+## Error handling
+
+The SDK raises typed exceptions for API errors:
+
+```python
+import os
+from layerlens import Stratix, StratixError, APIError, BadRequestError, NotFoundError
+
+client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY"))
+
+try:
+    result = client.models.get_by_id("nonexistent-id")
+except NotFoundError as e:
+    print(f"Not found (HTTP {e.status_code}): {e.message}")
+except BadRequestError as e:
+    print(f"Bad request: {e.message}")
+except APIError as e:
+    print(f"API error: {e.message}")
+except StratixError as e:
+    print(f"Client error: {e}")
 ```
 
-## Next steps
+Catch the most specific exception first. The hierarchy:
+
+- `StratixError` (base for all SDK errors)
+  - `APIError` (base for all API-related errors)
+    - `APIConnectionError` (network issues)
+      - `APITimeoutError` (request timed out)
+    - `APIResponseValidationError` (response didn't match expected schema)
+    - `APIStatusError` (HTTP 4xx/5xx)
+      - `BadRequestError` (400)
+      - `AuthenticationError` (401)
+      - `PermissionDeniedError` (403)
+      - `NotFoundError` (404)
+      - `ConflictError` (409)
+      - `UnprocessableEntityError` (422)
+      - `RateLimitError` (429)
+      - `InternalServerError` (500+)
+
+Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`.
+
+## Requirements
+
+- Python 3.8+
+- Dependencies: `httpx`, `pydantic`, `requests`
+
+## Documentation
+
+Full API reference and examples are available in the [docs/](docs/) directory:
+
+- [API Reference](docs/api-reference/) (client config, all resource methods, error handling)
+- [Code Examples](docs/examples/) (evaluations, judges, traces)
+- [Troubleshooting](docs/troubleshooting/) (auth issues, error codes)
+
+## License
 
-- **[API Reference](api-reference/)** - Complete documentation of all available methods
-- **[Code Examples](examples/)** - Practical examples for common use cases
-- **[Troubleshooting](troubleshooting/)** - Solutions to common issues
+Apache 2.0. See [LICENSE](LICENSE) for details.
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index a89a4b7..43bc303 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -21,10 +21,10 @@
 
 ## Code Examples
 * [Creating Evaluations](examples/creating-evaluations.md)
-* [Judges and Traces](examples/judges-and-traces.md)
 * [Retrieving Results](examples/retrieving-results.md)
-* [Working with Timeouts](examples/timeouts.md)
-* [Advanced Usage Patterns](examples/advanced-usage.md)
+* [Models and Benchmarks](examples/models-and-benchmarks.md)
+* [Judges and Traces](examples/judges-and-traces.md)
+* [Public API](examples/public-api.md)
 
 ## Troubleshooting
 * [Common Issues](troubleshooting/common-issues.md)
diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md
index 935cbbc..550190c 100644
--- a/docs/api-reference/models-benchmarks.md
+++ b/docs/api-reference/models-benchmarks.md
@@ -132,6 +132,13 @@ Removes models from the project by their IDs.
 
 Returns `bool` - `True` if the operation succeeded, `False` otherwise.
 
+#### Example
+
+```python
+client = Stratix()
+success = client.models.remove("model-id-1", "model-id-2")
+```
+
 ### `create_custom(name, key, description, api_url, max_tokens, api_key=None, timeout=None)`
 
 Creates a custom model backed by an OpenAI-compatible API endpoint. This allows you to evaluate any model accessible via a chat completions endpoint.
@@ -271,6 +278,13 @@ Removes benchmarks from the project by their IDs.
 
 Returns `bool` - `True` if the operation succeeded, `False` otherwise.
 
+#### Example
+
+```python
+client = Stratix()
+success = client.benchmarks.remove("benchmark-id-1", "benchmark-id-2")
+```
+
 ### `create_custom(name, description, file_path, additional_metrics=None, custom_scorer_ids=None, input_type=None, timeout=None)`
 
 Creates a custom benchmark by uploading a JSONL file. The file should contain one JSON object per line with `input` and `truth` fields.
diff --git a/docs/api-reference/traces.md b/docs/api-reference/traces.md
index ce4bfd8..9114fef 100644
--- a/docs/api-reference/traces.md
+++ b/docs/api-reference/traces.md
@@ -168,6 +168,14 @@ Deletes a trace by its unique identifier.
 
 Returns `True` if the trace was deleted, `False` otherwise.
 
+#### Example
+
+```python
+deleted = client.traces.delete("trace-abc123")
+if deleted:
+    print("Trace deleted successfully")
+```
+
 ### `get_sources(timeout=None)`
 
 Retrieves the list of available trace sources for the current project.
diff --git a/docs/examples/README.md b/docs/examples/README.md
index 65afe60..35f1cc7 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -1,2 +1,41 @@
 # Examples
 
+This section provides practical code examples for common SDK use cases. All examples are available as runnable scripts in the [`examples/`](../../examples/) directory.
+
+## Quick Reference
+
+| Example | Description |
+| ------- | ----------- |
+| [`client_simple.py`](../../examples/client_simple.py) | Minimal sync client usage |
+| [`client.py`](../../examples/client.py) | Full sync evaluation workflow |
+| [`async_client_simple.py`](../../examples/async_client_simple.py) | Minimal async client usage |
+| [`async_client.py`](../../examples/async_client.py) | Full async evaluation workflow |
+| [`async_run_evaluations.py`](../../examples/async_run_evaluations.py) | Run multiple evaluations in parallel |
+| [`get_models.py`](../../examples/get_models.py) | Filter models by name, company, region, type |
+| [`get_benchmarks.py`](../../examples/get_benchmarks.py) | Filter benchmarks by name and type |
+| [`get_evaluation.py`](../../examples/get_evaluation.py) | Fetch an evaluation by ID |
+| [`evaluation_sorting.py`](../../examples/evaluation_sorting.py) | Sort and filter evaluations |
+| [`compare_evaluations.py`](../../examples/compare_evaluations.py) | Compare two models on a benchmark |
+| [`paginated_results.py`](../../examples/paginated_results.py) | Paginate through evaluation results |
+| [`all_results_no_pagination.py`](../../examples/all_results_no_pagination.py) | Fetch all results at once |
+| [`fetch_results_async.py`](../../examples/fetch_results_async.py) | Fetch results for multiple evaluations concurrently |
+| [`create_custom_model.py`](../../examples/create_custom_model.py) | Create a custom model with an OpenAI-compatible API |
+| [`create_custom_benchmark.py`](../../examples/create_custom_benchmark.py) | Create a custom benchmark from a JSONL file |
+| [`create_smart_benchmark.py`](../../examples/create_smart_benchmark.py) | Create an AI-generated benchmark from documents |
+| [`manage_project_models_benchmarks.py`](../../examples/manage_project_models_benchmarks.py) | Add/remove models and benchmarks from a project |
+| [`judges.py`](../../examples/judges.py) | Create, list, update, and delete judges |
+| [`traces.py`](../../examples/traces.py) | Upload, list, get, and delete traces |
+| [`trace_evaluations.py`](../../examples/trace_evaluations.py) | Run judges on traces, estimate cost, get results |
+| [`async_judges_and_traces.py`](../../examples/async_judges_and_traces.py) | Async judge and trace evaluation workflow |
+| [`judge_optimizations.py`](../../examples/judge_optimizations.py) | Estimate, run, and apply judge optimizations |
+| [`public_models.py`](../../examples/public_models.py) | Browse, search, and filter public models |
+| [`public_benchmarks.py`](../../examples/public_benchmarks.py) | Browse public benchmarks and download prompts |
+| [`public_evaluations.py`](../../examples/public_evaluations.py) | Get public evaluation details and results |
+
+## Guides
+
+- [Creating Evaluations](creating-evaluations.md) - Sync, async, and parallel evaluations
+- [Retrieving Results](retrieving-results.md) - Paginated, bulk, and concurrent result fetching
+- [Models and Benchmarks](models-and-benchmarks.md) - Filtering, custom models, custom/smart benchmarks, project management
+- [Judges and Traces](judges-and-traces.md) - Judge CRUD, trace uploads, trace evaluations, and optimizations
+- [Public API](public-api.md) - Public models, benchmarks, evaluations, and comparisons
diff --git a/docs/examples/creating-evaluations.md b/docs/examples/creating-evaluations.md
index ffec8cf..47f5adc 100644
--- a/docs/examples/creating-evaluations.md
+++ b/docs/examples/creating-evaluations.md
@@ -1,14 +1,14 @@
 # Creating Evaluations
 
-Examples for creating evaluations on the Stratix platform using the Layerlens python sdk.
+Examples for creating evaluations on the Stratix platform using the LayerLens Python SDK.
 
-> Before running the below examples ensure the model and benchmark being run are present on your organiztion.
+> Before running the below examples ensure the model and benchmark being run are present on your organization.
 
 ## Basic Evaluation
 
 ### Using Synchronous Client
 
-Below is an example showing how to trigger an evaluation, waiting for it to complete and finally fetching the evaluations results.
+> Source: [`examples/client.py`](../../examples/client.py)
 
 ```python
 from layerlens import Stratix
@@ -16,23 +16,20 @@ from layerlens import Stratix
 # Construct sync client (API key from env or inline)
 client = Stratix()
 
-# --- Models replace with the model key you want to run
-model = client.models.get_by_key("openai/gpt-4o")
+# --- Models
+models = client.models.get()
+print(f"Found {len(models)} models")
 
-if not model:
-    print("Model not found")
-
-# --- Benchmarks replace with the benchmark name you want to run
-benchmark = client.benchmarks.get_by_key("aime2024")
-
-if not benchmark:
-    print("benchmark not found")
+# --- Benchmarks
+benchmarks = client.benchmarks.get()
+print(f"Found {len(benchmarks)} benchmarks")
 
 # --- Create evaluation
 evaluation = client.evaluations.create(
-    model=model,
-    benchmark=benchmark,
+    model=models[0],
+    benchmark=benchmarks[0],
 )
+print(f"Created evaluation {evaluation.id}, status={evaluation.status}")
 
 # --- Wait for completion
 evaluation = client.evaluations.wait_for_completion(
@@ -40,17 +37,41 @@ evaluation = client.evaluations.wait_for_completion(
     interval_seconds=10,
     timeout_seconds=600,  # 10 minutes
 )
+print(f"Evaluation {evaluation.id} finished with status={evaluation.status}")
 
 # --- Results
 if evaluation.is_success:
-    # Loads the first page of results
     results = client.results.get(evaluation=evaluation)
     print("Results:", results)
+else:
+    print("Evaluation did not succeed, no results to show.")
+```
+
+### Minimal Sync Example
+
+> Source: [`examples/client_simple.py`](../../examples/client_simple.py)
+
+```python
+from layerlens import Stratix
+
+client = Stratix()
+
+models = client.models.get(type="public", name="gpt-4o")
+model = models[0]
+
+benchmarks = client.benchmarks.get(type="public", name="simpleQA")
+benchmark = benchmarks[0]
 
+evaluation = client.evaluations.create(
+    model=model,
+    benchmark=benchmark,
+)
 ```
 
 ### Using Async Client
 
+> Source: [`examples/async_client_simple.py`](../../examples/async_client_simple.py)
+
 ```python
 import asyncio
 
@@ -58,65 +79,128 @@ from layerlens import AsyncStratix
 
 
 async def main():
-    # Construct async client
     client = AsyncStratix()
 
-    # --- Model to use
-    model = await client.models.get_by_key("openai/gpt-4o")
+    models = await client.models.get()
+    print(f"Found {len(models)} models")
 
-    if not model:
-        print("Model not found")
-        return
+    benchmarks = await client.benchmarks.get()
+    print(f"Found {len(benchmarks)} benchmarks")
 
-    # --- Benchmark to use
-    benchmark = await client.benchmarks.get_by_key("aime2024")
+    evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0])
+    print(f"Created evaluation {evaluation.id}, status={evaluation.status}")
 
-    if not benchmark:
-        print("benchmark not found")
-        return
+    await evaluation.wait_for_completion_async(interval_seconds=10, timeout_seconds=600)
+    print(f"Evaluation {evaluation.id} finished with status={evaluation.status}")
 
+    if evaluation.is_success:
+        results = await evaluation.get_results_async()
+        print("Results:", results)
+    else:
+        print("Evaluation did not succeed, no results to show.")
 
-    # --- Create evaluation
-    evaluation = await client.evaluations.create(model=model, benchmark=benchmark)
 
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 
-    await evaluation.wait_for_completion_async(interval_seconds=10)
+## Sorting and Filtering Evaluations
 
-    # --- Results
-    if evaluation.is_success:
-        results = await evaluation.get_results_async()
+> Source: [`examples/evaluation_sorting.py`](../../examples/evaluation_sorting.py)
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+from layerlens.models import EvaluationStatus
+
+
+async def main():
+    client = AsyncStratix()
+
+    # --- Sort by accuracy (highest first)
+    response = await client.evaluations.get_many(
+        sort_by="accuracy",
+        order="desc",
+        page_size=10,
+    )
+    if response:
+        print(f"Top {len(response.evaluations)} evaluations by accuracy:")
+        for evaluation in response.evaluations:
+            print(f"  - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%")
+
+    # --- Filter by status (only successful)
+    response = await client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+    )
+    if response:
+        print(f"Successful evaluations: {response.pagination.total_count}")
+
+    # --- Filter by model or benchmark IDs
+    response = await client.evaluations.get_many(
+        model_ids=["your-model-id"],
+        sort_by="accuracy",
+        order="desc",
+    )
+
+    # --- Combine sorting, filtering, and pagination
+    response = await client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+        page=1,
+        page_size=20,
+    )
+    if response:
+        print(f"Page 1: {response.pagination.total_count} total, {response.pagination.total_pages} pages")
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-## Error Handling
+## Comparing Evaluations
+
+> Source: [`examples/compare_evaluations.py`](../../examples/compare_evaluations.py)
 
 ```python
-from layerlens import Stratix
-import layerlens
+from layerlens import PublicClient
 
-client = Stratix()
+client = PublicClient()
 
-try:
-    models = client.models.get()
-    benchmarks = client.benchmarks.get()
+# Compare two models on a benchmark
+comparison = client.comparisons.compare_models(
+    benchmark_id="682bddc1e014f9fa440f8a91",
+    model_id_1="699f9761e014f9c3072b0513",
+    model_id_2="699f9761e014f9c3072b0512",
+    page=1,
+    page_size=10,
+)
 
-    evaluation = client.evaluations.create(
-        model=models[0],
-        benchmark=benchmarks[0]
-    )
+if comparison:
+    print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+    print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
 
-except layerlens.AuthenticationError:
-    print("Check your API key")
-except layerlens.NotFoundError:
-    print("Model or benchmark not found")
-except layerlens.APIError as e:
-    print(f"API error: {e}")
+# Filter: where model 1 fails but model 2 succeeds
+comparison = client.comparisons.compare_models(
+    benchmark_id="682bddc1e014f9fa440f8a91",
+    model_id_1="699f9761e014f9c3072b0513",
+    model_id_2="699f9761e014f9c3072b0512",
+    outcome_filter="reference_fails",
+)
+
+# Or compare using evaluation IDs directly
+comparison = client.comparisons.compare(
+    evaluation_id_1="699f9938a03d70bf6607081f",
+    evaluation_id_2="699f991ca782d00ebd666ba1",
+)
 ```
 
-## Triggering Multiple Evaluations
+## Running Multiple Evaluations in Parallel
+
+> Source: [`examples/async_run_evaluations.py`](../../examples/async_run_evaluations.py)
 
 ```python
 import asyncio
@@ -125,81 +209,136 @@ from layerlens import AsyncStratix
 
 
 async def create_and_run_evaluation(client, model, benchmark, eval_number):
-    """Create and run a single evaluation, tracking progress."""
     try:
-        print(f"Starting evaluation #{eval_number}...")
-
-        # Create evaluation
         evaluation = await client.evaluations.create(model=model, benchmark=benchmark)
 
-        # Wait for completion
         evaluation = await client.evaluations.wait_for_completion(
             evaluation,
             interval_seconds=10,
-            timeout_seconds=600,  # 10 minutes
+            timeout_seconds=600,
         )
 
-        # Get results if successful
         if evaluation.is_success:
             results = await client.results.get_all(evaluation=evaluation)
-            return results
+            print(f"Evaluation #{eval_number} completed with {len(results)} results")
+            return eval_number, evaluation.id, len(results), True
         else:
-            return None
+            return eval_number, evaluation.id, 0, False
 
     except Exception as e:
-        print(f"✗ Error in evaluation #{eval_number}: {e}")
+        print(f"Error in evaluation #{eval_number}: {e}")
         return eval_number, None, 0, False
 
 
 async def main():
-    # Construct async client
     client = AsyncStratix()
 
-    # --- Models
     models = await client.models.get()
+    benchmarks = await client.benchmarks.get()
+
+    num_evaluations = 3
+    tasks = [
+        create_and_run_evaluation(client, models[0], benchmarks[0], i + 1)
+        for i in range(num_evaluations)
+    ]
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)
 
-    # --- Benchmarks
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Fetching Results
+
+### Paginated Results
+
+> Source: [`examples/paginated_results.py`](../../examples/paginated_results.py)
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    models = await client.models.get()
     benchmarks = await client.benchmarks.get()
 
-    # Use first model and benchmark for all evaluations
-    target_model = models[0]
-    target_benchmark = benchmarks[0]
+    evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0])
+    evaluation = await client.evaluations.wait_for_completion(evaluation, interval_seconds=10, timeout_seconds=600)
 
-    print(f"Using model: {target_model}")
-    print(f"Using benchmark: {target_benchmark}")
+    if evaluation.is_success:
+        all_results = []
+        page = 1
+        page_size = 50
 
-    # Create 3 evaluation tasks
-    num_evaluations = 3
-    print(f"Starting {num_evaluations} evaluations in parallel...")
+        while True:
+            results_data = await client.results.get_by_id(
+                evaluation_id=evaluation.id, page=page, page_size=page_size
+            )
+
+            if not results_data or not results_data.results:
+                break
 
-    tasks = [create_and_run_evaluation(client, target_model, target_benchmark, i + 1) for i in range(num_evaluations)]
+            all_results.extend(results_data.results)
 
-    # Execute all evaluations concurrently
-    await asyncio.gather(*tasks, return_exceptions=True)
+            if page >= results_data.pagination.total_pages:
+                break
+            page += 1
+
+        print(f"Total results collected: {len(all_results)}")
 
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-## Fetching Results of Multiple Evaluations Async
+### All Results Without Pagination
+
+> Source: [`examples/all_results_no_pagination.py`](../../examples/all_results_no_pagination.py)
 
 ```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    models = await client.models.get()
+    benchmarks = await client.benchmarks.get()
+
+    evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0])
+    evaluation = await client.evaluations.wait_for_completion(evaluation, interval_seconds=10, timeout_seconds=600)
+
+    # Fetch all results at once
+    results = await client.results.get_all(evaluation=evaluation)
+    print(f"Found {len(results)} results")
+
 
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Fetch Results for Multiple Evaluations Concurrently
+
+> Source: [`examples/fetch_results_async.py`](../../examples/fetch_results_async.py)
+
+```python
 import asyncio
 
 from layerlens import AsyncStratix
 
 
 async def fetch_evaluation_results(client, evaluation_id):
-    """Fetch results for a single evaluation and print when loaded."""
     try:
-        print(f"Fetching evaluation {evaluation_id}...")
         evaluation = await client.evaluations.get_by_id(evaluation_id)
-        # Get all results for this evaluation
         results = await client.results.get_all(evaluation=evaluation)
         print(f"Loaded {len(results)} results for evaluation {evaluation_id}")
-
         return evaluation_id, results
     except Exception as e:
         print(f"Error fetching evaluation {evaluation_id}: {e}")
@@ -207,26 +346,42 @@ async def fetch_evaluation_results(client, evaluation_id):
 
 
 async def main():
-    # Construct async client
     client = AsyncStratix()
 
-    # List of example evaluation IDs to fetch
-
-    evaluation_ids = ["68a65a3de7ad047fbd8e7d4", "688a54c673f6b2835cc7278"]
+    evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"]
 
-    print(f"Starting async fetch for {len(evaluation_ids)} evaluations...")
-
-    # Create tasks for concurrent execution
     tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids]
-
-    # Execute all tasks concurrently and print results as they complete
     results = await asyncio.gather(*tasks, return_exceptions=True)
 
-    print("=" * 80)
-    print("Summary:")
     successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception))
+    print(f"Successfully fetched {successful}/{len(evaluation_ids)} evaluations")
+
 
 if __name__ == "__main__":
     asyncio.run(main())
+```
 
+## Error Handling
+
+```python
+from layerlens import Stratix
+import layerlens
+
+client = Stratix()
+
+try:
+    models = client.models.get()
+    benchmarks = client.benchmarks.get()
+
+    evaluation = client.evaluations.create(
+        model=models[0],
+        benchmark=benchmarks[0],
+    )
+
+except layerlens.AuthenticationError:
+    print("Check your API key")
+except layerlens.NotFoundError:
+    print("Model or benchmark not found")
+except layerlens.APIError as e:
+    print(f"API error: {e}")
 ```
diff --git a/docs/examples/judges-and-traces.md b/docs/examples/judges-and-traces.md
index 8ac976e..9d5e80b 100644
--- a/docs/examples/judges-and-traces.md
+++ b/docs/examples/judges-and-traces.md
@@ -1,245 +1,319 @@
 # Judges and Traces
 
-Examples for working with judges, traces, and trace evaluations on the Stratix platform using the Layerlens Python SDK.
+Examples for working with judges, traces, and trace evaluations on the Stratix platform using the LayerLens Python SDK.
 
 ## Creating and Managing Judges
 
-### Basic Judge CRUD
+> Source: [`examples/judges.py`](../../examples/judges.py)
 
 ```python
+import time
+
 from layerlens import Stratix
 
 client = Stratix()
 
-# Fetch a model to use for the judge
+# Fetch a model to use as the judge's LLM
 models = client.models.get(type="public", name="gpt-4o")
 model = models[0]
+print(f"Using model: {model.name} ({model.id})")
 
-# Create a judge (model_id is required)
+# --- Create a judge
 judge = client.judges.create(
-    name="Code Quality Judge",
+    name=f"Code Quality Judge {int(time.time())}",
     evaluation_goal="Evaluate the quality of code output including correctness, readability, and style",
     model_id=model.id,
 )
-print(f"Created judge: {judge.name} (v{judge.version})")
+print(f"Created judge {judge.id}: {judge.name}")
 
-# Get a judge by ID
+# --- Get a judge by ID
 judge = client.judges.get(judge.id)
+print(f"Judge: {judge.name}, version: {judge.version}")
 
-# List all judges with pagination
-response = client.judges.get_many(page=1, page_size=50)
+# --- List all judges
+response = client.judges.get_many()
+print(f"Found {response.total_count} judges")
 for j in response.judges:
-    print(f"  {j.name}: v{j.version}, {j.run_count} runs")
+    print(f"  - {j.name} (v{j.version}, {j.run_count} runs)")
 
-# Update a judge (creates a new version)
-client.judges.update(
+# --- Update a judge (creates a new version)
+updated = client.judges.update(
     judge.id,
-    evaluation_goal="Evaluate code for correctness, readability, style, and security",
+    name="Updated Code Quality Judge",
+    evaluation_goal="Evaluate code output for correctness, readability, style, and security",
 )
+print(f"Updated judge {updated.id}")
 
-# Delete a judge
-client.judges.delete(judge.id)
+# --- Delete a judge
+deleted = client.judges.delete(judge.id)
+print(f"Deleted judge {deleted.id}")
 ```
 
 ## Uploading and Managing Traces
 
-### Upload Trace Files
+> Source: [`examples/traces.py`](../../examples/traces.py)
 
 ```python
+import os
+
 from layerlens import Stratix
 
 client = Stratix()
 
-# Upload a JSONL file containing multiple traces
-result = client.traces.upload("./traces.jsonl")
+# --- Upload traces from a file
+traces_file = os.path.join(os.path.dirname(__file__), "traces.jsonl")
+result = client.traces.upload(traces_file)
 print(f"Uploaded {len(result.trace_ids)} traces")
 
-# Upload a single JSON trace
-result = client.traces.upload("./single-trace.json")
-```
-
-### Browse and Filter Traces
-
-```python
-from layerlens import Stratix
-
-client = Stratix()
-
-# List all traces
+# --- List traces
 response = client.traces.get_many()
-print(f"Total traces: {response.total_count}")
+print(f"Found {response.total_count} traces")
+for trace in response.traces[:5]:
+    print(f"  - {trace.id}: {trace.filename}")
 
-# Filter by time range and sort
-response = client.traces.get_many(
-    time_range="7d",
+# --- List traces with filters
+filtered = client.traces.get_many(
     sort_by="created_at",
     sort_order="desc",
+    page_size=10,
 )
+print(f"Filtered traces: {filtered.count}")
 
-# Search traces
-response = client.traces.get_many(search="authentication")
+# --- Get a single trace
+trace = client.traces.get(result.trace_ids[0])
+print(f"Trace {trace.id}: {len(trace.data)} data keys")
 
-# Get available sources
+# --- Get available sources
 sources = client.traces.get_sources()
 print(f"Sources: {sources}")
-```
-
-### Get Trace Details
-
-```python
-from layerlens import Stratix
-
-client = Stratix()
 
-trace = client.traces.get("trace-abc123")
-if trace:
-    print(f"Filename: {trace.filename}")
-    print(f"Created: {trace.created_at}")
-    print(f"Data keys: {list(trace.data.keys())}")
+# --- Delete a trace
+deleted = client.traces.delete(trace.id)
+print(f"Deleted: {deleted}")
 ```
 
 ## Running Trace Evaluations
 
-### Estimate Cost Before Running
+> Source: [`examples/trace_evaluations.py`](../../examples/trace_evaluations.py)
 
 ```python
+import time
+
 from layerlens import Stratix
 
 client = Stratix()
 
-# Get trace IDs to evaluate
-traces_response = client.traces.get_many(page_size=10)
+# Fetch a model and create a judge
+models = client.models.get(type="public", name="gpt-4o")
+model = models[0]
+
+judge = client.judges.create(
+    name=f"Trace Eval Demo Judge {int(time.time())}",
+    evaluation_goal="Evaluate whether the response is accurate, complete, and well-structured",
+    model_id=model.id,
+)
+print(f"Created judge {judge.id}: {judge.name}")
+
+# --- Get existing traces to evaluate
+traces_response = client.traces.get_many(page_size=3)
 trace_ids = [t.id for t in traces_response.traces]
+print(f"Found {len(trace_ids)} traces to evaluate")
 
-# Estimate cost
+# --- Estimate cost before running
 estimate = client.trace_evaluations.estimate_cost(
     trace_ids=trace_ids,
-    judge_id="judge-123",
+    judge_id=judge.id,
 )
-print(f"Cost for {estimate.trace_count} traces: ${estimate.estimated_cost:.4f}")
-print(f"Model: {estimate.model}")
-```
-
-### Run a Judge on a Trace
+print(f"Estimated cost: ${estimate.estimated_cost:.4f} for {estimate.trace_count} traces")
 
-```python
-import time
-from layerlens import Stratix
-
-client = Stratix()
-
-# Create an evaluation
+# --- Run a judge on the first trace
 evaluation = client.trace_evaluations.create(
-    trace_id="trace-abc",
-    judge_id="judge-xyz",
+    trace_id=trace_ids[0],
+    judge_id=judge.id,
 )
-print(f"Evaluation {evaluation.id}: {evaluation.status}")
+print(f"Created evaluation {evaluation.id}, status: {evaluation.status}")
 
-# Wait for evaluation to complete (evaluations run asynchronously on the server)
+# --- Wait for evaluation to complete
 for _ in range(30):
     evaluation = client.trace_evaluations.get(evaluation.id)
+    print(f"Evaluation status: {evaluation.status}")
     if evaluation.status.value in ("success", "failure"):
         break
     time.sleep(2)
 
-# Get results (only available after evaluation completes)
+# --- Get evaluation results
 try:
     results_response = client.trace_evaluations.get_results(evaluation.id)
-    if results_response:
+    if results_response and results_response.results:
         for result in results_response.results:
-            print(f"Score: {result.score}, Passed: {result.passed}")
-            print(f"Reasoning: {result.reasoning}")
-            print(f"Latency: {result.latency_ms}ms, Cost: ${result.total_cost:.4f}")
-            for step in result.steps:
-                print(f"  Step {step.step}: {step.reasoning}")
+            print(f"  Score: {result.score}, Passed: {result.passed}")
+            print(f"  Reasoning: {result.reasoning}")
+            if result.steps:
+                for step in result.steps:
+                    print(f"    Step {step.step}: {step.reasoning}")
+    else:
+        print("  No results returned")
 except Exception:
-    print("Results not available yet")
+    print("  No results yet (evaluation may still be in progress)")
+
+# --- List all trace evaluations
+response = client.trace_evaluations.get_many()
+print(f"Found {response.total} trace evaluations")
+
+# --- Clean up
+client.judges.delete(judge.id)
 ```
 
-### Browse Evaluation Results
+## Judge Optimizations
+
+> Source: [`examples/judge_optimizations.py`](../../examples/judge_optimizations.py)
+
+Optimization requires that the judge has at least 10 annotations (trace evaluation results). Run trace evaluations first to build up annotation data.
 
 ```python
+import time
+
+import layerlens
 from layerlens import Stratix
 
 client = Stratix()
 
-# List all evaluations
-response = client.trace_evaluations.get_many()
-print(f"Total evaluations: {response.total}")
+models = client.models.get(type="public", name="gpt-4o")
+model = models[0]
 
-# Filter by judge and outcome
-response = client.trace_evaluations.get_many(
-    judge_id="judge-123",
-    outcome="pass",
-    sort_by="created_at",
-    sort_order="desc",
+judge = client.judges.create(
+    name=f"Optimization Demo Judge {int(time.time())}",
+    evaluation_goal="Evaluate whether the response is accurate, complete, and well-structured",
+    model_id=model.id,
 )
 
-# Filter by trace
-response = client.trace_evaluations.get_many(
-    trace_id="trace-abc",
+# --- Estimate cost
+estimate = client.judge_optimizations.estimate(
+    judge_id=judge.id,
+    budget="medium",
 )
+if estimate:
+    print(f"Estimated cost: ${estimate.estimated_cost:.4f}")
+    print(f"  Annotations: {estimate.annotation_count}, Budget: {estimate.budget}")
+
+# --- Create an optimization run
+try:
+    run = client.judge_optimizations.create(
+        judge_id=judge.id,
+        budget="medium",
+    )
+except layerlens.BadRequestError as e:
+    print(f"Cannot start optimization: {e}")
+    print("Tip: Run trace evaluations with this judge first to build up annotations.")
+    client.judges.delete(judge.id)
+    exit(0)
+
+# --- Poll for completion
+optimization = None
+for i in range(60):
+    optimization = client.judge_optimizations.get(run.id)
+    if not optimization:
+        break
+    print(f"  [{i * 5}s] Status: {optimization.status}")
+    if optimization.status.value in ("success", "failure"):
+        print(f"  Baseline accuracy: {optimization.baseline_accuracy}")
+        print(f"  Optimized accuracy: {optimization.optimized_accuracy}")
+        break
+    time.sleep(5)
+
+# --- List optimization runs
+response = client.judge_optimizations.get_many(judge_id=judge.id)
+if response:
+    print(f"Found {response.total} optimization runs")
+
+# --- Apply optimization results
+if optimization and optimization.status.value == "success":
+    result = client.judge_optimizations.apply(run.id)
+    if result:
+        print(f"Applied optimization: new version v{result.new_version}")
+
+client.judges.delete(judge.id)
 ```
 
-## Async Workflows
+## Async Judges and Traces
 
-### Run Evaluations Concurrently
+> Source: [`examples/async_judges_and_traces.py`](../../examples/async_judges_and_traces.py)
 
 ```python
+import os
+import time
 import asyncio
+
 from layerlens import Stratix, AsyncStratix
 
+
 async def main():
-    # Fetch a model for judge creation
+    # Fetch a model using sync client
     sync_client = Stratix()
     models = sync_client.models.get(type="public", name="gpt-4o")
     model = models[0]
 
     client = AsyncStratix()
 
-    # Create a judge (model_id is required)
+    # --- Create a judge
     judge = await client.judges.create(
-        name="Response Quality Judge",
-        evaluation_goal="Evaluate whether the response is accurate and well-structured",
+        name=f"Response Quality Judge {int(time.time())}",
+        evaluation_goal="Evaluate whether the response is accurate, helpful, and well-structured",
         model_id=model.id,
     )
+    print(f"Created judge {judge.id}: {judge.name}")
 
-    # Upload traces
-    result = await client.traces.upload("./traces.jsonl")
+    # --- Upload traces
+    traces_file = os.path.join(os.path.dirname(__file__), "traces.jsonl")
+    result = await client.traces.upload(traces_file)
     print(f"Uploaded {len(result.trace_ids)} traces")
 
-    # Get traces to evaluate
-    traces_response = await client.traces.get_many(page_size=5)
-    trace_ids = [t.id for t in traces_response.traces]
+    # --- List traces
+    traces_response = await client.traces.get_many(page_size=10)
+    trace_ids = [t.id for t in traces_response.traces[:5]]
+
+    # --- Estimate cost
+    estimate = await client.trace_evaluations.estimate_cost(
+        trace_ids=trace_ids,
+        judge_id=judge.id,
+    )
+    print(f"Estimated cost: ${estimate.estimated_cost:.4f}")
 
-    # Run evaluations concurrently
-    tasks = [
-        client.trace_evaluations.create(trace_id=tid, judge_id=judge.id)
-        for tid in trace_ids
-    ]
+    # --- Run evaluations concurrently
+    tasks = [client.trace_evaluations.create(trace_id=tid, judge_id=judge.id) for tid in trace_ids]
     evaluations = await asyncio.gather(*tasks)
 
     for evaluation in evaluations:
         if evaluation:
-            print(f"Evaluation {evaluation.id}: {evaluation.status}")
+            print(f"  Evaluation {evaluation.id}: {evaluation.status}")
 
-    # Wait for evaluations to complete, then fetch results
+    # --- Wait and fetch results
     await asyncio.sleep(10)
     for evaluation in evaluations:
         if not evaluation:
             continue
         try:
             results_response = await client.trace_evaluations.get_results(evaluation.id)
-            if results_response:
+            if results_response and results_response.results:
                 for result in results_response.results:
-                    print(f"Score: {result.score}, Passed: {result.passed}")
+                    print(f"  Score: {result.score}, Passed: {result.passed}")
+            else:
+                print(f"  Evaluation {evaluation.id}: no results yet")
         except Exception:
-            print(f"Evaluation {evaluation.id}: results not available yet")
+            print(f"  Evaluation {evaluation.id}: results not available yet")
+
+    await client.judges.delete(judge.id)
+
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
+## See Also
+
+- [Models and Benchmarks](models-and-benchmarks.md) - Custom models, custom/smart benchmarks, project management
+- [Public API](public-api.md) - Public models, benchmarks, evaluations, and comparisons
+
 ## Error Handling
 
 ```python
@@ -249,7 +323,6 @@ import layerlens
 client = Stratix()
 
 try:
-    # Fetch a model for the judge
     models = client.models.get(type="public", name="gpt-4o")
     model = models[0]
 
diff --git a/docs/examples/models-and-benchmarks.md b/docs/examples/models-and-benchmarks.md
new file mode 100644
index 0000000..fc21c68
--- /dev/null
+++ b/docs/examples/models-and-benchmarks.md
@@ -0,0 +1,256 @@
+# Models and Benchmarks
+
+Examples for browsing, filtering, creating, and managing models and benchmarks using the LayerLens Python SDK.
+
+## Filtering Models
+
+> Source: [`examples/get_models.py`](../../examples/get_models.py)
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    # --- Filter by name
+    model_name = "gpt-4o"
+    models = await client.models.get(name=model_name)
+    print(f"Found {len(models)} models with name {model_name}")
+
+    # --- Filter by company
+    company_names = ["openai", "anthropic"]
+    models = await client.models.get(companies=company_names)
+    print(f"Found {len(models)} models with companies {company_names}")
+
+    # --- Filter by region
+    region_names = ["usa"]
+    models = await client.models.get(regions=region_names)
+    print(f"Found {len(models)} models with regions {region_names}")
+
+    # --- Filter by type
+    model_type = "public"
+    models = await client.models.get(type=model_type)
+    print(f"Found {len(models)} models with type {model_type}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Filtering Benchmarks
+
+> Source: [`examples/get_benchmarks.py`](../../examples/get_benchmarks.py)
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    # --- Filter by name
+    benchmark_name = "mmlu"
+    benchmarks = await client.benchmarks.get(name=benchmark_name)
+    print(f"Found {len(benchmarks)} benchmarks with name {benchmark_name}")
+
+    # --- Filter by type
+    benchmark_type = "public"
+    benchmarks = await client.benchmarks.get(type=benchmark_type)
+    print(f"Found {len(benchmarks)} benchmarks with type {benchmark_type}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Creating a Custom Model
+
+> Source: [`examples/create_custom_model.py`](../../examples/create_custom_model.py)
+
+Custom models let you evaluate any model accessible via an OpenAI-compatible chat completions endpoint.
+
+```python
+import os
+
+from layerlens import Stratix
+
+
+def main():
+    client = Stratix()
+
+    result = client.models.create_custom(
+        name="My Custom Model",
+        key="my-org/custom-model-v1",
+        description="Custom fine-tuned model served via vLLM",
+        api_url="https://my-model-endpoint.example.com/v1",
+        api_key=os.environ["MY_PROVIDER_API_KEY"],
+        max_tokens=4096,
+    )
+
+    if result:
+        print(f"Custom model created: {result.model_id}")
+    else:
+        print("Failed to create custom model")
+
+    # Verify the model was added
+    models = client.models.get(type="custom")
+    if models:
+        print(f"\nCustom models in project ({len(models)}):")
+        for m in models:
+            print(f"  - {m.name} (id={m.id}, key={m.key})")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+## Creating a Custom Benchmark
+
+> Source: [`examples/create_custom_benchmark.py`](../../examples/create_custom_benchmark.py)
+
+Custom benchmarks are created from JSONL files with `input` and `truth` fields.
+
+```python
+from layerlens import Stratix
+
+
+def main():
+    client = Stratix()
+
+    # Basic custom benchmark
+    result = client.benchmarks.create_custom(
+        name="My Custom Benchmark",
+        description="A simple test benchmark for QA evaluation",
+        file_path="path/to/benchmark.jsonl",
+    )
+
+    if result:
+        print(f"Custom benchmark created: {result.benchmark_id}")
+
+    # With additional metrics and input type
+    result = client.benchmarks.create_custom(
+        name="Advanced Benchmark",
+        description="Benchmark with toxicity and readability scoring",
+        file_path="path/to/benchmark.jsonl",
+        additional_metrics=["toxicity", "readability"],
+        input_type="messages",
+    )
+
+    if result:
+        print(f"Advanced benchmark created: {result.benchmark_id}")
+
+    # Verify
+    benchmarks = client.benchmarks.get(type="custom")
+    if benchmarks:
+        print(f"\nCustom benchmarks in project ({len(benchmarks)}):")
+        for b in benchmarks:
+            print(f"  - {b.name} (id={b.id})")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+### JSONL File Format
+
+Each line should be a JSON object:
+
+```json
+{"input": "What is 2+2?", "truth": "4"}
+{"input": "Capital of France?", "truth": "Paris"}
+```
+
+Optional field: `subset` (for grouping prompts into categories).
+
+## Creating a Smart Benchmark
+
+> Source: [`examples/create_smart_benchmark.py`](../../examples/create_smart_benchmark.py)
+
+Smart benchmarks use AI to automatically generate benchmark prompts from uploaded documents. Supported file types: `.txt`, `.pdf`, `.html`, `.docx`, `.csv`, `.json`, `.jsonl`, `.parquet`.
+
+```python
+from layerlens import Stratix
+
+
+def main():
+    client = Stratix()
+
+    result = client.benchmarks.create_smart(
+        name="Product Knowledge Benchmark",
+        description="Evaluates model knowledge of our product documentation",
+        system_prompt=(
+            "Generate question-answer pairs that test understanding of the "
+            "product features, capabilities, and limitations described in "
+            "the provided documents. Each question should have a clear, "
+            "factual answer derived from the source material."
+        ),
+        file_paths=[
+            "path/to/product_docs.pdf",
+            "path/to/faq.txt",
+        ],
+        metrics=["hallucination"],
+    )
+
+    if result:
+        print(f"Smart benchmark created: {result.benchmark_id}")
+        print("The benchmark is being generated asynchronously.")
+        print("Check the dashboard for progress.")
+    else:
+        print("Failed to create smart benchmark")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+## Managing Project Models and Benchmarks
+
+> Source: [`examples/manage_project_models_benchmarks.py`](../../examples/manage_project_models_benchmarks.py)
+
+Add and remove public models and benchmarks from your project.
+
+```python
+from layerlens import Stratix
+
+
+def main():
+    client = Stratix()
+
+    # --- Add public models to the project
+    success = client.models.add("model-id-1", "model-id-2")
+    print(f"Add models: {'success' if success else 'failed'}")
+
+    # --- Remove a model from the project
+    success = client.models.remove("model-id-1")
+    print(f"Remove model: {'success' if success else 'failed'}")
+
+    # --- Add public benchmarks to the project
+    success = client.benchmarks.add("benchmark-id-1")
+    print(f"Add benchmark: {'success' if success else 'failed'}")
+
+    # --- Remove a benchmark from the project
+    success = client.benchmarks.remove("benchmark-id-1")
+    print(f"Remove benchmark: {'success' if success else 'failed'}")
+
+    # --- List current models and benchmarks
+    models = client.models.get()
+    if models:
+        print(f"\nModels in project ({len(models)}):")
+        for m in models:
+            print(f"  - {m.name} (id={m.id})")
+
+    benchmarks = client.benchmarks.get()
+    if benchmarks:
+        print(f"\nBenchmarks in project ({len(benchmarks)}):")
+        for b in benchmarks:
+            print(f"  - {b.name} (id={b.id})")
+
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/docs/examples/public-api.md b/docs/examples/public-api.md
new file mode 100644
index 0000000..5ace255
--- /dev/null
+++ b/docs/examples/public-api.md
@@ -0,0 +1,279 @@
+# Public API
+
+Examples for browsing public models, benchmarks, evaluations, and comparing results using the LayerLens Python SDK.
+
+The public API is accessed through `client.public` on a `Stratix` client, or by instantiating `PublicClient` directly. An API key is still required.
+
+```python
+from layerlens import Stratix, PublicClient
+
+# Via the main client
+client = Stratix()
+public = client.public
+
+# Or directly
+public = PublicClient()
+```
+
+## Public Models
+
+> Source: [`examples/public_models.py`](../../examples/public_models.py)
+
+```python
+from layerlens import PublicClient
+
+
+def main():
+    client = PublicClient()
+
+    # --- Browse all public models (first page)
+    response = client.models.get(page=1, page_size=10)
+    print(f"Found {response.total_count} public models (showing first {len(response.models)})")
+    for model in response.models:
+        print(f"  - {model.name} ({model.company})")
+
+    # --- Search models by query
+    response = client.models.get(query="gpt")
+    print(f"\nFound {response.total_count} models matching 'gpt'")
+    for model in response.models:
+        print(f"  - {model.name}")
+
+    # --- Filter by company
+    companies = ["OpenAI", "Anthropic"]
+    response = client.models.get(companies=companies)
+    print(f"\nFound {response.total_count} models from {companies}")
+    for model in response.models:
+        print(f"  - {model.name} ({model.company})")
+
+    # --- Filter by region
+    response = client.models.get(regions=["usa"])
+    print(f"\nFound {response.total_count} models in region 'usa'")
+
+    # --- Filter by category
+    response = client.models.get(categories=["open-source"])
+    print(f"\nFound {response.total_count} open-source models")
+
+    # --- Sort by release date (newest first)
+    response = client.models.get(sort_by="releasedAt", order="desc", page_size=5)
+    print(f"\nNewest 5 models:")
+    for model in response.models:
+        print(f"  - {model.name} (released_at={model.released_at})")
+
+    # --- Include deprecated models
+    response = client.models.get(include_deprecated=True)
+    print(f"\nTotal models (including deprecated): {response.total_count}")
+
+    # --- Discover available filter values
+    response = client.models.get(page=1, page_size=1)
+    print(f"\nAvailable filter values:")
+    print(f"  Categories: {response.categories}")
+    print(f"  Companies:  {response.companies}")
+    print(f"  Regions:    {response.regions}")
+    print(f"  Licenses:   {response.licenses}")
+    print(f"  Sizes:      {response.sizes}")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+## Public Benchmarks
+
+> Source: [`examples/public_benchmarks.py`](../../examples/public_benchmarks.py)
+
+```python
+from layerlens import PublicClient
+
+
+def main():
+    client = PublicClient()
+
+    # --- Browse all public benchmarks
+    response = client.benchmarks.get(page=1, page_size=10)
+    print(f"Found {response.total_count} public benchmarks (showing first {len(response.datasets)})")
+    for benchmark in response.datasets:
+        print(f"  - {benchmark.name} (prompts={benchmark.prompt_count}, language={benchmark.language})")
+
+    # --- Filter by language
+    response = client.benchmarks.get(languages=["English"])
+    print(f"\nFound {response.total_count} English benchmarks")
+
+    # --- Discover available filter values
+    if response.categories:
+        print(f"\nAvailable categories: {response.categories}")
+    if response.languages:
+        print(f"Available languages: {response.languages}")
+
+    # --- Search by name
+    response = client.benchmarks.get(query="mmlu")
+    print(f"\nFound {response.total_count} benchmarks matching 'mmlu'")
+    for benchmark in response.datasets:
+        print(f"  - {benchmark.name}: {benchmark.description[:80] if benchmark.description else 'N/A'}...")
+
+    # --- Get benchmark prompts (paginated)
+    if response.datasets:
+        benchmark = response.datasets[0]
+        print(f"\nFetching prompts for '{benchmark.name}' (id={benchmark.id})...")
+
+        prompts_response = client.benchmarks.get_prompts(
+            benchmark.id,
+            page=1,
+            page_size=5,
+        )
+
+        if prompts_response:
+            print(f"Total prompts: {prompts_response.data.count}")
+            print(f"Showing first {len(prompts_response.data.prompts)} prompts:")
+            for prompt in prompts_response.data.prompts:
+                input_preview = str(prompt.input)[:80]
+                truth_preview = prompt.truth[:50] if prompt.truth else "N/A"
+                print(f"  - Input: {input_preview}...")
+                print(f"    Truth: {truth_preview}")
+
+    # --- Get all prompts (auto-paginates)
+    if response.datasets:
+        benchmark = response.datasets[0]
+        print(f"\nFetching ALL prompts for '{benchmark.name}'...")
+        all_prompts = client.benchmarks.get_all_prompts(benchmark.id)
+        print(f"Retrieved {len(all_prompts)} total prompts")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+## Public Evaluations
+
+> Source: [`examples/public_evaluations.py`](../../examples/public_evaluations.py)
+
+```python
+from layerlens import PublicClient
+from layerlens.models import EvaluationStatus
+
+
+def main():
+    client = PublicClient()
+
+    # --- Get a specific evaluation by ID
+    evaluation_id = "699f1426c1212b2d9c78e947"
+    evaluation = client.evaluations.get_by_id(evaluation_id)
+    if evaluation:
+        print(f"Evaluation: {evaluation.id}")
+        print(f"  Model: {evaluation.model_name} ({evaluation.model_company})")
+        print(f"  Benchmark: {evaluation.benchmark_name}")
+        print(f"  Status: {evaluation.status.value}")
+        print(f"  Accuracy: {evaluation.accuracy:.2f}%")
+
+        if evaluation.summary:
+            print(f"  Summary: {evaluation.summary.name}")
+            print(f"  Goal: {evaluation.summary.goal}")
+            if evaluation.summary.metrics:
+                print(f"  Metrics: {', '.join(m.name for m in evaluation.summary.metrics)}")
+            if evaluation.summary.performance_details:
+                print(f"  Strengths: {evaluation.summary.performance_details.strengths}")
+            if evaluation.summary.analysis_summary:
+                print(f"  Key takeaways: {evaluation.summary.analysis_summary.key_takeaways}")
+    else:
+        print(f"Evaluation {evaluation_id} not found")
+
+    # --- List latest evaluations
+    response = client.evaluations.get_many(
+        page=1,
+        page_size=5,
+        sort_by="submittedAt",
+        order="desc",
+    )
+    if response:
+        print(f"\nLatest evaluations ({response.pagination.total_count} total):")
+        for e in response.evaluations:
+            print(f"  - {e.id}: {e.model_name} on {e.benchmark_name} -> {e.accuracy:.2f}% ({e.status.value})")
+
+    # --- Filter by status (only successful)
+    response = client.evaluations.get_many(
+        status=EvaluationStatus.SUCCESS,
+        sort_by="accuracy",
+        order="desc",
+        page_size=5,
+    )
+    if response:
+        print(f"\nTop successful evaluations ({response.pagination.total_count} total):")
+        for e in response.evaluations:
+            print(f"  - {e.model_name}: {e.accuracy:.2f}%")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+## Comparing Evaluations
+
+> Source: [`examples/compare_evaluations.py`](../../examples/compare_evaluations.py)
+
+Compare how two models perform on the same benchmark, prompt by prompt.
+
+```python
+from layerlens import PublicClient
+
+
+def main():
+    client = PublicClient()
+
+    # --- Compare two models on a benchmark
+    # The SDK automatically finds the most recent successful evaluation for each model.
+    benchmark_id = "682bddc1e014f9fa440f8a91"  # AIME 2025
+    model_id_1 = "699f9761e014f9c3072b0513"    # Qwen3.5 27B
+    model_id_2 = "699f9761e014f9c3072b0512"    # Qwen3.5 122B A10B
+
+    print(f"Comparing models on benchmark {benchmark_id}...")
+    comparison = client.comparisons.compare_models(
+        benchmark_id=benchmark_id,
+        model_id_1=model_id_1,
+        model_id_2=model_id_2,
+        page=1,
+        page_size=10,
+    )
+
+    if comparison:
+        print(f"\n=== Comparison Summary ===")
+        print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+        print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
+        print(f"Total compared: {comparison.total_count}")
+
+        if comparison.results:
+            print(f"\nFirst {len(comparison.results)} results:")
+            for result in comparison.results:
+                s1 = "Y" if result.score1 and result.score1 > 0.5 else "N"
+                s2 = "Y" if result.score2 and result.score2 > 0.5 else "N"
+                print(f"  Prompt: {result.prompt[:80]}...")
+                print(f"    Model 1: {s1} (score={result.score1})")
+                print(f"    Model 2: {s2} (score={result.score2})")
+
+    # --- Filter: where model 1 fails but model 2 succeeds
+    comparison = client.comparisons.compare_models(
+        benchmark_id=benchmark_id,
+        model_id_1=model_id_1,
+        model_id_2=model_id_2,
+        outcome_filter="reference_fails",
+    )
+
+    if comparison:
+        print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===")
+        print(f"Found {comparison.total_count} such cases")
+
+    # --- Compare using evaluation IDs directly
+    comparison = client.comparisons.compare(
+        evaluation_id_1="699f9938a03d70bf6607081f",
+        evaluation_id_2="699f991ca782d00ebd666ba1",
+        page=1,
+        page_size=5,
+    )
+
+    if comparison:
+        print(f"\n=== Direct Comparison by Evaluation IDs ===")
+        print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct")
+        print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct")
+
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/docs/examples/retrieving-results.md b/docs/examples/retrieving-results.md
new file mode 100644
index 0000000..d54da79
--- /dev/null
+++ b/docs/examples/retrieving-results.md
@@ -0,0 +1,194 @@
+# Retrieving Results
+
+Examples for fetching evaluation results using the LayerLens Python SDK, including pagination, bulk fetching, and concurrent retrieval.
+
+## Paginated Results
+
+> Source: [`examples/paginated_results.py`](../../examples/paginated_results.py)
+
+Walk through results page by page with full control over page size.
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    models = await client.models.get()
+    benchmarks = await client.benchmarks.get()
+
+    evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0])
+    evaluation = await client.evaluations.wait_for_completion(
+        evaluation, interval_seconds=10, timeout_seconds=600
+    )
+
+    if evaluation.is_success:
+        print("Fetching all results with pagination...")
+
+        all_results = []
+        page = 1
+        page_size = 50
+
+        while True:
+            print(f"Fetching page {page} (page size: {page_size})...")
+
+            results_data = await client.results.get_by_id(
+                evaluation_id=evaluation.id, page=page, page_size=page_size
+            )
+
+            if not results_data or not results_data.results:
+                print("No more results to fetch")
+                break
+
+            all_results.extend(results_data.results)
+
+            if page == 1:
+                total_count = results_data.pagination.total_count
+                total_pages = results_data.pagination.total_pages
+                print(f"Total results: {total_count:,}")
+                print(f"Total pages: {total_pages}")
+
+            print(f"Page {page}: Retrieved {len(results_data.results)} results")
+            print(f"Running total: {len(all_results):,} results")
+
+            if page >= results_data.pagination.total_pages:
+                print("Reached last page")
+                break
+
+            page += 1
+
+        print(f"\nTotal results collected: {len(all_results):,}")
+
+        if all_results:
+            correct_answers = sum(1 for r in all_results if r.score > 0.5)
+            accuracy = correct_answers / len(all_results)
+            avg_score = sum(r.score for r in all_results) / len(all_results)
+
+            print(f"Overall accuracy: {accuracy:.1%} ({correct_answers:,}/{len(all_results):,})")
+            print(f"Average score: {avg_score:.3f}")
+
+            print(f"\nFirst 3 results:")
+            for i, result in enumerate(all_results[:3], 1):
+                print(f"  {i}. Score: {result.score:.3f}, Subset: {result.subset}")
+                print(f"     Prompt: {result.prompt[:100]}...")
+                print(f"     Response: {result.result[:100]}...")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## All Results Without Pagination
+
+> Source: [`examples/all_results_no_pagination.py`](../../examples/all_results_no_pagination.py)
+
+Use `get_all()` to fetch every result in a single call. Simpler but loads everything into memory.
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def main():
+    client = AsyncStratix()
+
+    models = await client.models.get()
+    benchmarks = await client.benchmarks.get()
+
+    evaluation = await client.evaluations.create(
+        model=models[0],
+        benchmark=benchmarks[0],
+    )
+
+    evaluation = await client.evaluations.wait_for_completion(
+        evaluation,
+        interval_seconds=10,
+        timeout_seconds=600,
+    )
+
+    # Fetch all results at once
+    results = await client.results.get_all(evaluation=evaluation)
+    print(f"Found {len(results)} results")
+    print(results)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Fetch Results for Multiple Evaluations Concurrently
+
+> Source: [`examples/fetch_results_async.py`](../../examples/fetch_results_async.py)
+
+Use `asyncio.gather` to load results for several evaluations in parallel.
+
+```python
+import asyncio
+
+from layerlens import AsyncStratix
+
+
+async def fetch_evaluation_results(client, evaluation_id):
+    """Fetch results for a single evaluation and print when loaded."""
+    try:
+        print(f"Fetching evaluation {evaluation_id}...")
+        evaluation = await client.evaluations.get_by_id(evaluation_id)
+        print(f"Found evaluation {evaluation.id}, status={evaluation.status}")
+
+        results = await client.results.get_all(evaluation=evaluation)
+        print(f"Loaded {len(results)} results for evaluation {evaluation_id}")
+        print(f"Results for {evaluation_id}: {results}")
+        print("-" * 80)
+
+        return evaluation_id, results
+    except Exception as e:
+        print(f"Error fetching evaluation {evaluation_id}: {e}")
+        return evaluation_id, None
+
+
+async def main():
+    client = AsyncStratix()
+
+    # Replace with your own evaluation IDs
+    evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"]
+
+    print(f"Starting async fetch for {len(evaluation_ids)} evaluations...")
+    print("=" * 80)
+
+    tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    print("=" * 80)
+    print("Summary:")
+    successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception))
+    print(f"Successfully fetched results for {successful}/{len(evaluation_ids)} evaluations")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Using the Evaluation Object Helpers
+
+Results can also be fetched directly from an `Evaluation` object when a client is attached:
+
+```python
+from layerlens import Stratix
+
+client = Stratix()
+
+# Get results via the client
+results_response = client.results.get(evaluation=evaluation, page=1, page_size=50)
+
+# Or via the evaluation object (client must be attached)
+results_response = evaluation.get_results(page=1, page_size=50)
+all_results = evaluation.get_all_results()
+
+# Async equivalents
+results_response = await evaluation.get_results_async(page=1, page_size=50)
+all_results = await evaluation.get_all_results_async()
+```
diff --git a/examples/all_results_no_pagination.py b/examples/all_results_no_pagination.py
index 173828a..ea0390b 100644
--- a/examples/all_results_no_pagination.py
+++ b/examples/all_results_no_pagination.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/async_client.py b/examples/async_client.py
index c64c2de..596878f 100644
--- a/examples/async_client.py
+++ b/examples/async_client.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/async_client_simple.py b/examples/async_client_simple.py
index 1c4623e..8ebfafc 100644
--- a/examples/async_client_simple.py
+++ b/examples/async_client_simple.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/async_judges_and_traces.py b/examples/async_judges_and_traces.py
index de78e0e..90ca657 100644
--- a/examples/async_judges_and_traces.py
+++ b/examples/async_judges_and_traces.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import os
 import time
diff --git a/examples/async_run_evaluations.py b/examples/async_run_evaluations.py
index 3764e17..8469f76 100644
--- a/examples/async_run_evaluations.py
+++ b/examples/async_run_evaluations.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/client.py b/examples/client.py
index 8047cea..eead9b1 100644
--- a/examples/client.py
+++ b/examples/client.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import Stratix
 
diff --git a/examples/client_simple.py b/examples/client_simple.py
index 7f1c7c6..52dad3b 100644
--- a/examples/client_simple.py
+++ b/examples/client_simple.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import Stratix
 
diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py
index 2293e8d..1f71704 100644
--- a/examples/compare_evaluations.py
+++ b/examples/compare_evaluations.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import PublicClient
 
diff --git a/examples/create_custom_benchmark.py b/examples/create_custom_benchmark.py
index 4b263de..d7f1aba 100644
--- a/examples/create_custom_benchmark.py
+++ b/examples/create_custom_benchmark.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import Stratix
 
diff --git a/examples/create_custom_model.py b/examples/create_custom_model.py
index 8d375f6..6325922 100644
--- a/examples/create_custom_model.py
+++ b/examples/create_custom_model.py
@@ -1,4 +1,6 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
+
+import os
 
 from layerlens import Stratix
 
@@ -20,7 +22,7 @@ def main():
         key="my-org/custom-model-v1",
         description="Custom fine-tuned model served via vLLM",
         api_url="https://my-model-endpoint.example.com/v1",
-        api_key="my-provider-api-key",
+        api_key=os.environ["MY_PROVIDER_API_KEY"],
         max_tokens=4096,
     )
 
diff --git a/examples/create_smart_benchmark.py b/examples/create_smart_benchmark.py
index 9c628d3..af16a7f 100644
--- a/examples/create_smart_benchmark.py
+++ b/examples/create_smart_benchmark.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import Stratix
 
diff --git a/examples/evaluation_sorting.py b/examples/evaluation_sorting.py
index cb1906f..ff48e44 100644
--- a/examples/evaluation_sorting.py
+++ b/examples/evaluation_sorting.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/fetch_results_async.py b/examples/fetch_results_async.py
index d9ef929..91c9350 100644
--- a/examples/fetch_results_async.py
+++ b/examples/fetch_results_async.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/get_benchmarks.py b/examples/get_benchmarks.py
index 169ffa9..6699e8e 100644
--- a/examples/get_benchmarks.py
+++ b/examples/get_benchmarks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/get_evaluation.py b/examples/get_evaluation.py
index a6d8fe6..2f99379 100644
--- a/examples/get_evaluation.py
+++ b/examples/get_evaluation.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/get_models.py b/examples/get_models.py
index c8d14de..c1fb9bb 100644
--- a/examples/get_models.py
+++ b/examples/get_models.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/judge_optimizations.py b/examples/judge_optimizations.py
index 9681ec9..2cb59ff 100644
--- a/examples/judge_optimizations.py
+++ b/examples/judge_optimizations.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 """
 Judge Optimizations example.
diff --git a/examples/judges.py b/examples/judges.py
index b6a3e3e..ab3e940 100644
--- a/examples/judges.py
+++ b/examples/judges.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import time
 
diff --git a/examples/manage_project_models_benchmarks.py b/examples/manage_project_models_benchmarks.py
index 0067051..07bbab9 100644
--- a/examples/manage_project_models_benchmarks.py
+++ b/examples/manage_project_models_benchmarks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import Stratix
 
diff --git a/examples/paginated_results.py b/examples/paginated_results.py
index 2f135e7..3e69cf7 100644
--- a/examples/paginated_results.py
+++ b/examples/paginated_results.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import asyncio
 
diff --git a/examples/public_benchmarks.py b/examples/public_benchmarks.py
index 491c51b..ecfd23e 100644
--- a/examples/public_benchmarks.py
+++ b/examples/public_benchmarks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import PublicClient
 
diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py
index a8eb588..824063a 100644
--- a/examples/public_evaluations.py
+++ b/examples/public_evaluations.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import PublicClient
 from layerlens.models import EvaluationStatus
diff --git a/examples/public_models.py b/examples/public_models.py
index be22040..122ba2d 100644
--- a/examples/public_models.py
+++ b/examples/public_models.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 from layerlens import PublicClient
 
diff --git a/examples/trace_evaluations.py b/examples/trace_evaluations.py
index e956598..5100d72 100644
--- a/examples/trace_evaluations.py
+++ b/examples/trace_evaluations.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import time
 
diff --git a/examples/traces.py b/examples/traces.py
index af99970..af03792 100644
--- a/examples/traces.py
+++ b/examples/traces.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S poetry run python
+#!/usr/bin/env python3
 
 import os
 
diff --git a/pyproject.toml b/pyproject.toml
index f5efccc..783372f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ dynamic = ["version"]
 description = "The official Python library for the LayerLens Stratix API"
 license = "Apache-2.0"
 authors = [{ name = "LayerLens", email = "support@layerlens.ai" }]
-dependencies = ["httpx>=0.23.0, <1", "pydantic>=1.9.0, <3", "requests"]
+dependencies = ["httpx>=0.23.0, <1", "pydantic>=1.9.0, <3"]
 requires-python = ">= 3.8"
 classifiers = [
   "Typing :: Typed",
@@ -47,7 +47,6 @@ dev-dependencies = [
   "pyright==1.1.399",
   "pytest-cov>=6.2.1",
   "ruff",
-  "types-requests",
   "build",
   "twine==6.1.0",
 ]
@@ -133,6 +132,7 @@ known-first-party = ["openai", "tests"]
 "scripts/**.py" = ["T201", "T203"]
 "tests/**.py" = ["T201", "T203"]
 "examples/**.py" = ["T201", "T203"]
+"src/layerlens/cli.py" = ["T201", "T203"]
 
 [tool.pyright]
 include = ["src", "tests"]
diff --git a/src/layerlens/_base_client.py b/src/layerlens/_base_client.py
index 6fbf3d1..8cd7650 100644
--- a/src/layerlens/_base_client.py
+++ b/src/layerlens/_base_client.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import time
 import logging
 from typing import Any, Dict, Type, Union, TypeVar, Optional
 
@@ -12,6 +13,11 @@
 
 ResponseT = TypeVar("ResponseT")
 
+MAX_RETRIES = 2
+RETRY_STATUS_CODES = {429, 500, 502, 503, 504}
+INITIAL_RETRY_DELAY = 0.5
+MAX_RETRY_DELAY = 8.0
+
 
 log: logging.Logger = logging.getLogger(__name__)
 log.addFilter(SensitiveHeadersFilter())
@@ -52,27 +58,45 @@ def _request_cast(
         **kwargs: Any,
     ) -> Union[ResponseT, httpx.Response]:
         combined_headers = {**self.default_headers, **(headers or {})}
-
-        response = super().request(
-            method=method,
-            url=url,
-            json=body,
-            params=params,
-            headers=combined_headers,
-            **kwargs,
-        )
-
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as err:
-            log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
-            log.debug("Re-raising status error")
-            raise self._make_status_error_from_response(err.response) from None
-
-        if cast_to:
-            data = response.json()
-            return cast_to(**data)
-        return response
+        retries_left = MAX_RETRIES
+        delay = INITIAL_RETRY_DELAY
+
+        while True:
+            response = super().request(
+                method=method,
+                url=url,
+                json=body,
+                params=params,
+                headers=combined_headers,
+                **kwargs,
+            )
+
+            if response.status_code in RETRY_STATUS_CODES and retries_left > 0:
+                retry_after = response.headers.get("retry-after")
+                sleep_time = float(retry_after) if retry_after else delay
+                sleep_time = min(sleep_time, MAX_RETRY_DELAY)
+                log.debug(
+                    "Retrying request after %.1fs (status %d, %d retries left)",
+                    sleep_time,
+                    response.status_code,
+                    retries_left,
+                )
+                time.sleep(sleep_time)
+                delay = min(delay * 2, MAX_RETRY_DELAY)
+                retries_left -= 1
+                continue
+
+            try:
+                response.raise_for_status()
+            except httpx.HTTPStatusError as err:
+                log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
+                log.debug("Re-raising status error")
+                raise self._make_status_error_from_response(err.response) from None
+
+            if cast_to:
+                data = response.json()
+                return cast_to(**data)
+            return response
 
     def get_cast(
         self,
@@ -177,28 +201,48 @@ async def _request_cast(
         headers: Optional[Dict[str, str]] = None,
         **kwargs: Any,
     ) -> Union[ResponseT, httpx.Response]:
-        combined_headers = {**self.default_headers, **(headers or {})}
-
-        response = await super().request(
-            method=method,
-            url=url,
-            json=body,
-            params=params,
-            headers=combined_headers,
-            **kwargs,
-        )
+        import asyncio
 
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as err:
-            log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
-            log.debug("Re-raising status error")
-            raise self._make_status_error_from_response(err.response) from None
-
-        if cast_to:
-            data = response.json()
-            return cast_to(**data)
-        return response
+        combined_headers = {**self.default_headers, **(headers or {})}
+        retries_left = MAX_RETRIES
+        delay = INITIAL_RETRY_DELAY
+
+        while True:
+            response = await super().request(
+                method=method,
+                url=url,
+                json=body,
+                params=params,
+                headers=combined_headers,
+                **kwargs,
+            )
+
+            if response.status_code in RETRY_STATUS_CODES and retries_left > 0:
+                retry_after = response.headers.get("retry-after")
+                sleep_time = float(retry_after) if retry_after else delay
+                sleep_time = min(sleep_time, MAX_RETRY_DELAY)
+                log.debug(
+                    "Retrying request after %.1fs (status %d, %d retries left)",
+                    sleep_time,
+                    response.status_code,
+                    retries_left,
+                )
+                await asyncio.sleep(sleep_time)
+                delay = min(delay * 2, MAX_RETRY_DELAY)
+                retries_left -= 1
+                continue
+
+            try:
+                response.raise_for_status()
+            except httpx.HTTPStatusError as err:
+                log.debug("Encountered httpx.HTTPStatusError", exc_info=True)
+                log.debug("Re-raising status error")
+                raise self._make_status_error_from_response(err.response) from None
+
+            if cast_to:
+                data = response.json()
+                return cast_to(**data)
+            return response
 
     async def get_cast(
         self,
diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py
index e146f29..e7688e0 100644
--- a/src/layerlens/_client.py
+++ b/src/layerlens/_client.py
@@ -7,12 +7,11 @@
 from typing_extensions import Self, override
 
 import httpx
-import requests
 
 from . import _exceptions
 from ._utils import is_mapping
 from .models import Organization, OrganizationResponse
-from ._constants import DEFAULT_TIMEOUT
+from ._constants import DEFAULT_TIMEOUT, DEFAULT_BASE_URL
 from ._exceptions import StratixError, APIStatusError
 from ._base_client import BaseClient, BaseAsyncClient
 
@@ -59,7 +58,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://api.layerlens.ai/api/v1"
+            base_url = DEFAULT_BASE_URL
 
         super().__init__(
             base_url=base_url,
@@ -231,7 +230,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://api.layerlens.ai/api/v1"
+            base_url = DEFAULT_BASE_URL
 
         super().__init__(base_url=base_url, timeout=timeout)
 
@@ -354,8 +353,9 @@ def _make_status_error(
     def _get_organization(self) -> Optional[Organization]:
         url = f"{self.base_url}organizations"
 
-        response = requests.get(url, headers=self.default_headers, timeout=30)
-        response.raise_for_status()
+        with httpx.Client(timeout=30) as http:
+            response = http.get(url, headers=self.default_headers)
+            response.raise_for_status()
 
         data = response.json()
 
diff --git a/src/layerlens/_constants.py b/src/layerlens/_constants.py
index 625df7c..2440945 100644
--- a/src/layerlens/_constants.py
+++ b/src/layerlens/_constants.py
@@ -2,3 +2,5 @@
 
 # default timeout is 10 minutes
 DEFAULT_TIMEOUT = httpx.Timeout(timeout=600, connect=5.0)
+
+DEFAULT_BASE_URL = "https://api.layerlens.ai/api/v1"
diff --git a/src/layerlens/_public_client.py b/src/layerlens/_public_client.py
index fb04979..057793e 100644
--- a/src/layerlens/_public_client.py
+++ b/src/layerlens/_public_client.py
@@ -10,7 +10,7 @@
 
 from . import _exceptions
 from ._utils import is_mapping
-from ._constants import DEFAULT_TIMEOUT
+from ._constants import DEFAULT_TIMEOUT, DEFAULT_BASE_URL
 from ._exceptions import APIStatusError
 from ._base_client import BaseClient, BaseAsyncClient
 
@@ -75,7 +75,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://api.layerlens.ai/api/v1"
+            base_url = DEFAULT_BASE_URL
 
         super().__init__(base_url=base_url, timeout=timeout)
 
@@ -159,7 +159,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://api.layerlens.ai/api/v1"
+            base_url = DEFAULT_BASE_URL
 
         super().__init__(base_url=base_url, timeout=timeout)
 
diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py
index df29efa..41607c3 100644
--- a/src/layerlens/_version.py
+++ b/src/layerlens/_version.py
@@ -1,4 +1,4 @@
-__version__ = "1.3.0"
+__version__ = "1.3.1"
 
 # Will be templated during the build
 __git_commit__ = "__GIT_COMMIT__"
diff --git a/src/layerlens/cli.py b/src/layerlens/cli.py
new file mode 100644
index 0000000..5c899c7
--- /dev/null
+++ b/src/layerlens/cli.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+import sys
+
+from ._version import __version__
+
+
+def main() -> None:
+    if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"):
+        print(f"layerlens {__version__}")
+        sys.exit(0)
+
+    print(f"layerlens {__version__}")
+    print("See https://layerlens.gitbook.io/stratix-python-sdk for documentation.")
+    print("\nUsage:")
+    print("  layerlens --version   Show version")