diff --git a/README.md b/README.md
index 2b9871a..536e80b 100644
--- a/README.md
+++ b/README.md
@@ -248,15 +248,62 @@ Catch the most specific exception first. The hierarchy:
 
 Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`.
 
+## CLI
+
+The LayerLens CLI lets you manage traces, judges, evaluations, integrations, and more from the terminal.
+
+### Install
+
+```bash
+pip install layerlens[cli] --extra-index-url https://sdk.layerlens.ai/package
+```
+
+### Configure
+
+```bash
+export LAYERLENS_STRATIX_API_KEY="your-api-key"
+```
+
+### Usage
+
+```bash
+stratix --help                   # Show all commands
+stratix trace list               # List traces
+stratix evaluate run \
+  --model openai/gpt-4o \
+  --benchmark arc-agi-2 --wait     # Run an evaluation and wait for results
+stratix judge create \
+  --name "Quality" \
+  --goal "Rate response quality" \
+  --model-id <MODEL_ID>            # Create a judge
+stratix ci report -o summary.md  # Generate CI report
+```
+
+Shell completions are available for bash, zsh, fish, and powershell:
+
+```bash
+stratix completion bash          # Print setup instructions
+```
+
+Full CLI docs: [docs/cli/](docs/cli/)
+
+| Guide | Description |
+| --- | --- |
+| [Getting Started](docs/cli/getting-started.md) | Installation, configuration, first commands |
+| [Command Reference](docs/cli/commands.md) | All commands and options |
+| [Examples](docs/cli/examples.md) | 15 common workflows as copy-paste shell sessions |
+
 ## Requirements
 
 - Python 3.8+
 - Dependencies: `httpx`, `pydantic`, `requests`
+- CLI extra: `click>=8.0.0`
 
 ## Documentation
 
 Full API reference and examples are available in the [docs/](docs/) directory:
 
+- [CLI Guide](docs/cli/) (getting started, command reference, workflow examples)
 - [API Reference](docs/api-reference/) (client config, all resource methods, error handling)
 - [Code Examples](docs/examples/) (evaluations, judges, traces)
 - [Troubleshooting](docs/troubleshooting/) (auth issues, error codes)
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index 619c641..7d2c039 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -15,11 +15,17 @@
   * [Results](api-reference/results.md)
   * [Models & Benchmarks](api-reference/models-benchmarks.md)
   * [Judges](api-reference/judges.md)
+  * [Scorers](api-reference/scorers.md)
   * [Traces](api-reference/traces.md)
   * [Trace Evaluations](api-reference/trace-evaluations.md)
   * [Judge Optimizations](api-reference/judge-optimizations.md)
   * [Error Handling](api-reference/errors.md)
 
+## CLI
+* [Getting Started](cli/getting-started.md)
+* [Command Reference](cli/commands.md)
+* [Workflow Examples](cli/examples.md)
+
 ## Code Examples
 * [Overview](examples/README.md)
   * [Creating Evaluations](examples/creating-evaluations.md)
diff --git a/docs/api-reference/evaluations.md b/docs/api-reference/evaluations.md
index a9039e0..5f4d59e 100644
--- a/docs/api-reference/evaluations.md
+++ b/docs/api-reference/evaluations.md
@@ -177,22 +177,23 @@ async def get_evaluation():
 asyncio.run(get_evaluation())
 ```
 
-### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, timeout=None)`
+### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, unique=False, timeout=None)`
 
 Retrieves multiple evaluations with optional pagination, sorting, and filtering.
 
 #### Parameters
 
-| Parameter       | Type                             | Required | Description                                             |
-| --------------- | -------------------------------- | -------- | ------------------------------------------------------- |
-| `page`          | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)     |
-| `page_size`     | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500) |
-| `sort_by`       | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` |
-| `order`         | `str \| None`                    | No       | Sort order: `asc` or `desc`                             |
-| `model_ids`     | `List[str] \| None`              | No       | Filter by model IDs                                     |
-| `benchmark_ids` | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                         |
-| `status`        | `EvaluationStatus \| None`       | No       | Filter by evaluation status                             |
-| `timeout`       | `float \| httpx.Timeout \| None` | No       | Override request timeout                                |
+| Parameter       | Type                             | Required | Description                                                                         |
+| --------------- | -------------------------------- | -------- | ----------------------------------------------------------------------------------- |
+| `page`          | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)                                 |
+| `page_size`     | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500)                             |
+| `sort_by`       | `str \| None`                    | No       | Sort by field: `submitted_at`, `accuracy`, or `average_duration`                    |
+| `order`         | `str \| None`                    | No       | Sort order: `asc` or `desc`                                                         |
+| `model_ids`     | `List[str] \| None`              | No       | Filter by model IDs                                                                 |
+| `benchmark_ids` | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                                                     |
+| `status`        | `EvaluationStatus \| None`       | No       | Filter by evaluation status                                                         |
+| `unique`        | `bool`                           | No       | If `True`, deduplicate by model+benchmark pair, keeping only the latest evaluation  |
+| `timeout`       | `float \| httpx.Timeout \| None` | No       | Override request timeout                                                            |
 
 #### Returns
 
@@ -222,6 +223,13 @@ response = client.evaluations.get_many(
 if response:
     for evaluation in response.evaluations:
         print(f"{evaluation.id}: accuracy={evaluation.accuracy:.2f}%")
+
+# Get only the latest evaluation per model+benchmark pair
+response = client.evaluations.get_many(
+    unique=True,
+    sort_by="accuracy",
+    order="desc",
+)
 ```
 
 ### `get_results(page=None, page_size=None, timeout=None)`
diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md
index 550190c..bef109a 100644
--- a/docs/api-reference/models-benchmarks.md
+++ b/docs/api-reference/models-benchmarks.md
@@ -35,20 +35,24 @@ benchmarks = client.benchmarks.get()
 
 ## Models
 
-### `get(type=None, name=None, companies=None, regions=None, licenses=None, timeout=None)`
+### `get(type=None, name=None, key=None, categories=None, companies=None, regions=None, licenses=None, timeout=None)`
 
 Retrieves a list of available models with optional filtering parameters. Both the `Stratix` and `AsyncStratix` clients have this method.
 
 #### Parameters
 
-| Parameter   | Type                                  | Required | Description                                                            |
-| ----------- | ------------------------------------- | -------- | ---------------------------------------------------------------------- |
-| `type`      | `Literal["custom", "public"] \| None` | No       | Filter by model type. If `None`, returns both custom and public models |
-| `name`      | `str \| None`                         | No       | Filter models by name (partial match search)                           |
-| `companies` | `List[str] \| None`                   | No       | Filter by model companies/providers                                    |
-| `regions`   | `List[str] \| None`                   | No       | Filter by supported regions                                            |
-| `licenses`  | `List[str] \| None`                   | No       | Filter by license types                                                |
-| `timeout`   | `float \| httpx.Timeout \| None`      | No       | Override request timeout                                               |
+| Parameter    | Type                                  | Required | Description                                                                                    |
+| ------------ | ------------------------------------- | -------- | ---------------------------------------------------------------------------------------------- |
+| `type`       | `Literal["custom", "public"] \| None` | No       | Filter by model type. If `None`, returns both custom and public models                         |
+| `name`       | `str \| None`                         | No       | Filter models by name (partial match search)                                                   |
+| `key`        | `str \| None`                         | No       | Filter models by key (partial match search)                                                    |
+| `categories` | `List[str] \| None`                   | No       | Filter by categories: `Transformer`, `MoE`, `Open-Source`, `Closed-Source`                     |
+| `companies`  | `List[str] \| None`                   | No       | Filter by model companies/providers                                                            |
+| `regions`    | `List[str] \| None`                   | No       | Filter by supported regions                                                                    |
+| `licenses`   | `List[str] \| None`                   | No       | Filter by license types                                                                        |
+| `timeout`    | `float \| httpx.Timeout \| None`      | No       | Override request timeout                                                                       |
+
+> **Note:** When filtering by `categories`, `companies`, `regions`, or `licenses`, only public models are returned since custom models do not have these fields.
 
 #### Returns
 
@@ -185,17 +189,22 @@ if result:
 
 ## Benchmarks
 
-### `get(type=None, name=None, timeout=None)`
+### `get(type=None, name=None, key=None, categories=None, languages=None, timeout=None)`
 
 Retrieves a list of available benchmarks with optional filtering parameters. Both the `Stratix` and `AsyncStratix` clients have this method.
 
 #### Parameters
 
-| Parameter | Type                                  | Required | Description                                                                    |
-| --------- | ------------------------------------- | -------- | ------------------------------------------------------------------------------ |
-| `type`    | `Literal["custom", "public"] \| None` | No       | Filter by benchmark type. If `None`, returns both custom and public benchmarks |
-| `name`    | `str \| None`                         | No       | Filter benchmarks by name (partial match search)                               |
-| `timeout` | `float \| httpx.Timeout \| None`      | No       | Override request timeout                                                       |
+| Parameter    | Type                                  | Required | Description                                                                    |
+| ------------ | ------------------------------------- | -------- | ------------------------------------------------------------------------------ |
+| `type`       | `Literal["custom", "public"] \| None` | No       | Filter by benchmark type. If `None`, returns both custom and public benchmarks |
+| `name`       | `str \| None`                         | No       | Filter benchmarks by name (partial match search)                               |
+| `key`        | `str \| None`                         | No       | Filter benchmarks by key (partial match search)                                |
+| `categories` | `List[str] \| None`                   | No       | Filter by categories (e.g., `reasoning`, `knowledge`, `coding`)                |
+| `languages`  | `List[str] \| None`                   | No       | Filter by language (e.g., `english`, `french`)                                 |
+| `timeout`    | `float \| httpx.Timeout \| None`      | No       | Override request timeout                                                       |
+
+> **Note:** When filtering by `categories` or `languages`, only public benchmarks are returned since custom benchmarks do not have these fields.
 
 #### Returns
 
diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md
index 31afc82..70a209a 100644
--- a/docs/api-reference/public-client.md
+++ b/docs/api-reference/public-client.md
@@ -286,16 +286,17 @@ Retrieves evaluations with optional pagination, sorting, and filtering.
 
 #### Parameters
 
-| Parameter         | Type                             | Required | Description                                                        |
-| ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ |
-| `page`            | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)                |
-| `page_size`       | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500)            |
-| `sort_by`         | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration`     |
-| `order`           | `str \| None`                    | No       | Sort order: `asc` or `desc`                                       |
-| `model_ids`       | `List[str] \| None`              | No       | Filter by model IDs                                                |
-| `benchmark_ids`   | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                                    |
-| `status`          | `EvaluationStatus \| None`       | No       | Filter by evaluation status                                        |
-| `timeout`         | `float \| httpx.Timeout \| None` | No       | Override request timeout                                           |
+| Parameter         | Type                             | Required | Description                                                                        |
+| ----------------- | -------------------------------- | -------- | ---------------------------------------------------------------------------------- |
+| `page`            | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)                                |
+| `page_size`       | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500)                            |
+| `sort_by`         | `str \| None`                    | No       | Sort by field: `submitted_at`, `accuracy`, or `average_duration`                   |
+| `order`           | `str \| None`                    | No       | Sort order: `asc` or `desc`                                                        |
+| `model_ids`       | `List[str] \| None`              | No       | Filter by model IDs                                                                |
+| `benchmark_ids`   | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                                                    |
+| `status`          | `EvaluationStatus \| None`       | No       | Filter by evaluation status                                                        |
+| `unique`          | `bool`                           | No       | If `True`, deduplicate by model+benchmark pair, keeping only the latest evaluation |
+| `timeout`         | `float \| httpx.Timeout \| None` | No       | Override request timeout                                                           |
 
 #### Returns
 
diff --git a/docs/api-reference/scorers.md b/docs/api-reference/scorers.md
new file mode 100644
index 0000000..652a815
--- /dev/null
+++ b/docs/api-reference/scorers.md
@@ -0,0 +1,176 @@
+# Scorers
+
+The `scorers` resource on the Stratix client allows you to create and manage custom scorers for evaluating benchmark results. Scorers use an LLM model to evaluate model outputs using a custom prompt.
+
+## Overview
+
+A scorer defines a custom evaluation criterion backed by a specific LLM model and a prompt template. Custom scorers can be attached to custom benchmarks to provide additional scoring beyond the built-in metrics.
+
+### Using Synchronous Client
+
+```python
+from layerlens import Stratix
+
+client = Stratix()
+
+# Fetch a model to use for the scorer
+models = client.models.get(type="public", name="gpt-4o")
+model = models[0]
+
+# Create a scorer
+scorer = client.scorers.create(
+    name="Helpfulness Scorer",
+    description="Evaluates how helpful the response is",
+    model_id=model.id,
+    prompt="Rate the helpfulness of the following response on a scale of 0 to 1.",
+)
+
+if scorer:
+    print(f"Created scorer: {scorer.name} (id={scorer.id})")
+
+# List all scorers
+response = client.scorers.get_many()
+if response:
+    for s in response.scorers:
+        print(f"  {s.name}: {s.description}")
+```
+
+### Using Async Client
+
+```python
+import asyncio
+from layerlens import AsyncStratix
+
+async def main():
+    client = AsyncStratix()
+
+    scorer = await client.scorers.create(
+        name="Helpfulness Scorer",
+        description="Evaluates how helpful the response is",
+        model_id="model-abc123",
+        prompt="Rate the helpfulness of the following response on a scale of 0 to 1.",
+    )
+
+    if scorer:
+        print(f"Created scorer: {scorer.name}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Methods
+
+Both the `Stratix` (synchronous) and `AsyncStratix` (asynchronous) clients support the following methods.
+
+### `create(name, description, model_id, prompt, timeout=None)`
+
+Creates a new custom scorer.
+
+#### Parameters
+
+| Parameter     | Type                             | Required | Description                                        |
+| ------------- | -------------------------------- | -------- | -------------------------------------------------- |
+| `name`        | `str`                            | Yes      | Display name for the scorer                        |
+| `description` | `str`                            | Yes      | Description of what the scorer evaluates           |
+| `model_id`    | `str`                            | Yes      | ID of the LLM model to use for scoring             |
+| `prompt`      | `str`                            | Yes      | Prompt template used to evaluate model outputs     |
+| `timeout`     | `float \| httpx.Timeout \| None` | No       | Override request timeout                           |
+
+#### Returns
+
+Returns a `Scorer` object if successful, `None` if the scorer could not be created.
+
+### `get(id, timeout=None)`
+
+Retrieves a scorer by its unique identifier.
+
+#### Parameters
+
+| Parameter | Type                             | Required | Description              |
+| --------- | -------------------------------- | -------- | ------------------------ |
+| `id`      | `str`                            | Yes      | The unique scorer ID     |
+| `timeout` | `float \| httpx.Timeout \| None` | No       | Override request timeout |
+
+#### Returns
+
+Returns a `Scorer` object if found, `None` otherwise.
+
+### `get_many(page=None, page_size=None, timeout=None)`
+
+Retrieves multiple scorers with pagination.
+
+#### Parameters
+
+| Parameter   | Type                             | Required | Description                                          |
+| ----------- | -------------------------------- | -------- | ---------------------------------------------------- |
+| `page`      | `int \| None`                    | No       | Page number (1-based, defaults to 1)                 |
+| `page_size` | `int \| None`                    | No       | Number of scorers per page (default: 100, max: 500)  |
+| `timeout`   | `float \| httpx.Timeout \| None` | No       | Override request timeout                             |
+
+#### Returns
+
+Returns a `ScorersResponse` object containing:
+
+- `scorers`: List of `Scorer` objects
+- `count`: Number of scorers in this page
+- `total_count`: Total number of scorers
+
+Returns `None` if the request fails.
+
+### `update(id, name=None, description=None, model_id=None, prompt=None, timeout=None)`
+
+Updates an existing scorer. Only provided fields are modified; omitted fields remain unchanged.
+
+#### Parameters
+
+| Parameter     | Type                             | Required | Description                            |
+| ------------- | -------------------------------- | -------- | -------------------------------------- |
+| `id`          | `str`                            | Yes      | The unique scorer ID                   |
+| `name`        | `str \| None`                    | No       | Updated display name                   |
+| `description` | `str \| None`                    | No       | Updated description                    |
+| `model_id`    | `str \| None`                    | No       | Updated model ID                       |
+| `prompt`      | `str \| None`                    | No       | Updated prompt template                |
+| `timeout`     | `float \| httpx.Timeout \| None` | No       | Override request timeout               |
+
+#### Returns
+
+Returns `True` if the update succeeded, `False` otherwise.
+
+### `delete(id, timeout=None)`
+
+Deletes a scorer by its unique identifier.
+
+#### Parameters
+
+| Parameter | Type                             | Required | Description              |
+| --------- | -------------------------------- | -------- | ------------------------ |
+| `id`      | `str`                            | Yes      | The unique scorer ID     |
+| `timeout` | `float \| httpx.Timeout \| None` | No       | Override request timeout |
+
+#### Returns
+
+Returns `True` if the scorer was deleted, `False` otherwise.
+
+## Response Objects
+
+### Scorer Object Properties
+
+| Property          | Type          | Description                          |
+| ----------------- | ------------- | ------------------------------------ |
+| `id`              | `str`         | Unique scorer identifier             |
+| `organization_id` | `str`         | Organization the scorer belongs to   |
+| `project_id`      | `str`         | Project the scorer belongs to        |
+| `name`            | `str`         | Display name                         |
+| `description`     | `str \| None` | Description of what it evaluates     |
+| `model_id`        | `str \| None` | ID of the backing LLM model         |
+| `model_name`      | `str \| None` | Name of the backing LLM model       |
+| `model_key`       | `str \| None` | Key of the backing LLM model        |
+| `model_company`   | `str \| None` | Company that provides the model      |
+| `prompt`          | `str \| None` | Prompt template for scoring          |
+| `created_at`      | `str \| None` | ISO 8601 creation timestamp          |
+| `updated_at`      | `str \| None` | ISO 8601 last update timestamp       |
+
+## Next Steps
+
+- Learn about [Benchmarks](models-benchmarks.md) to attach custom scorers to custom benchmarks
+- Learn about [Judges](judges.md) for evaluating traces
diff --git a/docs/cli/commands.md b/docs/cli/commands.md
new file mode 100644
index 0000000..30e8aee
--- /dev/null
+++ b/docs/cli/commands.md
@@ -0,0 +1,423 @@
+# CLI — Command Reference
+
+Complete reference for all `stratix` CLI commands.
+
+## Command tree
+
+```
+stratix [global-options]
+├── trace
+│   ├── list          List traces
+│   ├── get           Get a trace by ID
+│   ├── search        Search traces
+│   ├── export        Export a trace as JSON
+│   └── delete        Delete a trace
+├── judge
+│   ├── list          List judges
+│   ├── get           Get a judge by ID
+│   ├── create        Create a new judge
+│   └── test          Test a judge against a trace
+├── evaluate
+│   ├── list          List evaluations
+│   ├── get           Get an evaluation by ID
+│   └── run           Run a new evaluation
+├── integration
+│   ├── list          List integrations
+│   └── test          Test an integration
+├── scorer
+│   ├── list          List scorers
+│   ├── get           Get a scorer by ID
+│   ├── create        Create a new scorer
+│   └── delete        Delete a scorer
+├── space
+│   ├── list          List evaluation spaces
+│   ├── get           Get a space by ID or slug
+│   ├── create        Create a new space
+│   └── delete        Delete a space
+├── bulk
+│   └── eval          Run evaluations in bulk
+├── ci
+│   └── report        Generate a CI summary report
+└── completion        Print shell completion setup
+```
+
+---
+
+## Global options
+
+These options are available on every command:
+
+```
+--api-key TEXT         API key (env: LAYERLENS_STRATIX_API_KEY)
+--host TEXT            API host
+--port INTEGER         API port
+--format [table|json]      Output format (default: table)
+--verbose, -v         Enable debug output
+--version             Show version and exit
+--help                Show help and exit
+```
+
+---
+
+## trace
+
+Manage traces.
+
+### `trace list`
+
+List traces with optional filtering and pagination.
+
+```bash
+stratix trace list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+| `--source` | text | Filter by source |
+| `--status` | text | Filter by status |
+| `--sort-by` | text | Sort field |
+| `--sort-order` | asc/desc | Sort order |
+
+### `trace get`
+
+Get a single trace by ID.
+
+```bash
+stratix trace get <ID>
+```
+
+### `trace search`
+
+Search traces by query string.
+
+```bash
+stratix trace search <QUERY> [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+| `--source` | text | Filter by source |
+| `--status` | text | Filter by status |
+| `--sort-by` | text | Sort field |
+| `--sort-order` | asc/desc | Sort order |
+
+### `trace export`
+
+Export a trace as JSON.
+
+```bash
+stratix trace export <ID> [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--output`, `-o` | path | Output file (default: stdout) |
+
+### `trace delete`
+
+Delete a trace by ID.
+
+```bash
+stratix trace delete <ID> [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--yes`, `-y` | flag | Skip confirmation prompt |
+
+---
+
+## judge
+
+Manage judges.
+
+### `judge list`
+
+```bash
+stratix judge list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+
+### `judge get`
+
+```bash
+stratix judge get <ID>
+```
+
+### `judge create`
+
+Create a new judge.
+
+```bash
+stratix judge create [OPTIONS]
+```
+
+| Option | Type | Required | Description |
+| --- | --- | --- | --- |
+| `--name` | text | yes | Judge name |
+| `--goal` | text | yes | Evaluation goal description |
+| `--model-id` | text | no | Model ID for the judge |
+
+### `judge test`
+
+Test a judge by running it against a trace. Creates a trace evaluation.
+
+```bash
+stratix judge test [OPTIONS]
+```
+
+| Option | Type | Required | Description |
+| --- | --- | --- | --- |
+| `--judge-id` | text | yes | Judge ID to test |
+| `--trace-id` | text | yes | Trace ID to evaluate |
+
+---
+
+## evaluate
+
+Manage evaluations.
+
+### `evaluate list`
+
+```bash
+stratix evaluate list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+| `--status` | text | Filter: pending, in-progress, success, failure |
+| `--sort-by` | submitted_at/accuracy/average_duration | Sort field |
+| `--order` | asc/desc | Sort order |
+
+### `evaluate get`
+
+```bash
+stratix evaluate get <ID>
+```
+
+### `evaluate run`
+
+Run a new evaluation. Accepts model/benchmark by ID, key, or name.
+
+```bash
+stratix evaluate run [OPTIONS]
+```
+
+| Option | Type | Required | Description |
+| --- | --- | --- | --- |
+| `--model` | text | yes | Model ID, key, or name |
+| `--benchmark` | text | yes | Benchmark ID, key, or name |
+| `--wait` | flag | no | Wait for evaluation to complete |
+
+---
+
+## integration
+
+Manage integrations.
+
+### `integration list`
+
+```bash
+stratix integration list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+
+### `integration test`
+
+Test an integration by ID.
+
+```bash
+stratix integration test <ID>
+```
+
+---
+
+## scorer
+
+Manage scorers.
+
+### `scorer list`
+
+```bash
+stratix scorer list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+
+### `scorer get`
+
+```bash
+stratix scorer get <ID>
+```
+
+### `scorer create`
+
+```bash
+stratix scorer create [OPTIONS]
+```
+
+| Option | Type | Required | Description |
+| --- | --- | --- | --- |
+| `--name` | text | yes | Name (3–64 characters) |
+| `--description` | text | yes | Description (10–500 characters) |
+| `--model-id` | text | yes | Model ID for scoring |
+| `--prompt` | text | yes | Scoring prompt |
+| `--dry-run` | flag | no | Preview without executing |
+
+### `scorer delete`
+
+```bash
+stratix scorer delete <ID> [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--yes`, `-y` | flag | Skip confirmation prompt |
+| `--dry-run` | flag | Preview without executing |
+
+---
+
+## space
+
+Manage evaluation spaces.
+
+### `space list`
+
+```bash
+stratix space list [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--page` | int | Page number |
+| `--page-size` | int | Results per page |
+| `--sort-by` | text | Sort field (e.g. weight, created_at) |
+| `--order` | asc/desc | Sort order |
+
+### `space get`
+
+```bash
+stratix space get <ID>
+```
+
+Accepts an ID or slug.
+
+### `space create`
+
+```bash
+stratix space create [OPTIONS]
+```
+
+| Option | Type | Required | Description |
+| --- | --- | --- | --- |
+| `--name` | text | yes | Space name |
+| `--description` | text | no | Description (max 500 characters) |
+| `--visibility` | private/public/tenant | no | Visibility level |
+| `--dry-run` | flag | no | Preview without executing |
+
+### `space delete`
+
+```bash
+stratix space delete <ID> [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--yes`, `-y` | flag | Skip confirmation prompt |
+| `--dry-run` | flag | Preview without executing |
+
+---
+
+## bulk
+
+Bulk operations.
+
+### `bulk eval`
+
+Run evaluations in bulk. Supports two modes:
+
+**Mode 1: JSONL file**
+
+```bash
+stratix bulk eval --file jobs.jsonl [OPTIONS]
+```
+
+Each line in the JSONL file is a JSON object with `model` and `benchmark` fields:
+
+```json
+{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"}
+{"model": "anthropic/claude-3-opus", "benchmark": "arc-agi-2"}
+```
+
+**Mode 2: Single model + benchmark**
+
+```bash
+stratix bulk eval --model openai/gpt-4o --benchmark arc-agi-2 --wait [OPTIONS]
+```
+
+**Mode 3: Judge + trace IDs**
+
+```bash
+stratix bulk eval --judge-id <JUDGE_ID> --traces trace_ids.txt [OPTIONS]
+```
+
+The traces file contains one trace ID per line.
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--file` | path | JSONL file with evaluation jobs |
+| `--model` | text | Model ID/name (use with --benchmark) |
+| `--benchmark` | text | Benchmark ID/name (use with --model) |
+| `--judge-id` | text | Judge ID (use with --traces) |
+| `--traces` | path | File with trace IDs (one per line) |
+| `--dry-run` | flag | Preview without executing |
+| `--wait` | flag | Wait for all evaluations to complete |
+
+---
+
+## ci
+
+CI/CD pipeline helpers.
+
+### `ci report`
+
+Generate a markdown summary of recent evaluations, suitable for GitHub Actions job summaries.
+
+```bash
+stratix ci report [OPTIONS]
+```
+
+| Option | Type | Description |
+| --- | --- | --- |
+| `--output`, `-o` | path | Output file (default: stdout) |
+| `--limit` | int | Number of evaluations to include (default: 10) |
+| `--dry-run` | flag | Preview without fetching data |
+
+---
+
+## completion
+
+Print shell completion setup instructions.
+
+```bash
+stratix completion <SHELL>
+```
+
+Where `SHELL` is one of: `bash`, `zsh`, `fish`, `powershell`.
diff --git a/docs/cli/examples.md b/docs/cli/examples.md
new file mode 100644
index 0000000..2d0ba21
--- /dev/null
+++ b/docs/cli/examples.md
@@ -0,0 +1,291 @@
+# CLI — Workflow Examples
+
+Fifteen copy-paste workflows covering the most common CLI tasks.
+
+---
+
+## 1. Quick start: first trace to evaluation
+
+Set up, list traces, inspect one, and evaluate it with a judge.
+
+```bash
+# Configure
+export LAYERLENS_STRATIX_API_KEY="sk-..."
+
+# See what traces are available
+stratix trace list
+
+# Inspect a specific trace
+stratix trace get <TRACE_ID>
+
+# Create a judge and test it against the trace
+stratix judge create --name "Accuracy" --goal "Rate factual accuracy of the response" --model-id <MODEL_ID>
+stratix judge test --judge-id <JUDGE_ID> --trace-id <TRACE_ID>
+```
+
+---
+
+## 2. Run an evaluation and wait for results
+
+```bash
+# Run and block until done
+stratix evaluate run \
+  --model openai/gpt-4o \
+  --benchmark arc-agi-2 \
+  --wait
+
+# Or fire and check later
+stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2
+stratix evaluate list --status in-progress
+stratix evaluate get <EVAL_ID>
+```
+
+---
+
+## 3. Compare models on the same benchmark
+
+```bash
+stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2 --wait
+stratix evaluate run --model anthropic/claude-3-opus --benchmark arc-agi-2 --wait
+
+# List results sorted by accuracy
+stratix evaluate list --sort-by accuracy --order desc
+```
+
+---
+
+## 4. Judge workflow: create, test, tune, apply
+
+```bash
+# Create a judge
+stratix judge create \
+  --name "Helpfulness" \
+  --goal "Rate how helpful and actionable the response is on a 1-5 scale" \
+  --model-id <MODEL_ID>
+
+# Test against a sample trace
+stratix judge test --judge-id <JUDGE_ID> --trace-id <TRACE_ID>
+
+# Review the result
+stratix --format json judge get <JUDGE_ID>
+
+# Iterate: create a refined version
+stratix judge create \
+  --name "Helpfulness v2" \
+  --goal "Rate helpfulness on a 1-5 scale with justification" \
+  --model-id <MODEL_ID>
+```
+
+---
+
+## 5. Trace search and export
+
+```bash
+# Search for traces matching a keyword
+stratix trace search "customer support" --page-size 5
+
+# Export a trace to a file
+stratix trace export <TRACE_ID> -o trace_backup.json
+
+# Export as JSON for piping
+stratix --format json trace get <TRACE_ID> | jq '.id'
+```
+
+---
+
+## 6. Bulk evaluation from a JSONL file
+
+```bash
+# Create a jobs file
+cat > jobs.jsonl <<'EOF'
+{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"}
+{"model": "openai/gpt-4o-mini", "benchmark": "arc-agi-2"}
+{"model": "anthropic/claude-3-opus", "benchmark": "arc-agi-2"}
+EOF
+
+# Dry-run to preview
+stratix bulk eval --file jobs.jsonl --dry-run
+
+# Execute and wait
+stratix bulk eval --file jobs.jsonl --wait
+```
+
+---
+
+## 7. Bulk trace evaluation with a judge
+
+```bash
+# Create a trace ID file
+stratix --format json trace list | jq -r '.[].id' > trace_ids.txt
+
+# Dry-run to preview
+stratix bulk eval \
+  --judge-id <JUDGE_ID> \
+  --traces trace_ids.txt \
+  --dry-run
+
+# Run trace evaluations for all traces
+stratix bulk eval \
+  --judge-id <JUDGE_ID> \
+  --traces trace_ids.txt
+```
+
+---
+
+## 8. CI/CD pipeline integration
+
+```bash
+# In your GitHub Actions workflow:
+stratix evaluate run \
+  --model openai/gpt-4o \
+  --benchmark arc-agi-2 \
+  --wait
+
+# Generate a summary for the GitHub job
+stratix ci report -o "$GITHUB_STEP_SUMMARY"
+
+# Or output JSON for custom processing
+stratix ci report -o report.json
+```
+
+---
+
+## 9. Integration monitoring
+
+```bash
+# List all integrations
+stratix integration list
+
+# Test a specific integration
+stratix integration test <INTEGRATION_ID>
+
+# JSON output for scripting
+stratix --format json integration list | jq '.[] | select(.status != "active")'
+```
+
+---
+
+## 10. Scorer management
+
+```bash
+# List existing scorers
+stratix scorer list
+
+# Create a scorer with dry-run
+stratix scorer create \
+  --name "Code Quality" \
+  --description "Evaluates generated code for correctness, readability, and best practices" \
+  --model-id <MODEL_ID> \
+  --prompt "Score the following code on a 1-10 scale for quality..." \
+  --dry-run
+
+# Create for real
+stratix scorer create \
+  --name "Code Quality" \
+  --description "Evaluates generated code for correctness, readability, and best practices" \
+  --model-id <MODEL_ID> \
+  --prompt "Score the following code on a 1-10 scale for quality..."
+
+# Delete with confirmation
+stratix scorer delete <SCORER_ID>
+
+# Delete without prompt
+stratix scorer delete <SCORER_ID> -y
+```
+
+---
+
+## 11. Evaluation spaces
+
+```bash
+# List spaces
+stratix space list
+
+# Create a private space
+stratix space create \
+  --name "Q1 Model Comparison" \
+  --description "Comparing GPT-4o vs Claude 3 Opus for Q1 release" \
+  --visibility private
+
+# Create a public space (dry-run first)
+stratix space create \
+  --name "Public Leaderboard" \
+  --visibility public \
+  --dry-run
+
+# Get space details by slug or ID
+stratix space get q1-model-comparison
+
+# Clean up
+stratix space delete <SPACE_ID> -y
+```
+
+---
+
+## 12. JSON output and scripting
+
+```bash
+# Pipe trace IDs into a loop
+stratix --format json trace list | jq -r '.[].id' | while read id; do
+  echo "Exporting $id..."
+  stratix trace export "$id" -o "traces/${id}.json"
+done
+
+# Get evaluation accuracy as a number
+ACCURACY=$(stratix --format json evaluate get <EVAL_ID> | jq -r '.accuracy')
+echo "Accuracy: $ACCURACY"
+
+# Filter evaluations by status
+stratix --format json evaluate list | jq '[.[] | select(.status == "success")]'
+```
+
+---
+
+## 13. Pagination and sorting
+
+```bash
+# Page through traces
+stratix trace list --page 1 --page-size 20
+stratix trace list --page 2 --page-size 20
+
+# Sort evaluations
+stratix evaluate list --sort-by accuracy --order desc --page-size 5
+stratix evaluate list --sort-by submitted_at --order asc
+
+# Sort spaces
+stratix space list --sort-by created_at --order desc
+```
+
+---
+
+## 14. Verbose mode and debugging
+
+```bash
+# Enable verbose output to see HTTP requests
+stratix -v trace list
+
+# Combine with JSON output
+stratix -v --format json evaluate get <EVAL_ID>
+
+# Debug authentication issues
+stratix -v integration list
+```
+
+---
+
+## 15. Clean up resources
+
+```bash
+# Delete a trace (with confirmation prompt)
+stratix trace delete <TRACE_ID>
+
+# Delete without prompting
+stratix trace delete <TRACE_ID> -y
+
+# Delete a scorer (dry-run first)
+stratix scorer delete <SCORER_ID> --dry-run
+stratix scorer delete <SCORER_ID> -y
+
+# Delete a space
+stratix space delete <SPACE_ID> -y
+```
diff --git a/docs/cli/getting-started.md b/docs/cli/getting-started.md
new file mode 100644
index 0000000..1245eb7
--- /dev/null
+++ b/docs/cli/getting-started.md
@@ -0,0 +1,150 @@
+# CLI — Getting Started
+
+The LayerLens Stratix CLI provides terminal access to all platform features: traces, judges, evaluations, integrations, scorers, evaluation spaces, bulk operations, and CI/CD helpers.
+
+## Installation
+
+Install the SDK with the `cli` extra:
+
+```bash
+pip install layerlens[cli] --extra-index-url https://sdk.layerlens.ai/package
+```
+
+If you already have `layerlens` installed, add the CLI extra:
+
+```bash
+pip install "layerlens[cli]" --extra-index-url https://sdk.layerlens.ai/package
+```
+
+For local development from a cloned repo:
+
+```bash
+pip install -e ".[cli]"
+```
+
+Verify the installation:
+
+```bash
+stratix --version
+```
+
+## Configuration
+
+### API key
+
+The CLI requires a LayerLens Stratix API key. Set it as an environment variable (recommended):
+
+```bash
+export LAYERLENS_STRATIX_API_KEY="your-api-key"
+```
+
+Or pass it per-command:
+
+```bash
+stratix --api-key "your-api-key" trace list
+```
+
+### Custom host
+
+By default the CLI talks to `api.layerlens.ai`. Override with:
+
+```bash
+stratix --host my-instance.example.com trace list
+stratix --host my-instance.example.com --port 8443 trace list
+```
+
+## Global options
+
+Every command accepts these options:
+
+| Option | Description |
+| --- | --- |
+| `--api-key` | API key (or set `LAYERLENS_STRATIX_API_KEY`) |
+| `--host` | API host |
+| `--port` | API port |
+| `--format` | Output format: `table` (default) or `json` |
+| `--verbose` / `-v` | Enable debug output |
+| `--version` | Print version and exit |
+
+## Output formats
+
+The default output is a human-readable table:
+
+```bash
+stratix trace list
+```
+
+```
+ID                                   Created              Filename         Evaluations
+───────────────────────────────────────────────────────────────────────────────────────
+a1b2c3d4-...                         2026-03-15 14:30     traces.jsonl     3
+e5f6a7b8-...                         2026-03-14 09:12     batch_02.json    1
+```
+
+Switch to JSON for scripting:
+
+```bash
+stratix --format json trace list
+```
+
+```json
+[
+  {
+    "id": "a1b2c3d4-...",
+    "created_at": "2026-03-15T14:30:00Z",
+    "filename": "traces.jsonl",
+    ...
+  }
+]
+```
+
+## Shell completions
+
+The CLI supports tab-completion for commands, options, and resource IDs.
+
+```bash
+# Print setup instructions for your shell
+stratix completion bash
+stratix completion zsh
+stratix completion fish
+stratix completion powershell
+```
+
+Follow the printed instructions to enable completions. After setup, you can tab-complete trace IDs, judge IDs, model names, and more.
+
+## First commands
+
+### List your traces
+
+```bash
+stratix trace list
+```
+
+### Run an evaluation
+
+```bash
+stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2 --wait
+```
+
+### Create a judge
+
+```bash
+stratix judge create --name "Response Quality" --goal "Rate accuracy and completeness" --model-id <MODEL_ID>
+```
+
+### Check integrations
+
+```bash
+stratix integration list
+```
+
+### Generate a CI report
+
+```bash
+stratix ci report -o summary.md
+```
+
+## Next steps
+
+- [Command Reference](commands.md) — all commands and their options
+- [Examples](examples.md) — 15 common workflows as copy-paste shell sessions
diff --git a/docs/examples/models-and-benchmarks.md b/docs/examples/models-and-benchmarks.md
index fc21c68..573eb00 100644
--- a/docs/examples/models-and-benchmarks.md
+++ b/docs/examples/models-and-benchmarks.md
@@ -30,6 +30,20 @@ async def main():
     models = await client.models.get(regions=region_names)
     print(f"Found {len(models)} models with regions {region_names}")
 
+    # --- Filter by categories
+    categories = ["Open-Source"]
+    models = await client.models.get(categories=categories)
+    print(f"Found {len(models)} open-source models")
+
+    # --- Filter by key
+    models = await client.models.get(key="gpt-4")
+    print(f"Found {len(models)} models matching key 'gpt-4'")
+
+    # --- Filter by license
+    licenses = ["apache-2.0"]
+    models = await client.models.get(licenses=licenses)
+    print(f"Found {len(models)} models with license {licenses}")
+
     # --- Filter by type
     model_type = "public"
     models = await client.models.get(type=model_type)
@@ -58,6 +72,20 @@ async def main():
     benchmarks = await client.benchmarks.get(name=benchmark_name)
     print(f"Found {len(benchmarks)} benchmarks with name {benchmark_name}")
 
+    # --- Filter by categories
+    categories = ["reasoning"]
+    benchmarks = await client.benchmarks.get(categories=categories)
+    print(f"Found {len(benchmarks)} benchmarks with categories {categories}")
+
+    # --- Filter by language
+    languages = ["english"]
+    benchmarks = await client.benchmarks.get(languages=languages)
+    print(f"Found {len(benchmarks)} english benchmarks")
+
+    # --- Filter by key
+    benchmarks = await client.benchmarks.get(key="mmlu")
+    print(f"Found {len(benchmarks)} benchmarks matching key 'mmlu'")
+
     # --- Filter by type
     benchmark_type = "public"
     benchmarks = await client.benchmarks.get(type=benchmark_type)
diff --git a/examples/cli/01_quickstart.sh b/examples/cli/01_quickstart.sh
new file mode 100755
index 0000000..9b9cb3d
--- /dev/null
+++ b/examples/cli/01_quickstart.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Quick start: configure, list traces, inspect, evaluate
+#
+# Usage: ./01_quickstart.sh [MODEL_ID]
+# MODEL_ID defaults to the first available judge model.
+set -euo pipefail
+
+MODEL_ID="${1:-67e1fe69e014f9fa6e50d7be}"
+
+# 1. List available traces
+echo "==> Listing traces..."
+stratix trace list --page-size 5
+
+# 2. Get the first trace ID
+TRACE_ID=$(stratix --format json trace list --page-size 1 \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)[0]['id'])")
+echo "==> First trace: $TRACE_ID"
+
+# 3. Inspect it
+stratix trace get "$TRACE_ID"
+
+# 4. Create a judge (capture ID from the "Judge created: <id>" line)
+echo "==> Creating judge..."
+JUDGE_ID=$(stratix judge create \
+  --name "Quick Start Judge $(date +%s)" \
+  --goal "Rate whether the response is accurate and helpful" \
+  --model-id "$MODEL_ID" \
+  | grep "^Judge created:" | awk '{print $NF}')
+echo "==> Created judge: $JUDGE_ID"
+
+# 5. Test the judge against the trace
+echo "==> Testing judge against trace..."
+stratix judge test --judge-id "$JUDGE_ID" --trace-id "$TRACE_ID"
diff --git a/examples/cli/02_evaluate.sh b/examples/cli/02_evaluate.sh
new file mode 100755
index 0000000..919bb40
--- /dev/null
+++ b/examples/cli/02_evaluate.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Run an evaluation and wait for results
+set -euo pipefail
+
+MODEL="${1:-openai/gpt-4o}"
+BENCHMARK="${2:-arc-agi-2}"
+
+echo "==> Running evaluation: $MODEL on $BENCHMARK"
+stratix evaluate run \
+  --model "$MODEL" \
+  --benchmark "$BENCHMARK" \
+  --wait
+
+echo "==> Recent evaluations:"
+stratix evaluate list --sort-by submitted_at --order desc --page-size 5
diff --git a/examples/cli/03_judge_workflow.sh b/examples/cli/03_judge_workflow.sh
new file mode 100755
index 0000000..c999065
--- /dev/null
+++ b/examples/cli/03_judge_workflow.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Judge workflow: create, test, review
+#
+# Usage: ./03_judge_workflow.sh <TRACE_ID> [MODEL_ID]
+set -euo pipefail
+
+TRACE_ID="${1:?Usage: $0 <TRACE_ID> [MODEL_ID]}"
+MODEL_ID="${2:-67e1fe69e014f9fa6e50d7be}"
+
+# Create a judge (capture ID from the "Judge created: <id>" line)
+echo "==> Creating judge..."
+JUDGE_ID=$(stratix judge create \
+  --name "Response Quality $(date +%s)" \
+  --goal "Rate the response for accuracy, completeness, and clarity on a 1-5 scale" \
+  --model-id "$MODEL_ID" \
+  | grep "^Judge created:" | awk '{print $NF}')
+echo "==> Judge ID: $JUDGE_ID"
+
+# Test against a trace
+echo "==> Testing judge..."
+stratix judge test --judge-id "$JUDGE_ID" --trace-id "$TRACE_ID"
+
+# Review judge details
+echo "==> Judge details:"
+stratix judge get "$JUDGE_ID"
diff --git a/examples/cli/04_bulk_eval.sh b/examples/cli/04_bulk_eval.sh
new file mode 100755
index 0000000..ed998fe
--- /dev/null
+++ b/examples/cli/04_bulk_eval.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Bulk evaluation from a JSONL file
+set -euo pipefail
+
+JOBS_FILE="${1:-/dev/stdin}"
+
+# Create a sample jobs file if none provided
+if [ "$JOBS_FILE" = "/dev/stdin" ]; then
+  JOBS_FILE=$(mktemp /tmp/layerlens-jobs-XXXXX.jsonl)
+  cat > "$JOBS_FILE" <<'EOF'
+{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"}
+{"model": "openai/gpt-4o-mini", "benchmark": "arc-agi-2"}
+EOF
+  echo "==> Created sample jobs file: $JOBS_FILE"
+fi
+
+# Dry-run first
+echo "==> Dry-run:"
+stratix bulk eval --file "$JOBS_FILE" --dry-run
+
+echo ""
+read -p "Proceed? [y/N] " confirm
+if [[ "$confirm" =~ ^[Yy]$ ]]; then
+  stratix bulk eval --file "$JOBS_FILE" --wait
+fi
diff --git a/examples/cli/05_export_traces.sh b/examples/cli/05_export_traces.sh
new file mode 100755
index 0000000..618b88f
--- /dev/null
+++ b/examples/cli/05_export_traces.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Export all traces to individual JSON files
+#
+# Usage: ./05_export_traces.sh [OUTPUT_DIR]
+set -euo pipefail
+
+OUTPUT_DIR="${1:-./exported_traces}"
+mkdir -p "$OUTPUT_DIR"
+
+echo "==> Exporting traces to $OUTPUT_DIR/"
+
+stratix --format json trace list | python3 -c "
+import sys, json
+for t in json.load(sys.stdin):
+    print(t['id'])
+" | while read -r id; do
+  echo "  Exporting $id..."
+  stratix trace export "$id" -o "$OUTPUT_DIR/${id}.json"
+done
+
+echo "==> Done. Files in $OUTPUT_DIR/"
+ls -la "$OUTPUT_DIR/"
diff --git a/examples/cli/06_ci_report.sh b/examples/cli/06_ci_report.sh
new file mode 100755
index 0000000..b2017b3
--- /dev/null
+++ b/examples/cli/06_ci_report.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Generate a CI evaluation report (for GitHub Actions)
+set -euo pipefail
+
+OUTPUT="${1:-summary.md}"
+
+echo "==> Generating CI report..."
+stratix ci report --limit 10 -o "$OUTPUT"
+
+echo "==> Report written to $OUTPUT"
+cat "$OUTPUT"
diff --git a/examples/cli/07_scorer_lifecycle.sh b/examples/cli/07_scorer_lifecycle.sh
new file mode 100755
index 0000000..8559673
--- /dev/null
+++ b/examples/cli/07_scorer_lifecycle.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Scorer lifecycle: create, list, inspect, delete
+#
+# Usage: ./07_scorer_lifecycle.sh <MODEL_ID>
+set -euo pipefail
+
+MODEL_ID="${1:?Usage: $0 <MODEL_ID>}"
+SCORER_NAME="CLI Demo $(date +%s)"
+
+# Create (dry-run)
+echo "==> Dry-run create:"
+stratix scorer create \
+  --name "$SCORER_NAME" \
+  --description "Evaluates generated code for correctness and readability" \
+  --model-id "$MODEL_ID" \
+  --prompt "Rate the following code on a 1-10 scale for correctness, readability, and adherence to best practices." \
+  --dry-run
+
+# Create for real
+echo ""
+echo "==> Creating scorer..."
+stratix scorer create \
+  --name "$SCORER_NAME" \
+  --description "Evaluates generated code for correctness and readability" \
+  --model-id "$MODEL_ID" \
+  --prompt "Rate the following code on a 1-10 scale for correctness, readability, and adherence to best practices."
+
+# Find the scorer by name in the list
+echo ""
+echo "==> Finding scorer in list..."
+SCORER_ID=$(stratix --format json scorer list \
+  | python3 -c "import sys,json
+for s in json.load(sys.stdin):
+    if s['name'] == '$SCORER_NAME':
+        print(s['id']); break")
+echo "==> Scorer ID: $SCORER_ID"
+
+# Inspect
+stratix scorer get "$SCORER_ID"
+
+# Delete
+echo ""
+echo "==> Cleaning up..."
+stratix scorer delete "$SCORER_ID" -y
+echo "==> Done."
diff --git a/examples/cli/08_spaces.sh b/examples/cli/08_spaces.sh
new file mode 100755
index 0000000..015b8e2
--- /dev/null
+++ b/examples/cli/08_spaces.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Evaluation spaces: create, list, inspect, delete
+set -euo pipefail
+
+# Create a private space (capture ID from output)
+echo "==> Creating evaluation space..."
+SPACE_ID=$(stratix space create \
+  --name "CLI Demo Space" \
+  --description "Temporary space for CLI examples" \
+  --visibility private \
+  | grep -oP '[a-f0-9]{24}' | head -1)
+echo "==> Created space: $SPACE_ID"
+
+# List spaces
+echo ""
+echo "==> All spaces:"
+stratix space list
+
+# Get details
+echo ""
+echo "==> Space details:"
+stratix space get "$SPACE_ID"
+
+# Clean up
+echo ""
+echo "==> Deleting space..."
+stratix space delete "$SPACE_ID" -y
+echo "==> Done."
diff --git a/examples/cli/09_integration_check.sh b/examples/cli/09_integration_check.sh
new file mode 100755
index 0000000..268fc30
--- /dev/null
+++ b/examples/cli/09_integration_check.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Check integration health
+set -euo pipefail
+
+echo "==> Integrations:"
+stratix integration list
+
+# Test each integration (skip if none found)
+echo ""
+echo "==> Testing all integrations..."
+OUTPUT=$(stratix --format json integration list 2>&1)
+if echo "$OUTPUT" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then
+  echo "$OUTPUT" | python3 -c "
+import sys, json
+for i in json.load(sys.stdin):
+    print(i['id'], i.get('name', ''))
+" | while read -r id name; do
+    echo "  Testing $name ($id)..."
+    stratix integration test "$id" || echo "  FAILED: $name"
+  done
+else
+  echo "  No integrations to test."
+fi
diff --git a/examples/cli/10_compare_models.sh b/examples/cli/10_compare_models.sh
new file mode 100755
index 0000000..65730ff
--- /dev/null
+++ b/examples/cli/10_compare_models.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Compare multiple models on the same benchmark
+#
+# Usage: ./10_compare_models.sh [BENCHMARK] [MODEL1] [MODEL2] ...
+set -euo pipefail
+
+BENCHMARK="${1:-arc-agi-2}"
+shift 2>/dev/null || true
+
+if [ $# -eq 0 ]; then
+  MODELS=("openai/gpt-4o" "openai/gpt-4o-mini")
+else
+  MODELS=("$@")
+fi
+
+echo "==> Comparing ${#MODELS[@]} models on $BENCHMARK"
+
+for model in "${MODELS[@]}"; do
+  echo "  Running: $model"
+  stratix evaluate run --model "$model" --benchmark "$BENCHMARK" --wait &
+done
+
+# Wait for all background evaluations
+wait
+
+echo ""
+echo "==> Results (sorted by accuracy):"
+stratix evaluate list --sort-by accuracy --order desc --page-size 10
diff --git a/examples/integrations.py b/examples/integrations.py
new file mode 100644
index 0000000..2621984
--- /dev/null
+++ b/examples/integrations.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Example: working with integrations via the Stratix SDK."""
+
+from layerlens import Stratix
+
+client = Stratix()
+
+# --- List all integrations
+response = client.integrations.get_many()
+
+if response is None or not response.integrations:
+    print("No integrations found.")
+else:
+    print(f"Found {response.total_count} integration(s):\n")
+    for integration in response.integrations:
+        print(f"  [{integration.id}] {integration.name}")
+        print(f"    Type:    {integration.type}")
+        print(f"    Status:  {integration.status}")
+        print(f"    Created: {integration.created_at}")
+        print()
+
+# --- List with pagination
+page1 = client.integrations.get_many(page=1, page_size=5)
+if page1:
+    print(f"Page 1: showing {page1.count} of {page1.total_count}")
+
+# --- Get a single integration by ID
+if response and response.integrations:
+    integration_id = response.integrations[0].id
+
+    integration = client.integrations.get(integration_id)
+    if integration:
+        print(f"\nIntegration detail:")
+        print(f"  ID:     {integration.id}")
+        print(f"  Name:   {integration.name}")
+        print(f"  Type:   {integration.type}")
+        print(f"  Status: {integration.status}")
+        print(f"  Config: {integration.config}")
+
+    # --- Test an integration
+    result = client.integrations.test(integration_id)
+    if result:
+        status = "OK" if result.success else "FAILED"
+        print(f"\nTest result: {status}")
+        if result.message:
+            print(f"  Message: {result.message}")
diff --git a/mypy.ini b/mypy.ini
index a5788f9..803803c 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -21,3 +21,12 @@ disallow_subclassing_any = True
 disallow_incomplete_defs = True
 disallow_untyped_decorators = True
 cache_fine_grained = True
+
+[mypy-click]
+ignore_missing_imports = True
+
+[mypy-layerlens.cli.*]
+ignore_missing_imports = True
+disallow_untyped_decorators = False
+disallow_untyped_defs = False
+disallow_any_generics = False
diff --git a/pyproject.toml b/pyproject.toml
index 783372f..fc8baa6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,12 +31,16 @@ classifiers = [
   "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 
+[project.optional-dependencies]
+cli = ["click>=8.0.0"]
+
 [project.urls]
 Homepage = "https://github.com/LayerLens/stratix-python"
 Repository = "https://github.com/LayerLens/stratix-python"
 
 [project.scripts]
 layerlens = "layerlens.cli:main"
+stratix = "layerlens.cli:main"
 
 [tool.rye]
 managed = true
@@ -49,6 +53,7 @@ dev-dependencies = [
   "ruff",
   "build",
   "twine==6.1.0",
+  "click>=8.0.0",
 ]
 
 [tool.rye.scripts]
@@ -132,14 +137,15 @@ known-first-party = ["openai", "tests"]
 "scripts/**.py" = ["T201", "T203"]
 "tests/**.py" = ["T201", "T203"]
 "examples/**.py" = ["T201", "T203"]
-"src/layerlens/cli.py" = ["T201", "T203"]
+"src/layerlens/cli/**" = ["T201", "T203"]
 
 [tool.pyright]
 include = ["src", "tests"]
 exclude = ["**/__pycache__"]
 reportMissingTypeStubs = false
 
-# Less strict settings for tests
+# Less strict settings for tests and cli
 executionEnvironments = [
+  { root = "src/layerlens/cli", reportMissingImports = false, reportFunctionMemberAccess = false, reportCallIssue = false, reportArgumentType = false, reportAttributeAccessIssue = false },
   { root = "tests", reportGeneralTypeIssues = false, reportOptionalSubscript = false, reportOptionalMemberAccess = false, reportUntypedFunctionDecorator = false, reportUnknownArgumentType = false, reportUnknownMemberType = false, reportUnknownVariableType = false, reportUnnecessaryIsInstance = false, reportUnnecessaryComparison = false, reportArgumentType = false, reportCallIssue = false },
 ]
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 2aaa85b..81a18f2 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -21,10 +21,16 @@ certifi==2025.7.14
     # via httpcore
     # via httpx
     # via requests
+cffi==2.0.0
+    # via cryptography
 charset-normalizer==3.4.3
     # via requests
+click==8.1.8
+    # via layerlens
 coverage==7.10.2
     # via pytest-cov
+cryptography==46.0.5
+    # via secretstorage
 docutils==0.22
     # via readme-renderer
 exceptiongroup==1.3.0
@@ -35,7 +41,7 @@ h11==0.16.0
 httpcore==1.0.9
     # via httpx
 httpx==0.28.1
-    # via test-atlas-lzok
+    # via layerlens
 id==1.5.0
     # via twine
 idna==3.10
@@ -54,6 +60,9 @@ jaraco-context==6.0.1
     # via keyring
 jaraco-functools==4.2.1
     # via keyring
+jeepney==0.9.0
+    # via keyring
+    # via secretstorage
 keyring==25.6.0
     # via twine
 markdown-it-py==3.0.0
@@ -79,8 +88,10 @@ pathspec==0.12.1
 pluggy==1.6.0
     # via pytest
     # via pytest-cov
+pycparser==2.23
+    # via cffi
 pydantic==2.11.7
-    # via test-atlas-lzok
+    # via layerlens
 pydantic-core==2.33.2
     # via pydantic
 pygments==2.19.2
@@ -97,7 +108,6 @@ readme-renderer==44.0
     # via twine
 requests==2.32.5
     # via id
-    # via layerlens
     # via requests-toolbelt
     # via twine
 requests-toolbelt==1.0.0
@@ -107,6 +117,8 @@ rfc3986==2.0.0
 rich==14.1.0
     # via twine
 ruff==0.12.7
+secretstorage==3.3.3
+    # via keyring
 sniffio==1.3.1
     # via anyio
 tomli==2.2.1
@@ -115,9 +127,9 @@ tomli==2.2.1
     # via mypy
     # via pytest
 twine==6.1.0
-types-requests==2.32.4.20250809
 typing-extensions==4.14.1
     # via anyio
+    # via cryptography
     # via exceptiongroup
     # via mypy
     # via pydantic
@@ -129,6 +141,5 @@ typing-inspection==0.4.1
 urllib3==2.5.0
     # via requests
     # via twine
-    # via types-requests
 zipp==3.23.0
     # via importlib-metadata
diff --git a/requirements.lock b/requirements.lock
index 540f4d6..1a890c9 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -17,9 +17,8 @@ anyio==4.9.0
 certifi==2025.7.14
     # via httpcore
     # via httpx
-    # via requests
-charset-normalizer==3.4.3
-    # via requests
+click==8.1.8
+    # via layerlens
 exceptiongroup==1.3.0
     # via anyio
 h11==0.16.0
@@ -27,17 +26,14 @@ h11==0.16.0
 httpcore==1.0.9
     # via httpx
 httpx==0.28.1
-    # via test-atlas-lzok
+    # via layerlens
 idna==3.10
     # via anyio
     # via httpx
-    # via requests
 pydantic==2.11.7
-    # via test-atlas-lzok
+    # via layerlens
 pydantic-core==2.33.2
     # via pydantic
-requests==2.32.5
-    # via atlas
 sniffio==1.3.1
     # via anyio
 typing-extensions==4.14.1
@@ -48,5 +44,3 @@ typing-extensions==4.14.1
     # via typing-inspection
 typing-inspection==0.4.1
     # via pydantic
-urllib3==2.5.0
-    # via requests
diff --git a/src/layerlens/__init__.py b/src/layerlens/__init__.py
index 78a69f9..5c6adf6 100644
--- a/src/layerlens/__init__.py
+++ b/src/layerlens/__init__.py
@@ -1,6 +1,7 @@
 from .models import (
     Judge,
     Trace,
+    Integration,
     JudgeVersion,
     JudgeSnapshot,
     BenchmarkPrompt,
@@ -47,6 +48,7 @@
     "Client",
     "ComparisonResult",
     "ComparisonResponse",
+    "Integration",
     "Judge",
     "JudgeOptimizationRun",
     "JudgeSnapshot",
diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py
index e7688e0..032e15b 100644
--- a/src/layerlens/_client.py
+++ b/src/layerlens/_client.py
@@ -21,8 +21,11 @@
     from .resources.models import Models, AsyncModels
     from .resources.traces import Traces, AsyncTraces
     from .resources.results import Results, AsyncResults
+    from .resources.scorers import Scorers, AsyncScorers
     from .resources.benchmarks import Benchmarks, AsyncBenchmarks
     from .resources.evaluations import Evaluations, AsyncEvaluations
+    from .resources.integrations import Integrations, AsyncIntegrations
+    from .resources.evaluation_spaces import EvaluationSpaces, AsyncEvaluationSpaces
     from .resources.trace_evaluations import TraceEvaluations, AsyncTraceEvaluations
     from .resources.judge_optimizations import JudgeOptimizations, AsyncJudgeOptimizations
 
@@ -124,6 +127,24 @@ def trace_evaluations(self) -> TraceEvaluations:
 
         return TraceEvaluations(self)
 
+    @cached_property
+    def integrations(self) -> Integrations:
+        from .resources.integrations import Integrations
+
+        return Integrations(self)
+
+    @cached_property
+    def scorers(self) -> Scorers:
+        from .resources.scorers import Scorers
+
+        return Scorers(self)
+
+    @cached_property
+    def evaluation_spaces(self) -> EvaluationSpaces:
+        from .resources.evaluation_spaces import EvaluationSpaces
+
+        return EvaluationSpaces(self)
+
     @cached_property
     def public(self) -> PublicClient:
         from ._public_client import PublicClient
@@ -293,6 +314,24 @@ def trace_evaluations(self) -> AsyncTraceEvaluations:
 
         return AsyncTraceEvaluations(self)
 
+    @cached_property
+    def integrations(self) -> AsyncIntegrations:
+        from .resources.integrations import AsyncIntegrations
+
+        return AsyncIntegrations(self)
+
+    @cached_property
+    def scorers(self) -> AsyncScorers:
+        from .resources.scorers import AsyncScorers
+
+        return AsyncScorers(self)
+
+    @cached_property
+    def evaluation_spaces(self) -> AsyncEvaluationSpaces:
+        from .resources.evaluation_spaces import AsyncEvaluationSpaces
+
+        return AsyncEvaluationSpaces(self)
+
     @cached_property
     def public(self) -> AsyncPublicClient:
         from ._public_client import AsyncPublicClient
diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py
index ddcf27c..438b4b5 100644
--- a/src/layerlens/_version.py
+++ b/src/layerlens/_version.py
@@ -1,4 +1,4 @@
-__version__ = "1.4.0"
+__version__ = "1.5.0"
 
 # Will be templated during the build
 __git_commit__ = "__GIT_COMMIT__"
diff --git a/src/layerlens/cli.py b/src/layerlens/cli.py
deleted file mode 100644
index 5c899c7..0000000
--- a/src/layerlens/cli.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from __future__ import annotations
-
-import sys
-
-from ._version import __version__
-
-
-def main() -> None:
-    if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"):
-        print(f"layerlens {__version__}")
-        sys.exit(0)
-
-    print(f"layerlens {__version__}")
-    print("See https://layerlens.gitbook.io/stratix-python-sdk for documentation.")
-    print("\nUsage:")
-    print("  layerlens --version   Show version")
diff --git a/src/layerlens/cli/__init__.py b/src/layerlens/cli/__init__.py
new file mode 100644
index 0000000..04234ab
--- /dev/null
+++ b/src/layerlens/cli/__init__.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import sys
+
+
+def main() -> None:
+    # Handle --version before importing click so it works without the [cli] extra
+    if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"):
+        from .._version import __version__
+
+        print(f"stratix {__version__}")  # noqa: T201
+        sys.exit(0)
+
+    try:
+        import click  # noqa: F401
+    except ImportError:
+        print(  # noqa: T201
+            "CLI dependencies not installed. Install them with:\n\n  pip install layerlens[cli]\n"
+        )
+        sys.exit(1)
+
+    from ._app import cli
+
+    cli()
+
+
+__all__ = ["main"]
diff --git a/src/layerlens/cli/_app.py b/src/layerlens/cli/_app.py
new file mode 100644
index 0000000..489d965
--- /dev/null
+++ b/src/layerlens/cli/_app.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import click
+
+from .._version import __version__
+from .commands.ci import ci
+from .commands.bulk import bulk
+from .commands.judge import judge
+from .commands.space import space
+from .commands.trace import trace
+from .commands.scorer import scorer
+from .commands.evaluate import evaluate
+from .commands.integration import integration
+
+
+@click.group()
+@click.option(
+    "--api-key",
+    envvar="LAYERLENS_STRATIX_API_KEY",
+    default=None,
+    help="API key for authentication.",
+)
+@click.option("--host", default=None, help="API host (e.g. api.layerlens.ai).")
+@click.option("--port", default=None, type=int, help="API port.")
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["table", "json"]),
+    default="table",
+    help="Output format.",
+)
+@click.option("--verbose", "-v", is_flag=True, default=False, help="Enable verbose/debug output.")
+@click.option("--quiet", "-q", is_flag=True, default=False, help="Suppress the startup banner.")
+@click.version_option(version=__version__, prog_name="layerlens")
+@click.pass_context
+def cli(
+    ctx: click.Context,
+    api_key: str | None,
+    host: str | None,
+    port: int | None,
+    output_format: str,
+    verbose: bool,
+    quiet: bool,
+) -> None:
+    """LayerLens Stratix CLI — manage traces, judges, evaluations, integrations, and more."""
+    import sys
+
+    if not quiet and sys.stderr.isatty():
+        from ._banner import banner
+
+        click.echo(banner(__version__), err=True)
+
+    ctx.ensure_object(dict)
+    ctx.obj["api_key"] = api_key
+    ctx.obj["output_format"] = output_format
+    ctx.obj["verbose"] = verbose
+
+    # Build base_url from --host / --port
+    base_url = None
+    if host is not None:
+        scheme = "https" if port in (None, 443) else "http"
+        if port and port not in (80, 443):
+            base_url = f"{scheme}://{host}:{port}/api/v1"
+        else:
+            base_url = f"{scheme}://{host}/api/v1"
+    ctx.obj["base_url"] = base_url
+
+
+# Core commands
+cli.add_command(trace)
+cli.add_command(judge)
+cli.add_command(evaluate)
+cli.add_command(integration)
+
+# Additional commands
+cli.add_command(scorer)
+cli.add_command(space)
+cli.add_command(bulk)
+cli.add_command(ci)
+
+
+@cli.command("completion")
+@click.argument("shell", type=click.Choice(["bash", "zsh", "fish", "powershell"]))
+def completion(shell: str) -> None:
+    """Print shell completion setup instructions.
+
+    \b
+    Examples:
+      stratix completion bash
+      stratix completion zsh
+      stratix completion fish
+      stratix completion powershell
+    """
+    import os
+
+    # Detect which command name was used to invoke the CLI
+    prog = os.path.basename(os.environ.get("_", "layerlens"))
+    if prog not in ("layerlens", "stratix"):
+        prog = "layerlens"
+    env_var = f"_{prog.upper()}_COMPLETE"
+
+    instructions = {
+        "bash": f'eval "$({env_var}=bash_source {prog})"',
+        "zsh": f'eval "$({env_var}=zsh_source {prog})"',
+        "fish": f"{env_var}=fish_source {prog} | source",
+        "powershell": (
+            f"Register-ArgumentCompleter -Native -CommandName {prog} -ScriptBlock {{\n"
+            "    param($wordToComplete, $commandAst, $cursorPosition)\n"
+            f'    $env:{env_var} = "powershell_source"\n'
+            f'    {prog} | ForEach-Object {{ [System.Management.Automation.CompletionResult]::new($_, $_, "ParameterValue", $_) }}\n'
+            f"    Remove-Item Env:{env_var}\n"
+            "}"
+        ),
+    }
+    if shell == "powershell":
+        print(f"Add this to your PowerShell profile:\n\n{instructions[shell]}")
+    else:
+        print(f"Add this to your shell profile:\n\n  {instructions[shell]}")
diff --git a/src/layerlens/cli/_banner.py b/src/layerlens/cli/_banner.py
new file mode 100644
index 0000000..8d01c8a
--- /dev/null
+++ b/src/layerlens/cli/_banner.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+# ANSI color codes
+_CYAN = "\033[38;2;54;191;250m"  # #36BFFA
+_GRAY = "\033[90m"
+_RESET = "\033[0m"
+
+_ART = r"""
+  ____ _____ ____      _  _____ _____  __
+ / ___|_   _|  _ \    / \|_   _|_ _\ \/ /
+ \___ \ | | | |_) |  / _ \ | |  | | \  /
+  ___) || | |  _ <  / ___ \| |  | | /  \
+ |____/ |_| |_| \_\/_/   \_\_| |___/_/\_\
+"""
+
+
+def banner(version: str) -> str:
+    """Return the colored CLI banner with version line."""
+    lines = _ART.rstrip("\n")
+    colored_art = f"{_CYAN}{lines}{_RESET}"
+    version_line = f"{_GRAY}  v{version} — layerlens.ai{_RESET}"
+    return f"{colored_art}\n{version_line}\n"
diff --git a/src/layerlens/cli/_client.py b/src/layerlens/cli/_client.py
new file mode 100644
index 0000000..a727363
--- /dev/null
+++ b/src/layerlens/cli/_client.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import re
+import sys
+import functools
+import traceback as tb
+from typing import Any, Callable
+
+import click
+
+from .._client import Stratix
+from .._exceptions import StratixError, NotFoundError, AuthenticationError
+
+
+def get_client(ctx: click.Context) -> Stratix:
+    """Create a Stratix client from CLI context options."""
+    try:
+        return Stratix(
+            api_key=ctx.obj.get("api_key"),
+            base_url=ctx.obj.get("base_url"),
+        )
+    except StratixError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+def handle_errors(fn: Callable[..., Any]) -> Callable[..., Any]:
+    """Decorator that catches SDK errors and prints user-friendly messages."""
+
+    @functools.wraps(fn)
+    @click.pass_context
+    def wrapper(ctx: click.Context, *args: Any, **kwargs: Any) -> Any:
+        try:
+            return ctx.invoke(fn, *args, **kwargs)
+        except AuthenticationError:
+            click.echo("Error: Invalid or missing API key.", err=True)
+            sys.exit(1)
+        except NotFoundError as e:
+            click.echo(f"Error: Resource not found. {e}", err=True)
+            sys.exit(1)
+        except StratixError as e:
+            click.echo(f"Error: {e}", err=True)
+            sys.exit(1)
+        except click.exceptions.Exit:
+            raise
+        except Exception as e:
+            if ctx.obj.get("verbose"):
+                tb.print_exc()
+            click.echo(f"Unexpected error: {e}", err=True)
+            sys.exit(1)
+
+    return wrapper
+
+
+_UUID_RE = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE)
+
+
+def _is_uuid(value: str) -> bool:
+    return bool(_UUID_RE.match(value))
+
+
+def resolve_model(client: Stratix, identifier: str) -> Any:
+    """Resolve a model by ID, key, or name."""
+    # Try by ID first if it looks like a UUID
+    if _is_uuid(identifier):
+        model = client.models.get_by_id(identifier)
+        if model:
+            return model
+
+    # Try by key
+    model = client.models.get_by_key(identifier)
+    if model:
+        return model
+
+    # Try by name
+    models = client.models.get(name=identifier)
+    if models:
+        return models[0]
+
+    return None
+
+
+def resolve_benchmark(client: Stratix, identifier: str) -> Any:
+    """Resolve a benchmark by ID, key, or name."""
+    # Try by ID first if it looks like a UUID
+    if _is_uuid(identifier):
+        benchmark = client.benchmarks.get_by_id(identifier)
+        if benchmark:
+            return benchmark
+
+    # Try by key
+    benchmark = client.benchmarks.get_by_key(identifier)
+    if benchmark:
+        return benchmark
+
+    # Try by name
+    benchmarks = client.benchmarks.get(name=identifier)
+    if benchmarks:
+        return benchmarks[0]
+
+    return None
diff --git a/src/layerlens/cli/_completions.py b/src/layerlens/cli/_completions.py
new file mode 100644
index 0000000..909df23
--- /dev/null
+++ b/src/layerlens/cli/_completions.py
@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from typing import Any, List
+
+import click
+
+
+def _get_client_silent(ctx: click.Context) -> Any:
+    """Try to create a Stratix client for autocompletion, returning None on failure."""
+    try:
+        from .._client import Stratix
+
+        api_key = ctx.params.get("api_key") or None
+        base_url = None
+        host = ctx.params.get("host")
+        port = ctx.params.get("port")
+        if host:
+            scheme = "https" if port in (None, 443) else "http"
+            if port and port not in (80, 443):
+                base_url = f"{scheme}://{host}:{port}/api/v1"
+            else:
+                base_url = f"{scheme}://{host}/api/v1"
+
+        return Stratix(api_key=api_key, base_url=base_url)
+    except Exception:
+        return None
+
+
+def complete_trace(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete trace IDs."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        resp = client.traces.get_many(search=incomplete if incomplete else None, page_size=20)
+        if resp and resp.traces:
+            return [
+                click.shell_completion.CompletionItem(t.id, help=t.filename)
+                for t in resp.traces
+                if t.id.startswith(incomplete)
+            ]
+    except Exception:
+        pass
+    return []
+
+
+def complete_judge(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete judge IDs."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        resp = client.judges.get_many(page_size=50)
+        if resp and resp.judges:
+            return [
+                click.shell_completion.CompletionItem(j.id, help=j.name)
+                for j in resp.judges
+                if j.id.startswith(incomplete) or j.name.lower().startswith(incomplete.lower())
+            ]
+    except Exception:
+        pass
+    return []
+
+
+def complete_model(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete model IDs, keys, and names."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        models = client.models.get()
+        if models:
+            items = []
+            for m in models:
+                if (
+                    m.id.startswith(incomplete)
+                    or m.key.lower().startswith(incomplete.lower())
+                    or m.name.lower().startswith(incomplete.lower())
+                ):
+                    items.append(click.shell_completion.CompletionItem(m.key, help=m.name))
+            return items
+    except Exception:
+        pass
+    return []
+
+
+def complete_benchmark(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete benchmark IDs, keys, and names."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        benchmarks = client.benchmarks.get()
+        if benchmarks:
+            items = []
+            for b in benchmarks:
+                if (
+                    b.id.startswith(incomplete)
+                    or b.key.lower().startswith(incomplete.lower())
+                    or b.name.lower().startswith(incomplete.lower())
+                ):
+                    items.append(click.shell_completion.CompletionItem(b.key, help=b.name))
+            return items
+    except Exception:
+        pass
+    return []
+
+
+def complete_evaluation(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete evaluation IDs."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        resp = client.evaluations.get_many(page_size=20)
+        if resp and resp.evaluations:
+            return [
+                click.shell_completion.CompletionItem(
+                    e.id,
+                    help=f"{getattr(e, 'model_name', '?')} / {getattr(e, 'benchmark_name', '?')}",
+                )
+                for e in resp.evaluations
+                if e.id.startswith(incomplete)
+            ]
+    except Exception:
+        pass
+    return []
+
+
+def complete_integration(
+    ctx: click.Context, _param: click.Parameter, incomplete: str
+) -> List[click.shell_completion.CompletionItem]:
+    """Autocomplete integration IDs."""
+    client = _get_client_silent(ctx)
+    if not client:
+        return []
+    try:
+        resp = client.integrations.get_many(page_size=50)
+        if resp and resp.integrations:
+            return [
+                click.shell_completion.CompletionItem(i.id, help=i.name)
+                for i in resp.integrations
+                if i.id.startswith(incomplete) or i.name.lower().startswith(incomplete.lower())
+            ]
+    except Exception:
+        pass
+    return []
diff --git a/src/layerlens/cli/_formatter.py b/src/layerlens/cli/_formatter.py
new file mode 100644
index 0000000..46423e4
--- /dev/null
+++ b/src/layerlens/cli/_formatter.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List, Tuple, Optional
+
+
+def to_dict(obj: Any) -> Any:
+    """Convert a Pydantic model (v1 or v2) to a dict."""
+    if hasattr(obj, "model_dump"):
+        return obj.model_dump()
+    elif hasattr(obj, "dict"):
+        return obj.dict()
+    elif isinstance(obj, dict):
+        return obj
+    return obj
+
+
+def format_table(items: List[Any], columns: List[Tuple[str, str]], max_col_width: int = 40) -> str:
+    """Render items as a fixed-width text table.
+
+    Args:
+        items: List of Pydantic models or dicts.
+        columns: List of (field_key, header_label) tuples.
+        max_col_width: Maximum column width before truncation.
+
+    Returns:
+        Formatted table string.
+    """
+    if not items:
+        return "No results found."
+
+    rows: List[Dict[str, str]] = []
+    for item in items:
+        d = to_dict(item) if not isinstance(item, dict) else item
+        row: Dict[str, str] = {}
+        for key, _ in columns:
+            val = d.get(key)
+            row[key] = _format_value(val)
+        rows.append(row)
+
+    # Compute column widths
+    widths: Dict[str, int] = {}
+    for key, header in columns:
+        widths[key] = min(max(len(header), max(len(r[key]) for r in rows)), max_col_width)
+
+    # Build header
+    header_parts = [header.ljust(widths[key]) for key, header in columns]
+    header_line = "  ".join(header_parts)
+    separator = "  ".join("-" * widths[key] for key, _ in columns)
+
+    # Build rows
+    lines = [header_line, separator]
+    for row in rows:
+        parts = [_truncate(row[key], widths[key]).ljust(widths[key]) for key, _ in columns]
+        lines.append("  ".join(parts))
+
+    return "\n".join(lines)
+
+
+def format_output(data: Any, output_format: str, columns: Optional[List[Tuple[str, str]]] = None) -> str:
+    """Format data as table or JSON.
+
+    Args:
+        data: A list of items, a single item, or a dict.
+        output_format: "table" or "json".
+        columns: For table format, list of (field_key, header_label) tuples.
+
+    Returns:
+        Formatted string.
+    """
+    if output_format == "json":
+        return _format_json(data)
+
+    # Table format
+    if isinstance(data, list):
+        if columns:
+            return format_table(data, columns)
+        return _format_json(data)
+
+    # Single item
+    return format_single(data)
+
+
+def format_single(item: Any) -> str:
+    """Format a single item as key-value pairs."""
+    d = to_dict(item) if not isinstance(item, dict) else item
+    if not isinstance(d, dict):
+        return str(d)
+
+    lines = []
+    max_key_len = max(len(k) for k in d) if d else 0
+    for key, value in d.items():
+        label = key.replace("_", " ").title()
+        lines.append(f"{label:<{max_key_len + 4}} {_format_value(value)}")
+    return "\n".join(lines)
+
+
+def _format_json(data: Any) -> str:
+    """Format data as pretty-printed JSON."""
+    if isinstance(data, list):
+        return json.dumps([to_dict(item) for item in data], indent=2, default=str)
+    return json.dumps(to_dict(data), indent=2, default=str)
+
+
+def _format_value(val: Any) -> str:
+    """Convert a value to a display string."""
+    if val is None:
+        return "-"
+    if isinstance(val, bool):
+        return "Yes" if val else "No"
+    if isinstance(val, float):
+        return f"{val:.4f}"
+    if isinstance(val, (dict, list)):
+        return json.dumps(val, default=str)
+    return str(val)
+
+
+def _truncate(s: str, width: int) -> str:
+    """Truncate a string to width, adding ellipsis if needed."""
+    if len(s) <= width:
+        return s
+    return s[: width - 1] + "\u2026"
diff --git a/src/layerlens/cli/commands/__init__.py b/src/layerlens/cli/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/layerlens/cli/commands/bulk.py b/src/layerlens/cli/commands/bulk.py
new file mode 100644
index 0000000..ea9b704
--- /dev/null
+++ b/src/layerlens/cli/commands/bulk.py
@@ -0,0 +1,183 @@
+from __future__ import annotations
+
+import sys
+import json
+
+import click
+
+from .._client import get_client, handle_errors, resolve_model, resolve_benchmark
+from .._formatter import format_output
+
+EVALUATION_COLUMNS = [
+    ("id", "ID"),
+    ("status", "Status"),
+    ("model_name", "Model"),
+    ("benchmark_name", "Benchmark"),
+]
+
+
+@click.group()
+def bulk() -> None:
+    """Bulk operations.
+
+    \b
+    Examples:
+      stratix bulk eval --file jobs.jsonl
+      stratix bulk eval --model gpt-4 --benchmark mmlu --traces trace_ids.txt
+    """
+
+
+@bulk.command("eval")
+@click.option(
+    "--file",
+    "file_path",
+    type=click.Path(exists=True),
+    help='JSONL file with evaluation jobs (each line: {"model": ..., "benchmark": ...}).',
+)
+@click.option("--model", "model_id", default=None, help="Model ID/name (use with --benchmark).")
+@click.option("--benchmark", "benchmark_id", default=None, help="Benchmark ID/name (use with --model).")
+@click.option("--judge-id", default=None, help="Judge ID (use with --traces).")
+@click.option(
+    "--traces", "traces_file", type=click.Path(exists=True), default=None, help="File with trace IDs (one per line)."
+)
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.")
+@click.option("--wait", is_flag=True, default=False, help="Wait for all evaluations to complete.")
+@click.pass_context
+@handle_errors
+def bulk_eval(
+    ctx: click.Context,
+    file_path: str | None,
+    model_id: str | None,
+    benchmark_id: str | None,
+    judge_id: str | None,
+    traces_file: str | None,
+    dry_run: bool,
+    wait: bool,
+) -> None:
+    """Run evaluations in bulk from a file or stdin.
+
+    Three modes:
+    1. JSONL file: each line is {"model": "<id>", "benchmark": "<id>"}
+    2. Model + benchmark: run a single evaluation (optionally --wait)
+    3. Judge + traces file: evaluate many traces with a judge
+
+    \b
+    Examples:
+      stratix bulk eval --file jobs.jsonl
+      stratix bulk eval --file jobs.jsonl --dry-run
+      stratix bulk eval --model gpt-4 --benchmark mmlu --wait
+      stratix bulk eval --judge-id <id> --traces trace_ids.txt
+    """
+    client = get_client(ctx)
+
+    if file_path:
+        with open(file_path) as f:
+            jobs = []
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        jobs.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        click.echo(f"Skipping invalid JSON line: {line}", err=True)
+
+        if not jobs:
+            click.echo("No valid jobs found in file.", err=True)
+            sys.exit(1)
+
+        if dry_run:
+            click.echo(f"[dry-run] Would create {len(jobs)} evaluation(s):")
+            for job in jobs:
+                click.echo(f"  model={job.get('model')} benchmark={job.get('benchmark')}")
+            return
+
+        click.echo(f"Creating {len(jobs)} evaluation(s)...")
+        evaluations = []
+        for i, job in enumerate(jobs, 1):
+            m = resolve_model(client, job["model"])
+            b = resolve_benchmark(client, job["benchmark"])
+            if m is None or b is None:
+                click.echo(
+                    f"  [{i}] SKIP - model={job.get('model')} or benchmark={job.get('benchmark')} not found", err=True
+                )
+                continue
+
+            ev = client.evaluations.create(model=m, benchmark=b)
+            if ev:
+                click.echo(f"  [{i}] Created: {ev.id}")
+                evaluations.append(ev)
+            else:
+                click.echo(f"  [{i}] FAIL", err=True)
+
+        click.echo(f"\n{len(evaluations)} evaluation(s) created.")
+
+        if wait and evaluations:
+            click.echo("Waiting for completion...")
+            for ev in evaluations:
+                result = client.evaluations.wait_for_completion(ev)
+                if result:
+                    click.echo(f"  {result.id}: {result.status}")
+
+    elif model_id and benchmark_id:
+        model = resolve_model(client, model_id)
+        if model is None:
+            click.echo(f"Model not found: {model_id}", err=True)
+            sys.exit(1)
+        benchmark = resolve_benchmark(client, benchmark_id)
+        if benchmark is None:
+            click.echo(f"Benchmark not found: {benchmark_id}", err=True)
+            sys.exit(1)
+
+        if traces_file:
+            click.echo("Error: --traces requires --judge-id, not --model/--benchmark.", err=True)
+            sys.exit(1)
+
+        else:
+            if dry_run:
+                click.echo(f"[dry-run] Would create evaluation: {model.name} x {benchmark.name}")
+                return
+
+            click.echo(f"Creating evaluation: {model.name} x {benchmark.name}")
+            ev = client.evaluations.create(model=model, benchmark=benchmark)
+            if ev is None:
+                click.echo("Failed to create evaluation.", err=True)
+                sys.exit(1)
+
+            click.echo(f"Evaluation created: {ev.id}")
+            if wait:
+                click.echo("Waiting for completion...")
+                ev = client.evaluations.wait_for_completion(ev)
+                if ev:
+                    click.echo(f"Evaluation finished: {ev.status}")
+
+            output = format_output(ev, ctx.obj["output_format"])
+            click.echo(output)
+    elif judge_id and traces_file:
+        # Mode 3: judge + traces file
+        with open(traces_file) as f:
+            trace_ids = [line.strip() for line in f if line.strip()]
+
+        if not trace_ids:
+            click.echo("No trace IDs found in file.", err=True)
+            sys.exit(1)
+
+        if dry_run:
+            click.echo(f"[dry-run] Would create {len(trace_ids)} trace evaluation(s) with judge {judge_id}:")
+            for tid in trace_ids:
+                click.echo(f"  trace={tid}")
+            return
+
+        click.echo(f"Creating {len(trace_ids)} trace evaluation(s) with judge {judge_id}...")
+        results = []
+        for i, trace_id in enumerate(trace_ids, 1):
+            te = client.trace_evaluations.create(trace_id=trace_id, judge_id=judge_id)
+            if te:
+                click.echo(f"  [{i}] Created: {te.id} (trace={trace_id})")
+                results.append(te)
+            else:
+                click.echo(f"  [{i}] FAIL (trace={trace_id})", err=True)
+
+        click.echo(f"\n{len(results)} trace evaluation(s) created.")
+    else:
+        click.echo("Provide --file, --model + --benchmark, or --judge-id + --traces.", err=True)
+        sys.exit(1)
diff --git a/src/layerlens/cli/commands/ci.py b/src/layerlens/cli/commands/ci.py
new file mode 100644
index 0000000..eaec4e4
--- /dev/null
+++ b/src/layerlens/cli/commands/ci.py
@@ -0,0 +1,126 @@
+from __future__ import annotations
+
+import sys
+import json
+from datetime import datetime
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import to_dict
+
+
+@click.group()
+def ci() -> None:
+    """CI/CD pipeline helpers.
+
+    \b
+    Examples:
+      stratix ci report
+      stratix ci report --format json
+      stratix ci report --output summary.md
+    """
+
+
+@ci.command("report")
+@click.option(
+    "--output",
+    "-o",
+    "output_file",
+    default=None,
+    type=click.Path(),
+    help="Output file path.",
+)
+@click.option("--limit", default=10, type=int, help="Number of recent evaluations to include.")
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without fetching data.")
+@click.pass_context
+@handle_errors
+def ci_report(ctx: click.Context, output_file: str | None, limit: int, dry_run: bool) -> None:
+    """Generate a CI summary report.
+
+    Outputs a markdown report suitable for GitHub Actions job summaries.
+
+    \b
+    Examples:
+      stratix ci report
+      stratix ci report --output summary.md
+      stratix ci report --limit 20 --format json
+      stratix ci report >> $GITHUB_STEP_SUMMARY
+    """
+    if dry_run:
+        click.echo("[dry-run] Would generate CI report")
+        return
+
+    client = get_client(ctx)
+
+    evals_resp = client.evaluations.get_many(page_size=limit, sort_by="submitted_at", order="desc")
+    if evals_resp is None or not evals_resp.evaluations:
+        click.echo("No evaluations found for report.", err=True)
+        sys.exit(1)
+
+    evaluations = evals_resp.evaluations
+
+    if ctx.obj["output_format"] == "json":
+        report = {
+            "generated_at": datetime.utcnow().isoformat(),
+            "total_evaluations": len(evaluations),
+            "evaluations": [to_dict(e) for e in evaluations],
+            "summary": {
+                "passed": sum(1 for e in evaluations if e.status == "success"),
+                "failed": sum(1 for e in evaluations if e.status == "failure"),
+                "pending": sum(1 for e in evaluations if e.status in ("pending", "in-progress")),
+            },
+        }
+        content = json.dumps(report, indent=2, default=str)
+    else:
+        content = _build_markdown_report(evaluations)
+
+    if output_file:
+        with open(output_file, "w") as f:
+            f.write(content)
+        click.echo(f"Report written to {output_file}")
+    else:
+        click.echo(content)
+
+
+def _build_markdown_report(evaluations: list) -> str:
+    lines = []
+    lines.append("# Stratix Evaluation Report")
+    lines.append("")
+    lines.append(f"Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}")
+    lines.append("")
+
+    passed = sum(1 for e in evaluations if e.status == "success")
+    failed = sum(1 for e in evaluations if e.status == "failure")
+    pending = sum(1 for e in evaluations if e.status in ("pending", "in-progress"))
+
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"| Metric | Count |")
+    lines.append(f"|--------|-------|")
+    lines.append(f"| Total | {len(evaluations)} |")
+    lines.append(f"| Passed | {passed} |")
+    lines.append(f"| Failed | {failed} |")
+    lines.append(f"| Pending | {pending} |")
+    lines.append("")
+
+    lines.append("## Evaluations")
+    lines.append("")
+    lines.append("| ID | Model | Benchmark | Status | Accuracy |")
+    lines.append("|-----|-------|-----------|--------|----------|")
+
+    for e in evaluations:
+        eid = getattr(e, "id", "-")[:12]
+        model = getattr(e, "model_name", "-") or "-"
+        benchmark = getattr(e, "benchmark_name", "-") or "-"
+        status = getattr(e, "status", "-") or "-"
+        accuracy = getattr(e, "accuracy", None)
+        acc_str = f"{accuracy:.2%}" if accuracy is not None else "-"
+        emoji = "+" if status == "success" else ("-" if status == "failure" else " ")
+        lines.append(f"| `{eid}` | {model} | {benchmark} | {emoji} {status} | {acc_str} |")
+
+    lines.append("")
+    lines.append("---")
+    lines.append("*Generated by `stratix ci report`*")
+
+    return "\n".join(lines)
diff --git a/src/layerlens/cli/commands/evaluate.py b/src/layerlens/cli/commands/evaluate.py
new file mode 100644
index 0000000..6371590
--- /dev/null
+++ b/src/layerlens/cli/commands/evaluate.py
@@ -0,0 +1,173 @@
+from __future__ import annotations
+
+import sys
+
+import click
+
+from .._client import get_client, handle_errors, resolve_model, resolve_benchmark
+from .._formatter import format_output
+from .._completions import complete_model, complete_benchmark, complete_evaluation
+
+EVALUATION_COLUMNS = [
+    ("id", "ID"),
+    ("status", "Status"),
+    ("model_name", "Model"),
+    ("benchmark_name", "Benchmark"),
+    ("accuracy", "Accuracy"),
+    ("submitted_at", "Submitted"),
+]
+
+
+@click.group()
+def evaluate() -> None:
+    """Manage evaluations.
+
+    \b
+    Examples:
+      stratix evaluate list
+      stratix evaluate get <evaluation-id>
+      stratix evaluate run --model gpt-4 --benchmark mmlu --wait
+    """
+
+
+@evaluate.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.option("--status", default=None, help="Filter by status (pending, in-progress, success, failure).")
+@click.option(
+    "--sort-by", default=None, type=click.Choice(["submitted_at", "accuracy", "average_duration"]), help="Sort field."
+)
+@click.option("--order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.")
+@click.pass_context
+@handle_errors
+def list_evaluations(
+    ctx: click.Context,
+    page: int | None,
+    page_size: int | None,
+    status: str | None,
+    sort_by: str | None,
+    order: str | None,
+) -> None:
+    """List evaluations with optional filtering and pagination.
+
+    \b
+    Examples:
+      stratix evaluate list
+      stratix evaluate list --status success --sort-by accuracy --order desc
+      stratix evaluate list --page-size 5
+    """
+    from ...models import EvaluationStatus
+
+    client = get_client(ctx)
+
+    eval_status = None
+    if status:
+        try:
+            eval_status = EvaluationStatus(status)
+        except ValueError:
+            click.echo(f"Invalid status: {status}. Valid: {', '.join(s.value for s in EvaluationStatus)}", err=True)
+            sys.exit(1)
+
+    result = client.evaluations.get_many(
+        page=page,
+        page_size=page_size,
+        status=eval_status,
+        sort_by=sort_by,  # type: ignore[arg-type]
+        order=order,  # type: ignore[arg-type]
+    )
+    if result is None or not result.evaluations:
+        click.echo("No evaluations found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(
+            f"Showing page {result.pagination.page} of {result.pagination.total_pages} ({result.pagination.total_count} total)",
+            err=True,
+        )
+
+    output = format_output(result.evaluations, ctx.obj["output_format"], EVALUATION_COLUMNS)
+    click.echo(output)
+
+
+@evaluate.command("get")
+@click.argument("id", shell_complete=complete_evaluation)
+@click.pass_context
+@handle_errors
+def get_evaluation(ctx: click.Context, id: str) -> None:
+    """Get an evaluation by ID.
+
+    \b
+    Examples:
+      stratix evaluate get abc123
+      stratix evaluate get abc123 --format json
+    """
+    client = get_client(ctx)
+    evaluation = client.evaluations.get_by_id(id)
+    if evaluation is None:
+        click.echo(f"Evaluation {id} not found.", err=True)
+        sys.exit(1)
+
+    output = format_output(evaluation, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@evaluate.command("run")
+@click.option("--model", "model_id", required=True, shell_complete=complete_model, help="Model ID, key, or name.")
+@click.option(
+    "--benchmark", "benchmark_id", required=True, shell_complete=complete_benchmark, help="Benchmark ID, key, or name."
+)
+@click.option("--wait", is_flag=True, default=False, help="Wait for evaluation to complete.")
+@click.pass_context
+@handle_errors
+def run_evaluation(ctx: click.Context, model_id: str, benchmark_id: str, wait: bool) -> None:
+    """Run an evaluation with a model and benchmark.
+
+    The --model and --benchmark options accept an ID, key, or name.
+
+    \b
+    Examples:
+      stratix evaluate run --model gpt-4 --benchmark mmlu
+      stratix evaluate run --model abc123-uuid --benchmark def456-uuid --wait
+      stratix evaluate run --model "GPT-4" --benchmark "MMLU" --wait --format json
+    """
+    client = get_client(ctx)
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Resolving model: {model_id}", err=True)
+
+    model = resolve_model(client, model_id)
+    if model is None:
+        click.echo(f"Model not found: {model_id}", err=True)
+        sys.exit(1)
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Resolved model: {model.name} ({model.id})", err=True)
+        click.echo(f"Resolving benchmark: {benchmark_id}", err=True)
+
+    benchmark = resolve_benchmark(client, benchmark_id)
+    if benchmark is None:
+        click.echo(f"Benchmark not found: {benchmark_id}", err=True)
+        sys.exit(1)
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Resolved benchmark: {benchmark.name} ({benchmark.id})", err=True)
+
+    click.echo(f"Creating evaluation: {model.name} x {benchmark.name}")
+
+    evaluation = client.evaluations.create(model=model, benchmark=benchmark)
+    if evaluation is None:
+        click.echo("Failed to create evaluation.", err=True)
+        sys.exit(1)
+
+    click.echo(f"Evaluation created: {evaluation.id} (status: {evaluation.status})")
+
+    if wait:
+        click.echo("Waiting for completion...")
+        evaluation = client.evaluations.wait_for_completion(evaluation)
+        if evaluation is None:
+            click.echo("Evaluation disappeared while waiting.", err=True)
+            sys.exit(1)
+        click.echo(f"Evaluation finished: {evaluation.status}")
+
+    output = format_output(evaluation, ctx.obj["output_format"])
+    click.echo(output)
diff --git a/src/layerlens/cli/commands/integration.py b/src/layerlens/cli/commands/integration.py
new file mode 100644
index 0000000..4675ffe
--- /dev/null
+++ b/src/layerlens/cli/commands/integration.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import sys
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import format_output
+from .._completions import complete_integration
+
+INTEGRATION_COLUMNS = [
+    ("id", "ID"),
+    ("name", "Name"),
+    ("type", "Type"),
+    ("status", "Status"),
+    ("created_at", "Created"),
+]
+
+
+@click.group()
+def integration() -> None:
+    """Manage integrations.
+
+    \b
+    Examples:
+      stratix integration list
+      stratix integration test <integration-id>
+    """
+
+
+@integration.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.pass_context
+@handle_errors
+def list_integrations(ctx: click.Context, page: int | None, page_size: int | None) -> None:
+    """List integrations with optional pagination.
+
+    \b
+    Examples:
+      stratix integration list
+      stratix integration list --page-size 10
+    """
+    client = get_client(ctx)
+    result = client.integrations.get_many(page=page, page_size=page_size)
+    if result is None or not result.integrations:
+        click.echo("No integrations found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Showing {result.count} of {result.total_count} integrations", err=True)
+
+    output = format_output(result.integrations, ctx.obj["output_format"], INTEGRATION_COLUMNS)
+    click.echo(output)
+
+
+@integration.command("test")
+@click.argument("id", shell_complete=complete_integration)
+@click.pass_context
+@handle_errors
+def test_integration(ctx: click.Context, id: str) -> None:
+    """Test an integration by ID.
+
+    \b
+    Examples:
+      stratix integration test abc123
+      stratix integration test abc123 --format json
+    """
+    client = get_client(ctx)
+    result = client.integrations.test(id)
+    if result is None:
+        click.echo(f"Failed to test integration {id}.", err=True)
+        sys.exit(1)
+
+    if result.success:
+        click.echo(f"Integration {id}: OK")
+    else:
+        click.echo(f"Integration {id}: FAILED")
+
+    if result.message:
+        click.echo(f"Message: {result.message}")
+
+    if ctx.obj["output_format"] == "json":
+        output = format_output(result, "json")
+        click.echo(output)
diff --git a/src/layerlens/cli/commands/judge.py b/src/layerlens/cli/commands/judge.py
new file mode 100644
index 0000000..16f222a
--- /dev/null
+++ b/src/layerlens/cli/commands/judge.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import sys
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import format_output
+from .._completions import complete_judge, complete_model, complete_trace
+
+JUDGE_COLUMNS = [
+    ("id", "ID"),
+    ("name", "Name"),
+    ("version", "Version"),
+    ("run_count", "Runs"),
+    ("created_at", "Created"),
+]
+
+
+@click.group()
+def judge() -> None:
+    """Manage judges.
+
+    \b
+    Examples:
+      stratix judge list
+      stratix judge get <judge-id>
+      stratix judge create --name "Quality" --goal "Evaluate response quality"
+      stratix judge test --judge-id <id> --trace-id <id>
+    """
+
+
+@judge.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.pass_context
+@handle_errors
+def list_judges(ctx: click.Context, page: int | None, page_size: int | None) -> None:
+    """List judges with optional pagination.
+
+    \b
+    Examples:
+      stratix judge list
+      stratix judge list --page-size 5
+    """
+    client = get_client(ctx)
+    result = client.judges.get_many(page=page, page_size=page_size)
+    if result is None or not result.judges:
+        click.echo("No judges found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Showing {result.count} of {result.total_count} judges", err=True)
+
+    output = format_output(result.judges, ctx.obj["output_format"], JUDGE_COLUMNS)
+    click.echo(output)
+
+
+@judge.command("get")
+@click.argument("id", shell_complete=complete_judge)
+@click.pass_context
+@handle_errors
+def get_judge(ctx: click.Context, id: str) -> None:
+    """Get a judge by ID.
+
+    \b
+    Examples:
+      stratix judge get abc123
+      stratix judge get abc123 --format json
+    """
+    client = get_client(ctx)
+    j = client.judges.get(id)
+    if j is None:
+        click.echo(f"Judge {id} not found.", err=True)
+        sys.exit(1)
+
+    output = format_output(j, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@judge.command("create")
+@click.option("--name", required=True, help="Judge name.")
+@click.option("--goal", required=True, help="Evaluation goal description.")
+@click.option("--model-id", default=None, shell_complete=complete_model, help="Model ID for the judge.")
+@click.pass_context
+@handle_errors
+def create_judge(ctx: click.Context, name: str, goal: str, model_id: str | None) -> None:
+    """Create a new judge.
+
+    \b
+    Examples:
+      stratix judge create --name "Quality" --goal "Evaluate response quality"
+      stratix judge create --name "Safety" --goal "Check for harmful content" --model-id abc123
+    """
+    client = get_client(ctx)
+    j = client.judges.create(name=name, evaluation_goal=goal, model_id=model_id)
+    if j is None:
+        click.echo("Failed to create judge.", err=True)
+        sys.exit(1)
+
+    click.echo(f"Judge created: {j.id}")
+    output = format_output(j, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@judge.command("test")
+@click.option("--judge-id", required=True, shell_complete=complete_judge, help="Judge ID to test with.")
+@click.option("--trace-id", required=True, shell_complete=complete_trace, help="Trace ID to evaluate.")
+@click.pass_context
+@handle_errors
+def test_judge(ctx: click.Context, judge_id: str, trace_id: str) -> None:
+    """Test a judge by evaluating a trace.
+
+    Creates a trace evaluation using the specified judge and trace.
+
+    \b
+    Examples:
+      stratix judge test --judge-id abc123 --trace-id def456
+      stratix judge test --judge-id abc123 --trace-id def456 --format json
+    """
+    client = get_client(ctx)
+    te = client.trace_evaluations.create(trace_id=trace_id, judge_id=judge_id)
+    if te is None:
+        click.echo("Failed to create trace evaluation.", err=True)
+        sys.exit(1)
+
+    click.echo(f"Trace evaluation created: {te.id}")
+    output = format_output(te, ctx.obj["output_format"])
+    click.echo(output)
diff --git a/src/layerlens/cli/commands/scorer.py b/src/layerlens/cli/commands/scorer.py
new file mode 100644
index 0000000..5924775
--- /dev/null
+++ b/src/layerlens/cli/commands/scorer.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+import sys
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import format_output
+
+SCORER_COLUMNS = [
+    ("id", "ID"),
+    ("name", "Name"),
+    ("model_name", "Model"),
+    ("model_company", "Company"),
+    ("created_at", "Created"),
+]
+
+
+@click.group()
+def scorer() -> None:
+    """Manage scorers.
+
+    \b
+    Examples:
+      stratix scorer list
+      stratix scorer get <scorer-id>
+      stratix scorer create --name "Quality" --description "..." --model-id <id> --prompt "..."
+      stratix scorer delete <scorer-id>
+    """
+
+
+@scorer.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.pass_context
+@handle_errors
+def list_scorers(ctx: click.Context, page: int | None, page_size: int | None) -> None:
+    """List scorers with optional pagination.
+
+    \b
+    Examples:
+      stratix scorer list
+      stratix scorer list --page-size 10
+    """
+    client = get_client(ctx)
+    result = client.scorers.get_many(page=page, page_size=page_size)
+    if result is None or not result.scorers:
+        click.echo("No scorers found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Showing {result.count} of {result.total_count} scorers", err=True)
+
+    output = format_output(result.scorers, ctx.obj["output_format"], SCORER_COLUMNS)
+    click.echo(output)
+
+
+@scorer.command("get")
+@click.argument("id")
+@click.pass_context
+@handle_errors
+def get_scorer(ctx: click.Context, id: str) -> None:
+    """Get a scorer by ID.
+
+    \b
+    Examples:
+      stratix scorer get abc123
+      stratix scorer get abc123 --format json
+    """
+    client = get_client(ctx)
+    s = client.scorers.get(id)
+    if s is None:
+        click.echo(f"Scorer {id} not found.", err=True)
+        sys.exit(1)
+
+    output = format_output(s, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@scorer.command("create")
+@click.option("--name", required=True, help="Scorer name (3-64 chars).")
+@click.option("--description", required=True, help="Scorer description (10-500 chars).")
+@click.option("--model-id", required=True, help="Model ID to use for scoring.")
+@click.option("--prompt", required=True, help="Scoring prompt.")
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.")
+@click.pass_context
+@handle_errors
+def create_scorer(ctx: click.Context, name: str, description: str, model_id: str, prompt: str, dry_run: bool) -> None:
+    """Create a new scorer.
+
+    \b
+    Examples:
+      stratix scorer create --name "Quality" --description "Evaluate quality" --model-id abc123 --prompt "Rate the quality..."
+      stratix scorer create --name "Test" --description "Test scorer" --model-id abc123 --prompt "..." --dry-run
+    """
+    if dry_run:
+        click.echo(f"[dry-run] Would create scorer: {name}")
+        click.echo(f"  Model: {model_id}")
+        click.echo(f"  Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}")
+        return
+
+    client = get_client(ctx)
+    s = client.scorers.create(name=name, description=description, model_id=model_id, prompt=prompt)
+    if s is None:
+        click.echo("Failed to create scorer.", err=True)
+        sys.exit(1)
+
+    click.echo(f"Scorer created: {s.id}")
+    output = format_output(s, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@scorer.command("delete")
+@click.argument("id")
+@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.")
+@click.pass_context
+@handle_errors
+def delete_scorer(ctx: click.Context, id: str, yes: bool, dry_run: bool) -> None:
+    """Delete a scorer by ID.
+
+    \b
+    Examples:
+      stratix scorer delete abc123
+      stratix scorer delete abc123 --yes
+      stratix scorer delete abc123 --dry-run
+    """
+    if dry_run:
+        click.echo(f"[dry-run] Would delete scorer {id}")
+        return
+
+    if not yes:
+        click.confirm(f"Are you sure you want to delete scorer {id}?", abort=True)
+
+    client = get_client(ctx)
+    success = client.scorers.delete(id)
+    if success:
+        click.echo(f"Scorer {id} deleted.")
+    else:
+        click.echo(f"Failed to delete scorer {id}.", err=True)
+        sys.exit(1)
diff --git a/src/layerlens/cli/commands/space.py b/src/layerlens/cli/commands/space.py
new file mode 100644
index 0000000..f7482cf
--- /dev/null
+++ b/src/layerlens/cli/commands/space.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+import sys
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import format_output
+
+SPACE_COLUMNS = [
+    ("id", "ID"),
+    ("name", "Name"),
+    ("visibility", "Visibility"),
+    ("models_count", "Models"),
+    ("benchmarks_count", "Benchmarks"),
+    ("evaluations_count", "Evaluations"),
+    ("created_at", "Created"),
+]
+
+
+@click.group()
+def space() -> None:
+    """Manage evaluation spaces.
+
+    \b
+    Examples:
+      stratix space list
+      stratix space get <space-id>
+      stratix space create --name "My Space"
+      stratix space delete <space-id>
+    """
+
+
+@space.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.option("--sort-by", default=None, help="Sort field (e.g. weight, created_at).")
+@click.option("--order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.")
+@click.pass_context
+@handle_errors
+def list_spaces(
+    ctx: click.Context, page: int | None, page_size: int | None, sort_by: str | None, order: str | None
+) -> None:
+    """List evaluation spaces with optional pagination.
+
+    \b
+    Examples:
+      stratix space list
+      stratix space list --page-size 10
+      stratix space list --sort-by created_at --order desc
+    """
+    client = get_client(ctx)
+    result = client.evaluation_spaces.get_many(page=page, page_size=page_size, sort_by=sort_by, order=order)
+    if result is None or not result.evaluation_spaces:
+        click.echo("No evaluation spaces found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Showing {result.count} of {result.total_count} evaluation spaces", err=True)
+
+    output = format_output(result.evaluation_spaces, ctx.obj["output_format"], SPACE_COLUMNS)
+    click.echo(output)
+
+
+@space.command("get")
+@click.argument("id")
+@click.pass_context
+@handle_errors
+def get_space(ctx: click.Context, id: str) -> None:
+    """Get an evaluation space by ID or slug.
+
+    \b
+    Examples:
+      stratix space get abc123
+      stratix space get my-space-slug
+      stratix space get abc123 --format json
+    """
+    client = get_client(ctx)
+    s = client.evaluation_spaces.get(id)
+    if s is None:
+        click.echo(f"Evaluation space {id} not found.", err=True)
+        sys.exit(1)
+
+    output = format_output(s, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@space.command("create")
+@click.option("--name", required=True, help="Space name.")
+@click.option("--description", default=None, help="Space description (max 500 chars).")
+@click.option(
+    "--visibility", default=None, type=click.Choice(["private", "public", "tenant"]), help="Visibility level."
+)
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.")
+@click.pass_context
+@handle_errors
+def create_space(ctx: click.Context, name: str, description: str | None, visibility: str | None, dry_run: bool) -> None:
+    """Create a new evaluation space.
+
+    \b
+    Examples:
+      stratix space create --name "Production"
+      stratix space create --name "Public Board" --visibility public
+      stratix space create --name "Test" --dry-run
+    """
+    if dry_run:
+        click.echo(f"[dry-run] Would create evaluation space: {name}")
+        if visibility:
+            click.echo(f"  Visibility: {visibility}")
+        return
+
+    client = get_client(ctx)
+    s = client.evaluation_spaces.create(name=name, description=description, visibility=visibility)
+    if s is None:
+        click.echo("Failed to create evaluation space.", err=True)
+        sys.exit(1)
+
+    click.echo(f"Evaluation space created: {s.id}")
+    output = format_output(s, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@space.command("delete")
+@click.argument("id")
+@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
+@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.")
+@click.pass_context
+@handle_errors
+def delete_space(ctx: click.Context, id: str, yes: bool, dry_run: bool) -> None:
+    """Delete an evaluation space by ID.
+
+    \b
+    Examples:
+      stratix space delete abc123
+      stratix space delete abc123 --yes
+      stratix space delete abc123 --dry-run
+    """
+    if dry_run:
+        click.echo(f"[dry-run] Would delete evaluation space {id}")
+        return
+
+    if not yes:
+        click.confirm(f"Are you sure you want to delete evaluation space {id}?", abort=True)
+
+    client = get_client(ctx)
+    success = client.evaluation_spaces.delete(id)
+    if success:
+        click.echo(f"Evaluation space {id} deleted.")
+    else:
+        click.echo(f"Failed to delete evaluation space {id}.", err=True)
+        sys.exit(1)
diff --git a/src/layerlens/cli/commands/trace.py b/src/layerlens/cli/commands/trace.py
new file mode 100644
index 0000000..3671693
--- /dev/null
+++ b/src/layerlens/cli/commands/trace.py
@@ -0,0 +1,203 @@
+from __future__ import annotations
+
+import sys
+import json
+
+import click
+
+from .._client import get_client, handle_errors
+from .._formatter import to_dict, format_output
+from .._completions import complete_trace
+
+TRACE_COLUMNS = [
+    ("id", "ID"),
+    ("created_at", "Created"),
+    ("filename", "Filename"),
+    ("evaluations_count", "Evaluations"),
+]
+
+
+@click.group()
+def trace() -> None:
+    """Manage traces.
+
+    \b
+    Examples:
+      stratix trace list
+      stratix trace get <trace-id>
+      stratix trace search "user login"
+      stratix trace export <trace-id> --output trace.json
+      stratix trace delete <trace-id> --yes
+    """
+
+
+@trace.command("list")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.option("--source", default=None, help="Filter by source.")
+@click.option("--status", default=None, help="Filter by status.")
+@click.option("--sort-by", default=None, help="Sort field.")
+@click.option("--sort-order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.")
+@click.pass_context
+@handle_errors
+def list_traces(
+    ctx: click.Context,
+    page: int | None,
+    page_size: int | None,
+    source: str | None,
+    status: str | None,
+    sort_by: str | None,
+    sort_order: str | None,
+) -> None:
+    """List traces with optional filtering and pagination.
+
+    \b
+    Examples:
+      stratix trace list
+      stratix trace list --page-size 10
+      stratix trace list --source sdk --sort-by created_at --sort-order desc
+    """
+    client = get_client(ctx)
+    result = client.traces.get_many(
+        page=page,
+        page_size=page_size,
+        source=source,
+        status=status,
+        sort_by=sort_by,
+        sort_order=sort_order,
+    )
+    if result is None or not result.traces:
+        click.echo("No traces found.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Showing {result.count} of {result.total_count} traces", err=True)
+
+    output = format_output(result.traces, ctx.obj["output_format"], TRACE_COLUMNS)
+    click.echo(output)
+
+
+@trace.command("get")
+@click.argument("id", shell_complete=complete_trace)
+@click.pass_context
+@handle_errors
+def get_trace(ctx: click.Context, id: str) -> None:
+    """Get a trace by ID.
+
+    \b
+    Examples:
+      stratix trace get abc123-def4-5678-ghij-klmnopqrstuv
+      stratix trace get abc123 --format json
+    """
+    client = get_client(ctx)
+    trace = client.traces.get(id)
+    if trace is None:
+        click.echo(f"Trace {id} not found.", err=True)
+        sys.exit(1)
+
+    output = format_output(trace, ctx.obj["output_format"])
+    click.echo(output)
+
+
+@trace.command("search")
+@click.argument("query")
+@click.option("--page", default=None, type=int, help="Page number.")
+@click.option("--page-size", default=None, type=int, help="Results per page.")
+@click.option("--source", default=None, help="Filter by source.")
+@click.option("--status", default=None, help="Filter by status.")
+@click.option("--sort-by", default=None, help="Sort field.")
+@click.option("--sort-order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.")
+@click.pass_context
+@handle_errors
+def search_traces(
+    ctx: click.Context,
+    query: str,
+    page: int | None,
+    page_size: int | None,
+    source: str | None,
+    status: str | None,
+    sort_by: str | None,
+    sort_order: str | None,
+) -> None:
+    """Search traces by query string.
+
+    \b
+    Examples:
+      stratix trace search "user login"
+      stratix trace search "error" --source sdk --page-size 5
+    """
+    client = get_client(ctx)
+    result = client.traces.get_many(
+        search=query,
+        page=page,
+        page_size=page_size,
+        source=source,
+        status=status,
+        sort_by=sort_by,
+        sort_order=sort_order,
+    )
+    if result is None or not result.traces:
+        click.echo("No traces found matching your query.")
+        return
+
+    if ctx.obj["verbose"]:
+        click.echo(f"Found {result.count} of {result.total_count} traces", err=True)
+
+    output = format_output(result.traces, ctx.obj["output_format"], TRACE_COLUMNS)
+    click.echo(output)
+
+
+@trace.command("export")
+@click.argument("id", shell_complete=complete_trace)
+@click.option(
+    "--output", "-o", "output_file", default=None, type=click.Path(), help="Output file path (default: stdout)."
+)
+@click.pass_context
+@handle_errors
+def export_trace(ctx: click.Context, id: str, output_file: str | None) -> None:
+    """Export a trace as JSON.
+
+    \b
+    Examples:
+      stratix trace export abc123
+      stratix trace export abc123 --output trace.json
+    """
+    client = get_client(ctx)
+    trace = client.traces.get(id)
+    if trace is None:
+        click.echo(f"Trace {id} not found.", err=True)
+        sys.exit(1)
+
+    json_str = json.dumps(to_dict(trace), indent=2, default=str)
+
+    if output_file:
+        with open(output_file, "w") as f:
+            f.write(json_str)
+        click.echo(f"Trace exported to {output_file}")
+    else:
+        click.echo(json_str)
+
+
+@trace.command("delete")
+@click.argument("id", shell_complete=complete_trace)
+@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
+@click.pass_context
+@handle_errors
+def delete_trace(ctx: click.Context, id: str, yes: bool) -> None:
+    """Delete a trace by ID.
+
+    \b
+    Examples:
+      stratix trace delete abc123
+      stratix trace delete abc123 --yes
+    """
+    if not yes:
+        click.confirm(f"Are you sure you want to delete trace {id}?", abort=True)
+
+    client = get_client(ctx)
+    success = client.traces.delete(id)
+    if success:
+        click.echo(f"Trace {id} deleted.")
+    else:
+        click.echo(f"Failed to delete trace {id}.", err=True)
+        sys.exit(1)
diff --git a/src/layerlens/models/__init__.py b/src/layerlens/models/__init__.py
index 26bcfb9..b4aa7a5 100644
--- a/src/layerlens/models/__init__.py
+++ b/src/layerlens/models/__init__.py
@@ -5,6 +5,7 @@
     ModelsResponse,
     TracesResponse,
     ResultsResponse,
+    ScorersResponse,
     UploadURLResponse,
     BenchmarksResponse,
     CreateJudgeResponse,
@@ -14,8 +15,11 @@
     UpdateJudgeResponse,
     CostEstimateResponse,
     CreateTracesResponse,
+    IntegrationsResponse,
     OrganizationResponse,
     CreateBenchmarkResponse,
+    TestIntegrationResponse,
+    EvaluationSpacesResponse,
     TraceEvaluationsResponse,
     CreateEvaluationsResponse,
     JudgeOptimizationRunsResponse,
@@ -38,6 +42,7 @@
     PublicModelsListResponse,
     PublicBenchmarksListResponse,
 )
+from .scorer import Scorer
 from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark
 from .evaluation import (
     Result,
@@ -52,7 +57,14 @@
     PerformanceDetails,
     EvaluationModelInfo,
 )
+from .integration import Integration
 from .organization import Project, Organization
+from .evaluation_space import (
+    EvaluationSpace,
+    EvaluationSpaceFilters,
+    EvaluationSpaceModelFilter,
+    EvaluationSpaceDatasetFilter,
+)
 from .trace_evaluation import (
     JudgeSnapshot,
     TraceEvaluation,
@@ -86,6 +98,14 @@
     "CustomModel",
     "DeleteJudgeResponse",
     "EstimateJudgeOptimizationCostResponse",
+    "EvaluationSpace",
+    "EvaluationSpaceDatasetFilter",
+    "EvaluationSpaceFilters",
+    "EvaluationSpaceModelFilter",
+    "EvaluationSpacesResponse",
+    "Integration",
+    "IntegrationsResponse",
+    "TestIntegrationResponse",
     "AnalysisSummary",
     "ErrorAnalysis",
     "Evaluation",
@@ -120,6 +140,8 @@
     "Result",
     "ResultMetrics",
     "ResultsResponse",
+    "Scorer",
+    "ScorersResponse",
     "Trace",
     "TraceEvaluation",
     "TraceEvaluationResult",
diff --git a/src/layerlens/models/api.py b/src/layerlens/models/api.py
index 72a2390..308f89f 100644
--- a/src/layerlens/models/api.py
+++ b/src/layerlens/models/api.py
@@ -1,15 +1,18 @@
 from __future__ import annotations
 
-from typing import List
+from typing import List, Optional
 
 from pydantic import Field, BaseModel, ConfigDict
 
 from .judge import Judge
 from .model import Model
 from .trace import TraceWithEvaluations
+from .scorer import Scorer
 from .benchmark import Benchmark
 from .evaluation import Result, Evaluation
+from .integration import Integration
 from .organization import Organization
+from .evaluation_space import EvaluationSpace
 from .trace_evaluation import TraceEvaluation, TraceEvaluationResult
 from .judge_optimization import JudgeOptimizationRun
 
@@ -152,3 +155,26 @@ class ApplyJudgeOptimizationResultResponse(BaseModel):
     judge_id: str
     new_version: int
     message: str
+
+
+class IntegrationsResponse(BaseModel):
+    integrations: List[Integration]
+    count: int
+    total_count: int
+
+
+class TestIntegrationResponse(BaseModel):
+    success: bool
+    message: Optional[str] = None
+
+
+class ScorersResponse(BaseModel):
+    scorers: List[Scorer]
+    count: int
+    total_count: int
+
+
+class EvaluationSpacesResponse(BaseModel):
+    evaluation_spaces: List[EvaluationSpace]
+    count: int
+    total_count: int
diff --git a/src/layerlens/models/evaluation.py b/src/layerlens/models/evaluation.py
index 7d16a3c..f2f090d 100644
--- a/src/layerlens/models/evaluation.py
+++ b/src/layerlens/models/evaluation.py
@@ -40,6 +40,8 @@ class EvaluationDataset(BaseModel):
 
 
 class EvaluationModelInfo(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
     model_name: str = ""
     performance: Any = None
 
@@ -71,7 +73,7 @@ class EvaluationSummary(BaseModel):
 
 
 class Evaluation(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(populate_by_name=True, protected_namespaces=())
 
     id: str
     status: EvaluationStatus
diff --git a/src/layerlens/models/evaluation_space.py b/src/layerlens/models/evaluation_space.py
new file mode 100644
index 0000000..3ab0be5
--- /dev/null
+++ b/src/layerlens/models/evaluation_space.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class EvaluationSpaceModelFilter(BaseModel):
+    ids: List[str] = []
+    vendors: List[str] = []
+    regions: List[str] = []
+
+
+class EvaluationSpaceDatasetFilter(BaseModel):
+    ids: List[str] = []
+    categories: List[str] = []
+    languages: List[str] = []
+
+
+class EvaluationSpaceFilters(BaseModel):
+    model_filters: Optional[EvaluationSpaceModelFilter] = None
+    dataset_filters: Optional[EvaluationSpaceDatasetFilter] = None
+    providers: List[str] = []
+
+
+class EvaluationSpace(BaseModel):
+    id: str
+    organization_id: Optional[str] = None
+    project_id: Optional[str] = None
+    name: str
+    description: Optional[str] = None
+    filters: Optional[EvaluationSpaceFilters] = None
+    owner: Optional[str] = None
+    visibility: Optional[str] = None
+    is_featured: bool = False
+    is_partner: bool = False
+    partner_name: Optional[str] = None
+    created_at: Optional[str] = None
+    image_path: Optional[str] = None
+    weight: int = 0
+    slug: Optional[str] = None
+    models_count: int = 0
+    benchmarks_count: int = 0
+    evaluations_count: int = 0
diff --git a/src/layerlens/models/integration.py b/src/layerlens/models/integration.py
new file mode 100644
index 0000000..5262259
--- /dev/null
+++ b/src/layerlens/models/integration.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel
+
+
+class Integration(BaseModel):
+    id: str
+    organization_id: str
+    project_id: str
+    name: str
+    type: Optional[str] = None
+    status: Optional[str] = None
+    created_at: Optional[str] = None
+    config: Dict[str, Any] = {}
diff --git a/src/layerlens/models/judge.py b/src/layerlens/models/judge.py
index 8b68439..db416a1 100644
--- a/src/layerlens/models/judge.py
+++ b/src/layerlens/models/judge.py
@@ -2,10 +2,12 @@
 
 from typing import List, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 
 class JudgeVersion(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
     version: int
     name: str
     evaluation_goal: str
@@ -17,6 +19,8 @@ class JudgeVersion(BaseModel):
 
 
 class Judge(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
     id: str
     organization_id: str
     project_id: str
diff --git a/src/layerlens/models/scorer.py b/src/layerlens/models/scorer.py
new file mode 100644
index 0000000..585d064
--- /dev/null
+++ b/src/layerlens/models/scorer.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class Scorer(BaseModel):
+    id: Optional[str] = None
+    organization_id: Optional[str] = None
+    project_id: Optional[str] = None
+    name: str
+    description: Optional[str] = None
+    model_id: Optional[str] = None
+    model_name: Optional[str] = None
+    model_key: Optional[str] = None
+    model_company: Optional[str] = None
+    prompt: Optional[str] = None
+    created_at: Optional[str] = None
+    updated_at: Optional[str] = None
diff --git a/src/layerlens/models/trace_evaluation.py b/src/layerlens/models/trace_evaluation.py
index 98f32db..b17b485 100644
--- a/src/layerlens/models/trace_evaluation.py
+++ b/src/layerlens/models/trace_evaluation.py
@@ -14,7 +14,7 @@ class TraceEvaluationStatus(str, Enum):
 
 
 class JudgeSnapshot(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(populate_by_name=True, protected_namespaces=())
 
     name: str
     version: int
diff --git a/src/layerlens/resources/benchmarks/benchmarks.py b/src/layerlens/resources/benchmarks/benchmarks.py
index b4ecf69..fca94c6 100644
--- a/src/layerlens/resources/benchmarks/benchmarks.py
+++ b/src/layerlens/resources/benchmarks/benchmarks.py
@@ -83,8 +83,22 @@ def cast_benchmark(b: Benchmark, bench_type: str) -> Benchmark:
             if resp:
                 benchmarks.extend([cast_benchmark(b, type) for b in resp.data.benchmarks])
 
-        if name:
-            benchmarks = [b for b in benchmarks if name.lower() in b.name.lower()]
+        # Exclude custom benchmarks when filtering by fields they don't have
+        if categories:
+            cat_set = {c.lower() for c in categories}
+            benchmarks = [
+                b
+                for b in benchmarks
+                if isinstance(b, PublicBenchmark) and b.categories and any(c.lower() in cat_set for c in b.categories)
+            ]
+
+        if languages:
+            lang_set = {l.lower() for l in languages}
+            benchmarks = [
+                b
+                for b in benchmarks
+                if isinstance(b, PublicBenchmark) and b.language and b.language.lower() in lang_set
+            ]
 
         return benchmarks
 
@@ -356,8 +370,22 @@ def cast_benchmark(b: Benchmark, bench_type: str) -> Benchmark:
             if resp:
                 benchmarks.extend([cast_benchmark(b, type) for b in resp.data.benchmarks])
 
-        if name:
-            benchmarks = [b for b in benchmarks if name.lower() in b.name.lower()]
+        # Exclude custom benchmarks when filtering by fields they don't have
+        if categories:
+            cat_set = {c.lower() for c in categories}
+            benchmarks = [
+                b
+                for b in benchmarks
+                if isinstance(b, PublicBenchmark) and b.categories and any(c.lower() in cat_set for c in b.categories)
+            ]
+
+        if languages:
+            lang_set = {l.lower() for l in languages}
+            benchmarks = [
+                b
+                for b in benchmarks
+                if isinstance(b, PublicBenchmark) and b.language and b.language.lower() in lang_set
+            ]
 
         return benchmarks
 
diff --git a/src/layerlens/resources/evaluation_spaces/__init__.py b/src/layerlens/resources/evaluation_spaces/__init__.py
new file mode 100644
index 0000000..60e7ca2
--- /dev/null
+++ b/src/layerlens/resources/evaluation_spaces/__init__.py
@@ -0,0 +1,3 @@
+from .evaluation_spaces import EvaluationSpaces, AsyncEvaluationSpaces
+
+__all__ = ["EvaluationSpaces", "AsyncEvaluationSpaces"]
diff --git a/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py b/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py
new file mode 100644
index 0000000..0442c4d
--- /dev/null
+++ b/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+import httpx
+
+from ...models import EvaluationSpace, EvaluationSpacesResponse
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+DEFAULT_PAGE = 1
+DEFAULT_PAGE_SIZE = 100
+MAX_PAGE_SIZE = 500
+
+
+def _unwrap(resp: Any) -> Any:
+    if isinstance(resp, dict) and "data" in resp and "status" in resp:
+        return resp["data"]
+    return resp
+
+
+class EvaluationSpaces(SyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/evaluation-spaces"
+
+    def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[EvaluationSpace]:
+        resp = self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        sort_by: Optional[str] = None,
+        order: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpacesResponse]:
+        params: Dict[str, Any] = {}
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+        if sort_by:
+            params["sort_by"] = sort_by
+        if order:
+            params["order"] = order
+
+        resp = self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict)
+        if not resp or not isinstance(resp, dict):
+            return None
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+
+        spaces = [EvaluationSpace(**s) if isinstance(s, dict) else s for s in data.get("evaluation_spaces", [])]
+        count: int = data.get("count", len(spaces))
+        total_count: int = data.get("total_count", count)
+        try:
+            return EvaluationSpacesResponse(evaluation_spaces=spaces, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    def create(
+        self,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        visibility: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpace]:
+        body: Dict[str, Any] = {"name": name}
+        if description:
+            body["description"] = description
+        if visibility:
+            body["visibility"] = visibility
+        resp = self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    def update(
+        self,
+        id: str,
+        *,
+        description: Optional[str] = None,
+        visibility: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpace]:
+        body: Dict[str, Any] = {}
+        if description is not None:
+            body["description"] = description
+        if visibility is not None:
+            body["visibility"] = visibility
+        resp = self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool:
+        try:
+            self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
+
+
+class AsyncEvaluationSpaces(AsyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/evaluation-spaces"
+
+    async def get(
+        self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT
+    ) -> Optional[EvaluationSpace]:
+        resp = await self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    async def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        sort_by: Optional[str] = None,
+        order: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpacesResponse]:
+        params: Dict[str, Any] = {}
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+        if sort_by:
+            params["sort_by"] = sort_by
+        if order:
+            params["order"] = order
+        resp = await self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict)
+        if not resp or not isinstance(resp, dict):
+            return None
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+        spaces = [EvaluationSpace(**s) if isinstance(s, dict) else s for s in data.get("evaluation_spaces", [])]
+        count: int = data.get("count", len(spaces))
+        total_count: int = data.get("total_count", count)
+        try:
+            return EvaluationSpacesResponse(evaluation_spaces=spaces, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    async def create(
+        self,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        visibility: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpace]:
+        body: Dict[str, Any] = {"name": name}
+        if description:
+            body["description"] = description
+        if visibility:
+            body["visibility"] = visibility
+        resp = await self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    async def update(
+        self,
+        id: str,
+        *,
+        description: Optional[str] = None,
+        visibility: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[EvaluationSpace]:
+        body: Dict[str, Any] = {}
+        if description is not None:
+            body["description"] = description
+        if visibility is not None:
+            body["visibility"] = visibility
+        resp = await self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return EvaluationSpace(**data)
+            except Exception:
+                return None
+        return None
+
+    async def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool:
+        try:
+            await self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
diff --git a/src/layerlens/resources/integrations/__init__.py b/src/layerlens/resources/integrations/__init__.py
new file mode 100644
index 0000000..101d042
--- /dev/null
+++ b/src/layerlens/resources/integrations/__init__.py
@@ -0,0 +1,3 @@
+from .integrations import Integrations, AsyncIntegrations
+
+__all__ = ["Integrations", "AsyncIntegrations"]
diff --git a/src/layerlens/resources/integrations/integrations.py b/src/layerlens/resources/integrations/integrations.py
new file mode 100644
index 0000000..ecd856b
--- /dev/null
+++ b/src/layerlens/resources/integrations/integrations.py
@@ -0,0 +1,186 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+import httpx
+
+from ...models import (
+    Integration,
+    IntegrationsResponse,
+    TestIntegrationResponse,
+)
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+DEFAULT_PAGE = 1
+DEFAULT_PAGE_SIZE = 100
+MAX_PAGE_SIZE = 500
+
+
+def _unwrap(resp: Any) -> Any:
+    """Unwrap {"status": ..., "data": ...} envelope if present."""
+    if isinstance(resp, dict) and "data" in resp and "status" in resp:
+        return resp["data"]
+    return resp
+
+
+class Integrations(SyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/integrations"
+
+    def get(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Integration]:
+        resp = self._get(
+            f"{self._base_url()}/{id}",
+            timeout=timeout,
+            cast_to=dict,
+        )
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return Integration(**data)
+            except Exception:
+                return None
+        return None
+
+    def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[IntegrationsResponse]:
+        params: Dict[str, Any] = {}
+
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+
+        resp = self._get(
+            self._base_url(),
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if not resp or not isinstance(resp, dict):
+            return None
+
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+
+        integrations = [i if isinstance(i, Integration) else Integration(**i) for i in data.get("integrations", [])]
+        count: int = data.get("count", len(integrations))
+        total_count: int = data.get("total_count", count)
+
+        try:
+            return IntegrationsResponse(integrations=integrations, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    def test(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[TestIntegrationResponse]:
+        resp = self._post(
+            f"{self._base_url()}/{id}/test",
+            body={},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return TestIntegrationResponse(**data)
+            except Exception:
+                return None
+        return None
+
+
+class AsyncIntegrations(AsyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/integrations"
+
+    async def get(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Integration]:
+        resp = await self._get(
+            f"{self._base_url()}/{id}",
+            timeout=timeout,
+            cast_to=dict,
+        )
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return Integration(**data)
+            except Exception:
+                return None
+        return None
+
+    async def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[IntegrationsResponse]:
+        params: Dict[str, Any] = {}
+
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+
+        resp = await self._get(
+            self._base_url(),
+            params=params,
+            timeout=timeout,
+            cast_to=dict,
+        )
+        if not resp or not isinstance(resp, dict):
+            return None
+
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+
+        integrations = [i if isinstance(i, Integration) else Integration(**i) for i in data.get("integrations", [])]
+        count: int = data.get("count", len(integrations))
+        total_count: int = data.get("total_count", count)
+
+        try:
+            return IntegrationsResponse(integrations=integrations, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    async def test(
+        self,
+        id: str,
+        *,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[TestIntegrationResponse]:
+        resp = await self._post(
+            f"{self._base_url()}/{id}/test",
+            body={},
+            timeout=timeout,
+            cast_to=dict,
+        )
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return TestIntegrationResponse(**data)
+            except Exception:
+                return None
+        return None
diff --git a/src/layerlens/resources/models/models.py b/src/layerlens/resources/models/models.py
index 7a25377..411afed 100644
--- a/src/layerlens/resources/models/models.py
+++ b/src/layerlens/resources/models/models.py
@@ -5,6 +5,55 @@
 import httpx
 
 from ...models import Model, CustomModel, PublicModel, ModelsResponse, CreateModelResponse
+
+
+def _exclude_custom_models(
+    models: List[Model],
+    *,
+    categories: Optional[List[str]] = None,
+    companies: Optional[List[str]] = None,
+    regions: Optional[List[str]] = None,
+    licenses: Optional[List[str]] = None,
+) -> List[Model]:
+    """Exclude custom models when filtering by fields they don't have.
+
+    The API correctly filters public models and custom models by name/key,
+    but custom models don't have categories/companies/regions/licenses fields,
+    so they must be excluded from results when those filters are active.
+    """
+    if categories:
+        cat_set = {c.lower() for c in categories}
+
+        def matches_category(m: Model) -> bool:
+            if not isinstance(m, PublicModel):
+                return False
+            arch = (m.architecture_type or "").lower()
+            for cat in cat_set:
+                if cat == "open-source" and m.open_weights:
+                    return True
+                if cat == "closed-source" and not m.open_weights and arch:
+                    return True
+                if arch and cat == arch:
+                    return True
+            return False
+
+        models = [m for m in models if matches_category(m)]
+
+    if companies:
+        comp_set = {c.lower() for c in companies}
+        models = [m for m in models if isinstance(m, PublicModel) and m.company and m.company.lower() in comp_set]
+
+    if regions:
+        reg_set = {r.lower() for r in regions}
+        models = [m for m in models if isinstance(m, PublicModel) and m.region and m.region.lower() in reg_set]
+
+    if licenses:
+        lic_set = {l.lower() for l in licenses}
+        models = [m for m in models if isinstance(m, PublicModel) and m.license and m.license.lower() in lic_set]
+
+    return models
+
+
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._constants import DEFAULT_TIMEOUT
 
@@ -66,8 +115,13 @@ def cast_model(m: Model, model_type: str) -> Model:
             if resp:
                 models.extend([cast_model(m, type) for m in resp.data.models])
 
-        if name:
-            models = [m for m in models if name.lower() in m.name.lower()]
+        models = _exclude_custom_models(
+            models,
+            categories=categories,
+            companies=companies,
+            regions=regions,
+            licenses=licenses,
+        )
 
         return models
 
@@ -252,8 +306,13 @@ def cast_model(m: Model, model_type: str) -> Model:
             if resp:
                 models.extend([cast_model(m, type) for m in resp.data.models])
 
-        if name:
-            models = [m for m in models if name.lower() in m.name.lower()]
+        models = _exclude_custom_models(
+            models,
+            categories=categories,
+            companies=companies,
+            regions=regions,
+            licenses=licenses,
+        )
 
         return models
 
diff --git a/src/layerlens/resources/scorers/__init__.py b/src/layerlens/resources/scorers/__init__.py
new file mode 100644
index 0000000..ac715f0
--- /dev/null
+++ b/src/layerlens/resources/scorers/__init__.py
@@ -0,0 +1,3 @@
+from .scorers import Scorers, AsyncScorers
+
+__all__ = ["Scorers", "AsyncScorers"]
diff --git a/src/layerlens/resources/scorers/scorers.py b/src/layerlens/resources/scorers/scorers.py
new file mode 100644
index 0000000..4696389
--- /dev/null
+++ b/src/layerlens/resources/scorers/scorers.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+import httpx
+
+from ...models import Scorer, ScorersResponse
+from ..._resource import SyncAPIResource, AsyncAPIResource
+from ..._constants import DEFAULT_TIMEOUT
+
+DEFAULT_PAGE = 1
+DEFAULT_PAGE_SIZE = 100
+MAX_PAGE_SIZE = 500
+
+
+def _unwrap(resp: Any) -> Any:
+    if isinstance(resp, dict) and "data" in resp and "status" in resp:
+        return resp["data"]
+    return resp
+
+
+def _pascal_to_snake(key: str) -> str:
+    """Convert PascalCase key to snake_case."""
+    import re
+
+    return re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", key)).lower()
+
+
+def _normalize_keys(d: Dict[str, Any]) -> Dict[str, Any]:
+    """Normalize a dict's keys from PascalCase to snake_case if needed."""
+    if not d or not isinstance(d, dict):
+        return d
+    # Check if keys are PascalCase (first key starts with uppercase)
+    first_key = next(iter(d), "")
+    if first_key and first_key[0].isupper():
+        return {_pascal_to_snake(k): v for k, v in d.items()}
+    return d
+
+
+class Scorers(SyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/scorers"
+
+    def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[Scorer]:
+        resp = self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return Scorer(**data)
+            except Exception:
+                return None
+        return None
+
+    def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ScorersResponse]:
+        params: Dict[str, Any] = {}
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+
+        resp = self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict)
+        if not resp or not isinstance(resp, dict):
+            return None
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+
+        scorers = [Scorer(**s) if isinstance(s, dict) else s for s in data.get("scorers", [])]
+        count: int = data.get("count", len(scorers))
+        total_count: int = data.get("total_count", count)
+        try:
+            return ScorersResponse(scorers=scorers, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    def create(
+        self,
+        *,
+        name: str,
+        description: str,
+        model_id: str,
+        prompt: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Scorer]:
+        body: Dict[str, Any] = {
+            "name": name,
+            "description": description,
+            "model_id": model_id,
+            "prompt": prompt,
+        }
+        resp = self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            data = _normalize_keys(data)
+            try:
+                return Scorer(**data)
+            except Exception:
+                return None
+        return None
+
+    def update(
+        self,
+        id: str,
+        *,
+        name: Optional[str] = None,
+        description: Optional[str] = None,
+        model_id: Optional[str] = None,
+        prompt: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        body: Dict[str, Any] = {}
+        if name is not None:
+            body["name"] = name
+        if description is not None:
+            body["description"] = description
+        if model_id is not None:
+            body["model_id"] = model_id
+        if prompt is not None:
+            body["prompt"] = prompt
+        try:
+            self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
+
+    def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool:
+        try:
+            self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
+
+
+class AsyncScorers(AsyncAPIResource):
+    def _base_url(self) -> str:
+        return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/scorers"
+
+    async def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[Scorer]:
+        resp = await self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            try:
+                return Scorer(**data)
+            except Exception:
+                return None
+        return None
+
+    async def get_many(
+        self,
+        *,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[ScorersResponse]:
+        params: Dict[str, Any] = {}
+        effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE
+        effective_page = page if page is not None else DEFAULT_PAGE
+        params["page"] = str(effective_page)
+        params["page_size"] = str(effective_page_size)
+        resp = await self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict)
+        if not resp or not isinstance(resp, dict):
+            return None
+        data = _unwrap(resp)
+        if not isinstance(data, dict):
+            return None
+        scorers = [Scorer(**s) if isinstance(s, dict) else s for s in data.get("scorers", [])]
+        count: int = data.get("count", len(scorers))
+        total_count: int = data.get("total_count", count)
+        try:
+            return ScorersResponse(scorers=scorers, count=count, total_count=total_count)
+        except Exception:
+            return None
+
+    async def create(
+        self,
+        *,
+        name: str,
+        description: str,
+        model_id: str,
+        prompt: str,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> Optional[Scorer]:
+        body: Dict[str, Any] = {"name": name, "description": description, "model_id": model_id, "prompt": prompt}
+        resp = await self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict)
+        data = _unwrap(resp)
+        if isinstance(data, dict):
+            data = _normalize_keys(data)
+            try:
+                return Scorer(**data)
+            except Exception:
+                return None
+        return None
+
+    async def update(
+        self,
+        id: str,
+        *,
+        name: Optional[str] = None,
+        description: Optional[str] = None,
+        model_id: Optional[str] = None,
+        prompt: Optional[str] = None,
+        timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
+    ) -> bool:
+        body: Dict[str, Any] = {}
+        if name is not None:
+            body["name"] = name
+        if description is not None:
+            body["description"] = description
+        if model_id is not None:
+            body["model_id"] = model_id
+        if prompt is not None:
+            body["prompt"] = prompt
+        try:
+            await self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
+
+    async def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool:
+        try:
+            await self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict)
+            return True
+        except Exception:
+            return False
diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
new file mode 100644
index 0000000..aee5547
--- /dev/null
+++ b/tests/cli/conftest.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+import pytest
+from click.testing import CliRunner
+
+
+@pytest.fixture
+def runner():
+    """Click CLI test runner."""
+    return CliRunner(mix_stderr=False)
+
+
+@pytest.fixture
+def cli_env():
+    """Environment variables for CLI tests."""
+    return {"LAYERLENS_STRATIX_API_KEY": "test-key-123"}
diff --git a/tests/cli/test_client.py b/tests/cli/test_client.py
new file mode 100644
index 0000000..d6ffaf3
--- /dev/null
+++ b/tests/cli/test_client.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from unittest.mock import Mock
+
+import pytest
+
+from layerlens.cli._client import _is_uuid, resolve_model, resolve_benchmark
+
+
+class TestIsUuid:
+    """Test UUID detection."""
+
+    def test_valid_uuid(self):
+        assert _is_uuid("550e8400-e29b-41d4-a716-446655440000") is True
+
+    def test_valid_uuid_uppercase(self):
+        assert _is_uuid("550E8400-E29B-41D4-A716-446655440000") is True
+
+    def test_short_id(self):
+        assert _is_uuid("abc123") is False
+
+    def test_mongo_id(self):
+        assert _is_uuid("69805c582a3c129a75d168b8") is False
+
+    def test_empty(self):
+        assert _is_uuid("") is False
+
+    def test_model_key(self):
+        assert _is_uuid("openai/gpt-4o") is False
+
+
+class TestResolveModel:
+    """Test model resolution by ID, key, or name."""
+
+    @pytest.fixture
+    def client(self):
+        c = Mock()
+        c.models = Mock()
+        return c
+
+    def test_resolve_by_uuid(self, client):
+        """UUID-like identifier tries get_by_id first."""
+        model = Mock(id="550e8400-e29b-41d4-a716-446655440000")
+        client.models.get_by_id.return_value = model
+
+        result = resolve_model(client, "550e8400-e29b-41d4-a716-446655440000")
+
+        assert result is model
+        client.models.get_by_id.assert_called_once()
+
+    def test_resolve_by_key(self, client):
+        """Non-UUID identifier tries get_by_key."""
+        model = Mock(id="m-1")
+        client.models.get_by_key.return_value = model
+
+        result = resolve_model(client, "openai/gpt-4o")
+
+        assert result is model
+        client.models.get_by_key.assert_called_once_with("openai/gpt-4o")
+
+    def test_resolve_by_name(self, client):
+        """Falls back to name search."""
+        model = Mock(id="m-1")
+        client.models.get_by_key.return_value = None
+        client.models.get.return_value = [model]
+
+        result = resolve_model(client, "GPT-4")
+
+        assert result is model
+        client.models.get.assert_called_once_with(name="GPT-4")
+
+    def test_resolve_not_found(self, client):
+        """Returns None when model not found."""
+        client.models.get_by_key.return_value = None
+        client.models.get.return_value = None
+
+        result = resolve_model(client, "nonexistent")
+
+        assert result is None
+
+    def test_resolve_uuid_fallback_to_key(self, client):
+        """UUID that fails get_by_id falls back to get_by_key."""
+        client.models.get_by_id.return_value = None
+        model = Mock(id="m-1")
+        client.models.get_by_key.return_value = model
+
+        result = resolve_model(client, "550e8400-e29b-41d4-a716-446655440000")
+
+        assert result is model
+
+
+class TestResolveBenchmark:
+    """Test benchmark resolution by ID, key, or name."""
+
+    @pytest.fixture
+    def client(self):
+        c = Mock()
+        c.benchmarks = Mock()
+        return c
+
+    def test_resolve_by_key(self, client):
+        bm = Mock(id="b-1")
+        client.benchmarks.get_by_key.return_value = bm
+
+        result = resolve_benchmark(client, "arc-agi-2")
+
+        assert result is bm
+
+    def test_resolve_not_found(self, client):
+        client.benchmarks.get_by_key.return_value = None
+        client.benchmarks.get.return_value = None
+
+        result = resolve_benchmark(client, "nonexistent")
+
+        assert result is None
diff --git a/tests/cli/test_commands.py b/tests/cli/test_commands.py
new file mode 100644
index 0000000..ec6c915
--- /dev/null
+++ b/tests/cli/test_commands.py
@@ -0,0 +1,464 @@
+from __future__ import annotations
+
+from unittest.mock import Mock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from layerlens.cli._app import cli
+
+
+class TestTraceCommands:
+    """Test trace CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @pytest.fixture
+    def mock_traces(self):
+        trace = Mock()
+        trace.id = "trace-123"
+        trace.created_at = "2026-01-01T00:00:00Z"
+        trace.filename = "test.jsonl"
+        trace.evaluations_count = 2
+        # Make to_dict work
+        trace.model_dump.return_value = {
+            "id": "trace-123",
+            "created_at": "2026-01-01T00:00:00Z",
+            "filename": "test.jsonl",
+            "evaluations_count": 2,
+        }
+        return trace
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_list(self, mock_get_client, runner, mock_traces):
+        """trace list displays traces in table format."""
+        client = Mock()
+        resp = Mock()
+        resp.traces = [mock_traces]
+        resp.count = 1
+        resp.total_count = 1
+        client.traces.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["trace", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "trace-123" in result.output
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_list_empty(self, mock_get_client, runner):
+        """trace list shows message when no traces found."""
+        client = Mock()
+        client.traces.get_many.return_value = Mock(traces=[])
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["trace", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "No traces found" in result.output
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_get(self, mock_get_client, runner, mock_traces):
+        """trace get displays a single trace."""
+        client = Mock()
+        client.traces.get.return_value = mock_traces
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["trace", "get", "trace-123"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "trace-123" in result.output
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_get_not_found(self, mock_get_client, runner):
+        """trace get exits with error when trace not found."""
+        client = Mock()
+        client.traces.get.return_value = None
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["trace", "get", "nonexistent"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code != 0
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_delete_confirms(self, mock_get_client, runner):
+        """trace delete prompts for confirmation."""
+        client = Mock()
+        mock_get_client.return_value = client
+
+        result = runner.invoke(
+            cli, ["trace", "delete", "trace-123"], input="y\n", env={"LAYERLENS_STRATIX_API_KEY": "test"}
+        )
+
+        client.traces.delete.assert_called_once()
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_trace_delete_skip_confirm(self, mock_get_client, runner):
+        """trace delete --yes skips confirmation."""
+        client = Mock()
+        client.traces.delete.return_value = True
+        mock_get_client.return_value = client
+
+        result = runner.invoke(
+            cli, ["trace", "delete", "trace-123", "--yes"], env={"LAYERLENS_STRATIX_API_KEY": "test"}
+        )
+
+        assert result.exit_code == 0
+        client.traces.delete.assert_called_once()
+
+
+class TestJudgeCommands:
+    """Test judge CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @patch("layerlens.cli.commands.judge.get_client")
+    def test_judge_list(self, mock_get_client, runner):
+        """judge list displays judges."""
+        judge = Mock()
+        judge.model_dump.return_value = {
+            "id": "j-1",
+            "name": "Quality",
+            "version": 1,
+            "run_count": 5,
+            "created_at": "2026-01-01T00:00:00Z",
+        }
+        client = Mock()
+        resp = Mock()
+        resp.judges = [judge]
+        resp.count = 1
+        resp.total_count = 1
+        client.judges.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["judge", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "Quality" in result.output
+
+    @patch("layerlens.cli.commands.judge.get_client")
+    def test_judge_create(self, mock_get_client, runner):
+        """judge create creates and displays a judge."""
+        judge = Mock()
+        judge.id = "j-new"
+        judge.model_dump.return_value = {"id": "j-new", "name": "Test"}
+        client = Mock()
+        client.judges.create.return_value = judge
+        mock_get_client.return_value = client
+
+        result = runner.invoke(
+            cli,
+            ["judge", "create", "--name", "Test", "--goal", "Evaluate accuracy and completeness"],
+            env={"LAYERLENS_STRATIX_API_KEY": "test"},
+        )
+
+        assert result.exit_code == 0
+        assert "j-new" in result.output
+
+    @patch("layerlens.cli.commands.judge.get_client")
+    def test_judge_test(self, mock_get_client, runner):
+        """judge test creates a trace evaluation."""
+        te = Mock()
+        te.id = "te-1"
+        te.model_dump.return_value = {"id": "te-1", "trace_id": "t-1", "judge_id": "j-1", "status": "pending"}
+        client = Mock()
+        client.trace_evaluations.create.return_value = te
+        mock_get_client.return_value = client
+
+        result = runner.invoke(
+            cli,
+            ["judge", "test", "--judge-id", "j-1", "--trace-id", "t-1"],
+            env={"LAYERLENS_STRATIX_API_KEY": "test"},
+        )
+
+        assert result.exit_code == 0
+        assert "te-1" in result.output
+
+
+class TestEvaluateCommands:
+    """Test evaluate CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @patch("layerlens.cli.commands.evaluate.get_client")
+    def test_evaluate_list(self, mock_get_client, runner):
+        """evaluate list displays evaluations."""
+        ev = Mock()
+        ev.model_dump.return_value = {
+            "id": "ev-1",
+            "status": "success",
+            "model_name": "GPT-4",
+            "benchmark_name": "MATH",
+            "accuracy": 0.95,
+            "submitted_at": 1700000000,
+        }
+        client = Mock()
+        resp = Mock()
+        resp.evaluations = [ev]
+        resp.pagination = Mock(page=1, total_pages=1, total_count=1)
+        client.evaluations.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["evaluate", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "GPT-4" in result.output
+
+
+class TestScorerCommands:
+    """Test scorer CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @patch("layerlens.cli.commands.scorer.get_client")
+    def test_scorer_list(self, mock_get_client, runner):
+        """scorer list displays scorers."""
+        scorer = Mock()
+        scorer.model_dump.return_value = {
+            "id": "s-1",
+            "name": "Quality",
+            "model_name": "GPT-4",
+            "model_company": "OpenAI",
+            "created_at": "2026-01-01",
+        }
+        client = Mock()
+        resp = Mock()
+        resp.scorers = [scorer]
+        resp.count = 1
+        resp.total_count = 1
+        client.scorers.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["scorer", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "Quality" in result.output
+
+    @patch("layerlens.cli.commands.scorer.get_client")
+    def test_scorer_create_dry_run(self, mock_get_client, runner):
+        """scorer create --dry-run previews without executing."""
+        result = runner.invoke(
+            cli,
+            [
+                "scorer",
+                "create",
+                "--name",
+                "Test",
+                "--description",
+                "A test scorer for quality",
+                "--model-id",
+                "m-1",
+                "--prompt",
+                "Rate quality",
+                "--dry-run",
+            ],
+            env={"LAYERLENS_STRATIX_API_KEY": "test"},
+        )
+
+        assert result.exit_code == 0
+        assert "[dry-run]" in result.output
+        mock_get_client.assert_not_called()
+
+    @patch("layerlens.cli.commands.scorer.get_client")
+    def test_scorer_delete_yes(self, mock_get_client, runner):
+        """scorer delete --yes skips confirmation."""
+        client = Mock()
+        client.scorers.delete.return_value = True
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["scorer", "delete", "s-1", "--yes"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        client.scorers.delete.assert_called_once_with("s-1")
+
+
+class TestSpaceCommands:
+    """Test space CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @patch("layerlens.cli.commands.space.get_client")
+    def test_space_create_dry_run(self, mock_get_client, runner):
+        """space create --dry-run previews without executing."""
+        result = runner.invoke(
+            cli,
+            ["space", "create", "--name", "Test Space", "--dry-run"],
+            env={"LAYERLENS_STRATIX_API_KEY": "test"},
+        )
+
+        assert result.exit_code == 0
+        assert "[dry-run]" in result.output
+        mock_get_client.assert_not_called()
+
+
+class TestBulkCommands:
+    """Test bulk CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    @patch("layerlens.cli.commands.bulk.get_client")
+    def test_bulk_eval_file_dry_run(self, _mock_get_client, runner):
+        """bulk eval --file --dry-run previews jobs."""
+        import os
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            f.write('{"model": "gpt-4", "benchmark": "mmlu"}\n')
+            f.write('{"model": "claude", "benchmark": "mmlu"}\n')
+            jobs_path = f.name
+
+        try:
+            result = runner.invoke(
+                cli,
+                ["bulk", "eval", "--file", jobs_path, "--dry-run"],
+                env={"LAYERLENS_STRATIX_API_KEY": "test"},
+            )
+
+            assert result.exit_code == 0, f"stdout={result.output!r} stderr={result.stderr!r}"
+            assert "[dry-run]" in result.output
+            assert "2 evaluation(s)" in result.output
+        finally:
+            os.unlink(jobs_path)
+
+    def test_bulk_eval_no_args(self, runner):
+        """bulk eval with no arguments shows error."""
+        result = runner.invoke(cli, ["bulk", "eval"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code != 0
+
+    @patch("layerlens.cli.commands.bulk.get_client")
+    def test_bulk_eval_judge_traces_dry_run(self, _mock_get_client, runner, tmp_path):
+        """bulk eval --judge-id --traces --dry-run previews trace evaluations."""
+        traces_file = tmp_path / "traces.txt"
+        traces_file.write_text("t-1\nt-2\nt-3\n")
+
+        result = runner.invoke(
+            cli,
+            ["bulk", "eval", "--judge-id", "j-1", "--traces", str(traces_file), "--dry-run"],
+            env={"LAYERLENS_STRATIX_API_KEY": "test"},
+        )
+
+        assert result.exit_code == 0
+        assert "3 trace evaluation(s)" in result.output
+
+
+class TestCiCommands:
+    """Test ci CLI commands."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    def test_ci_report_dry_run(self, runner):
+        """ci report --dry-run previews."""
+        result = runner.invoke(cli, ["ci", "report", "--dry-run"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "[dry-run]" in result.output
+
+    @patch("layerlens.cli.commands.ci.get_client")
+    def test_ci_report_markdown(self, mock_get_client, runner):
+        """ci report generates markdown."""
+        ev = Mock()
+        ev.id = "ev-1"
+        ev.status = "success"
+        ev.model_name = "GPT-4"
+        ev.benchmark_name = "MATH"
+        ev.accuracy = 0.95
+        ev.model_dump.return_value = {"id": "ev-1", "status": "success"}
+
+        client = Mock()
+        resp = Mock()
+        resp.evaluations = [ev]
+        client.evaluations.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        result = runner.invoke(cli, ["ci", "report"], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert "# Stratix Evaluation Report" in result.output
+        assert "GPT-4" in result.output
+
+    @patch("layerlens.cli.commands.ci.get_client")
+    def test_ci_report_to_file(self, mock_get_client, runner, tmp_path):
+        """ci report --output writes to file."""
+        ev = Mock()
+        ev.id = "ev-1"
+        ev.status = "success"
+        ev.model_name = "GPT-4"
+        ev.benchmark_name = "MATH"
+        ev.accuracy = 0.95
+        ev.model_dump.return_value = {"id": "ev-1"}
+
+        client = Mock()
+        resp = Mock()
+        resp.evaluations = [ev]
+        client.evaluations.get_many.return_value = resp
+        mock_get_client.return_value = client
+
+        out_file = tmp_path / "report.md"
+        result = runner.invoke(cli, ["ci", "report", "-o", str(out_file)], env={"LAYERLENS_STRATIX_API_KEY": "test"})
+
+        assert result.exit_code == 0
+        assert out_file.exists()
+        content = out_file.read_text()
+        assert "Stratix Evaluation Report" in content
+
+
+class TestGlobalOptions:
+    """Test global CLI options."""
+
+    @pytest.fixture
+    def runner(self):
+        return CliRunner(mix_stderr=False)
+
+    def test_version(self, runner):
+        """--version prints version."""
+        result = runner.invoke(cli, ["--version"])
+        assert result.exit_code == 0
+        assert "layerlens" in result.output
+
+    def test_help(self, runner):
+        """--help shows all command groups."""
+        result = runner.invoke(cli, ["--help"])
+        assert result.exit_code == 0
+        for cmd in ["trace", "judge", "evaluate", "integration", "scorer", "space", "bulk", "ci"]:
+            assert cmd in result.output
+
+    @patch("layerlens.cli.commands.trace.get_client")
+    def test_json_format(self, mock_get_client, runner):
+        """--format json outputs JSON."""
+        trace = Mock()
+        trace.model_dump.return_value = {"id": "t-1", "filename": "test.json"}
+        client = Mock()
+        client.traces.get.return_value = trace
+        mock_get_client.return_value = client
+
+        result = runner.invoke(
+            cli, ["--format", "json", "trace", "get", "t-1"], env={"LAYERLENS_STRATIX_API_KEY": "test"}
+        )
+
+        assert result.exit_code == 0
+        import json
+
+        parsed = json.loads(result.output)
+        assert parsed["id"] == "t-1"
+
+    def test_quiet_flag(self, runner):
+        """--quiet suppresses banner."""
+        result = runner.invoke(cli, ["-q", "--help"])
+        assert result.exit_code == 0
+        # Banner goes to stderr; with -q it should be empty
+        assert "STRATIX" not in result.stderr
diff --git a/tests/cli/test_formatter.py b/tests/cli/test_formatter.py
new file mode 100644
index 0000000..a8ea441
--- /dev/null
+++ b/tests/cli/test_formatter.py
@@ -0,0 +1,191 @@
+from __future__ import annotations
+
+import json
+
+import pytest
+from pydantic import BaseModel
+
+from layerlens.cli._formatter import (
+    to_dict,
+    _truncate,
+    format_table,
+    _format_value,
+    format_output,
+    format_single,
+)
+
+
+class SampleModel(BaseModel):
+    id: str
+    name: str
+    score: float = 0.0
+
+
+class TestToDict:
+    """Test to_dict conversion for various input types."""
+
+    def test_pydantic_v2_model(self):
+        """Pydantic model with model_dump is converted."""
+        m = SampleModel(id="1", name="test", score=0.5)
+        result = to_dict(m)
+        assert result == {"id": "1", "name": "test", "score": 0.5}
+
+    def test_dict_passthrough(self):
+        """Dict input is returned as-is."""
+        d = {"key": "value"}
+        assert to_dict(d) is d
+
+    def test_other_type_passthrough(self):
+        """Non-model, non-dict input is returned as-is."""
+        assert to_dict("hello") == "hello"
+        assert to_dict(42) == 42
+
+
+class TestFormatValue:
+    """Test _format_value display conversion."""
+
+    def test_none(self):
+        assert _format_value(None) == "-"
+
+    def test_bool_true(self):
+        assert _format_value(True) == "Yes"
+
+    def test_bool_false(self):
+        assert _format_value(False) == "No"
+
+    def test_float(self):
+        assert _format_value(3.14159) == "3.1416"
+
+    def test_dict(self):
+        result = _format_value({"a": 1})
+        assert json.loads(result) == {"a": 1}
+
+    def test_list(self):
+        result = _format_value([1, 2])
+        assert json.loads(result) == [1, 2]
+
+    def test_string(self):
+        assert _format_value("hello") == "hello"
+
+    def test_int(self):
+        assert _format_value(42) == "42"
+
+
+class TestTruncate:
+    """Test _truncate string truncation."""
+
+    def test_short_string(self):
+        assert _truncate("abc", 10) == "abc"
+
+    def test_exact_width(self):
+        assert _truncate("abcde", 5) == "abcde"
+
+    def test_long_string(self):
+        result = _truncate("abcdefgh", 5)
+        assert len(result) == 5
+        assert result.endswith("\u2026")
+
+    def test_single_char_width(self):
+        result = _truncate("abcdef", 1)
+        assert result == "\u2026"
+
+
+class TestFormatTable:
+    """Test format_table rendering."""
+
+    @pytest.fixture
+    def columns(self):
+        return [("id", "ID"), ("name", "Name")]
+
+    def test_empty_list(self, columns):
+        result = format_table([], columns)
+        assert result == "No results found."
+
+    def test_single_row(self, columns):
+        items = [{"id": "1", "name": "Alice"}]
+        result = format_table(items, columns)
+        lines = result.split("\n")
+        assert len(lines) == 3  # header, separator, row
+        assert "ID" in lines[0]
+        assert "Name" in lines[0]
+        assert "1" in lines[2]
+        assert "Alice" in lines[2]
+
+    def test_multiple_rows(self, columns):
+        items = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
+        result = format_table(items, columns)
+        lines = result.split("\n")
+        assert len(lines) == 4  # header, separator, 2 rows
+
+    def test_pydantic_models(self, columns):
+        items = [SampleModel(id="1", name="Test")]
+        result = format_table(items, columns)
+        assert "Test" in result
+
+    def test_column_width_adapts(self):
+        columns = [("val", "V")]
+        items = [{"val": "short"}, {"val": "a much longer value here"}]
+        result = format_table(items, columns)
+        lines = result.split("\n")
+        # Header should be at least as wide as longest value
+        assert len(lines[0]) >= len("a much longer value here")
+
+    def test_truncation_at_max_width(self):
+        columns = [("val", "V")]
+        items = [{"val": "x" * 100}]
+        result = format_table(items, columns, max_col_width=20)
+        data_line = result.split("\n")[2]
+        assert len(data_line.strip()) <= 20
+
+
+class TestFormatOutput:
+    """Test format_output dispatch."""
+
+    @pytest.fixture
+    def columns(self):
+        return [("id", "ID"), ("name", "Name")]
+
+    def test_json_format_list(self, columns):
+        items = [{"id": "1", "name": "A"}]
+        result = format_output(items, "json", columns)
+        parsed = json.loads(result)
+        assert isinstance(parsed, list)
+        assert parsed[0]["id"] == "1"
+
+    def test_json_format_single(self):
+        item = {"id": "1", "name": "A"}
+        result = format_output(item, "json")
+        parsed = json.loads(result)
+        assert parsed["id"] == "1"
+
+    def test_table_format_list(self, columns):
+        items = [{"id": "1", "name": "A"}]
+        result = format_output(items, "table", columns)
+        assert "ID" in result
+        assert "Name" in result
+
+    def test_table_format_single(self):
+        item = {"id": "1", "name": "Test"}
+        result = format_output(item, "table")
+        assert "Test" in result
+
+
+class TestFormatSingle:
+    """Test format_single key-value rendering."""
+
+    def test_dict_input(self):
+        result = format_single({"name": "Alice", "age": 30})
+        assert "Name" in result
+        assert "Alice" in result
+        assert "Age" in result
+        assert "30" in result
+
+    def test_pydantic_model(self):
+        m = SampleModel(id="1", name="Test", score=0.5)
+        result = format_single(m)
+        assert "Id" in result
+        assert "1" in result
+
+    def test_non_dict(self):
+        result = format_single("just a string")
+        assert result == "just a string"
diff --git a/tests/resources/test_benchmarks.py b/tests/resources/test_benchmarks.py
index 9e03e22..53843c0 100644
--- a/tests/resources/test_benchmarks.py
+++ b/tests/resources/test_benchmarks.py
@@ -976,3 +976,179 @@ def test_upload_file_sends_correct_upload_request(self, mock_put, benchmarks_res
         assert body["filename"] == "data.jsonl"
         assert "type" in body
         assert "size" in body
+
+
+class TestBenchmarksClientSideFiltering:
+    """Test client-side filtering for benchmarks (fixes API not filtering custom objects)."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def benchmarks_resource(self, mock_client):
+        return Benchmarks(mock_client)
+
+    @pytest.fixture
+    def public_reasoning(self):
+        return PublicBenchmark(
+            id="pub-1",
+            key="mmlu",
+            name="MMLU",
+            language="english",
+            categories=["reasoning", "knowledge"],
+        )
+
+    @pytest.fixture
+    def public_coding(self):
+        return PublicBenchmark(
+            id="pub-2",
+            key="humaneval",
+            name="HumanEval",
+            language="english",
+            categories=["coding"],
+        )
+
+    @pytest.fixture
+    def public_french(self):
+        return PublicBenchmark(
+            id="pub-3",
+            key="french-bench",
+            name="French Bench",
+            language="french",
+            categories=["reasoning"],
+        )
+
+    @pytest.fixture
+    def custom_bench(self):
+        return CustomBenchmark(
+            id="custom-1",
+            key="my-bench",
+            name="My Custom Benchmark",
+        )
+
+    def _mock_responses(self, resource, custom_list, public_list):
+        """Helper to set up mock API responses returning custom and public benchmarks."""
+        custom_resp = BenchmarksResponse(data=BenchmarksResponse.Data(benchmarks=custom_list))
+        public_resp = BenchmarksResponse(data=BenchmarksResponse.Data(benchmarks=public_list))
+        resource._get.side_effect = lambda *_, **kwargs: (
+            custom_resp if kwargs.get("params", {}).get("type") == "custom" else public_resp
+        )
+
+    def test_filter_by_categories_excludes_custom(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+        public_coding,
+    ):
+        """Filtering by categories excludes custom benchmarks (they have no categories)."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_coding])
+
+        result = benchmarks_resource.get(categories=["reasoning"])
+
+        assert len(result) == 1
+        assert result[0].key == "mmlu"
+        assert isinstance(result[0], PublicBenchmark)
+
+    def test_filter_by_categories_no_match_returns_empty(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_coding,
+    ):
+        """Filtering by a category that no benchmark matches returns empty list."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_coding])
+
+        result = benchmarks_resource.get(categories=["math"])
+
+        assert result == []
+
+    def test_filter_by_languages_excludes_custom(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+        public_french,
+    ):
+        """Filtering by language excludes custom benchmarks (they have no language)."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_french])
+
+        result = benchmarks_resource.get(languages=["french"])
+
+        assert len(result) == 1
+        assert result[0].key == "french-bench"
+
+    def test_filter_by_languages_no_match_returns_empty(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+    ):
+        """Filtering by a language that no benchmark matches returns empty list."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning])
+
+        result = benchmarks_resource.get(languages=["spanish"])
+
+        assert result == []
+
+    def test_filter_by_key_sends_param_to_api(
+        self,
+        benchmarks_resource,
+        public_reasoning,
+    ):
+        """Filtering by key sends the key param to the API."""
+        self._mock_responses(benchmarks_resource, [], [public_reasoning])
+
+        benchmarks_resource.get(key="mmlu")
+
+        # Verify key param was sent in API calls
+        for c in benchmarks_resource._get.call_args_list:
+            assert c.kwargs["params"]["key"] == "mmlu"
+
+    def test_combined_filters_categories_and_languages(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+        public_french,
+    ):
+        """Multiple filters are applied together (AND logic)."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_french])
+
+        result = benchmarks_resource.get(categories=["reasoning"], languages=["french"])
+
+        assert len(result) == 1
+        assert result[0].key == "french-bench"
+
+    def test_no_filters_returns_all(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+        public_coding,
+    ):
+        """When no filters are applied, all benchmarks are returned."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_coding])
+
+        result = benchmarks_resource.get()
+
+        assert len(result) == 3
+
+    def test_filter_case_insensitive(
+        self,
+        benchmarks_resource,
+        custom_bench,
+        public_reasoning,
+    ):
+        """Filters are case-insensitive."""
+        self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning])
+
+        result = benchmarks_resource.get(categories=["REASONING"])
+
+        assert len(result) == 1
+        assert result[0].key == "mmlu"
diff --git a/tests/resources/test_evaluation_spaces.py b/tests/resources/test_evaluation_spaces.py
new file mode 100644
index 0000000..8be2f16
--- /dev/null
+++ b/tests/resources/test_evaluation_spaces.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from unittest.mock import Mock
+
+import pytest
+
+from layerlens.models.evaluation_space import EvaluationSpace
+from layerlens.resources.evaluation_spaces.evaluation_spaces import EvaluationSpaces
+
+
+class TestEvaluationSpaces:
+    """Test EvaluationSpaces resource API methods."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        client.put_cast = Mock()
+        client.delete_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def spaces_resource(self, mock_client):
+        return EvaluationSpaces(mock_client)
+
+    @pytest.fixture
+    def sample_space_data(self):
+        return {
+            "id": "sp-123",
+            "organization_id": "org-123",
+            "project_id": "proj-456",
+            "name": "Q1 Comparison",
+            "description": "Compare models for Q1",
+            "visibility": "private",
+            "owner": "admin@test.com",
+            "created_at": "2026-01-01T00:00:00Z",
+        }
+
+    def test_base_url(self, spaces_resource):
+        """Base URL includes org and project."""
+        assert spaces_resource._base_url() == "/organizations/org-123/projects/proj-456/evaluation-spaces"
+
+    def test_get_success(self, spaces_resource, sample_space_data):
+        """get returns EvaluationSpace on success."""
+        spaces_resource._get.return_value = {"status": "success", "data": sample_space_data}
+
+        result = spaces_resource.get("sp-123")
+
+        assert isinstance(result, EvaluationSpace)
+        assert result.name == "Q1 Comparison"
+
+    def test_get_not_found(self, spaces_resource):
+        """get returns None when not found."""
+        spaces_resource._get.return_value = None
+
+        result = spaces_resource.get("nonexistent")
+
+        assert result is None
+
+    def test_get_many_success(self, spaces_resource, sample_space_data):
+        """get_many returns EvaluationSpacesResponse."""
+        spaces_resource._get.return_value = {
+            "status": "success",
+            "data": {"evaluation_spaces": [sample_space_data], "count": 1, "total_count": 1},
+        }
+
+        result = spaces_resource.get_many()
+
+        assert result is not None
+        assert len(result.evaluation_spaces) == 1
+        assert result.evaluation_spaces[0].name == "Q1 Comparison"
+
+    def test_get_many_pagination(self, spaces_resource):
+        """get_many passes pagination and sort parameters."""
+        spaces_resource._get.return_value = {
+            "status": "success",
+            "data": {"evaluation_spaces": [], "count": 0, "total_count": 0},
+        }
+
+        spaces_resource.get_many(page=2, page_size=10, sort_by="created_at", order="desc")
+
+        call_params = spaces_resource._get.call_args[1]["params"]
+        assert call_params["page"] == "2"
+        assert call_params["page_size"] == "10"
+        assert call_params["sort_by"] == "created_at"
+        assert call_params["order"] == "desc"
+
+    def test_create_success(self, spaces_resource, sample_space_data):
+        """create returns EvaluationSpace."""
+        spaces_resource._post.return_value = {"status": "success", "data": sample_space_data}
+
+        result = spaces_resource.create(name="Q1 Comparison", description="Compare models for Q1")
+
+        assert result is not None
+        assert result.name == "Q1 Comparison"
+
+    def test_create_request_body(self, spaces_resource):
+        """create sends correct body."""
+        spaces_resource._post.return_value = {"status": "success", "data": {"name": "Test"}}
+
+        spaces_resource.create(name="Test", description="Desc", visibility="public")
+
+        call_body = spaces_resource._post.call_args[1]["body"]
+        assert call_body["name"] == "Test"
+        assert call_body["description"] == "Desc"
+        assert call_body["visibility"] == "public"
+
+    def test_delete_success(self, spaces_resource):
+        """delete returns True on success."""
+        spaces_resource._delete.return_value = {}
+
+        result = spaces_resource.delete("sp-123")
+
+        assert result is True
+
+    def test_delete_failure(self, spaces_resource):
+        """delete returns False on exception."""
+        spaces_resource._delete.side_effect = Exception("error")
+
+        result = spaces_resource.delete("sp-123")
+
+        assert result is False
diff --git a/tests/resources/test_integrations.py b/tests/resources/test_integrations.py
new file mode 100644
index 0000000..c371a8c
--- /dev/null
+++ b/tests/resources/test_integrations.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+from unittest.mock import Mock
+
+import pytest
+
+from layerlens._constants import DEFAULT_TIMEOUT
+from layerlens.models.integration import Integration
+from layerlens.resources.integrations.integrations import Integrations
+
+
+class TestIntegrations:
+    """Test Integrations resource API methods."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        client.delete_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def integrations_resource(self, mock_client):
+        return Integrations(mock_client)
+
+    @pytest.fixture
+    def sample_integration_data(self):
+        return {
+            "id": "int-123",
+            "organization_id": "org-123",
+            "project_id": "proj-456",
+            "name": "Langfuse Prod",
+            "type": "langfuse",
+            "status": "active",
+            "created_at": "2026-01-01T00:00:00Z",
+        }
+
+    def test_base_url_org_level(self, integrations_resource):
+        """Base URL is at organization level (no project_id)."""
+        url = integrations_resource._base_url()
+        assert url == "/organizations/org-123/integrations"
+        assert "project" not in url
+
+    def test_get_success(self, integrations_resource, sample_integration_data):
+        """get returns Integration on success."""
+        integrations_resource._get.return_value = sample_integration_data
+
+        result = integrations_resource.get("int-123")
+
+        assert isinstance(result, Integration)
+        assert result.id == "int-123"
+        assert result.name == "Langfuse Prod"
+
+    def test_get_with_envelope(self, integrations_resource, sample_integration_data):
+        """get handles {status, data} envelope."""
+        integrations_resource._get.return_value = {"status": "success", "data": sample_integration_data}
+
+        result = integrations_resource.get("int-123")
+
+        assert isinstance(result, Integration)
+
+    def test_get_not_found(self, integrations_resource):
+        """get returns None when not found."""
+        integrations_resource._get.return_value = None
+
+        result = integrations_resource.get("nonexistent")
+
+        assert result is None
+
+    def test_get_many_success(self, integrations_resource, sample_integration_data):
+        """get_many returns IntegrationsResponse."""
+        integrations_resource._get.return_value = {
+            "status": "success",
+            "data": {"integrations": [sample_integration_data], "count": 1, "total_count": 1},
+        }
+
+        result = integrations_resource.get_many()
+
+        assert result is not None
+        assert len(result.integrations) == 1
+        assert result.integrations[0].name == "Langfuse Prod"
+        assert result.count == 1
+
+    def test_get_many_empty(self, integrations_resource):
+        """get_many returns empty list."""
+        integrations_resource._get.return_value = {
+            "status": "success",
+            "data": {"integrations": [], "count": 0, "total_count": 0},
+        }
+
+        result = integrations_resource.get_many()
+
+        assert result is not None
+        assert len(result.integrations) == 0
+
+    def test_get_many_pagination(self, integrations_resource, sample_integration_data):
+        """get_many passes pagination parameters."""
+        integrations_resource._get.return_value = {
+            "status": "success",
+            "data": {"integrations": [sample_integration_data], "count": 1, "total_count": 10},
+        }
+
+        integrations_resource.get_many(page=2, page_size=5)
+
+        integrations_resource._get.assert_called_once_with(
+            "/organizations/org-123/integrations",
+            params={"page": "2", "page_size": "5"},
+            timeout=DEFAULT_TIMEOUT,
+            cast_to=dict,
+        )
+
+    def test_test_integration_success(self, integrations_resource):
+        """test returns TestIntegrationResponse."""
+        integrations_resource._post.return_value = {
+            "status": "success",
+            "data": {"success": True, "message": "Connection OK"},
+        }
+
+        result = integrations_resource.test("int-123")
+
+        assert result is not None
+        assert result.success is True
+        assert result.message == "Connection OK"
+
+    def test_test_integration_failure(self, integrations_resource):
+        """test returns failure result."""
+        integrations_resource._post.return_value = {
+            "status": "success",
+            "data": {"success": False, "message": "Connection refused"},
+        }
+
+        result = integrations_resource.test("int-123")
+
+        assert result is not None
+        assert result.success is False
diff --git a/tests/resources/test_models_resource.py b/tests/resources/test_models_resource.py
index 4852a00..085177b 100644
--- a/tests/resources/test_models_resource.py
+++ b/tests/resources/test_models_resource.py
@@ -843,3 +843,242 @@ def test_create_custom_returns_none_on_error_envelope(self, models_resource):
         )
 
         assert result is None
+
+
+class TestModelsClientSideFiltering:
+    """Test client-side filtering for models (fixes API not filtering custom objects)."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def models_resource(self, mock_client):
+        return Models(mock_client)
+
+    @pytest.fixture
+    def public_openai(self):
+        return PublicModel(
+            id="pub-1",
+            key="gpt-4",
+            name="GPT-4",
+            description="OpenAI model",
+            company="OpenAI",
+            architecture_type="Transformer",
+            open_weights=False,
+            region="us-east-1",
+            license="proprietary",
+        )
+
+    @pytest.fixture
+    def public_meta(self):
+        return PublicModel(
+            id="pub-2",
+            key="llama-3",
+            name="Llama 3",
+            description="Meta model",
+            company="Meta",
+            architecture_type="Transformer",
+            open_weights=True,
+            region="us-west-2",
+            license="llama-community",
+        )
+
+    @pytest.fixture
+    def public_mistral(self):
+        return PublicModel(
+            id="pub-3",
+            key="mixtral",
+            name="Mixtral",
+            description="Mistral MoE",
+            company="Mistral",
+            architecture_type="MoE",
+            open_weights=True,
+            region="eu-west-1",
+            license="apache-2.0",
+        )
+
+    @pytest.fixture
+    def custom_model(self):
+        return CustomModel(
+            id="custom-1",
+            key="my-model",
+            name="My Custom Model",
+            description="Custom",
+            max_tokens=4096,
+        )
+
+    def _mock_responses(self, resource, custom_list, public_list):
+        """Helper to set up mock API responses."""
+        custom_resp = ModelsResponse(data=ModelsResponse.Data(models=custom_list))
+        public_resp = ModelsResponse(data=ModelsResponse.Data(models=public_list))
+        resource._get.side_effect = lambda *_, **kwargs: (
+            custom_resp if kwargs.get("params", {}).get("type") == "custom" else public_resp
+        )
+
+    def test_filter_by_companies_excludes_custom(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_meta,
+    ):
+        """Filtering by company excludes custom models (they have no company)."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_meta])
+
+        result = models_resource.get(companies=["OpenAI"])
+
+        assert len(result) == 1
+        assert result[0].key == "gpt-4"
+
+    def test_filter_by_companies_no_match_returns_empty(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+    ):
+        """Filtering by a company with no match returns empty list."""
+        self._mock_responses(models_resource, [custom_model], [public_openai])
+
+        result = models_resource.get(companies=["Google"])
+
+        assert result == []
+
+    def test_filter_by_regions_excludes_custom(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_mistral,
+    ):
+        """Filtering by region excludes custom models."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_mistral])
+
+        result = models_resource.get(regions=["eu-west-1"])
+
+        assert len(result) == 1
+        assert result[0].key == "mixtral"
+
+    def test_filter_by_licenses_excludes_custom(
+        self,
+        models_resource,
+        custom_model,
+        public_meta,
+        public_mistral,
+    ):
+        """Filtering by license excludes custom models."""
+        self._mock_responses(models_resource, [custom_model], [public_meta, public_mistral])
+
+        result = models_resource.get(licenses=["apache-2.0"])
+
+        assert len(result) == 1
+        assert result[0].key == "mixtral"
+
+    def test_filter_by_categories_open_source(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_meta,
+    ):
+        """Filtering by 'Open-Source' category matches models with open_weights=True."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_meta])
+
+        result = models_resource.get(categories=["Open-Source"])
+
+        assert len(result) == 1
+        assert result[0].key == "llama-3"
+
+    def test_filter_by_categories_closed_source(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_meta,
+    ):
+        """Filtering by 'Closed-Source' matches models with open_weights=False."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_meta])
+
+        result = models_resource.get(categories=["Closed-Source"])
+
+        assert len(result) == 1
+        assert result[0].key == "gpt-4"
+
+    def test_filter_by_categories_architecture(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_mistral,
+    ):
+        """Filtering by architecture type category (MoE) works."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_mistral])
+
+        result = models_resource.get(categories=["MoE"])
+
+        assert len(result) == 1
+        assert result[0].key == "mixtral"
+
+    def test_filter_by_key_sends_param_to_api(
+        self,
+        models_resource,
+        public_openai,
+    ):
+        """Filtering by key sends the key param to the API."""
+        self._mock_responses(models_resource, [], [public_openai])
+
+        models_resource.get(key="gpt")
+
+        for c in models_resource._get.call_args_list:
+            assert c.kwargs["params"]["key"] == "gpt"
+
+    def test_combined_filters(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_meta,
+        public_mistral,
+    ):
+        """Multiple filters work together (AND logic)."""
+        self._mock_responses(
+            models_resource,
+            [custom_model],
+            [public_openai, public_meta, public_mistral],
+        )
+
+        result = models_resource.get(categories=["Open-Source"], regions=["eu-west-1"])
+
+        assert len(result) == 1
+        assert result[0].key == "mixtral"
+
+    def test_no_filters_returns_all(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+        public_meta,
+    ):
+        """When no filters are applied, all models are returned."""
+        self._mock_responses(models_resource, [custom_model], [public_openai, public_meta])
+
+        result = models_resource.get()
+
+        assert len(result) == 3
+
+    def test_filter_excludes_all_when_no_match(
+        self,
+        models_resource,
+        custom_model,
+        public_openai,
+    ):
+        """Filtering with no matches returns empty list, including no custom models."""
+        self._mock_responses(models_resource, [custom_model], [public_openai])
+
+        result = models_resource.get(regions=["ap-southeast-1"])
+
+        assert result == []
diff --git a/tests/resources/test_scorers.py b/tests/resources/test_scorers.py
new file mode 100644
index 0000000..c1d4b61
--- /dev/null
+++ b/tests/resources/test_scorers.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+from unittest.mock import Mock
+
+import pytest
+
+from layerlens._constants import DEFAULT_TIMEOUT
+from layerlens.models.scorer import Scorer
+from layerlens.resources.scorers.scorers import Scorers, _normalize_keys, _pascal_to_snake
+
+
+class TestPascalToSnake:
+    """Test PascalCase to snake_case conversion."""
+
+    def test_simple(self):
+        assert _pascal_to_snake("Name") == "name"
+
+    def test_two_words(self):
+        assert _pascal_to_snake("ModelId") == "model_id"
+
+    def test_three_words(self):
+        assert _pascal_to_snake("ModelCompany") == "model_company"
+
+    def test_consecutive_caps(self):
+        assert _pascal_to_snake("ModelID") == "model_id"
+
+    def test_already_snake(self):
+        assert _pascal_to_snake("model_id") == "model_id"
+
+    def test_single_char(self):
+        assert _pascal_to_snake("A") == "a"
+
+
+class TestNormalizeKeys:
+    """Test dict key normalization."""
+
+    def test_pascal_keys(self):
+        d = {"Name": "test", "ModelId": "m-1", "ModelCompany": "OpenAI"}
+        result = _normalize_keys(d)
+        assert result["name"] == "test"
+        assert result["model_id"] == "m-1"
+        assert result["model_company"] == "OpenAI"
+
+    def test_snake_keys_passthrough(self):
+        d = {"name": "test", "model_id": "m-1"}
+        result = _normalize_keys(d)
+        assert result is d  # Same object, not copied
+
+    def test_empty_dict(self):
+        result = _normalize_keys({})
+        assert result == {}
+
+
+class TestScorers:
+    """Test Scorers resource API methods."""
+
+    @pytest.fixture
+    def mock_client(self):
+        client = Mock()
+        client.organization_id = "org-123"
+        client.project_id = "proj-456"
+        client.get_cast = Mock()
+        client.post_cast = Mock()
+        client.patch_cast = Mock()
+        client.delete_cast = Mock()
+        return client
+
+    @pytest.fixture
+    def scorers_resource(self, mock_client):
+        return Scorers(mock_client)
+
+    @pytest.fixture
+    def sample_scorer_data(self):
+        return {
+            "id": "s-123",
+            "organization_id": "org-123",
+            "project_id": "proj-456",
+            "name": "Quality Scorer",
+            "description": "Rates quality",
+            "model_id": "m-1",
+            "model_name": "GPT-4",
+            "model_key": "openai/gpt-4",
+            "model_company": "OpenAI",
+            "prompt": "Rate quality",
+            "created_at": "2026-01-01T00:00:00Z",
+            "updated_at": "2026-01-01T00:00:00Z",
+        }
+
+    def test_base_url(self, scorers_resource):
+        """Base URL includes org and project."""
+        assert scorers_resource._base_url() == "/organizations/org-123/projects/proj-456/scorers"
+
+    def test_get_success(self, scorers_resource, sample_scorer_data):
+        """get returns Scorer on success."""
+        scorers_resource._get.return_value = sample_scorer_data
+
+        result = scorers_resource.get("s-123")
+
+        assert isinstance(result, Scorer)
+        assert result.name == "Quality Scorer"
+
+    def test_get_with_envelope(self, scorers_resource, sample_scorer_data):
+        """get handles {status, data} envelope."""
+        scorers_resource._get.return_value = {"status": "success", "data": sample_scorer_data}
+
+        result = scorers_resource.get("s-123")
+
+        assert isinstance(result, Scorer)
+        assert result.id == "s-123"
+
+    def test_get_not_found(self, scorers_resource):
+        """get returns None when not found."""
+        scorers_resource._get.return_value = None
+
+        result = scorers_resource.get("nonexistent")
+
+        assert result is None
+
+    def test_get_many_success(self, scorers_resource, sample_scorer_data):
+        """get_many returns ScorersResponse."""
+        scorers_resource._get.return_value = {
+            "status": "success",
+            "data": {"scorers": [sample_scorer_data], "count": 1, "total_count": 1},
+        }
+
+        result = scorers_resource.get_many()
+
+        assert result is not None
+        assert len(result.scorers) == 1
+        assert result.scorers[0].name == "Quality Scorer"
+
+    def test_get_many_empty(self, scorers_resource):
+        """get_many returns empty list when no scorers."""
+        scorers_resource._get.return_value = {
+            "status": "success",
+            "data": {"scorers": [], "count": 0, "total_count": 0},
+        }
+
+        result = scorers_resource.get_many()
+
+        assert result is not None
+        assert len(result.scorers) == 0
+
+    def test_create_with_pascal_response(self, scorers_resource):
+        """create handles PascalCase API response."""
+        scorers_resource._post.return_value = {
+            "status": "success",
+            "data": {
+                "Name": "New Scorer",
+                "Description": "Desc",
+                "ModelID": "m-1",
+                "ModelName": "GPT-4",
+                "ModelCompany": "OpenAI",
+                "ModelKey": "openai/gpt-4",
+                "Prompt": "Rate it",
+                "CreatedAt": "2026-01-01",
+                "UpdatedAt": "2026-01-01",
+            },
+        }
+
+        result = scorers_resource.create(name="New Scorer", description="Desc", model_id="m-1", prompt="Rate it")
+
+        assert result is not None
+        assert result.name == "New Scorer"
+        assert result.model_name == "GPT-4"
+
+    def test_create_request_parameters(self, scorers_resource):
+        """create sends correct body."""
+        scorers_resource._post.return_value = {"status": "success", "data": {"Name": "X", "Prompt": "Y"}}
+
+        scorers_resource.create(name="X", description="D", model_id="m-1", prompt="Y")
+
+        scorers_resource._post.assert_called_once_with(
+            "/organizations/org-123/projects/proj-456/scorers",
+            body={"name": "X", "description": "D", "model_id": "m-1", "prompt": "Y"},
+            timeout=DEFAULT_TIMEOUT,
+            cast_to=dict,
+        )
+
+    def test_delete_success(self, scorers_resource):
+        """delete returns True on success."""
+        scorers_resource._delete.return_value = {}
+
+        result = scorers_resource.delete("s-123")
+
+        assert result is True
+
+    def test_delete_failure(self, scorers_resource):
+        """delete returns False on exception."""
+        scorers_resource._delete.side_effect = Exception("not found")
+
+        result = scorers_resource.delete("s-123")
+
+        assert result is False
+
+    def test_update_sends_patch(self, scorers_resource):
+        """update sends PATCH with only provided fields."""
+        scorers_resource._patch.return_value = {}
+
+        result = scorers_resource.update("s-123", name="Updated")
+
+        assert result is True
+        scorers_resource._patch.assert_called_once()
+        call_body = scorers_resource._patch.call_args[1]["body"]
+        assert call_body == {"name": "Updated"}