diff --git a/README.md b/README.md index 2b9871a..536e80b 100644 --- a/README.md +++ b/README.md @@ -248,15 +248,62 @@ Catch the most specific exception first. The hierarchy: Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`. +## CLI + +The LayerLens CLI lets you manage traces, judges, evaluations, integrations, and more from the terminal. + +### Install + +```bash +pip install layerlens[cli] --extra-index-url https://sdk.layerlens.ai/package +``` + +### Configure + +```bash +export LAYERLENS_STRATIX_API_KEY="your-api-key" +``` + +### Usage + +```bash +stratix --help # Show all commands +stratix trace list # List traces +stratix evaluate run \ + --model openai/gpt-4o \ + --benchmark arc-agi-2 --wait # Run an evaluation and wait for results +stratix judge create \ + --name "Quality" \ + --goal "Rate response quality" \ + --model-id # Create a judge +stratix ci report -o summary.md # Generate CI report +``` + +Shell completions are available for bash, zsh, fish, and powershell: + +```bash +stratix completion bash # Print setup instructions +``` + +Full CLI docs: [docs/cli/](docs/cli/) + +| Guide | Description | +| --- | --- | +| [Getting Started](docs/cli/getting-started.md) | Installation, configuration, first commands | +| [Command Reference](docs/cli/commands.md) | All commands and options | +| [Examples](docs/cli/examples.md) | 15 common workflows as copy-paste shell sessions | + ## Requirements - Python 3.8+ - Dependencies: `httpx`, `pydantic`, `requests` +- CLI extra: `click>=8.0.0` ## Documentation Full API reference and examples are available in the [docs/](docs/) directory: +- [CLI Guide](docs/cli/) (getting started, command reference, workflow examples) - [API Reference](docs/api-reference/) (client config, all resource methods, error handling) - [Code Examples](docs/examples/) (evaluations, judges, traces) - [Troubleshooting](docs/troubleshooting/) (auth issues, error codes) diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 619c641..7d2c039 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -15,11 +15,17 @@ * [Results](api-reference/results.md) * [Models & Benchmarks](api-reference/models-benchmarks.md) * [Judges](api-reference/judges.md) + * [Scorers](api-reference/scorers.md) * [Traces](api-reference/traces.md) * [Trace Evaluations](api-reference/trace-evaluations.md) * [Judge Optimizations](api-reference/judge-optimizations.md) * [Error Handling](api-reference/errors.md) +## CLI +* [Getting Started](cli/getting-started.md) +* [Command Reference](cli/commands.md) +* [Workflow Examples](cli/examples.md) + ## Code Examples * [Overview](examples/README.md) * [Creating Evaluations](examples/creating-evaluations.md) diff --git a/docs/api-reference/evaluations.md b/docs/api-reference/evaluations.md index a9039e0..5f4d59e 100644 --- a/docs/api-reference/evaluations.md +++ b/docs/api-reference/evaluations.md @@ -177,22 +177,23 @@ async def get_evaluation(): asyncio.run(get_evaluation()) ``` -### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, timeout=None)` +### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, unique=False, timeout=None)` Retrieves multiple evaluations with optional pagination, sorting, and filtering. #### Parameters -| Parameter | Type | Required | Description | -| --------------- | -------------------------------- | -------- | ------------------------------------------------------- | -| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | -| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | -| `sort_by` | `str \| None` | No | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` | -| `order` | `str \| None` | No | Sort order: `asc` or `desc` | -| `model_ids` | `List[str] \| None` | No | Filter by model IDs | -| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | -| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | -| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | +| Parameter | Type | Required | Description | +| --------------- | -------------------------------- | -------- | ----------------------------------------------------------------------------------- | +| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | +| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | +| `sort_by` | `str \| None` | No | Sort by field: `submitted_at`, `accuracy`, or `average_duration` | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `model_ids` | `List[str] \| None` | No | Filter by model IDs | +| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | +| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | +| `unique` | `bool` | No | If `True`, deduplicate by model+benchmark pair, keeping only the latest evaluation | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | #### Returns @@ -222,6 +223,13 @@ response = client.evaluations.get_many( if response: for evaluation in response.evaluations: print(f"{evaluation.id}: accuracy={evaluation.accuracy:.2f}%") + +# Get only the latest evaluation per model+benchmark pair +response = client.evaluations.get_many( + unique=True, + sort_by="accuracy", + order="desc", +) ``` ### `get_results(page=None, page_size=None, timeout=None)` diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md index 550190c..bef109a 100644 --- a/docs/api-reference/models-benchmarks.md +++ b/docs/api-reference/models-benchmarks.md @@ -35,20 +35,24 @@ benchmarks = client.benchmarks.get() ## Models -### `get(type=None, name=None, companies=None, regions=None, licenses=None, timeout=None)` +### `get(type=None, name=None, key=None, categories=None, companies=None, regions=None, licenses=None, timeout=None)` Retrieves a list of available models with optional filtering parameters. Both the `Stratix` and `AsyncStratix` clients have this method. #### Parameters -| Parameter | Type | Required | Description | -| ----------- | ------------------------------------- | -------- | ---------------------------------------------------------------------- | -| `type` | `Literal["custom", "public"] \| None` | No | Filter by model type. If `None`, returns both custom and public models | -| `name` | `str \| None` | No | Filter models by name (partial match search) | -| `companies` | `List[str] \| None` | No | Filter by model companies/providers | -| `regions` | `List[str] \| None` | No | Filter by supported regions | -| `licenses` | `List[str] \| None` | No | Filter by license types | -| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | +| Parameter | Type | Required | Description | +| ------------ | ------------------------------------- | -------- | ---------------------------------------------------------------------------------------------- | +| `type` | `Literal["custom", "public"] \| None` | No | Filter by model type. If `None`, returns both custom and public models | +| `name` | `str \| None` | No | Filter models by name (partial match search) | +| `key` | `str \| None` | No | Filter models by key (partial match search) | +| `categories` | `List[str] \| None` | No | Filter by categories: `Transformer`, `MoE`, `Open-Source`, `Closed-Source` | +| `companies` | `List[str] \| None` | No | Filter by model companies/providers | +| `regions` | `List[str] \| None` | No | Filter by supported regions | +| `licenses` | `List[str] \| None` | No | Filter by license types | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +> **Note:** When filtering by `categories`, `companies`, `regions`, or `licenses`, only public models are returned since custom models do not have these fields. #### Returns @@ -185,17 +189,22 @@ if result: ## Benchmarks -### `get(type=None, name=None, timeout=None)` +### `get(type=None, name=None, key=None, categories=None, languages=None, timeout=None)` Retrieves a list of available benchmarks with optional filtering parameters. Both the `Stratix` and `AsyncStratix` clients have this method. #### Parameters -| Parameter | Type | Required | Description | -| --------- | ------------------------------------- | -------- | ------------------------------------------------------------------------------ | -| `type` | `Literal["custom", "public"] \| None` | No | Filter by benchmark type. If `None`, returns both custom and public benchmarks | -| `name` | `str \| None` | No | Filter benchmarks by name (partial match search) | -| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | +| Parameter | Type | Required | Description | +| ------------ | ------------------------------------- | -------- | ------------------------------------------------------------------------------ | +| `type` | `Literal["custom", "public"] \| None` | No | Filter by benchmark type. If `None`, returns both custom and public benchmarks | +| `name` | `str \| None` | No | Filter benchmarks by name (partial match search) | +| `key` | `str \| None` | No | Filter benchmarks by key (partial match search) | +| `categories` | `List[str] \| None` | No | Filter by categories (e.g., `reasoning`, `knowledge`, `coding`) | +| `languages` | `List[str] \| None` | No | Filter by language (e.g., `english`, `french`) | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +> **Note:** When filtering by `categories` or `languages`, only public benchmarks are returned since custom benchmarks do not have these fields. #### Returns diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md index 31afc82..70a209a 100644 --- a/docs/api-reference/public-client.md +++ b/docs/api-reference/public-client.md @@ -286,16 +286,17 @@ Retrieves evaluations with optional pagination, sorting, and filtering. #### Parameters -| Parameter | Type | Required | Description | -| ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ | -| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | -| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | -| `sort_by` | `str \| None` | No | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` | -| `order` | `str \| None` | No | Sort order: `asc` or `desc` | -| `model_ids` | `List[str] \| None` | No | Filter by model IDs | -| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | -| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | -| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | +| Parameter | Type | Required | Description | +| ----------------- | -------------------------------- | -------- | ---------------------------------------------------------------------------------- | +| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | +| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | +| `sort_by` | `str \| None` | No | Sort by field: `submitted_at`, `accuracy`, or `average_duration` | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `model_ids` | `List[str] \| None` | No | Filter by model IDs | +| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | +| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | +| `unique` | `bool` | No | If `True`, deduplicate by model+benchmark pair, keeping only the latest evaluation | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | #### Returns diff --git a/docs/api-reference/scorers.md b/docs/api-reference/scorers.md new file mode 100644 index 0000000..652a815 --- /dev/null +++ b/docs/api-reference/scorers.md @@ -0,0 +1,176 @@ +# Scorers + +The `scorers` resource on the Stratix client allows you to create and manage custom scorers for evaluating benchmark results. Scorers use an LLM model to evaluate model outputs using a custom prompt. + +## Overview + +A scorer defines a custom evaluation criterion backed by a specific LLM model and a prompt template. Custom scorers can be attached to custom benchmarks to provide additional scoring beyond the built-in metrics. + +### Using Synchronous Client + +```python +from layerlens import Stratix + +client = Stratix() + +# Fetch a model to use for the scorer +models = client.models.get(type="public", name="gpt-4o") +model = models[0] + +# Create a scorer +scorer = client.scorers.create( + name="Helpfulness Scorer", + description="Evaluates how helpful the response is", + model_id=model.id, + prompt="Rate the helpfulness of the following response on a scale of 0 to 1.", +) + +if scorer: + print(f"Created scorer: {scorer.name} (id={scorer.id})") + +# List all scorers +response = client.scorers.get_many() +if response: + for s in response.scorers: + print(f" {s.name}: {s.description}") +``` + +### Using Async Client + +```python +import asyncio +from layerlens import AsyncStratix + +async def main(): + client = AsyncStratix() + + scorer = await client.scorers.create( + name="Helpfulness Scorer", + description="Evaluates how helpful the response is", + model_id="model-abc123", + prompt="Rate the helpfulness of the following response on a scale of 0 to 1.", + ) + + if scorer: + print(f"Created scorer: {scorer.name}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Methods + +Both the `Stratix` (synchronous) and `AsyncStratix` (asynchronous) clients support the following methods. + +### `create(name, description, model_id, prompt, timeout=None)` + +Creates a new custom scorer. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------- | -------------------------------- | -------- | -------------------------------------------------- | +| `name` | `str` | Yes | Display name for the scorer | +| `description` | `str` | Yes | Description of what the scorer evaluates | +| `model_id` | `str` | Yes | ID of the LLM model to use for scoring | +| `prompt` | `str` | Yes | Prompt template used to evaluate model outputs | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `Scorer` object if successful, `None` if the scorer could not be created. + +### `get(id, timeout=None)` + +Retrieves a scorer by its unique identifier. + +#### Parameters + +| Parameter | Type | Required | Description | +| --------- | -------------------------------- | -------- | ------------------------ | +| `id` | `str` | Yes | The unique scorer ID | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `Scorer` object if found, `None` otherwise. + +### `get_many(page=None, page_size=None, timeout=None)` + +Retrieves multiple scorers with pagination. + +#### Parameters + +| Parameter | Type | Required | Description | +| ----------- | -------------------------------- | -------- | ---------------------------------------------------- | +| `page` | `int \| None` | No | Page number (1-based, defaults to 1) | +| `page_size` | `int \| None` | No | Number of scorers per page (default: 100, max: 500) | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `ScorersResponse` object containing: + +- `scorers`: List of `Scorer` objects +- `count`: Number of scorers in this page +- `total_count`: Total number of scorers + +Returns `None` if the request fails. + +### `update(id, name=None, description=None, model_id=None, prompt=None, timeout=None)` + +Updates an existing scorer. Only provided fields are modified; omitted fields remain unchanged. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------- | -------------------------------- | -------- | -------------------------------------- | +| `id` | `str` | Yes | The unique scorer ID | +| `name` | `str \| None` | No | Updated display name | +| `description` | `str \| None` | No | Updated description | +| `model_id` | `str \| None` | No | Updated model ID | +| `prompt` | `str \| None` | No | Updated prompt template | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `True` if the update succeeded, `False` otherwise. + +### `delete(id, timeout=None)` + +Deletes a scorer by its unique identifier. + +#### Parameters + +| Parameter | Type | Required | Description | +| --------- | -------------------------------- | -------- | ------------------------ | +| `id` | `str` | Yes | The unique scorer ID | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `True` if the scorer was deleted, `False` otherwise. + +## Response Objects + +### Scorer Object Properties + +| Property | Type | Description | +| ----------------- | ------------- | ------------------------------------ | +| `id` | `str` | Unique scorer identifier | +| `organization_id` | `str` | Organization the scorer belongs to | +| `project_id` | `str` | Project the scorer belongs to | +| `name` | `str` | Display name | +| `description` | `str \| None` | Description of what it evaluates | +| `model_id` | `str \| None` | ID of the backing LLM model | +| `model_name` | `str \| None` | Name of the backing LLM model | +| `model_key` | `str \| None` | Key of the backing LLM model | +| `model_company` | `str \| None` | Company that provides the model | +| `prompt` | `str \| None` | Prompt template for scoring | +| `created_at` | `str \| None` | ISO 8601 creation timestamp | +| `updated_at` | `str \| None` | ISO 8601 last update timestamp | + +## Next Steps + +- Learn about [Benchmarks](models-benchmarks.md) to attach custom scorers to custom benchmarks +- Learn about [Judges](judges.md) for evaluating traces diff --git a/docs/cli/commands.md b/docs/cli/commands.md new file mode 100644 index 0000000..30e8aee --- /dev/null +++ b/docs/cli/commands.md @@ -0,0 +1,423 @@ +# CLI — Command Reference + +Complete reference for all `stratix` CLI commands. + +## Command tree + +``` +stratix [global-options] +├── trace +│ ├── list List traces +│ ├── get Get a trace by ID +│ ├── search Search traces +│ ├── export Export a trace as JSON +│ └── delete Delete a trace +├── judge +│ ├── list List judges +│ ├── get Get a judge by ID +│ ├── create Create a new judge +│ └── test Test a judge against a trace +├── evaluate +│ ├── list List evaluations +│ ├── get Get an evaluation by ID +│ └── run Run a new evaluation +├── integration +│ ├── list List integrations +│ └── test Test an integration +├── scorer +│ ├── list List scorers +│ ├── get Get a scorer by ID +│ ├── create Create a new scorer +│ └── delete Delete a scorer +├── space +│ ├── list List evaluation spaces +│ ├── get Get a space by ID or slug +│ ├── create Create a new space +│ └── delete Delete a space +├── bulk +│ └── eval Run evaluations in bulk +├── ci +│ └── report Generate a CI summary report +└── completion Print shell completion setup +``` + +--- + +## Global options + +These options are available on every command: + +``` +--api-key TEXT API key (env: LAYERLENS_STRATIX_API_KEY) +--host TEXT API host +--port INTEGER API port +--format [table|json] Output format (default: table) +--verbose, -v Enable debug output +--version Show version and exit +--help Show help and exit +``` + +--- + +## trace + +Manage traces. + +### `trace list` + +List traces with optional filtering and pagination. + +```bash +stratix trace list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | +| `--source` | text | Filter by source | +| `--status` | text | Filter by status | +| `--sort-by` | text | Sort field | +| `--sort-order` | asc/desc | Sort order | + +### `trace get` + +Get a single trace by ID. + +```bash +stratix trace get +``` + +### `trace search` + +Search traces by query string. + +```bash +stratix trace search [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | +| `--source` | text | Filter by source | +| `--status` | text | Filter by status | +| `--sort-by` | text | Sort field | +| `--sort-order` | asc/desc | Sort order | + +### `trace export` + +Export a trace as JSON. + +```bash +stratix trace export [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--output`, `-o` | path | Output file (default: stdout) | + +### `trace delete` + +Delete a trace by ID. + +```bash +stratix trace delete [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--yes`, `-y` | flag | Skip confirmation prompt | + +--- + +## judge + +Manage judges. + +### `judge list` + +```bash +stratix judge list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | + +### `judge get` + +```bash +stratix judge get +``` + +### `judge create` + +Create a new judge. + +```bash +stratix judge create [OPTIONS] +``` + +| Option | Type | Required | Description | +| --- | --- | --- | --- | +| `--name` | text | yes | Judge name | +| `--goal` | text | yes | Evaluation goal description | +| `--model-id` | text | no | Model ID for the judge | + +### `judge test` + +Test a judge by running it against a trace. Creates a trace evaluation. + +```bash +stratix judge test [OPTIONS] +``` + +| Option | Type | Required | Description | +| --- | --- | --- | --- | +| `--judge-id` | text | yes | Judge ID to test | +| `--trace-id` | text | yes | Trace ID to evaluate | + +--- + +## evaluate + +Manage evaluations. + +### `evaluate list` + +```bash +stratix evaluate list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | +| `--status` | text | Filter: pending, in-progress, success, failure | +| `--sort-by` | submitted_at/accuracy/average_duration | Sort field | +| `--order` | asc/desc | Sort order | + +### `evaluate get` + +```bash +stratix evaluate get +``` + +### `evaluate run` + +Run a new evaluation. Accepts model/benchmark by ID, key, or name. + +```bash +stratix evaluate run [OPTIONS] +``` + +| Option | Type | Required | Description | +| --- | --- | --- | --- | +| `--model` | text | yes | Model ID, key, or name | +| `--benchmark` | text | yes | Benchmark ID, key, or name | +| `--wait` | flag | no | Wait for evaluation to complete | + +--- + +## integration + +Manage integrations. + +### `integration list` + +```bash +stratix integration list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | + +### `integration test` + +Test an integration by ID. + +```bash +stratix integration test +``` + +--- + +## scorer + +Manage scorers. + +### `scorer list` + +```bash +stratix scorer list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | + +### `scorer get` + +```bash +stratix scorer get +``` + +### `scorer create` + +```bash +stratix scorer create [OPTIONS] +``` + +| Option | Type | Required | Description | +| --- | --- | --- | --- | +| `--name` | text | yes | Name (3–64 characters) | +| `--description` | text | yes | Description (10–500 characters) | +| `--model-id` | text | yes | Model ID for scoring | +| `--prompt` | text | yes | Scoring prompt | +| `--dry-run` | flag | no | Preview without executing | + +### `scorer delete` + +```bash +stratix scorer delete [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--yes`, `-y` | flag | Skip confirmation prompt | +| `--dry-run` | flag | Preview without executing | + +--- + +## space + +Manage evaluation spaces. + +### `space list` + +```bash +stratix space list [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--page` | int | Page number | +| `--page-size` | int | Results per page | +| `--sort-by` | text | Sort field (e.g. weight, created_at) | +| `--order` | asc/desc | Sort order | + +### `space get` + +```bash +stratix space get +``` + +Accepts an ID or slug. + +### `space create` + +```bash +stratix space create [OPTIONS] +``` + +| Option | Type | Required | Description | +| --- | --- | --- | --- | +| `--name` | text | yes | Space name | +| `--description` | text | no | Description (max 500 characters) | +| `--visibility` | private/public/tenant | no | Visibility level | +| `--dry-run` | flag | no | Preview without executing | + +### `space delete` + +```bash +stratix space delete [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--yes`, `-y` | flag | Skip confirmation prompt | +| `--dry-run` | flag | Preview without executing | + +--- + +## bulk + +Bulk operations. + +### `bulk eval` + +Run evaluations in bulk. Supports two modes: + +**Mode 1: JSONL file** + +```bash +stratix bulk eval --file jobs.jsonl [OPTIONS] +``` + +Each line in the JSONL file is a JSON object with `model` and `benchmark` fields: + +```json +{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"} +{"model": "anthropic/claude-3-opus", "benchmark": "arc-agi-2"} +``` + +**Mode 2: Single model + benchmark** + +```bash +stratix bulk eval --model openai/gpt-4o --benchmark arc-agi-2 --wait [OPTIONS] +``` + +**Mode 3: Judge + trace IDs** + +```bash +stratix bulk eval --judge-id --traces trace_ids.txt [OPTIONS] +``` + +The traces file contains one trace ID per line. + +| Option | Type | Description | +| --- | --- | --- | +| `--file` | path | JSONL file with evaluation jobs | +| `--model` | text | Model ID/name (use with --benchmark) | +| `--benchmark` | text | Benchmark ID/name (use with --model) | +| `--judge-id` | text | Judge ID (use with --traces) | +| `--traces` | path | File with trace IDs (one per line) | +| `--dry-run` | flag | Preview without executing | +| `--wait` | flag | Wait for all evaluations to complete | + +--- + +## ci + +CI/CD pipeline helpers. + +### `ci report` + +Generate a markdown summary of recent evaluations, suitable for GitHub Actions job summaries. + +```bash +stratix ci report [OPTIONS] +``` + +| Option | Type | Description | +| --- | --- | --- | +| `--output`, `-o` | path | Output file (default: stdout) | +| `--limit` | int | Number of evaluations to include (default: 10) | +| `--dry-run` | flag | Preview without fetching data | + +--- + +## completion + +Print shell completion setup instructions. + +```bash +stratix completion +``` + +Where `SHELL` is one of: `bash`, `zsh`, `fish`, `powershell`. diff --git a/docs/cli/examples.md b/docs/cli/examples.md new file mode 100644 index 0000000..2d0ba21 --- /dev/null +++ b/docs/cli/examples.md @@ -0,0 +1,291 @@ +# CLI — Workflow Examples + +Fifteen copy-paste workflows covering the most common CLI tasks. + +--- + +## 1. Quick start: first trace to evaluation + +Set up, list traces, inspect one, and evaluate it with a judge. + +```bash +# Configure +export LAYERLENS_STRATIX_API_KEY="sk-..." + +# See what traces are available +stratix trace list + +# Inspect a specific trace +stratix trace get + +# Create a judge and test it against the trace +stratix judge create --name "Accuracy" --goal "Rate factual accuracy of the response" --model-id +stratix judge test --judge-id --trace-id +``` + +--- + +## 2. Run an evaluation and wait for results + +```bash +# Run and block until done +stratix evaluate run \ + --model openai/gpt-4o \ + --benchmark arc-agi-2 \ + --wait + +# Or fire and check later +stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2 +stratix evaluate list --status in-progress +stratix evaluate get +``` + +--- + +## 3. Compare models on the same benchmark + +```bash +stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2 --wait +stratix evaluate run --model anthropic/claude-3-opus --benchmark arc-agi-2 --wait + +# List results sorted by accuracy +stratix evaluate list --sort-by accuracy --order desc +``` + +--- + +## 4. Judge workflow: create, test, tune, apply + +```bash +# Create a judge +stratix judge create \ + --name "Helpfulness" \ + --goal "Rate how helpful and actionable the response is on a 1-5 scale" \ + --model-id + +# Test against a sample trace +stratix judge test --judge-id --trace-id + +# Review the result +stratix --format json judge get + +# Iterate: create a refined version +stratix judge create \ + --name "Helpfulness v2" \ + --goal "Rate helpfulness on a 1-5 scale with justification" \ + --model-id +``` + +--- + +## 5. Trace search and export + +```bash +# Search for traces matching a keyword +stratix trace search "customer support" --page-size 5 + +# Export a trace to a file +stratix trace export -o trace_backup.json + +# Export as JSON for piping +stratix --format json trace get | jq '.id' +``` + +--- + +## 6. Bulk evaluation from a JSONL file + +```bash +# Create a jobs file +cat > jobs.jsonl <<'EOF' +{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"} +{"model": "openai/gpt-4o-mini", "benchmark": "arc-agi-2"} +{"model": "anthropic/claude-3-opus", "benchmark": "arc-agi-2"} +EOF + +# Dry-run to preview +stratix bulk eval --file jobs.jsonl --dry-run + +# Execute and wait +stratix bulk eval --file jobs.jsonl --wait +``` + +--- + +## 7. Bulk trace evaluation with a judge + +```bash +# Create a trace ID file +stratix --format json trace list | jq -r '.[].id' > trace_ids.txt + +# Dry-run to preview +stratix bulk eval \ + --judge-id \ + --traces trace_ids.txt \ + --dry-run + +# Run trace evaluations for all traces +stratix bulk eval \ + --judge-id \ + --traces trace_ids.txt +``` + +--- + +## 8. CI/CD pipeline integration + +```bash +# In your GitHub Actions workflow: +stratix evaluate run \ + --model openai/gpt-4o \ + --benchmark arc-agi-2 \ + --wait + +# Generate a summary for the GitHub job +stratix ci report -o "$GITHUB_STEP_SUMMARY" + +# Or output JSON for custom processing +stratix ci report -o report.json +``` + +--- + +## 9. Integration monitoring + +```bash +# List all integrations +stratix integration list + +# Test a specific integration +stratix integration test + +# JSON output for scripting +stratix --format json integration list | jq '.[] | select(.status != "active")' +``` + +--- + +## 10. Scorer management + +```bash +# List existing scorers +stratix scorer list + +# Create a scorer with dry-run +stratix scorer create \ + --name "Code Quality" \ + --description "Evaluates generated code for correctness, readability, and best practices" \ + --model-id \ + --prompt "Score the following code on a 1-10 scale for quality..." \ + --dry-run + +# Create for real +stratix scorer create \ + --name "Code Quality" \ + --description "Evaluates generated code for correctness, readability, and best practices" \ + --model-id \ + --prompt "Score the following code on a 1-10 scale for quality..." + +# Delete with confirmation +stratix scorer delete + +# Delete without prompt +stratix scorer delete -y +``` + +--- + +## 11. Evaluation spaces + +```bash +# List spaces +stratix space list + +# Create a private space +stratix space create \ + --name "Q1 Model Comparison" \ + --description "Comparing GPT-4o vs Claude 3 Opus for Q1 release" \ + --visibility private + +# Create a public space (dry-run first) +stratix space create \ + --name "Public Leaderboard" \ + --visibility public \ + --dry-run + +# Get space details by slug or ID +stratix space get q1-model-comparison + +# Clean up +stratix space delete -y +``` + +--- + +## 12. JSON output and scripting + +```bash +# Pipe trace IDs into a loop +stratix --format json trace list | jq -r '.[].id' | while read id; do + echo "Exporting $id..." + stratix trace export "$id" -o "traces/${id}.json" +done + +# Get evaluation accuracy as a number +ACCURACY=$(stratix --format json evaluate get | jq -r '.accuracy') +echo "Accuracy: $ACCURACY" + +# Filter evaluations by status +stratix --format json evaluate list | jq '[.[] | select(.status == "success")]' +``` + +--- + +## 13. Pagination and sorting + +```bash +# Page through traces +stratix trace list --page 1 --page-size 20 +stratix trace list --page 2 --page-size 20 + +# Sort evaluations +stratix evaluate list --sort-by accuracy --order desc --page-size 5 +stratix evaluate list --sort-by submitted_at --order asc + +# Sort spaces +stratix space list --sort-by created_at --order desc +``` + +--- + +## 14. Verbose mode and debugging + +```bash +# Enable verbose output to see HTTP requests +stratix -v trace list + +# Combine with JSON output +stratix -v --format json evaluate get + +# Debug authentication issues +stratix -v integration list +``` + +--- + +## 15. Clean up resources + +```bash +# Delete a trace (with confirmation prompt) +stratix trace delete + +# Delete without prompting +stratix trace delete -y + +# Delete a scorer (dry-run first) +stratix scorer delete --dry-run +stratix scorer delete -y + +# Delete a space +stratix space delete -y +``` diff --git a/docs/cli/getting-started.md b/docs/cli/getting-started.md new file mode 100644 index 0000000..1245eb7 --- /dev/null +++ b/docs/cli/getting-started.md @@ -0,0 +1,150 @@ +# CLI — Getting Started + +The LayerLens Stratix CLI provides terminal access to all platform features: traces, judges, evaluations, integrations, scorers, evaluation spaces, bulk operations, and CI/CD helpers. + +## Installation + +Install the SDK with the `cli` extra: + +```bash +pip install layerlens[cli] --extra-index-url https://sdk.layerlens.ai/package +``` + +If you already have `layerlens` installed, add the CLI extra: + +```bash +pip install "layerlens[cli]" --extra-index-url https://sdk.layerlens.ai/package +``` + +For local development from a cloned repo: + +```bash +pip install -e ".[cli]" +``` + +Verify the installation: + +```bash +stratix --version +``` + +## Configuration + +### API key + +The CLI requires a LayerLens Stratix API key. Set it as an environment variable (recommended): + +```bash +export LAYERLENS_STRATIX_API_KEY="your-api-key" +``` + +Or pass it per-command: + +```bash +stratix --api-key "your-api-key" trace list +``` + +### Custom host + +By default the CLI talks to `api.layerlens.ai`. Override with: + +```bash +stratix --host my-instance.example.com trace list +stratix --host my-instance.example.com --port 8443 trace list +``` + +## Global options + +Every command accepts these options: + +| Option | Description | +| --- | --- | +| `--api-key` | API key (or set `LAYERLENS_STRATIX_API_KEY`) | +| `--host` | API host | +| `--port` | API port | +| `--format` | Output format: `table` (default) or `json` | +| `--verbose` / `-v` | Enable debug output | +| `--version` | Print version and exit | + +## Output formats + +The default output is a human-readable table: + +```bash +stratix trace list +``` + +``` +ID Created Filename Evaluations +─────────────────────────────────────────────────────────────────────────────────────── +a1b2c3d4-... 2026-03-15 14:30 traces.jsonl 3 +e5f6a7b8-... 2026-03-14 09:12 batch_02.json 1 +``` + +Switch to JSON for scripting: + +```bash +stratix --format json trace list +``` + +```json +[ + { + "id": "a1b2c3d4-...", + "created_at": "2026-03-15T14:30:00Z", + "filename": "traces.jsonl", + ... + } +] +``` + +## Shell completions + +The CLI supports tab-completion for commands, options, and resource IDs. + +```bash +# Print setup instructions for your shell +stratix completion bash +stratix completion zsh +stratix completion fish +stratix completion powershell +``` + +Follow the printed instructions to enable completions. After setup, you can tab-complete trace IDs, judge IDs, model names, and more. + +## First commands + +### List your traces + +```bash +stratix trace list +``` + +### Run an evaluation + +```bash +stratix evaluate run --model openai/gpt-4o --benchmark arc-agi-2 --wait +``` + +### Create a judge + +```bash +stratix judge create --name "Response Quality" --goal "Rate accuracy and completeness" --model-id +``` + +### Check integrations + +```bash +stratix integration list +``` + +### Generate a CI report + +```bash +stratix ci report -o summary.md +``` + +## Next steps + +- [Command Reference](commands.md) — all commands and their options +- [Examples](examples.md) — 15 common workflows as copy-paste shell sessions diff --git a/docs/examples/models-and-benchmarks.md b/docs/examples/models-and-benchmarks.md index fc21c68..573eb00 100644 --- a/docs/examples/models-and-benchmarks.md +++ b/docs/examples/models-and-benchmarks.md @@ -30,6 +30,20 @@ async def main(): models = await client.models.get(regions=region_names) print(f"Found {len(models)} models with regions {region_names}") + # --- Filter by categories + categories = ["Open-Source"] + models = await client.models.get(categories=categories) + print(f"Found {len(models)} open-source models") + + # --- Filter by key + models = await client.models.get(key="gpt-4") + print(f"Found {len(models)} models matching key 'gpt-4'") + + # --- Filter by license + licenses = ["apache-2.0"] + models = await client.models.get(licenses=licenses) + print(f"Found {len(models)} models with license {licenses}") + # --- Filter by type model_type = "public" models = await client.models.get(type=model_type) @@ -58,6 +72,20 @@ async def main(): benchmarks = await client.benchmarks.get(name=benchmark_name) print(f"Found {len(benchmarks)} benchmarks with name {benchmark_name}") + # --- Filter by categories + categories = ["reasoning"] + benchmarks = await client.benchmarks.get(categories=categories) + print(f"Found {len(benchmarks)} benchmarks with categories {categories}") + + # --- Filter by language + languages = ["english"] + benchmarks = await client.benchmarks.get(languages=languages) + print(f"Found {len(benchmarks)} english benchmarks") + + # --- Filter by key + benchmarks = await client.benchmarks.get(key="mmlu") + print(f"Found {len(benchmarks)} benchmarks matching key 'mmlu'") + # --- Filter by type benchmark_type = "public" benchmarks = await client.benchmarks.get(type=benchmark_type) diff --git a/examples/cli/01_quickstart.sh b/examples/cli/01_quickstart.sh new file mode 100755 index 0000000..9b9cb3d --- /dev/null +++ b/examples/cli/01_quickstart.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Quick start: configure, list traces, inspect, evaluate +# +# Usage: ./01_quickstart.sh [MODEL_ID] +# MODEL_ID defaults to the first available judge model. +set -euo pipefail + +MODEL_ID="${1:-67e1fe69e014f9fa6e50d7be}" + +# 1. List available traces +echo "==> Listing traces..." +stratix trace list --page-size 5 + +# 2. Get the first trace ID +TRACE_ID=$(stratix --format json trace list --page-size 1 \ + | python3 -c "import sys,json; print(json.load(sys.stdin)[0]['id'])") +echo "==> First trace: $TRACE_ID" + +# 3. Inspect it +stratix trace get "$TRACE_ID" + +# 4. Create a judge (capture ID from the "Judge created: " line) +echo "==> Creating judge..." +JUDGE_ID=$(stratix judge create \ + --name "Quick Start Judge $(date +%s)" \ + --goal "Rate whether the response is accurate and helpful" \ + --model-id "$MODEL_ID" \ + | grep "^Judge created:" | awk '{print $NF}') +echo "==> Created judge: $JUDGE_ID" + +# 5. Test the judge against the trace +echo "==> Testing judge against trace..." +stratix judge test --judge-id "$JUDGE_ID" --trace-id "$TRACE_ID" diff --git a/examples/cli/02_evaluate.sh b/examples/cli/02_evaluate.sh new file mode 100755 index 0000000..919bb40 --- /dev/null +++ b/examples/cli/02_evaluate.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Run an evaluation and wait for results +set -euo pipefail + +MODEL="${1:-openai/gpt-4o}" +BENCHMARK="${2:-arc-agi-2}" + +echo "==> Running evaluation: $MODEL on $BENCHMARK" +stratix evaluate run \ + --model "$MODEL" \ + --benchmark "$BENCHMARK" \ + --wait + +echo "==> Recent evaluations:" +stratix evaluate list --sort-by submitted_at --order desc --page-size 5 diff --git a/examples/cli/03_judge_workflow.sh b/examples/cli/03_judge_workflow.sh new file mode 100755 index 0000000..c999065 --- /dev/null +++ b/examples/cli/03_judge_workflow.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Judge workflow: create, test, review +# +# Usage: ./03_judge_workflow.sh [MODEL_ID] +set -euo pipefail + +TRACE_ID="${1:?Usage: $0 [MODEL_ID]}" +MODEL_ID="${2:-67e1fe69e014f9fa6e50d7be}" + +# Create a judge (capture ID from the "Judge created: " line) +echo "==> Creating judge..." +JUDGE_ID=$(stratix judge create \ + --name "Response Quality $(date +%s)" \ + --goal "Rate the response for accuracy, completeness, and clarity on a 1-5 scale" \ + --model-id "$MODEL_ID" \ + | grep "^Judge created:" | awk '{print $NF}') +echo "==> Judge ID: $JUDGE_ID" + +# Test against a trace +echo "==> Testing judge..." +stratix judge test --judge-id "$JUDGE_ID" --trace-id "$TRACE_ID" + +# Review judge details +echo "==> Judge details:" +stratix judge get "$JUDGE_ID" diff --git a/examples/cli/04_bulk_eval.sh b/examples/cli/04_bulk_eval.sh new file mode 100755 index 0000000..ed998fe --- /dev/null +++ b/examples/cli/04_bulk_eval.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Bulk evaluation from a JSONL file +set -euo pipefail + +JOBS_FILE="${1:-/dev/stdin}" + +# Create a sample jobs file if none provided +if [ "$JOBS_FILE" = "/dev/stdin" ]; then + JOBS_FILE=$(mktemp /tmp/layerlens-jobs-XXXXX.jsonl) + cat > "$JOBS_FILE" <<'EOF' +{"model": "openai/gpt-4o", "benchmark": "arc-agi-2"} +{"model": "openai/gpt-4o-mini", "benchmark": "arc-agi-2"} +EOF + echo "==> Created sample jobs file: $JOBS_FILE" +fi + +# Dry-run first +echo "==> Dry-run:" +stratix bulk eval --file "$JOBS_FILE" --dry-run + +echo "" +read -p "Proceed? [y/N] " confirm +if [[ "$confirm" =~ ^[Yy]$ ]]; then + stratix bulk eval --file "$JOBS_FILE" --wait +fi diff --git a/examples/cli/05_export_traces.sh b/examples/cli/05_export_traces.sh new file mode 100755 index 0000000..618b88f --- /dev/null +++ b/examples/cli/05_export_traces.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Export all traces to individual JSON files +# +# Usage: ./05_export_traces.sh [OUTPUT_DIR] +set -euo pipefail + +OUTPUT_DIR="${1:-./exported_traces}" +mkdir -p "$OUTPUT_DIR" + +echo "==> Exporting traces to $OUTPUT_DIR/" + +stratix --format json trace list | python3 -c " +import sys, json +for t in json.load(sys.stdin): + print(t['id']) +" | while read -r id; do + echo " Exporting $id..." + stratix trace export "$id" -o "$OUTPUT_DIR/${id}.json" +done + +echo "==> Done. Files in $OUTPUT_DIR/" +ls -la "$OUTPUT_DIR/" diff --git a/examples/cli/06_ci_report.sh b/examples/cli/06_ci_report.sh new file mode 100755 index 0000000..b2017b3 --- /dev/null +++ b/examples/cli/06_ci_report.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Generate a CI evaluation report (for GitHub Actions) +set -euo pipefail + +OUTPUT="${1:-summary.md}" + +echo "==> Generating CI report..." +stratix ci report --limit 10 -o "$OUTPUT" + +echo "==> Report written to $OUTPUT" +cat "$OUTPUT" diff --git a/examples/cli/07_scorer_lifecycle.sh b/examples/cli/07_scorer_lifecycle.sh new file mode 100755 index 0000000..8559673 --- /dev/null +++ b/examples/cli/07_scorer_lifecycle.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Scorer lifecycle: create, list, inspect, delete +# +# Usage: ./07_scorer_lifecycle.sh +set -euo pipefail + +MODEL_ID="${1:?Usage: $0 }" +SCORER_NAME="CLI Demo $(date +%s)" + +# Create (dry-run) +echo "==> Dry-run create:" +stratix scorer create \ + --name "$SCORER_NAME" \ + --description "Evaluates generated code for correctness and readability" \ + --model-id "$MODEL_ID" \ + --prompt "Rate the following code on a 1-10 scale for correctness, readability, and adherence to best practices." \ + --dry-run + +# Create for real +echo "" +echo "==> Creating scorer..." +stratix scorer create \ + --name "$SCORER_NAME" \ + --description "Evaluates generated code for correctness and readability" \ + --model-id "$MODEL_ID" \ + --prompt "Rate the following code on a 1-10 scale for correctness, readability, and adherence to best practices." + +# Find the scorer by name in the list +echo "" +echo "==> Finding scorer in list..." +SCORER_ID=$(stratix --format json scorer list \ + | python3 -c "import sys,json +for s in json.load(sys.stdin): + if s['name'] == '$SCORER_NAME': + print(s['id']); break") +echo "==> Scorer ID: $SCORER_ID" + +# Inspect +stratix scorer get "$SCORER_ID" + +# Delete +echo "" +echo "==> Cleaning up..." +stratix scorer delete "$SCORER_ID" -y +echo "==> Done." diff --git a/examples/cli/08_spaces.sh b/examples/cli/08_spaces.sh new file mode 100755 index 0000000..015b8e2 --- /dev/null +++ b/examples/cli/08_spaces.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Evaluation spaces: create, list, inspect, delete +set -euo pipefail + +# Create a private space (capture ID from output) +echo "==> Creating evaluation space..." +SPACE_ID=$(stratix space create \ + --name "CLI Demo Space" \ + --description "Temporary space for CLI examples" \ + --visibility private \ + | grep -oP '[a-f0-9]{24}' | head -1) +echo "==> Created space: $SPACE_ID" + +# List spaces +echo "" +echo "==> All spaces:" +stratix space list + +# Get details +echo "" +echo "==> Space details:" +stratix space get "$SPACE_ID" + +# Clean up +echo "" +echo "==> Deleting space..." +stratix space delete "$SPACE_ID" -y +echo "==> Done." diff --git a/examples/cli/09_integration_check.sh b/examples/cli/09_integration_check.sh new file mode 100755 index 0000000..268fc30 --- /dev/null +++ b/examples/cli/09_integration_check.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Check integration health +set -euo pipefail + +echo "==> Integrations:" +stratix integration list + +# Test each integration (skip if none found) +echo "" +echo "==> Testing all integrations..." +OUTPUT=$(stratix --format json integration list 2>&1) +if echo "$OUTPUT" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then + echo "$OUTPUT" | python3 -c " +import sys, json +for i in json.load(sys.stdin): + print(i['id'], i.get('name', '')) +" | while read -r id name; do + echo " Testing $name ($id)..." + stratix integration test "$id" || echo " FAILED: $name" + done +else + echo " No integrations to test." +fi diff --git a/examples/cli/10_compare_models.sh b/examples/cli/10_compare_models.sh new file mode 100755 index 0000000..65730ff --- /dev/null +++ b/examples/cli/10_compare_models.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Compare multiple models on the same benchmark +# +# Usage: ./10_compare_models.sh [BENCHMARK] [MODEL1] [MODEL2] ... +set -euo pipefail + +BENCHMARK="${1:-arc-agi-2}" +shift 2>/dev/null || true + +if [ $# -eq 0 ]; then + MODELS=("openai/gpt-4o" "openai/gpt-4o-mini") +else + MODELS=("$@") +fi + +echo "==> Comparing ${#MODELS[@]} models on $BENCHMARK" + +for model in "${MODELS[@]}"; do + echo " Running: $model" + stratix evaluate run --model "$model" --benchmark "$BENCHMARK" --wait & +done + +# Wait for all background evaluations +wait + +echo "" +echo "==> Results (sorted by accuracy):" +stratix evaluate list --sort-by accuracy --order desc --page-size 10 diff --git a/examples/integrations.py b/examples/integrations.py new file mode 100644 index 0000000..2621984 --- /dev/null +++ b/examples/integrations.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""Example: working with integrations via the Stratix SDK.""" + +from layerlens import Stratix + +client = Stratix() + +# --- List all integrations +response = client.integrations.get_many() + +if response is None or not response.integrations: + print("No integrations found.") +else: + print(f"Found {response.total_count} integration(s):\n") + for integration in response.integrations: + print(f" [{integration.id}] {integration.name}") + print(f" Type: {integration.type}") + print(f" Status: {integration.status}") + print(f" Created: {integration.created_at}") + print() + +# --- List with pagination +page1 = client.integrations.get_many(page=1, page_size=5) +if page1: + print(f"Page 1: showing {page1.count} of {page1.total_count}") + +# --- Get a single integration by ID +if response and response.integrations: + integration_id = response.integrations[0].id + + integration = client.integrations.get(integration_id) + if integration: + print(f"\nIntegration detail:") + print(f" ID: {integration.id}") + print(f" Name: {integration.name}") + print(f" Type: {integration.type}") + print(f" Status: {integration.status}") + print(f" Config: {integration.config}") + + # --- Test an integration + result = client.integrations.test(integration_id) + if result: + status = "OK" if result.success else "FAILED" + print(f"\nTest result: {status}") + if result.message: + print(f" Message: {result.message}") diff --git a/mypy.ini b/mypy.ini index a5788f9..803803c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -21,3 +21,12 @@ disallow_subclassing_any = True disallow_incomplete_defs = True disallow_untyped_decorators = True cache_fine_grained = True + +[mypy-click] +ignore_missing_imports = True + +[mypy-layerlens.cli.*] +ignore_missing_imports = True +disallow_untyped_decorators = False +disallow_untyped_defs = False +disallow_any_generics = False diff --git a/pyproject.toml b/pyproject.toml index 783372f..fc8baa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,12 +31,16 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] +[project.optional-dependencies] +cli = ["click>=8.0.0"] + [project.urls] Homepage = "https://github.com/LayerLens/stratix-python" Repository = "https://github.com/LayerLens/stratix-python" [project.scripts] layerlens = "layerlens.cli:main" +stratix = "layerlens.cli:main" [tool.rye] managed = true @@ -49,6 +53,7 @@ dev-dependencies = [ "ruff", "build", "twine==6.1.0", + "click>=8.0.0", ] [tool.rye.scripts] @@ -132,14 +137,15 @@ known-first-party = ["openai", "tests"] "scripts/**.py" = ["T201", "T203"] "tests/**.py" = ["T201", "T203"] "examples/**.py" = ["T201", "T203"] -"src/layerlens/cli.py" = ["T201", "T203"] +"src/layerlens/cli/**" = ["T201", "T203"] [tool.pyright] include = ["src", "tests"] exclude = ["**/__pycache__"] reportMissingTypeStubs = false -# Less strict settings for tests +# Less strict settings for tests and cli executionEnvironments = [ + { root = "src/layerlens/cli", reportMissingImports = false, reportFunctionMemberAccess = false, reportCallIssue = false, reportArgumentType = false, reportAttributeAccessIssue = false }, { root = "tests", reportGeneralTypeIssues = false, reportOptionalSubscript = false, reportOptionalMemberAccess = false, reportUntypedFunctionDecorator = false, reportUnknownArgumentType = false, reportUnknownMemberType = false, reportUnknownVariableType = false, reportUnnecessaryIsInstance = false, reportUnnecessaryComparison = false, reportArgumentType = false, reportCallIssue = false }, ] diff --git a/requirements-dev.lock b/requirements-dev.lock index 2aaa85b..81a18f2 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -21,10 +21,16 @@ certifi==2025.7.14 # via httpcore # via httpx # via requests +cffi==2.0.0 + # via cryptography charset-normalizer==3.4.3 # via requests +click==8.1.8 + # via layerlens coverage==7.10.2 # via pytest-cov +cryptography==46.0.5 + # via secretstorage docutils==0.22 # via readme-renderer exceptiongroup==1.3.0 @@ -35,7 +41,7 @@ h11==0.16.0 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via test-atlas-lzok + # via layerlens id==1.5.0 # via twine idna==3.10 @@ -54,6 +60,9 @@ jaraco-context==6.0.1 # via keyring jaraco-functools==4.2.1 # via keyring +jeepney==0.9.0 + # via keyring + # via secretstorage keyring==25.6.0 # via twine markdown-it-py==3.0.0 @@ -79,8 +88,10 @@ pathspec==0.12.1 pluggy==1.6.0 # via pytest # via pytest-cov +pycparser==2.23 + # via cffi pydantic==2.11.7 - # via test-atlas-lzok + # via layerlens pydantic-core==2.33.2 # via pydantic pygments==2.19.2 @@ -97,7 +108,6 @@ readme-renderer==44.0 # via twine requests==2.32.5 # via id - # via layerlens # via requests-toolbelt # via twine requests-toolbelt==1.0.0 @@ -107,6 +117,8 @@ rfc3986==2.0.0 rich==14.1.0 # via twine ruff==0.12.7 +secretstorage==3.3.3 + # via keyring sniffio==1.3.1 # via anyio tomli==2.2.1 @@ -115,9 +127,9 @@ tomli==2.2.1 # via mypy # via pytest twine==6.1.0 -types-requests==2.32.4.20250809 typing-extensions==4.14.1 # via anyio + # via cryptography # via exceptiongroup # via mypy # via pydantic @@ -129,6 +141,5 @@ typing-inspection==0.4.1 urllib3==2.5.0 # via requests # via twine - # via types-requests zipp==3.23.0 # via importlib-metadata diff --git a/requirements.lock b/requirements.lock index 540f4d6..1a890c9 100644 --- a/requirements.lock +++ b/requirements.lock @@ -17,9 +17,8 @@ anyio==4.9.0 certifi==2025.7.14 # via httpcore # via httpx - # via requests -charset-normalizer==3.4.3 - # via requests +click==8.1.8 + # via layerlens exceptiongroup==1.3.0 # via anyio h11==0.16.0 @@ -27,17 +26,14 @@ h11==0.16.0 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via test-atlas-lzok + # via layerlens idna==3.10 # via anyio # via httpx - # via requests pydantic==2.11.7 - # via test-atlas-lzok + # via layerlens pydantic-core==2.33.2 # via pydantic -requests==2.32.5 - # via atlas sniffio==1.3.1 # via anyio typing-extensions==4.14.1 @@ -48,5 +44,3 @@ typing-extensions==4.14.1 # via typing-inspection typing-inspection==0.4.1 # via pydantic -urllib3==2.5.0 - # via requests diff --git a/src/layerlens/__init__.py b/src/layerlens/__init__.py index 78a69f9..5c6adf6 100644 --- a/src/layerlens/__init__.py +++ b/src/layerlens/__init__.py @@ -1,6 +1,7 @@ from .models import ( Judge, Trace, + Integration, JudgeVersion, JudgeSnapshot, BenchmarkPrompt, @@ -47,6 +48,7 @@ "Client", "ComparisonResult", "ComparisonResponse", + "Integration", "Judge", "JudgeOptimizationRun", "JudgeSnapshot", diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py index e7688e0..032e15b 100644 --- a/src/layerlens/_client.py +++ b/src/layerlens/_client.py @@ -21,8 +21,11 @@ from .resources.models import Models, AsyncModels from .resources.traces import Traces, AsyncTraces from .resources.results import Results, AsyncResults + from .resources.scorers import Scorers, AsyncScorers from .resources.benchmarks import Benchmarks, AsyncBenchmarks from .resources.evaluations import Evaluations, AsyncEvaluations + from .resources.integrations import Integrations, AsyncIntegrations + from .resources.evaluation_spaces import EvaluationSpaces, AsyncEvaluationSpaces from .resources.trace_evaluations import TraceEvaluations, AsyncTraceEvaluations from .resources.judge_optimizations import JudgeOptimizations, AsyncJudgeOptimizations @@ -124,6 +127,24 @@ def trace_evaluations(self) -> TraceEvaluations: return TraceEvaluations(self) + @cached_property + def integrations(self) -> Integrations: + from .resources.integrations import Integrations + + return Integrations(self) + + @cached_property + def scorers(self) -> Scorers: + from .resources.scorers import Scorers + + return Scorers(self) + + @cached_property + def evaluation_spaces(self) -> EvaluationSpaces: + from .resources.evaluation_spaces import EvaluationSpaces + + return EvaluationSpaces(self) + @cached_property def public(self) -> PublicClient: from ._public_client import PublicClient @@ -293,6 +314,24 @@ def trace_evaluations(self) -> AsyncTraceEvaluations: return AsyncTraceEvaluations(self) + @cached_property + def integrations(self) -> AsyncIntegrations: + from .resources.integrations import AsyncIntegrations + + return AsyncIntegrations(self) + + @cached_property + def scorers(self) -> AsyncScorers: + from .resources.scorers import AsyncScorers + + return AsyncScorers(self) + + @cached_property + def evaluation_spaces(self) -> AsyncEvaluationSpaces: + from .resources.evaluation_spaces import AsyncEvaluationSpaces + + return AsyncEvaluationSpaces(self) + @cached_property def public(self) -> AsyncPublicClient: from ._public_client import AsyncPublicClient diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index ddcf27c..438b4b5 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1,4 +1,4 @@ -__version__ = "1.4.0" +__version__ = "1.5.0" # Will be templated during the build __git_commit__ = "__GIT_COMMIT__" diff --git a/src/layerlens/cli.py b/src/layerlens/cli.py deleted file mode 100644 index 5c899c7..0000000 --- a/src/layerlens/cli.py +++ /dev/null @@ -1,16 +0,0 @@ -from __future__ import annotations - -import sys - -from ._version import __version__ - - -def main() -> None: - if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"): - print(f"layerlens {__version__}") - sys.exit(0) - - print(f"layerlens {__version__}") - print("See https://layerlens.gitbook.io/stratix-python-sdk for documentation.") - print("\nUsage:") - print(" layerlens --version Show version") diff --git a/src/layerlens/cli/__init__.py b/src/layerlens/cli/__init__.py new file mode 100644 index 0000000..04234ab --- /dev/null +++ b/src/layerlens/cli/__init__.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import sys + + +def main() -> None: + # Handle --version before importing click so it works without the [cli] extra + if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"): + from .._version import __version__ + + print(f"stratix {__version__}") # noqa: T201 + sys.exit(0) + + try: + import click # noqa: F401 + except ImportError: + print( # noqa: T201 + "CLI dependencies not installed. Install them with:\n\n pip install layerlens[cli]\n" + ) + sys.exit(1) + + from ._app import cli + + cli() + + +__all__ = ["main"] diff --git a/src/layerlens/cli/_app.py b/src/layerlens/cli/_app.py new file mode 100644 index 0000000..489d965 --- /dev/null +++ b/src/layerlens/cli/_app.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import click + +from .._version import __version__ +from .commands.ci import ci +from .commands.bulk import bulk +from .commands.judge import judge +from .commands.space import space +from .commands.trace import trace +from .commands.scorer import scorer +from .commands.evaluate import evaluate +from .commands.integration import integration + + +@click.group() +@click.option( + "--api-key", + envvar="LAYERLENS_STRATIX_API_KEY", + default=None, + help="API key for authentication.", +) +@click.option("--host", default=None, help="API host (e.g. api.layerlens.ai).") +@click.option("--port", default=None, type=int, help="API port.") +@click.option( + "--format", + "output_format", + type=click.Choice(["table", "json"]), + default="table", + help="Output format.", +) +@click.option("--verbose", "-v", is_flag=True, default=False, help="Enable verbose/debug output.") +@click.option("--quiet", "-q", is_flag=True, default=False, help="Suppress the startup banner.") +@click.version_option(version=__version__, prog_name="layerlens") +@click.pass_context +def cli( + ctx: click.Context, + api_key: str | None, + host: str | None, + port: int | None, + output_format: str, + verbose: bool, + quiet: bool, +) -> None: + """LayerLens Stratix CLI — manage traces, judges, evaluations, integrations, and more.""" + import sys + + if not quiet and sys.stderr.isatty(): + from ._banner import banner + + click.echo(banner(__version__), err=True) + + ctx.ensure_object(dict) + ctx.obj["api_key"] = api_key + ctx.obj["output_format"] = output_format + ctx.obj["verbose"] = verbose + + # Build base_url from --host / --port + base_url = None + if host is not None: + scheme = "https" if port in (None, 443) else "http" + if port and port not in (80, 443): + base_url = f"{scheme}://{host}:{port}/api/v1" + else: + base_url = f"{scheme}://{host}/api/v1" + ctx.obj["base_url"] = base_url + + +# Core commands +cli.add_command(trace) +cli.add_command(judge) +cli.add_command(evaluate) +cli.add_command(integration) + +# Additional commands +cli.add_command(scorer) +cli.add_command(space) +cli.add_command(bulk) +cli.add_command(ci) + + +@cli.command("completion") +@click.argument("shell", type=click.Choice(["bash", "zsh", "fish", "powershell"])) +def completion(shell: str) -> None: + """Print shell completion setup instructions. + + \b + Examples: + stratix completion bash + stratix completion zsh + stratix completion fish + stratix completion powershell + """ + import os + + # Detect which command name was used to invoke the CLI + prog = os.path.basename(os.environ.get("_", "layerlens")) + if prog not in ("layerlens", "stratix"): + prog = "layerlens" + env_var = f"_{prog.upper()}_COMPLETE" + + instructions = { + "bash": f'eval "$({env_var}=bash_source {prog})"', + "zsh": f'eval "$({env_var}=zsh_source {prog})"', + "fish": f"{env_var}=fish_source {prog} | source", + "powershell": ( + f"Register-ArgumentCompleter -Native -CommandName {prog} -ScriptBlock {{\n" + " param($wordToComplete, $commandAst, $cursorPosition)\n" + f' $env:{env_var} = "powershell_source"\n' + f' {prog} | ForEach-Object {{ [System.Management.Automation.CompletionResult]::new($_, $_, "ParameterValue", $_) }}\n' + f" Remove-Item Env:{env_var}\n" + "}" + ), + } + if shell == "powershell": + print(f"Add this to your PowerShell profile:\n\n{instructions[shell]}") + else: + print(f"Add this to your shell profile:\n\n {instructions[shell]}") diff --git a/src/layerlens/cli/_banner.py b/src/layerlens/cli/_banner.py new file mode 100644 index 0000000..8d01c8a --- /dev/null +++ b/src/layerlens/cli/_banner.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +# ANSI color codes +_CYAN = "\033[38;2;54;191;250m" # #36BFFA +_GRAY = "\033[90m" +_RESET = "\033[0m" + +_ART = r""" + ____ _____ ____ _ _____ _____ __ + / ___|_ _| _ \ / \|_ _|_ _\ \/ / + \___ \ | | | |_) | / _ \ | | | | \ / + ___) || | | _ < / ___ \| | | | / \ + |____/ |_| |_| \_\/_/ \_\_| |___/_/\_\ +""" + + +def banner(version: str) -> str: + """Return the colored CLI banner with version line.""" + lines = _ART.rstrip("\n") + colored_art = f"{_CYAN}{lines}{_RESET}" + version_line = f"{_GRAY} v{version} — layerlens.ai{_RESET}" + return f"{colored_art}\n{version_line}\n" diff --git a/src/layerlens/cli/_client.py b/src/layerlens/cli/_client.py new file mode 100644 index 0000000..a727363 --- /dev/null +++ b/src/layerlens/cli/_client.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import re +import sys +import functools +import traceback as tb +from typing import Any, Callable + +import click + +from .._client import Stratix +from .._exceptions import StratixError, NotFoundError, AuthenticationError + + +def get_client(ctx: click.Context) -> Stratix: + """Create a Stratix client from CLI context options.""" + try: + return Stratix( + api_key=ctx.obj.get("api_key"), + base_url=ctx.obj.get("base_url"), + ) + except StratixError as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + +def handle_errors(fn: Callable[..., Any]) -> Callable[..., Any]: + """Decorator that catches SDK errors and prints user-friendly messages.""" + + @functools.wraps(fn) + @click.pass_context + def wrapper(ctx: click.Context, *args: Any, **kwargs: Any) -> Any: + try: + return ctx.invoke(fn, *args, **kwargs) + except AuthenticationError: + click.echo("Error: Invalid or missing API key.", err=True) + sys.exit(1) + except NotFoundError as e: + click.echo(f"Error: Resource not found. {e}", err=True) + sys.exit(1) + except StratixError as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + except click.exceptions.Exit: + raise + except Exception as e: + if ctx.obj.get("verbose"): + tb.print_exc() + click.echo(f"Unexpected error: {e}", err=True) + sys.exit(1) + + return wrapper + + +_UUID_RE = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE) + + +def _is_uuid(value: str) -> bool: + return bool(_UUID_RE.match(value)) + + +def resolve_model(client: Stratix, identifier: str) -> Any: + """Resolve a model by ID, key, or name.""" + # Try by ID first if it looks like a UUID + if _is_uuid(identifier): + model = client.models.get_by_id(identifier) + if model: + return model + + # Try by key + model = client.models.get_by_key(identifier) + if model: + return model + + # Try by name + models = client.models.get(name=identifier) + if models: + return models[0] + + return None + + +def resolve_benchmark(client: Stratix, identifier: str) -> Any: + """Resolve a benchmark by ID, key, or name.""" + # Try by ID first if it looks like a UUID + if _is_uuid(identifier): + benchmark = client.benchmarks.get_by_id(identifier) + if benchmark: + return benchmark + + # Try by key + benchmark = client.benchmarks.get_by_key(identifier) + if benchmark: + return benchmark + + # Try by name + benchmarks = client.benchmarks.get(name=identifier) + if benchmarks: + return benchmarks[0] + + return None diff --git a/src/layerlens/cli/_completions.py b/src/layerlens/cli/_completions.py new file mode 100644 index 0000000..909df23 --- /dev/null +++ b/src/layerlens/cli/_completions.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +from typing import Any, List + +import click + + +def _get_client_silent(ctx: click.Context) -> Any: + """Try to create a Stratix client for autocompletion, returning None on failure.""" + try: + from .._client import Stratix + + api_key = ctx.params.get("api_key") or None + base_url = None + host = ctx.params.get("host") + port = ctx.params.get("port") + if host: + scheme = "https" if port in (None, 443) else "http" + if port and port not in (80, 443): + base_url = f"{scheme}://{host}:{port}/api/v1" + else: + base_url = f"{scheme}://{host}/api/v1" + + return Stratix(api_key=api_key, base_url=base_url) + except Exception: + return None + + +def complete_trace( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete trace IDs.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + resp = client.traces.get_many(search=incomplete if incomplete else None, page_size=20) + if resp and resp.traces: + return [ + click.shell_completion.CompletionItem(t.id, help=t.filename) + for t in resp.traces + if t.id.startswith(incomplete) + ] + except Exception: + pass + return [] + + +def complete_judge( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete judge IDs.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + resp = client.judges.get_many(page_size=50) + if resp and resp.judges: + return [ + click.shell_completion.CompletionItem(j.id, help=j.name) + for j in resp.judges + if j.id.startswith(incomplete) or j.name.lower().startswith(incomplete.lower()) + ] + except Exception: + pass + return [] + + +def complete_model( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete model IDs, keys, and names.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + models = client.models.get() + if models: + items = [] + for m in models: + if ( + m.id.startswith(incomplete) + or m.key.lower().startswith(incomplete.lower()) + or m.name.lower().startswith(incomplete.lower()) + ): + items.append(click.shell_completion.CompletionItem(m.key, help=m.name)) + return items + except Exception: + pass + return [] + + +def complete_benchmark( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete benchmark IDs, keys, and names.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + benchmarks = client.benchmarks.get() + if benchmarks: + items = [] + for b in benchmarks: + if ( + b.id.startswith(incomplete) + or b.key.lower().startswith(incomplete.lower()) + or b.name.lower().startswith(incomplete.lower()) + ): + items.append(click.shell_completion.CompletionItem(b.key, help=b.name)) + return items + except Exception: + pass + return [] + + +def complete_evaluation( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete evaluation IDs.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + resp = client.evaluations.get_many(page_size=20) + if resp and resp.evaluations: + return [ + click.shell_completion.CompletionItem( + e.id, + help=f"{getattr(e, 'model_name', '?')} / {getattr(e, 'benchmark_name', '?')}", + ) + for e in resp.evaluations + if e.id.startswith(incomplete) + ] + except Exception: + pass + return [] + + +def complete_integration( + ctx: click.Context, _param: click.Parameter, incomplete: str +) -> List[click.shell_completion.CompletionItem]: + """Autocomplete integration IDs.""" + client = _get_client_silent(ctx) + if not client: + return [] + try: + resp = client.integrations.get_many(page_size=50) + if resp and resp.integrations: + return [ + click.shell_completion.CompletionItem(i.id, help=i.name) + for i in resp.integrations + if i.id.startswith(incomplete) or i.name.lower().startswith(incomplete.lower()) + ] + except Exception: + pass + return [] diff --git a/src/layerlens/cli/_formatter.py b/src/layerlens/cli/_formatter.py new file mode 100644 index 0000000..46423e4 --- /dev/null +++ b/src/layerlens/cli/_formatter.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, List, Tuple, Optional + + +def to_dict(obj: Any) -> Any: + """Convert a Pydantic model (v1 or v2) to a dict.""" + if hasattr(obj, "model_dump"): + return obj.model_dump() + elif hasattr(obj, "dict"): + return obj.dict() + elif isinstance(obj, dict): + return obj + return obj + + +def format_table(items: List[Any], columns: List[Tuple[str, str]], max_col_width: int = 40) -> str: + """Render items as a fixed-width text table. + + Args: + items: List of Pydantic models or dicts. + columns: List of (field_key, header_label) tuples. + max_col_width: Maximum column width before truncation. + + Returns: + Formatted table string. + """ + if not items: + return "No results found." + + rows: List[Dict[str, str]] = [] + for item in items: + d = to_dict(item) if not isinstance(item, dict) else item + row: Dict[str, str] = {} + for key, _ in columns: + val = d.get(key) + row[key] = _format_value(val) + rows.append(row) + + # Compute column widths + widths: Dict[str, int] = {} + for key, header in columns: + widths[key] = min(max(len(header), max(len(r[key]) for r in rows)), max_col_width) + + # Build header + header_parts = [header.ljust(widths[key]) for key, header in columns] + header_line = " ".join(header_parts) + separator = " ".join("-" * widths[key] for key, _ in columns) + + # Build rows + lines = [header_line, separator] + for row in rows: + parts = [_truncate(row[key], widths[key]).ljust(widths[key]) for key, _ in columns] + lines.append(" ".join(parts)) + + return "\n".join(lines) + + +def format_output(data: Any, output_format: str, columns: Optional[List[Tuple[str, str]]] = None) -> str: + """Format data as table or JSON. + + Args: + data: A list of items, a single item, or a dict. + output_format: "table" or "json". + columns: For table format, list of (field_key, header_label) tuples. + + Returns: + Formatted string. + """ + if output_format == "json": + return _format_json(data) + + # Table format + if isinstance(data, list): + if columns: + return format_table(data, columns) + return _format_json(data) + + # Single item + return format_single(data) + + +def format_single(item: Any) -> str: + """Format a single item as key-value pairs.""" + d = to_dict(item) if not isinstance(item, dict) else item + if not isinstance(d, dict): + return str(d) + + lines = [] + max_key_len = max(len(k) for k in d) if d else 0 + for key, value in d.items(): + label = key.replace("_", " ").title() + lines.append(f"{label:<{max_key_len + 4}} {_format_value(value)}") + return "\n".join(lines) + + +def _format_json(data: Any) -> str: + """Format data as pretty-printed JSON.""" + if isinstance(data, list): + return json.dumps([to_dict(item) for item in data], indent=2, default=str) + return json.dumps(to_dict(data), indent=2, default=str) + + +def _format_value(val: Any) -> str: + """Convert a value to a display string.""" + if val is None: + return "-" + if isinstance(val, bool): + return "Yes" if val else "No" + if isinstance(val, float): + return f"{val:.4f}" + if isinstance(val, (dict, list)): + return json.dumps(val, default=str) + return str(val) + + +def _truncate(s: str, width: int) -> str: + """Truncate a string to width, adding ellipsis if needed.""" + if len(s) <= width: + return s + return s[: width - 1] + "\u2026" diff --git a/src/layerlens/cli/commands/__init__.py b/src/layerlens/cli/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/layerlens/cli/commands/bulk.py b/src/layerlens/cli/commands/bulk.py new file mode 100644 index 0000000..ea9b704 --- /dev/null +++ b/src/layerlens/cli/commands/bulk.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import sys +import json + +import click + +from .._client import get_client, handle_errors, resolve_model, resolve_benchmark +from .._formatter import format_output + +EVALUATION_COLUMNS = [ + ("id", "ID"), + ("status", "Status"), + ("model_name", "Model"), + ("benchmark_name", "Benchmark"), +] + + +@click.group() +def bulk() -> None: + """Bulk operations. + + \b + Examples: + stratix bulk eval --file jobs.jsonl + stratix bulk eval --model gpt-4 --benchmark mmlu --traces trace_ids.txt + """ + + +@bulk.command("eval") +@click.option( + "--file", + "file_path", + type=click.Path(exists=True), + help='JSONL file with evaluation jobs (each line: {"model": ..., "benchmark": ...}).', +) +@click.option("--model", "model_id", default=None, help="Model ID/name (use with --benchmark).") +@click.option("--benchmark", "benchmark_id", default=None, help="Benchmark ID/name (use with --model).") +@click.option("--judge-id", default=None, help="Judge ID (use with --traces).") +@click.option( + "--traces", "traces_file", type=click.Path(exists=True), default=None, help="File with trace IDs (one per line)." +) +@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.") +@click.option("--wait", is_flag=True, default=False, help="Wait for all evaluations to complete.") +@click.pass_context +@handle_errors +def bulk_eval( + ctx: click.Context, + file_path: str | None, + model_id: str | None, + benchmark_id: str | None, + judge_id: str | None, + traces_file: str | None, + dry_run: bool, + wait: bool, +) -> None: + """Run evaluations in bulk from a file or stdin. + + Three modes: + 1. JSONL file: each line is {"model": "", "benchmark": ""} + 2. Model + benchmark: run a single evaluation (optionally --wait) + 3. Judge + traces file: evaluate many traces with a judge + + \b + Examples: + stratix bulk eval --file jobs.jsonl + stratix bulk eval --file jobs.jsonl --dry-run + stratix bulk eval --model gpt-4 --benchmark mmlu --wait + stratix bulk eval --judge-id --traces trace_ids.txt + """ + client = get_client(ctx) + + if file_path: + with open(file_path) as f: + jobs = [] + for line in f: + line = line.strip() + if line: + try: + jobs.append(json.loads(line)) + except json.JSONDecodeError: + click.echo(f"Skipping invalid JSON line: {line}", err=True) + + if not jobs: + click.echo("No valid jobs found in file.", err=True) + sys.exit(1) + + if dry_run: + click.echo(f"[dry-run] Would create {len(jobs)} evaluation(s):") + for job in jobs: + click.echo(f" model={job.get('model')} benchmark={job.get('benchmark')}") + return + + click.echo(f"Creating {len(jobs)} evaluation(s)...") + evaluations = [] + for i, job in enumerate(jobs, 1): + m = resolve_model(client, job["model"]) + b = resolve_benchmark(client, job["benchmark"]) + if m is None or b is None: + click.echo( + f" [{i}] SKIP - model={job.get('model')} or benchmark={job.get('benchmark')} not found", err=True + ) + continue + + ev = client.evaluations.create(model=m, benchmark=b) + if ev: + click.echo(f" [{i}] Created: {ev.id}") + evaluations.append(ev) + else: + click.echo(f" [{i}] FAIL", err=True) + + click.echo(f"\n{len(evaluations)} evaluation(s) created.") + + if wait and evaluations: + click.echo("Waiting for completion...") + for ev in evaluations: + result = client.evaluations.wait_for_completion(ev) + if result: + click.echo(f" {result.id}: {result.status}") + + elif model_id and benchmark_id: + model = resolve_model(client, model_id) + if model is None: + click.echo(f"Model not found: {model_id}", err=True) + sys.exit(1) + benchmark = resolve_benchmark(client, benchmark_id) + if benchmark is None: + click.echo(f"Benchmark not found: {benchmark_id}", err=True) + sys.exit(1) + + if traces_file: + click.echo("Error: --traces requires --judge-id, not --model/--benchmark.", err=True) + sys.exit(1) + + else: + if dry_run: + click.echo(f"[dry-run] Would create evaluation: {model.name} x {benchmark.name}") + return + + click.echo(f"Creating evaluation: {model.name} x {benchmark.name}") + ev = client.evaluations.create(model=model, benchmark=benchmark) + if ev is None: + click.echo("Failed to create evaluation.", err=True) + sys.exit(1) + + click.echo(f"Evaluation created: {ev.id}") + if wait: + click.echo("Waiting for completion...") + ev = client.evaluations.wait_for_completion(ev) + if ev: + click.echo(f"Evaluation finished: {ev.status}") + + output = format_output(ev, ctx.obj["output_format"]) + click.echo(output) + elif judge_id and traces_file: + # Mode 3: judge + traces file + with open(traces_file) as f: + trace_ids = [line.strip() for line in f if line.strip()] + + if not trace_ids: + click.echo("No trace IDs found in file.", err=True) + sys.exit(1) + + if dry_run: + click.echo(f"[dry-run] Would create {len(trace_ids)} trace evaluation(s) with judge {judge_id}:") + for tid in trace_ids: + click.echo(f" trace={tid}") + return + + click.echo(f"Creating {len(trace_ids)} trace evaluation(s) with judge {judge_id}...") + results = [] + for i, trace_id in enumerate(trace_ids, 1): + te = client.trace_evaluations.create(trace_id=trace_id, judge_id=judge_id) + if te: + click.echo(f" [{i}] Created: {te.id} (trace={trace_id})") + results.append(te) + else: + click.echo(f" [{i}] FAIL (trace={trace_id})", err=True) + + click.echo(f"\n{len(results)} trace evaluation(s) created.") + else: + click.echo("Provide --file, --model + --benchmark, or --judge-id + --traces.", err=True) + sys.exit(1) diff --git a/src/layerlens/cli/commands/ci.py b/src/layerlens/cli/commands/ci.py new file mode 100644 index 0000000..eaec4e4 --- /dev/null +++ b/src/layerlens/cli/commands/ci.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import sys +import json +from datetime import datetime + +import click + +from .._client import get_client, handle_errors +from .._formatter import to_dict + + +@click.group() +def ci() -> None: + """CI/CD pipeline helpers. + + \b + Examples: + stratix ci report + stratix ci report --format json + stratix ci report --output summary.md + """ + + +@ci.command("report") +@click.option( + "--output", + "-o", + "output_file", + default=None, + type=click.Path(), + help="Output file path.", +) +@click.option("--limit", default=10, type=int, help="Number of recent evaluations to include.") +@click.option("--dry-run", is_flag=True, default=False, help="Preview without fetching data.") +@click.pass_context +@handle_errors +def ci_report(ctx: click.Context, output_file: str | None, limit: int, dry_run: bool) -> None: + """Generate a CI summary report. + + Outputs a markdown report suitable for GitHub Actions job summaries. + + \b + Examples: + stratix ci report + stratix ci report --output summary.md + stratix ci report --limit 20 --format json + stratix ci report >> $GITHUB_STEP_SUMMARY + """ + if dry_run: + click.echo("[dry-run] Would generate CI report") + return + + client = get_client(ctx) + + evals_resp = client.evaluations.get_many(page_size=limit, sort_by="submitted_at", order="desc") + if evals_resp is None or not evals_resp.evaluations: + click.echo("No evaluations found for report.", err=True) + sys.exit(1) + + evaluations = evals_resp.evaluations + + if ctx.obj["output_format"] == "json": + report = { + "generated_at": datetime.utcnow().isoformat(), + "total_evaluations": len(evaluations), + "evaluations": [to_dict(e) for e in evaluations], + "summary": { + "passed": sum(1 for e in evaluations if e.status == "success"), + "failed": sum(1 for e in evaluations if e.status == "failure"), + "pending": sum(1 for e in evaluations if e.status in ("pending", "in-progress")), + }, + } + content = json.dumps(report, indent=2, default=str) + else: + content = _build_markdown_report(evaluations) + + if output_file: + with open(output_file, "w") as f: + f.write(content) + click.echo(f"Report written to {output_file}") + else: + click.echo(content) + + +def _build_markdown_report(evaluations: list) -> str: + lines = [] + lines.append("# Stratix Evaluation Report") + lines.append("") + lines.append(f"Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}") + lines.append("") + + passed = sum(1 for e in evaluations if e.status == "success") + failed = sum(1 for e in evaluations if e.status == "failure") + pending = sum(1 for e in evaluations if e.status in ("pending", "in-progress")) + + lines.append("## Summary") + lines.append("") + lines.append(f"| Metric | Count |") + lines.append(f"|--------|-------|") + lines.append(f"| Total | {len(evaluations)} |") + lines.append(f"| Passed | {passed} |") + lines.append(f"| Failed | {failed} |") + lines.append(f"| Pending | {pending} |") + lines.append("") + + lines.append("## Evaluations") + lines.append("") + lines.append("| ID | Model | Benchmark | Status | Accuracy |") + lines.append("|-----|-------|-----------|--------|----------|") + + for e in evaluations: + eid = getattr(e, "id", "-")[:12] + model = getattr(e, "model_name", "-") or "-" + benchmark = getattr(e, "benchmark_name", "-") or "-" + status = getattr(e, "status", "-") or "-" + accuracy = getattr(e, "accuracy", None) + acc_str = f"{accuracy:.2%}" if accuracy is not None else "-" + emoji = "+" if status == "success" else ("-" if status == "failure" else " ") + lines.append(f"| `{eid}` | {model} | {benchmark} | {emoji} {status} | {acc_str} |") + + lines.append("") + lines.append("---") + lines.append("*Generated by `stratix ci report`*") + + return "\n".join(lines) diff --git a/src/layerlens/cli/commands/evaluate.py b/src/layerlens/cli/commands/evaluate.py new file mode 100644 index 0000000..6371590 --- /dev/null +++ b/src/layerlens/cli/commands/evaluate.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +import sys + +import click + +from .._client import get_client, handle_errors, resolve_model, resolve_benchmark +from .._formatter import format_output +from .._completions import complete_model, complete_benchmark, complete_evaluation + +EVALUATION_COLUMNS = [ + ("id", "ID"), + ("status", "Status"), + ("model_name", "Model"), + ("benchmark_name", "Benchmark"), + ("accuracy", "Accuracy"), + ("submitted_at", "Submitted"), +] + + +@click.group() +def evaluate() -> None: + """Manage evaluations. + + \b + Examples: + stratix evaluate list + stratix evaluate get + stratix evaluate run --model gpt-4 --benchmark mmlu --wait + """ + + +@evaluate.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.option("--status", default=None, help="Filter by status (pending, in-progress, success, failure).") +@click.option( + "--sort-by", default=None, type=click.Choice(["submitted_at", "accuracy", "average_duration"]), help="Sort field." +) +@click.option("--order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.") +@click.pass_context +@handle_errors +def list_evaluations( + ctx: click.Context, + page: int | None, + page_size: int | None, + status: str | None, + sort_by: str | None, + order: str | None, +) -> None: + """List evaluations with optional filtering and pagination. + + \b + Examples: + stratix evaluate list + stratix evaluate list --status success --sort-by accuracy --order desc + stratix evaluate list --page-size 5 + """ + from ...models import EvaluationStatus + + client = get_client(ctx) + + eval_status = None + if status: + try: + eval_status = EvaluationStatus(status) + except ValueError: + click.echo(f"Invalid status: {status}. Valid: {', '.join(s.value for s in EvaluationStatus)}", err=True) + sys.exit(1) + + result = client.evaluations.get_many( + page=page, + page_size=page_size, + status=eval_status, + sort_by=sort_by, # type: ignore[arg-type] + order=order, # type: ignore[arg-type] + ) + if result is None or not result.evaluations: + click.echo("No evaluations found.") + return + + if ctx.obj["verbose"]: + click.echo( + f"Showing page {result.pagination.page} of {result.pagination.total_pages} ({result.pagination.total_count} total)", + err=True, + ) + + output = format_output(result.evaluations, ctx.obj["output_format"], EVALUATION_COLUMNS) + click.echo(output) + + +@evaluate.command("get") +@click.argument("id", shell_complete=complete_evaluation) +@click.pass_context +@handle_errors +def get_evaluation(ctx: click.Context, id: str) -> None: + """Get an evaluation by ID. + + \b + Examples: + stratix evaluate get abc123 + stratix evaluate get abc123 --format json + """ + client = get_client(ctx) + evaluation = client.evaluations.get_by_id(id) + if evaluation is None: + click.echo(f"Evaluation {id} not found.", err=True) + sys.exit(1) + + output = format_output(evaluation, ctx.obj["output_format"]) + click.echo(output) + + +@evaluate.command("run") +@click.option("--model", "model_id", required=True, shell_complete=complete_model, help="Model ID, key, or name.") +@click.option( + "--benchmark", "benchmark_id", required=True, shell_complete=complete_benchmark, help="Benchmark ID, key, or name." +) +@click.option("--wait", is_flag=True, default=False, help="Wait for evaluation to complete.") +@click.pass_context +@handle_errors +def run_evaluation(ctx: click.Context, model_id: str, benchmark_id: str, wait: bool) -> None: + """Run an evaluation with a model and benchmark. + + The --model and --benchmark options accept an ID, key, or name. + + \b + Examples: + stratix evaluate run --model gpt-4 --benchmark mmlu + stratix evaluate run --model abc123-uuid --benchmark def456-uuid --wait + stratix evaluate run --model "GPT-4" --benchmark "MMLU" --wait --format json + """ + client = get_client(ctx) + + if ctx.obj["verbose"]: + click.echo(f"Resolving model: {model_id}", err=True) + + model = resolve_model(client, model_id) + if model is None: + click.echo(f"Model not found: {model_id}", err=True) + sys.exit(1) + + if ctx.obj["verbose"]: + click.echo(f"Resolved model: {model.name} ({model.id})", err=True) + click.echo(f"Resolving benchmark: {benchmark_id}", err=True) + + benchmark = resolve_benchmark(client, benchmark_id) + if benchmark is None: + click.echo(f"Benchmark not found: {benchmark_id}", err=True) + sys.exit(1) + + if ctx.obj["verbose"]: + click.echo(f"Resolved benchmark: {benchmark.name} ({benchmark.id})", err=True) + + click.echo(f"Creating evaluation: {model.name} x {benchmark.name}") + + evaluation = client.evaluations.create(model=model, benchmark=benchmark) + if evaluation is None: + click.echo("Failed to create evaluation.", err=True) + sys.exit(1) + + click.echo(f"Evaluation created: {evaluation.id} (status: {evaluation.status})") + + if wait: + click.echo("Waiting for completion...") + evaluation = client.evaluations.wait_for_completion(evaluation) + if evaluation is None: + click.echo("Evaluation disappeared while waiting.", err=True) + sys.exit(1) + click.echo(f"Evaluation finished: {evaluation.status}") + + output = format_output(evaluation, ctx.obj["output_format"]) + click.echo(output) diff --git a/src/layerlens/cli/commands/integration.py b/src/layerlens/cli/commands/integration.py new file mode 100644 index 0000000..4675ffe --- /dev/null +++ b/src/layerlens/cli/commands/integration.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import sys + +import click + +from .._client import get_client, handle_errors +from .._formatter import format_output +from .._completions import complete_integration + +INTEGRATION_COLUMNS = [ + ("id", "ID"), + ("name", "Name"), + ("type", "Type"), + ("status", "Status"), + ("created_at", "Created"), +] + + +@click.group() +def integration() -> None: + """Manage integrations. + + \b + Examples: + stratix integration list + stratix integration test + """ + + +@integration.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.pass_context +@handle_errors +def list_integrations(ctx: click.Context, page: int | None, page_size: int | None) -> None: + """List integrations with optional pagination. + + \b + Examples: + stratix integration list + stratix integration list --page-size 10 + """ + client = get_client(ctx) + result = client.integrations.get_many(page=page, page_size=page_size) + if result is None or not result.integrations: + click.echo("No integrations found.") + return + + if ctx.obj["verbose"]: + click.echo(f"Showing {result.count} of {result.total_count} integrations", err=True) + + output = format_output(result.integrations, ctx.obj["output_format"], INTEGRATION_COLUMNS) + click.echo(output) + + +@integration.command("test") +@click.argument("id", shell_complete=complete_integration) +@click.pass_context +@handle_errors +def test_integration(ctx: click.Context, id: str) -> None: + """Test an integration by ID. + + \b + Examples: + stratix integration test abc123 + stratix integration test abc123 --format json + """ + client = get_client(ctx) + result = client.integrations.test(id) + if result is None: + click.echo(f"Failed to test integration {id}.", err=True) + sys.exit(1) + + if result.success: + click.echo(f"Integration {id}: OK") + else: + click.echo(f"Integration {id}: FAILED") + + if result.message: + click.echo(f"Message: {result.message}") + + if ctx.obj["output_format"] == "json": + output = format_output(result, "json") + click.echo(output) diff --git a/src/layerlens/cli/commands/judge.py b/src/layerlens/cli/commands/judge.py new file mode 100644 index 0000000..16f222a --- /dev/null +++ b/src/layerlens/cli/commands/judge.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import sys + +import click + +from .._client import get_client, handle_errors +from .._formatter import format_output +from .._completions import complete_judge, complete_model, complete_trace + +JUDGE_COLUMNS = [ + ("id", "ID"), + ("name", "Name"), + ("version", "Version"), + ("run_count", "Runs"), + ("created_at", "Created"), +] + + +@click.group() +def judge() -> None: + """Manage judges. + + \b + Examples: + stratix judge list + stratix judge get + stratix judge create --name "Quality" --goal "Evaluate response quality" + stratix judge test --judge-id --trace-id + """ + + +@judge.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.pass_context +@handle_errors +def list_judges(ctx: click.Context, page: int | None, page_size: int | None) -> None: + """List judges with optional pagination. + + \b + Examples: + stratix judge list + stratix judge list --page-size 5 + """ + client = get_client(ctx) + result = client.judges.get_many(page=page, page_size=page_size) + if result is None or not result.judges: + click.echo("No judges found.") + return + + if ctx.obj["verbose"]: + click.echo(f"Showing {result.count} of {result.total_count} judges", err=True) + + output = format_output(result.judges, ctx.obj["output_format"], JUDGE_COLUMNS) + click.echo(output) + + +@judge.command("get") +@click.argument("id", shell_complete=complete_judge) +@click.pass_context +@handle_errors +def get_judge(ctx: click.Context, id: str) -> None: + """Get a judge by ID. + + \b + Examples: + stratix judge get abc123 + stratix judge get abc123 --format json + """ + client = get_client(ctx) + j = client.judges.get(id) + if j is None: + click.echo(f"Judge {id} not found.", err=True) + sys.exit(1) + + output = format_output(j, ctx.obj["output_format"]) + click.echo(output) + + +@judge.command("create") +@click.option("--name", required=True, help="Judge name.") +@click.option("--goal", required=True, help="Evaluation goal description.") +@click.option("--model-id", default=None, shell_complete=complete_model, help="Model ID for the judge.") +@click.pass_context +@handle_errors +def create_judge(ctx: click.Context, name: str, goal: str, model_id: str | None) -> None: + """Create a new judge. + + \b + Examples: + stratix judge create --name "Quality" --goal "Evaluate response quality" + stratix judge create --name "Safety" --goal "Check for harmful content" --model-id abc123 + """ + client = get_client(ctx) + j = client.judges.create(name=name, evaluation_goal=goal, model_id=model_id) + if j is None: + click.echo("Failed to create judge.", err=True) + sys.exit(1) + + click.echo(f"Judge created: {j.id}") + output = format_output(j, ctx.obj["output_format"]) + click.echo(output) + + +@judge.command("test") +@click.option("--judge-id", required=True, shell_complete=complete_judge, help="Judge ID to test with.") +@click.option("--trace-id", required=True, shell_complete=complete_trace, help="Trace ID to evaluate.") +@click.pass_context +@handle_errors +def test_judge(ctx: click.Context, judge_id: str, trace_id: str) -> None: + """Test a judge by evaluating a trace. + + Creates a trace evaluation using the specified judge and trace. + + \b + Examples: + stratix judge test --judge-id abc123 --trace-id def456 + stratix judge test --judge-id abc123 --trace-id def456 --format json + """ + client = get_client(ctx) + te = client.trace_evaluations.create(trace_id=trace_id, judge_id=judge_id) + if te is None: + click.echo("Failed to create trace evaluation.", err=True) + sys.exit(1) + + click.echo(f"Trace evaluation created: {te.id}") + output = format_output(te, ctx.obj["output_format"]) + click.echo(output) diff --git a/src/layerlens/cli/commands/scorer.py b/src/layerlens/cli/commands/scorer.py new file mode 100644 index 0000000..5924775 --- /dev/null +++ b/src/layerlens/cli/commands/scorer.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import sys + +import click + +from .._client import get_client, handle_errors +from .._formatter import format_output + +SCORER_COLUMNS = [ + ("id", "ID"), + ("name", "Name"), + ("model_name", "Model"), + ("model_company", "Company"), + ("created_at", "Created"), +] + + +@click.group() +def scorer() -> None: + """Manage scorers. + + \b + Examples: + stratix scorer list + stratix scorer get + stratix scorer create --name "Quality" --description "..." --model-id --prompt "..." + stratix scorer delete + """ + + +@scorer.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.pass_context +@handle_errors +def list_scorers(ctx: click.Context, page: int | None, page_size: int | None) -> None: + """List scorers with optional pagination. + + \b + Examples: + stratix scorer list + stratix scorer list --page-size 10 + """ + client = get_client(ctx) + result = client.scorers.get_many(page=page, page_size=page_size) + if result is None or not result.scorers: + click.echo("No scorers found.") + return + + if ctx.obj["verbose"]: + click.echo(f"Showing {result.count} of {result.total_count} scorers", err=True) + + output = format_output(result.scorers, ctx.obj["output_format"], SCORER_COLUMNS) + click.echo(output) + + +@scorer.command("get") +@click.argument("id") +@click.pass_context +@handle_errors +def get_scorer(ctx: click.Context, id: str) -> None: + """Get a scorer by ID. + + \b + Examples: + stratix scorer get abc123 + stratix scorer get abc123 --format json + """ + client = get_client(ctx) + s = client.scorers.get(id) + if s is None: + click.echo(f"Scorer {id} not found.", err=True) + sys.exit(1) + + output = format_output(s, ctx.obj["output_format"]) + click.echo(output) + + +@scorer.command("create") +@click.option("--name", required=True, help="Scorer name (3-64 chars).") +@click.option("--description", required=True, help="Scorer description (10-500 chars).") +@click.option("--model-id", required=True, help="Model ID to use for scoring.") +@click.option("--prompt", required=True, help="Scoring prompt.") +@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.") +@click.pass_context +@handle_errors +def create_scorer(ctx: click.Context, name: str, description: str, model_id: str, prompt: str, dry_run: bool) -> None: + """Create a new scorer. + + \b + Examples: + stratix scorer create --name "Quality" --description "Evaluate quality" --model-id abc123 --prompt "Rate the quality..." + stratix scorer create --name "Test" --description "Test scorer" --model-id abc123 --prompt "..." --dry-run + """ + if dry_run: + click.echo(f"[dry-run] Would create scorer: {name}") + click.echo(f" Model: {model_id}") + click.echo(f" Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}") + return + + client = get_client(ctx) + s = client.scorers.create(name=name, description=description, model_id=model_id, prompt=prompt) + if s is None: + click.echo("Failed to create scorer.", err=True) + sys.exit(1) + + click.echo(f"Scorer created: {s.id}") + output = format_output(s, ctx.obj["output_format"]) + click.echo(output) + + +@scorer.command("delete") +@click.argument("id") +@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.") +@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.") +@click.pass_context +@handle_errors +def delete_scorer(ctx: click.Context, id: str, yes: bool, dry_run: bool) -> None: + """Delete a scorer by ID. + + \b + Examples: + stratix scorer delete abc123 + stratix scorer delete abc123 --yes + stratix scorer delete abc123 --dry-run + """ + if dry_run: + click.echo(f"[dry-run] Would delete scorer {id}") + return + + if not yes: + click.confirm(f"Are you sure you want to delete scorer {id}?", abort=True) + + client = get_client(ctx) + success = client.scorers.delete(id) + if success: + click.echo(f"Scorer {id} deleted.") + else: + click.echo(f"Failed to delete scorer {id}.", err=True) + sys.exit(1) diff --git a/src/layerlens/cli/commands/space.py b/src/layerlens/cli/commands/space.py new file mode 100644 index 0000000..f7482cf --- /dev/null +++ b/src/layerlens/cli/commands/space.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import sys + +import click + +from .._client import get_client, handle_errors +from .._formatter import format_output + +SPACE_COLUMNS = [ + ("id", "ID"), + ("name", "Name"), + ("visibility", "Visibility"), + ("models_count", "Models"), + ("benchmarks_count", "Benchmarks"), + ("evaluations_count", "Evaluations"), + ("created_at", "Created"), +] + + +@click.group() +def space() -> None: + """Manage evaluation spaces. + + \b + Examples: + stratix space list + stratix space get + stratix space create --name "My Space" + stratix space delete + """ + + +@space.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.option("--sort-by", default=None, help="Sort field (e.g. weight, created_at).") +@click.option("--order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.") +@click.pass_context +@handle_errors +def list_spaces( + ctx: click.Context, page: int | None, page_size: int | None, sort_by: str | None, order: str | None +) -> None: + """List evaluation spaces with optional pagination. + + \b + Examples: + stratix space list + stratix space list --page-size 10 + stratix space list --sort-by created_at --order desc + """ + client = get_client(ctx) + result = client.evaluation_spaces.get_many(page=page, page_size=page_size, sort_by=sort_by, order=order) + if result is None or not result.evaluation_spaces: + click.echo("No evaluation spaces found.") + return + + if ctx.obj["verbose"]: + click.echo(f"Showing {result.count} of {result.total_count} evaluation spaces", err=True) + + output = format_output(result.evaluation_spaces, ctx.obj["output_format"], SPACE_COLUMNS) + click.echo(output) + + +@space.command("get") +@click.argument("id") +@click.pass_context +@handle_errors +def get_space(ctx: click.Context, id: str) -> None: + """Get an evaluation space by ID or slug. + + \b + Examples: + stratix space get abc123 + stratix space get my-space-slug + stratix space get abc123 --format json + """ + client = get_client(ctx) + s = client.evaluation_spaces.get(id) + if s is None: + click.echo(f"Evaluation space {id} not found.", err=True) + sys.exit(1) + + output = format_output(s, ctx.obj["output_format"]) + click.echo(output) + + +@space.command("create") +@click.option("--name", required=True, help="Space name.") +@click.option("--description", default=None, help="Space description (max 500 chars).") +@click.option( + "--visibility", default=None, type=click.Choice(["private", "public", "tenant"]), help="Visibility level." +) +@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.") +@click.pass_context +@handle_errors +def create_space(ctx: click.Context, name: str, description: str | None, visibility: str | None, dry_run: bool) -> None: + """Create a new evaluation space. + + \b + Examples: + stratix space create --name "Production" + stratix space create --name "Public Board" --visibility public + stratix space create --name "Test" --dry-run + """ + if dry_run: + click.echo(f"[dry-run] Would create evaluation space: {name}") + if visibility: + click.echo(f" Visibility: {visibility}") + return + + client = get_client(ctx) + s = client.evaluation_spaces.create(name=name, description=description, visibility=visibility) + if s is None: + click.echo("Failed to create evaluation space.", err=True) + sys.exit(1) + + click.echo(f"Evaluation space created: {s.id}") + output = format_output(s, ctx.obj["output_format"]) + click.echo(output) + + +@space.command("delete") +@click.argument("id") +@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.") +@click.option("--dry-run", is_flag=True, default=False, help="Preview without executing.") +@click.pass_context +@handle_errors +def delete_space(ctx: click.Context, id: str, yes: bool, dry_run: bool) -> None: + """Delete an evaluation space by ID. + + \b + Examples: + stratix space delete abc123 + stratix space delete abc123 --yes + stratix space delete abc123 --dry-run + """ + if dry_run: + click.echo(f"[dry-run] Would delete evaluation space {id}") + return + + if not yes: + click.confirm(f"Are you sure you want to delete evaluation space {id}?", abort=True) + + client = get_client(ctx) + success = client.evaluation_spaces.delete(id) + if success: + click.echo(f"Evaluation space {id} deleted.") + else: + click.echo(f"Failed to delete evaluation space {id}.", err=True) + sys.exit(1) diff --git a/src/layerlens/cli/commands/trace.py b/src/layerlens/cli/commands/trace.py new file mode 100644 index 0000000..3671693 --- /dev/null +++ b/src/layerlens/cli/commands/trace.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import sys +import json + +import click + +from .._client import get_client, handle_errors +from .._formatter import to_dict, format_output +from .._completions import complete_trace + +TRACE_COLUMNS = [ + ("id", "ID"), + ("created_at", "Created"), + ("filename", "Filename"), + ("evaluations_count", "Evaluations"), +] + + +@click.group() +def trace() -> None: + """Manage traces. + + \b + Examples: + stratix trace list + stratix trace get + stratix trace search "user login" + stratix trace export --output trace.json + stratix trace delete --yes + """ + + +@trace.command("list") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.option("--source", default=None, help="Filter by source.") +@click.option("--status", default=None, help="Filter by status.") +@click.option("--sort-by", default=None, help="Sort field.") +@click.option("--sort-order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.") +@click.pass_context +@handle_errors +def list_traces( + ctx: click.Context, + page: int | None, + page_size: int | None, + source: str | None, + status: str | None, + sort_by: str | None, + sort_order: str | None, +) -> None: + """List traces with optional filtering and pagination. + + \b + Examples: + stratix trace list + stratix trace list --page-size 10 + stratix trace list --source sdk --sort-by created_at --sort-order desc + """ + client = get_client(ctx) + result = client.traces.get_many( + page=page, + page_size=page_size, + source=source, + status=status, + sort_by=sort_by, + sort_order=sort_order, + ) + if result is None or not result.traces: + click.echo("No traces found.") + return + + if ctx.obj["verbose"]: + click.echo(f"Showing {result.count} of {result.total_count} traces", err=True) + + output = format_output(result.traces, ctx.obj["output_format"], TRACE_COLUMNS) + click.echo(output) + + +@trace.command("get") +@click.argument("id", shell_complete=complete_trace) +@click.pass_context +@handle_errors +def get_trace(ctx: click.Context, id: str) -> None: + """Get a trace by ID. + + \b + Examples: + stratix trace get abc123-def4-5678-ghij-klmnopqrstuv + stratix trace get abc123 --format json + """ + client = get_client(ctx) + trace = client.traces.get(id) + if trace is None: + click.echo(f"Trace {id} not found.", err=True) + sys.exit(1) + + output = format_output(trace, ctx.obj["output_format"]) + click.echo(output) + + +@trace.command("search") +@click.argument("query") +@click.option("--page", default=None, type=int, help="Page number.") +@click.option("--page-size", default=None, type=int, help="Results per page.") +@click.option("--source", default=None, help="Filter by source.") +@click.option("--status", default=None, help="Filter by status.") +@click.option("--sort-by", default=None, help="Sort field.") +@click.option("--sort-order", default=None, type=click.Choice(["asc", "desc"]), help="Sort order.") +@click.pass_context +@handle_errors +def search_traces( + ctx: click.Context, + query: str, + page: int | None, + page_size: int | None, + source: str | None, + status: str | None, + sort_by: str | None, + sort_order: str | None, +) -> None: + """Search traces by query string. + + \b + Examples: + stratix trace search "user login" + stratix trace search "error" --source sdk --page-size 5 + """ + client = get_client(ctx) + result = client.traces.get_many( + search=query, + page=page, + page_size=page_size, + source=source, + status=status, + sort_by=sort_by, + sort_order=sort_order, + ) + if result is None or not result.traces: + click.echo("No traces found matching your query.") + return + + if ctx.obj["verbose"]: + click.echo(f"Found {result.count} of {result.total_count} traces", err=True) + + output = format_output(result.traces, ctx.obj["output_format"], TRACE_COLUMNS) + click.echo(output) + + +@trace.command("export") +@click.argument("id", shell_complete=complete_trace) +@click.option( + "--output", "-o", "output_file", default=None, type=click.Path(), help="Output file path (default: stdout)." +) +@click.pass_context +@handle_errors +def export_trace(ctx: click.Context, id: str, output_file: str | None) -> None: + """Export a trace as JSON. + + \b + Examples: + stratix trace export abc123 + stratix trace export abc123 --output trace.json + """ + client = get_client(ctx) + trace = client.traces.get(id) + if trace is None: + click.echo(f"Trace {id} not found.", err=True) + sys.exit(1) + + json_str = json.dumps(to_dict(trace), indent=2, default=str) + + if output_file: + with open(output_file, "w") as f: + f.write(json_str) + click.echo(f"Trace exported to {output_file}") + else: + click.echo(json_str) + + +@trace.command("delete") +@click.argument("id", shell_complete=complete_trace) +@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.") +@click.pass_context +@handle_errors +def delete_trace(ctx: click.Context, id: str, yes: bool) -> None: + """Delete a trace by ID. + + \b + Examples: + stratix trace delete abc123 + stratix trace delete abc123 --yes + """ + if not yes: + click.confirm(f"Are you sure you want to delete trace {id}?", abort=True) + + client = get_client(ctx) + success = client.traces.delete(id) + if success: + click.echo(f"Trace {id} deleted.") + else: + click.echo(f"Failed to delete trace {id}.", err=True) + sys.exit(1) diff --git a/src/layerlens/models/__init__.py b/src/layerlens/models/__init__.py index 26bcfb9..b4aa7a5 100644 --- a/src/layerlens/models/__init__.py +++ b/src/layerlens/models/__init__.py @@ -5,6 +5,7 @@ ModelsResponse, TracesResponse, ResultsResponse, + ScorersResponse, UploadURLResponse, BenchmarksResponse, CreateJudgeResponse, @@ -14,8 +15,11 @@ UpdateJudgeResponse, CostEstimateResponse, CreateTracesResponse, + IntegrationsResponse, OrganizationResponse, CreateBenchmarkResponse, + TestIntegrationResponse, + EvaluationSpacesResponse, TraceEvaluationsResponse, CreateEvaluationsResponse, JudgeOptimizationRunsResponse, @@ -38,6 +42,7 @@ PublicModelsListResponse, PublicBenchmarksListResponse, ) +from .scorer import Scorer from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark from .evaluation import ( Result, @@ -52,7 +57,14 @@ PerformanceDetails, EvaluationModelInfo, ) +from .integration import Integration from .organization import Project, Organization +from .evaluation_space import ( + EvaluationSpace, + EvaluationSpaceFilters, + EvaluationSpaceModelFilter, + EvaluationSpaceDatasetFilter, +) from .trace_evaluation import ( JudgeSnapshot, TraceEvaluation, @@ -86,6 +98,14 @@ "CustomModel", "DeleteJudgeResponse", "EstimateJudgeOptimizationCostResponse", + "EvaluationSpace", + "EvaluationSpaceDatasetFilter", + "EvaluationSpaceFilters", + "EvaluationSpaceModelFilter", + "EvaluationSpacesResponse", + "Integration", + "IntegrationsResponse", + "TestIntegrationResponse", "AnalysisSummary", "ErrorAnalysis", "Evaluation", @@ -120,6 +140,8 @@ "Result", "ResultMetrics", "ResultsResponse", + "Scorer", + "ScorersResponse", "Trace", "TraceEvaluation", "TraceEvaluationResult", diff --git a/src/layerlens/models/api.py b/src/layerlens/models/api.py index 72a2390..308f89f 100644 --- a/src/layerlens/models/api.py +++ b/src/layerlens/models/api.py @@ -1,15 +1,18 @@ from __future__ import annotations -from typing import List +from typing import List, Optional from pydantic import Field, BaseModel, ConfigDict from .judge import Judge from .model import Model from .trace import TraceWithEvaluations +from .scorer import Scorer from .benchmark import Benchmark from .evaluation import Result, Evaluation +from .integration import Integration from .organization import Organization +from .evaluation_space import EvaluationSpace from .trace_evaluation import TraceEvaluation, TraceEvaluationResult from .judge_optimization import JudgeOptimizationRun @@ -152,3 +155,26 @@ class ApplyJudgeOptimizationResultResponse(BaseModel): judge_id: str new_version: int message: str + + +class IntegrationsResponse(BaseModel): + integrations: List[Integration] + count: int + total_count: int + + +class TestIntegrationResponse(BaseModel): + success: bool + message: Optional[str] = None + + +class ScorersResponse(BaseModel): + scorers: List[Scorer] + count: int + total_count: int + + +class EvaluationSpacesResponse(BaseModel): + evaluation_spaces: List[EvaluationSpace] + count: int + total_count: int diff --git a/src/layerlens/models/evaluation.py b/src/layerlens/models/evaluation.py index 7d16a3c..f2f090d 100644 --- a/src/layerlens/models/evaluation.py +++ b/src/layerlens/models/evaluation.py @@ -40,6 +40,8 @@ class EvaluationDataset(BaseModel): class EvaluationModelInfo(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + model_name: str = "" performance: Any = None @@ -71,7 +73,7 @@ class EvaluationSummary(BaseModel): class Evaluation(BaseModel): - model_config = ConfigDict(populate_by_name=True) + model_config = ConfigDict(populate_by_name=True, protected_namespaces=()) id: str status: EvaluationStatus diff --git a/src/layerlens/models/evaluation_space.py b/src/layerlens/models/evaluation_space.py new file mode 100644 index 0000000..3ab0be5 --- /dev/null +++ b/src/layerlens/models/evaluation_space.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from typing import List, Optional + +from pydantic import BaseModel + + +class EvaluationSpaceModelFilter(BaseModel): + ids: List[str] = [] + vendors: List[str] = [] + regions: List[str] = [] + + +class EvaluationSpaceDatasetFilter(BaseModel): + ids: List[str] = [] + categories: List[str] = [] + languages: List[str] = [] + + +class EvaluationSpaceFilters(BaseModel): + model_filters: Optional[EvaluationSpaceModelFilter] = None + dataset_filters: Optional[EvaluationSpaceDatasetFilter] = None + providers: List[str] = [] + + +class EvaluationSpace(BaseModel): + id: str + organization_id: Optional[str] = None + project_id: Optional[str] = None + name: str + description: Optional[str] = None + filters: Optional[EvaluationSpaceFilters] = None + owner: Optional[str] = None + visibility: Optional[str] = None + is_featured: bool = False + is_partner: bool = False + partner_name: Optional[str] = None + created_at: Optional[str] = None + image_path: Optional[str] = None + weight: int = 0 + slug: Optional[str] = None + models_count: int = 0 + benchmarks_count: int = 0 + evaluations_count: int = 0 diff --git a/src/layerlens/models/integration.py b/src/layerlens/models/integration.py new file mode 100644 index 0000000..5262259 --- /dev/null +++ b/src/layerlens/models/integration.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional + +from pydantic import BaseModel + + +class Integration(BaseModel): + id: str + organization_id: str + project_id: str + name: str + type: Optional[str] = None + status: Optional[str] = None + created_at: Optional[str] = None + config: Dict[str, Any] = {} diff --git a/src/layerlens/models/judge.py b/src/layerlens/models/judge.py index 8b68439..db416a1 100644 --- a/src/layerlens/models/judge.py +++ b/src/layerlens/models/judge.py @@ -2,10 +2,12 @@ from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class JudgeVersion(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + version: int name: str evaluation_goal: str @@ -17,6 +19,8 @@ class JudgeVersion(BaseModel): class Judge(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + id: str organization_id: str project_id: str diff --git a/src/layerlens/models/scorer.py b/src/layerlens/models/scorer.py new file mode 100644 index 0000000..585d064 --- /dev/null +++ b/src/layerlens/models/scorer.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel + + +class Scorer(BaseModel): + id: Optional[str] = None + organization_id: Optional[str] = None + project_id: Optional[str] = None + name: str + description: Optional[str] = None + model_id: Optional[str] = None + model_name: Optional[str] = None + model_key: Optional[str] = None + model_company: Optional[str] = None + prompt: Optional[str] = None + created_at: Optional[str] = None + updated_at: Optional[str] = None diff --git a/src/layerlens/models/trace_evaluation.py b/src/layerlens/models/trace_evaluation.py index 98f32db..b17b485 100644 --- a/src/layerlens/models/trace_evaluation.py +++ b/src/layerlens/models/trace_evaluation.py @@ -14,7 +14,7 @@ class TraceEvaluationStatus(str, Enum): class JudgeSnapshot(BaseModel): - model_config = ConfigDict(populate_by_name=True) + model_config = ConfigDict(populate_by_name=True, protected_namespaces=()) name: str version: int diff --git a/src/layerlens/resources/benchmarks/benchmarks.py b/src/layerlens/resources/benchmarks/benchmarks.py index b4ecf69..fca94c6 100644 --- a/src/layerlens/resources/benchmarks/benchmarks.py +++ b/src/layerlens/resources/benchmarks/benchmarks.py @@ -83,8 +83,22 @@ def cast_benchmark(b: Benchmark, bench_type: str) -> Benchmark: if resp: benchmarks.extend([cast_benchmark(b, type) for b in resp.data.benchmarks]) - if name: - benchmarks = [b for b in benchmarks if name.lower() in b.name.lower()] + # Exclude custom benchmarks when filtering by fields they don't have + if categories: + cat_set = {c.lower() for c in categories} + benchmarks = [ + b + for b in benchmarks + if isinstance(b, PublicBenchmark) and b.categories and any(c.lower() in cat_set for c in b.categories) + ] + + if languages: + lang_set = {l.lower() for l in languages} + benchmarks = [ + b + for b in benchmarks + if isinstance(b, PublicBenchmark) and b.language and b.language.lower() in lang_set + ] return benchmarks @@ -356,8 +370,22 @@ def cast_benchmark(b: Benchmark, bench_type: str) -> Benchmark: if resp: benchmarks.extend([cast_benchmark(b, type) for b in resp.data.benchmarks]) - if name: - benchmarks = [b for b in benchmarks if name.lower() in b.name.lower()] + # Exclude custom benchmarks when filtering by fields they don't have + if categories: + cat_set = {c.lower() for c in categories} + benchmarks = [ + b + for b in benchmarks + if isinstance(b, PublicBenchmark) and b.categories and any(c.lower() in cat_set for c in b.categories) + ] + + if languages: + lang_set = {l.lower() for l in languages} + benchmarks = [ + b + for b in benchmarks + if isinstance(b, PublicBenchmark) and b.language and b.language.lower() in lang_set + ] return benchmarks diff --git a/src/layerlens/resources/evaluation_spaces/__init__.py b/src/layerlens/resources/evaluation_spaces/__init__.py new file mode 100644 index 0000000..60e7ca2 --- /dev/null +++ b/src/layerlens/resources/evaluation_spaces/__init__.py @@ -0,0 +1,3 @@ +from .evaluation_spaces import EvaluationSpaces, AsyncEvaluationSpaces + +__all__ = ["EvaluationSpaces", "AsyncEvaluationSpaces"] diff --git a/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py b/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py new file mode 100644 index 0000000..0442c4d --- /dev/null +++ b/src/layerlens/resources/evaluation_spaces/evaluation_spaces.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional + +import httpx + +from ...models import EvaluationSpace, EvaluationSpacesResponse +from ..._resource import SyncAPIResource, AsyncAPIResource +from ..._constants import DEFAULT_TIMEOUT + +DEFAULT_PAGE = 1 +DEFAULT_PAGE_SIZE = 100 +MAX_PAGE_SIZE = 500 + + +def _unwrap(resp: Any) -> Any: + if isinstance(resp, dict) and "data" in resp and "status" in resp: + return resp["data"] + return resp + + +class EvaluationSpaces(SyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/evaluation-spaces" + + def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[EvaluationSpace]: + resp = self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + sort_by: Optional[str] = None, + order: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpacesResponse]: + params: Dict[str, Any] = {} + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + if sort_by: + params["sort_by"] = sort_by + if order: + params["order"] = order + + resp = self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict) + if not resp or not isinstance(resp, dict): + return None + data = _unwrap(resp) + if not isinstance(data, dict): + return None + + spaces = [EvaluationSpace(**s) if isinstance(s, dict) else s for s in data.get("evaluation_spaces", [])] + count: int = data.get("count", len(spaces)) + total_count: int = data.get("total_count", count) + try: + return EvaluationSpacesResponse(evaluation_spaces=spaces, count=count, total_count=total_count) + except Exception: + return None + + def create( + self, + *, + name: str, + description: Optional[str] = None, + visibility: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpace]: + body: Dict[str, Any] = {"name": name} + if description: + body["description"] = description + if visibility: + body["visibility"] = visibility + resp = self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + def update( + self, + id: str, + *, + description: Optional[str] = None, + visibility: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpace]: + body: Dict[str, Any] = {} + if description is not None: + body["description"] = description + if visibility is not None: + body["visibility"] = visibility + resp = self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool: + try: + self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + return True + except Exception: + return False + + +class AsyncEvaluationSpaces(AsyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/evaluation-spaces" + + async def get( + self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT + ) -> Optional[EvaluationSpace]: + resp = await self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + async def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + sort_by: Optional[str] = None, + order: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpacesResponse]: + params: Dict[str, Any] = {} + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + if sort_by: + params["sort_by"] = sort_by + if order: + params["order"] = order + resp = await self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict) + if not resp or not isinstance(resp, dict): + return None + data = _unwrap(resp) + if not isinstance(data, dict): + return None + spaces = [EvaluationSpace(**s) if isinstance(s, dict) else s for s in data.get("evaluation_spaces", [])] + count: int = data.get("count", len(spaces)) + total_count: int = data.get("total_count", count) + try: + return EvaluationSpacesResponse(evaluation_spaces=spaces, count=count, total_count=total_count) + except Exception: + return None + + async def create( + self, + *, + name: str, + description: Optional[str] = None, + visibility: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpace]: + body: Dict[str, Any] = {"name": name} + if description: + body["description"] = description + if visibility: + body["visibility"] = visibility + resp = await self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + async def update( + self, + id: str, + *, + description: Optional[str] = None, + visibility: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationSpace]: + body: Dict[str, Any] = {} + if description is not None: + body["description"] = description + if visibility is not None: + body["visibility"] = visibility + resp = await self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return EvaluationSpace(**data) + except Exception: + return None + return None + + async def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool: + try: + await self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + return True + except Exception: + return False diff --git a/src/layerlens/resources/integrations/__init__.py b/src/layerlens/resources/integrations/__init__.py new file mode 100644 index 0000000..101d042 --- /dev/null +++ b/src/layerlens/resources/integrations/__init__.py @@ -0,0 +1,3 @@ +from .integrations import Integrations, AsyncIntegrations + +__all__ = ["Integrations", "AsyncIntegrations"] diff --git a/src/layerlens/resources/integrations/integrations.py b/src/layerlens/resources/integrations/integrations.py new file mode 100644 index 0000000..ecd856b --- /dev/null +++ b/src/layerlens/resources/integrations/integrations.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional + +import httpx + +from ...models import ( + Integration, + IntegrationsResponse, + TestIntegrationResponse, +) +from ..._resource import SyncAPIResource, AsyncAPIResource +from ..._constants import DEFAULT_TIMEOUT + +DEFAULT_PAGE = 1 +DEFAULT_PAGE_SIZE = 100 +MAX_PAGE_SIZE = 500 + + +def _unwrap(resp: Any) -> Any: + """Unwrap {"status": ..., "data": ...} envelope if present.""" + if isinstance(resp, dict) and "data" in resp and "status" in resp: + return resp["data"] + return resp + + +class Integrations(SyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/integrations" + + def get( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Integration]: + resp = self._get( + f"{self._base_url()}/{id}", + timeout=timeout, + cast_to=dict, + ) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return Integration(**data) + except Exception: + return None + return None + + def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[IntegrationsResponse]: + params: Dict[str, Any] = {} + + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + + resp = self._get( + self._base_url(), + params=params, + timeout=timeout, + cast_to=dict, + ) + if not resp or not isinstance(resp, dict): + return None + + data = _unwrap(resp) + if not isinstance(data, dict): + return None + + integrations = [i if isinstance(i, Integration) else Integration(**i) for i in data.get("integrations", [])] + count: int = data.get("count", len(integrations)) + total_count: int = data.get("total_count", count) + + try: + return IntegrationsResponse(integrations=integrations, count=count, total_count=total_count) + except Exception: + return None + + def test( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[TestIntegrationResponse]: + resp = self._post( + f"{self._base_url()}/{id}/test", + body={}, + timeout=timeout, + cast_to=dict, + ) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return TestIntegrationResponse(**data) + except Exception: + return None + return None + + +class AsyncIntegrations(AsyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/integrations" + + async def get( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Integration]: + resp = await self._get( + f"{self._base_url()}/{id}", + timeout=timeout, + cast_to=dict, + ) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return Integration(**data) + except Exception: + return None + return None + + async def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[IntegrationsResponse]: + params: Dict[str, Any] = {} + + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + + resp = await self._get( + self._base_url(), + params=params, + timeout=timeout, + cast_to=dict, + ) + if not resp or not isinstance(resp, dict): + return None + + data = _unwrap(resp) + if not isinstance(data, dict): + return None + + integrations = [i if isinstance(i, Integration) else Integration(**i) for i in data.get("integrations", [])] + count: int = data.get("count", len(integrations)) + total_count: int = data.get("total_count", count) + + try: + return IntegrationsResponse(integrations=integrations, count=count, total_count=total_count) + except Exception: + return None + + async def test( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[TestIntegrationResponse]: + resp = await self._post( + f"{self._base_url()}/{id}/test", + body={}, + timeout=timeout, + cast_to=dict, + ) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return TestIntegrationResponse(**data) + except Exception: + return None + return None diff --git a/src/layerlens/resources/models/models.py b/src/layerlens/resources/models/models.py index 7a25377..411afed 100644 --- a/src/layerlens/resources/models/models.py +++ b/src/layerlens/resources/models/models.py @@ -5,6 +5,55 @@ import httpx from ...models import Model, CustomModel, PublicModel, ModelsResponse, CreateModelResponse + + +def _exclude_custom_models( + models: List[Model], + *, + categories: Optional[List[str]] = None, + companies: Optional[List[str]] = None, + regions: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, +) -> List[Model]: + """Exclude custom models when filtering by fields they don't have. + + The API correctly filters public models and custom models by name/key, + but custom models don't have categories/companies/regions/licenses fields, + so they must be excluded from results when those filters are active. + """ + if categories: + cat_set = {c.lower() for c in categories} + + def matches_category(m: Model) -> bool: + if not isinstance(m, PublicModel): + return False + arch = (m.architecture_type or "").lower() + for cat in cat_set: + if cat == "open-source" and m.open_weights: + return True + if cat == "closed-source" and not m.open_weights and arch: + return True + if arch and cat == arch: + return True + return False + + models = [m for m in models if matches_category(m)] + + if companies: + comp_set = {c.lower() for c in companies} + models = [m for m in models if isinstance(m, PublicModel) and m.company and m.company.lower() in comp_set] + + if regions: + reg_set = {r.lower() for r in regions} + models = [m for m in models if isinstance(m, PublicModel) and m.region and m.region.lower() in reg_set] + + if licenses: + lic_set = {l.lower() for l in licenses} + models = [m for m in models if isinstance(m, PublicModel) and m.license and m.license.lower() in lic_set] + + return models + + from ..._resource import SyncAPIResource, AsyncAPIResource from ..._constants import DEFAULT_TIMEOUT @@ -66,8 +115,13 @@ def cast_model(m: Model, model_type: str) -> Model: if resp: models.extend([cast_model(m, type) for m in resp.data.models]) - if name: - models = [m for m in models if name.lower() in m.name.lower()] + models = _exclude_custom_models( + models, + categories=categories, + companies=companies, + regions=regions, + licenses=licenses, + ) return models @@ -252,8 +306,13 @@ def cast_model(m: Model, model_type: str) -> Model: if resp: models.extend([cast_model(m, type) for m in resp.data.models]) - if name: - models = [m for m in models if name.lower() in m.name.lower()] + models = _exclude_custom_models( + models, + categories=categories, + companies=companies, + regions=regions, + licenses=licenses, + ) return models diff --git a/src/layerlens/resources/scorers/__init__.py b/src/layerlens/resources/scorers/__init__.py new file mode 100644 index 0000000..ac715f0 --- /dev/null +++ b/src/layerlens/resources/scorers/__init__.py @@ -0,0 +1,3 @@ +from .scorers import Scorers, AsyncScorers + +__all__ = ["Scorers", "AsyncScorers"] diff --git a/src/layerlens/resources/scorers/scorers.py b/src/layerlens/resources/scorers/scorers.py new file mode 100644 index 0000000..4696389 --- /dev/null +++ b/src/layerlens/resources/scorers/scorers.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional + +import httpx + +from ...models import Scorer, ScorersResponse +from ..._resource import SyncAPIResource, AsyncAPIResource +from ..._constants import DEFAULT_TIMEOUT + +DEFAULT_PAGE = 1 +DEFAULT_PAGE_SIZE = 100 +MAX_PAGE_SIZE = 500 + + +def _unwrap(resp: Any) -> Any: + if isinstance(resp, dict) and "data" in resp and "status" in resp: + return resp["data"] + return resp + + +def _pascal_to_snake(key: str) -> str: + """Convert PascalCase key to snake_case.""" + import re + + return re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", key)).lower() + + +def _normalize_keys(d: Dict[str, Any]) -> Dict[str, Any]: + """Normalize a dict's keys from PascalCase to snake_case if needed.""" + if not d or not isinstance(d, dict): + return d + # Check if keys are PascalCase (first key starts with uppercase) + first_key = next(iter(d), "") + if first_key and first_key[0].isupper(): + return {_pascal_to_snake(k): v for k, v in d.items()} + return d + + +class Scorers(SyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/scorers" + + def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[Scorer]: + resp = self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return Scorer(**data) + except Exception: + return None + return None + + def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ScorersResponse]: + params: Dict[str, Any] = {} + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + + resp = self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict) + if not resp or not isinstance(resp, dict): + return None + data = _unwrap(resp) + if not isinstance(data, dict): + return None + + scorers = [Scorer(**s) if isinstance(s, dict) else s for s in data.get("scorers", [])] + count: int = data.get("count", len(scorers)) + total_count: int = data.get("total_count", count) + try: + return ScorersResponse(scorers=scorers, count=count, total_count=total_count) + except Exception: + return None + + def create( + self, + *, + name: str, + description: str, + model_id: str, + prompt: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Scorer]: + body: Dict[str, Any] = { + "name": name, + "description": description, + "model_id": model_id, + "prompt": prompt, + } + resp = self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + data = _normalize_keys(data) + try: + return Scorer(**data) + except Exception: + return None + return None + + def update( + self, + id: str, + *, + name: Optional[str] = None, + description: Optional[str] = None, + model_id: Optional[str] = None, + prompt: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + body: Dict[str, Any] = {} + if name is not None: + body["name"] = name + if description is not None: + body["description"] = description + if model_id is not None: + body["model_id"] = model_id + if prompt is not None: + body["prompt"] = prompt + try: + self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict) + return True + except Exception: + return False + + def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool: + try: + self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + return True + except Exception: + return False + + +class AsyncScorers(AsyncAPIResource): + def _base_url(self) -> str: + return f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/scorers" + + async def get(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> Optional[Scorer]: + resp = await self._get(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + try: + return Scorer(**data) + except Exception: + return None + return None + + async def get_many( + self, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ScorersResponse]: + params: Dict[str, Any] = {} + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + params["page"] = str(effective_page) + params["page_size"] = str(effective_page_size) + resp = await self._get(self._base_url(), params=params, timeout=timeout, cast_to=dict) + if not resp or not isinstance(resp, dict): + return None + data = _unwrap(resp) + if not isinstance(data, dict): + return None + scorers = [Scorer(**s) if isinstance(s, dict) else s for s in data.get("scorers", [])] + count: int = data.get("count", len(scorers)) + total_count: int = data.get("total_count", count) + try: + return ScorersResponse(scorers=scorers, count=count, total_count=total_count) + except Exception: + return None + + async def create( + self, + *, + name: str, + description: str, + model_id: str, + prompt: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Scorer]: + body: Dict[str, Any] = {"name": name, "description": description, "model_id": model_id, "prompt": prompt} + resp = await self._post(self._base_url(), body=body, timeout=timeout, cast_to=dict) + data = _unwrap(resp) + if isinstance(data, dict): + data = _normalize_keys(data) + try: + return Scorer(**data) + except Exception: + return None + return None + + async def update( + self, + id: str, + *, + name: Optional[str] = None, + description: Optional[str] = None, + model_id: Optional[str] = None, + prompt: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + body: Dict[str, Any] = {} + if name is not None: + body["name"] = name + if description is not None: + body["description"] = description + if model_id is not None: + body["model_id"] = model_id + if prompt is not None: + body["prompt"] = prompt + try: + await self._patch(f"{self._base_url()}/{id}", body=body, timeout=timeout, cast_to=dict) + return True + except Exception: + return False + + async def delete(self, id: str, *, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT) -> bool: + try: + await self._delete(f"{self._base_url()}/{id}", timeout=timeout, cast_to=dict) + return True + except Exception: + return False diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py new file mode 100644 index 0000000..aee5547 --- /dev/null +++ b/tests/cli/conftest.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +import pytest +from click.testing import CliRunner + + +@pytest.fixture +def runner(): + """Click CLI test runner.""" + return CliRunner(mix_stderr=False) + + +@pytest.fixture +def cli_env(): + """Environment variables for CLI tests.""" + return {"LAYERLENS_STRATIX_API_KEY": "test-key-123"} diff --git a/tests/cli/test_client.py b/tests/cli/test_client.py new file mode 100644 index 0000000..d6ffaf3 --- /dev/null +++ b/tests/cli/test_client.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from unittest.mock import Mock + +import pytest + +from layerlens.cli._client import _is_uuid, resolve_model, resolve_benchmark + + +class TestIsUuid: + """Test UUID detection.""" + + def test_valid_uuid(self): + assert _is_uuid("550e8400-e29b-41d4-a716-446655440000") is True + + def test_valid_uuid_uppercase(self): + assert _is_uuid("550E8400-E29B-41D4-A716-446655440000") is True + + def test_short_id(self): + assert _is_uuid("abc123") is False + + def test_mongo_id(self): + assert _is_uuid("69805c582a3c129a75d168b8") is False + + def test_empty(self): + assert _is_uuid("") is False + + def test_model_key(self): + assert _is_uuid("openai/gpt-4o") is False + + +class TestResolveModel: + """Test model resolution by ID, key, or name.""" + + @pytest.fixture + def client(self): + c = Mock() + c.models = Mock() + return c + + def test_resolve_by_uuid(self, client): + """UUID-like identifier tries get_by_id first.""" + model = Mock(id="550e8400-e29b-41d4-a716-446655440000") + client.models.get_by_id.return_value = model + + result = resolve_model(client, "550e8400-e29b-41d4-a716-446655440000") + + assert result is model + client.models.get_by_id.assert_called_once() + + def test_resolve_by_key(self, client): + """Non-UUID identifier tries get_by_key.""" + model = Mock(id="m-1") + client.models.get_by_key.return_value = model + + result = resolve_model(client, "openai/gpt-4o") + + assert result is model + client.models.get_by_key.assert_called_once_with("openai/gpt-4o") + + def test_resolve_by_name(self, client): + """Falls back to name search.""" + model = Mock(id="m-1") + client.models.get_by_key.return_value = None + client.models.get.return_value = [model] + + result = resolve_model(client, "GPT-4") + + assert result is model + client.models.get.assert_called_once_with(name="GPT-4") + + def test_resolve_not_found(self, client): + """Returns None when model not found.""" + client.models.get_by_key.return_value = None + client.models.get.return_value = None + + result = resolve_model(client, "nonexistent") + + assert result is None + + def test_resolve_uuid_fallback_to_key(self, client): + """UUID that fails get_by_id falls back to get_by_key.""" + client.models.get_by_id.return_value = None + model = Mock(id="m-1") + client.models.get_by_key.return_value = model + + result = resolve_model(client, "550e8400-e29b-41d4-a716-446655440000") + + assert result is model + + +class TestResolveBenchmark: + """Test benchmark resolution by ID, key, or name.""" + + @pytest.fixture + def client(self): + c = Mock() + c.benchmarks = Mock() + return c + + def test_resolve_by_key(self, client): + bm = Mock(id="b-1") + client.benchmarks.get_by_key.return_value = bm + + result = resolve_benchmark(client, "arc-agi-2") + + assert result is bm + + def test_resolve_not_found(self, client): + client.benchmarks.get_by_key.return_value = None + client.benchmarks.get.return_value = None + + result = resolve_benchmark(client, "nonexistent") + + assert result is None diff --git a/tests/cli/test_commands.py b/tests/cli/test_commands.py new file mode 100644 index 0000000..ec6c915 --- /dev/null +++ b/tests/cli/test_commands.py @@ -0,0 +1,464 @@ +from __future__ import annotations + +from unittest.mock import Mock, patch + +import pytest +from click.testing import CliRunner + +from layerlens.cli._app import cli + + +class TestTraceCommands: + """Test trace CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @pytest.fixture + def mock_traces(self): + trace = Mock() + trace.id = "trace-123" + trace.created_at = "2026-01-01T00:00:00Z" + trace.filename = "test.jsonl" + trace.evaluations_count = 2 + # Make to_dict work + trace.model_dump.return_value = { + "id": "trace-123", + "created_at": "2026-01-01T00:00:00Z", + "filename": "test.jsonl", + "evaluations_count": 2, + } + return trace + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_list(self, mock_get_client, runner, mock_traces): + """trace list displays traces in table format.""" + client = Mock() + resp = Mock() + resp.traces = [mock_traces] + resp.count = 1 + resp.total_count = 1 + client.traces.get_many.return_value = resp + mock_get_client.return_value = client + + result = runner.invoke(cli, ["trace", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "trace-123" in result.output + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_list_empty(self, mock_get_client, runner): + """trace list shows message when no traces found.""" + client = Mock() + client.traces.get_many.return_value = Mock(traces=[]) + mock_get_client.return_value = client + + result = runner.invoke(cli, ["trace", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "No traces found" in result.output + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_get(self, mock_get_client, runner, mock_traces): + """trace get displays a single trace.""" + client = Mock() + client.traces.get.return_value = mock_traces + mock_get_client.return_value = client + + result = runner.invoke(cli, ["trace", "get", "trace-123"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "trace-123" in result.output + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_get_not_found(self, mock_get_client, runner): + """trace get exits with error when trace not found.""" + client = Mock() + client.traces.get.return_value = None + mock_get_client.return_value = client + + result = runner.invoke(cli, ["trace", "get", "nonexistent"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code != 0 + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_delete_confirms(self, mock_get_client, runner): + """trace delete prompts for confirmation.""" + client = Mock() + mock_get_client.return_value = client + + result = runner.invoke( + cli, ["trace", "delete", "trace-123"], input="y\n", env={"LAYERLENS_STRATIX_API_KEY": "test"} + ) + + client.traces.delete.assert_called_once() + + @patch("layerlens.cli.commands.trace.get_client") + def test_trace_delete_skip_confirm(self, mock_get_client, runner): + """trace delete --yes skips confirmation.""" + client = Mock() + client.traces.delete.return_value = True + mock_get_client.return_value = client + + result = runner.invoke( + cli, ["trace", "delete", "trace-123", "--yes"], env={"LAYERLENS_STRATIX_API_KEY": "test"} + ) + + assert result.exit_code == 0 + client.traces.delete.assert_called_once() + + +class TestJudgeCommands: + """Test judge CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @patch("layerlens.cli.commands.judge.get_client") + def test_judge_list(self, mock_get_client, runner): + """judge list displays judges.""" + judge = Mock() + judge.model_dump.return_value = { + "id": "j-1", + "name": "Quality", + "version": 1, + "run_count": 5, + "created_at": "2026-01-01T00:00:00Z", + } + client = Mock() + resp = Mock() + resp.judges = [judge] + resp.count = 1 + resp.total_count = 1 + client.judges.get_many.return_value = resp + mock_get_client.return_value = client + + result = runner.invoke(cli, ["judge", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "Quality" in result.output + + @patch("layerlens.cli.commands.judge.get_client") + def test_judge_create(self, mock_get_client, runner): + """judge create creates and displays a judge.""" + judge = Mock() + judge.id = "j-new" + judge.model_dump.return_value = {"id": "j-new", "name": "Test"} + client = Mock() + client.judges.create.return_value = judge + mock_get_client.return_value = client + + result = runner.invoke( + cli, + ["judge", "create", "--name", "Test", "--goal", "Evaluate accuracy and completeness"], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0 + assert "j-new" in result.output + + @patch("layerlens.cli.commands.judge.get_client") + def test_judge_test(self, mock_get_client, runner): + """judge test creates a trace evaluation.""" + te = Mock() + te.id = "te-1" + te.model_dump.return_value = {"id": "te-1", "trace_id": "t-1", "judge_id": "j-1", "status": "pending"} + client = Mock() + client.trace_evaluations.create.return_value = te + mock_get_client.return_value = client + + result = runner.invoke( + cli, + ["judge", "test", "--judge-id", "j-1", "--trace-id", "t-1"], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0 + assert "te-1" in result.output + + +class TestEvaluateCommands: + """Test evaluate CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @patch("layerlens.cli.commands.evaluate.get_client") + def test_evaluate_list(self, mock_get_client, runner): + """evaluate list displays evaluations.""" + ev = Mock() + ev.model_dump.return_value = { + "id": "ev-1", + "status": "success", + "model_name": "GPT-4", + "benchmark_name": "MATH", + "accuracy": 0.95, + "submitted_at": 1700000000, + } + client = Mock() + resp = Mock() + resp.evaluations = [ev] + resp.pagination = Mock(page=1, total_pages=1, total_count=1) + client.evaluations.get_many.return_value = resp + mock_get_client.return_value = client + + result = runner.invoke(cli, ["evaluate", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "GPT-4" in result.output + + +class TestScorerCommands: + """Test scorer CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @patch("layerlens.cli.commands.scorer.get_client") + def test_scorer_list(self, mock_get_client, runner): + """scorer list displays scorers.""" + scorer = Mock() + scorer.model_dump.return_value = { + "id": "s-1", + "name": "Quality", + "model_name": "GPT-4", + "model_company": "OpenAI", + "created_at": "2026-01-01", + } + client = Mock() + resp = Mock() + resp.scorers = [scorer] + resp.count = 1 + resp.total_count = 1 + client.scorers.get_many.return_value = resp + mock_get_client.return_value = client + + result = runner.invoke(cli, ["scorer", "list"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "Quality" in result.output + + @patch("layerlens.cli.commands.scorer.get_client") + def test_scorer_create_dry_run(self, mock_get_client, runner): + """scorer create --dry-run previews without executing.""" + result = runner.invoke( + cli, + [ + "scorer", + "create", + "--name", + "Test", + "--description", + "A test scorer for quality", + "--model-id", + "m-1", + "--prompt", + "Rate quality", + "--dry-run", + ], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0 + assert "[dry-run]" in result.output + mock_get_client.assert_not_called() + + @patch("layerlens.cli.commands.scorer.get_client") + def test_scorer_delete_yes(self, mock_get_client, runner): + """scorer delete --yes skips confirmation.""" + client = Mock() + client.scorers.delete.return_value = True + mock_get_client.return_value = client + + result = runner.invoke(cli, ["scorer", "delete", "s-1", "--yes"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + client.scorers.delete.assert_called_once_with("s-1") + + +class TestSpaceCommands: + """Test space CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @patch("layerlens.cli.commands.space.get_client") + def test_space_create_dry_run(self, mock_get_client, runner): + """space create --dry-run previews without executing.""" + result = runner.invoke( + cli, + ["space", "create", "--name", "Test Space", "--dry-run"], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0 + assert "[dry-run]" in result.output + mock_get_client.assert_not_called() + + +class TestBulkCommands: + """Test bulk CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + @patch("layerlens.cli.commands.bulk.get_client") + def test_bulk_eval_file_dry_run(self, _mock_get_client, runner): + """bulk eval --file --dry-run previews jobs.""" + import os + import tempfile + + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: + f.write('{"model": "gpt-4", "benchmark": "mmlu"}\n') + f.write('{"model": "claude", "benchmark": "mmlu"}\n') + jobs_path = f.name + + try: + result = runner.invoke( + cli, + ["bulk", "eval", "--file", jobs_path, "--dry-run"], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0, f"stdout={result.output!r} stderr={result.stderr!r}" + assert "[dry-run]" in result.output + assert "2 evaluation(s)" in result.output + finally: + os.unlink(jobs_path) + + def test_bulk_eval_no_args(self, runner): + """bulk eval with no arguments shows error.""" + result = runner.invoke(cli, ["bulk", "eval"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code != 0 + + @patch("layerlens.cli.commands.bulk.get_client") + def test_bulk_eval_judge_traces_dry_run(self, _mock_get_client, runner, tmp_path): + """bulk eval --judge-id --traces --dry-run previews trace evaluations.""" + traces_file = tmp_path / "traces.txt" + traces_file.write_text("t-1\nt-2\nt-3\n") + + result = runner.invoke( + cli, + ["bulk", "eval", "--judge-id", "j-1", "--traces", str(traces_file), "--dry-run"], + env={"LAYERLENS_STRATIX_API_KEY": "test"}, + ) + + assert result.exit_code == 0 + assert "3 trace evaluation(s)" in result.output + + +class TestCiCommands: + """Test ci CLI commands.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + def test_ci_report_dry_run(self, runner): + """ci report --dry-run previews.""" + result = runner.invoke(cli, ["ci", "report", "--dry-run"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "[dry-run]" in result.output + + @patch("layerlens.cli.commands.ci.get_client") + def test_ci_report_markdown(self, mock_get_client, runner): + """ci report generates markdown.""" + ev = Mock() + ev.id = "ev-1" + ev.status = "success" + ev.model_name = "GPT-4" + ev.benchmark_name = "MATH" + ev.accuracy = 0.95 + ev.model_dump.return_value = {"id": "ev-1", "status": "success"} + + client = Mock() + resp = Mock() + resp.evaluations = [ev] + client.evaluations.get_many.return_value = resp + mock_get_client.return_value = client + + result = runner.invoke(cli, ["ci", "report"], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert "# Stratix Evaluation Report" in result.output + assert "GPT-4" in result.output + + @patch("layerlens.cli.commands.ci.get_client") + def test_ci_report_to_file(self, mock_get_client, runner, tmp_path): + """ci report --output writes to file.""" + ev = Mock() + ev.id = "ev-1" + ev.status = "success" + ev.model_name = "GPT-4" + ev.benchmark_name = "MATH" + ev.accuracy = 0.95 + ev.model_dump.return_value = {"id": "ev-1"} + + client = Mock() + resp = Mock() + resp.evaluations = [ev] + client.evaluations.get_many.return_value = resp + mock_get_client.return_value = client + + out_file = tmp_path / "report.md" + result = runner.invoke(cli, ["ci", "report", "-o", str(out_file)], env={"LAYERLENS_STRATIX_API_KEY": "test"}) + + assert result.exit_code == 0 + assert out_file.exists() + content = out_file.read_text() + assert "Stratix Evaluation Report" in content + + +class TestGlobalOptions: + """Test global CLI options.""" + + @pytest.fixture + def runner(self): + return CliRunner(mix_stderr=False) + + def test_version(self, runner): + """--version prints version.""" + result = runner.invoke(cli, ["--version"]) + assert result.exit_code == 0 + assert "layerlens" in result.output + + def test_help(self, runner): + """--help shows all command groups.""" + result = runner.invoke(cli, ["--help"]) + assert result.exit_code == 0 + for cmd in ["trace", "judge", "evaluate", "integration", "scorer", "space", "bulk", "ci"]: + assert cmd in result.output + + @patch("layerlens.cli.commands.trace.get_client") + def test_json_format(self, mock_get_client, runner): + """--format json outputs JSON.""" + trace = Mock() + trace.model_dump.return_value = {"id": "t-1", "filename": "test.json"} + client = Mock() + client.traces.get.return_value = trace + mock_get_client.return_value = client + + result = runner.invoke( + cli, ["--format", "json", "trace", "get", "t-1"], env={"LAYERLENS_STRATIX_API_KEY": "test"} + ) + + assert result.exit_code == 0 + import json + + parsed = json.loads(result.output) + assert parsed["id"] == "t-1" + + def test_quiet_flag(self, runner): + """--quiet suppresses banner.""" + result = runner.invoke(cli, ["-q", "--help"]) + assert result.exit_code == 0 + # Banner goes to stderr; with -q it should be empty + assert "STRATIX" not in result.stderr diff --git a/tests/cli/test_formatter.py b/tests/cli/test_formatter.py new file mode 100644 index 0000000..a8ea441 --- /dev/null +++ b/tests/cli/test_formatter.py @@ -0,0 +1,191 @@ +from __future__ import annotations + +import json + +import pytest +from pydantic import BaseModel + +from layerlens.cli._formatter import ( + to_dict, + _truncate, + format_table, + _format_value, + format_output, + format_single, +) + + +class SampleModel(BaseModel): + id: str + name: str + score: float = 0.0 + + +class TestToDict: + """Test to_dict conversion for various input types.""" + + def test_pydantic_v2_model(self): + """Pydantic model with model_dump is converted.""" + m = SampleModel(id="1", name="test", score=0.5) + result = to_dict(m) + assert result == {"id": "1", "name": "test", "score": 0.5} + + def test_dict_passthrough(self): + """Dict input is returned as-is.""" + d = {"key": "value"} + assert to_dict(d) is d + + def test_other_type_passthrough(self): + """Non-model, non-dict input is returned as-is.""" + assert to_dict("hello") == "hello" + assert to_dict(42) == 42 + + +class TestFormatValue: + """Test _format_value display conversion.""" + + def test_none(self): + assert _format_value(None) == "-" + + def test_bool_true(self): + assert _format_value(True) == "Yes" + + def test_bool_false(self): + assert _format_value(False) == "No" + + def test_float(self): + assert _format_value(3.14159) == "3.1416" + + def test_dict(self): + result = _format_value({"a": 1}) + assert json.loads(result) == {"a": 1} + + def test_list(self): + result = _format_value([1, 2]) + assert json.loads(result) == [1, 2] + + def test_string(self): + assert _format_value("hello") == "hello" + + def test_int(self): + assert _format_value(42) == "42" + + +class TestTruncate: + """Test _truncate string truncation.""" + + def test_short_string(self): + assert _truncate("abc", 10) == "abc" + + def test_exact_width(self): + assert _truncate("abcde", 5) == "abcde" + + def test_long_string(self): + result = _truncate("abcdefgh", 5) + assert len(result) == 5 + assert result.endswith("\u2026") + + def test_single_char_width(self): + result = _truncate("abcdef", 1) + assert result == "\u2026" + + +class TestFormatTable: + """Test format_table rendering.""" + + @pytest.fixture + def columns(self): + return [("id", "ID"), ("name", "Name")] + + def test_empty_list(self, columns): + result = format_table([], columns) + assert result == "No results found." + + def test_single_row(self, columns): + items = [{"id": "1", "name": "Alice"}] + result = format_table(items, columns) + lines = result.split("\n") + assert len(lines) == 3 # header, separator, row + assert "ID" in lines[0] + assert "Name" in lines[0] + assert "1" in lines[2] + assert "Alice" in lines[2] + + def test_multiple_rows(self, columns): + items = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + result = format_table(items, columns) + lines = result.split("\n") + assert len(lines) == 4 # header, separator, 2 rows + + def test_pydantic_models(self, columns): + items = [SampleModel(id="1", name="Test")] + result = format_table(items, columns) + assert "Test" in result + + def test_column_width_adapts(self): + columns = [("val", "V")] + items = [{"val": "short"}, {"val": "a much longer value here"}] + result = format_table(items, columns) + lines = result.split("\n") + # Header should be at least as wide as longest value + assert len(lines[0]) >= len("a much longer value here") + + def test_truncation_at_max_width(self): + columns = [("val", "V")] + items = [{"val": "x" * 100}] + result = format_table(items, columns, max_col_width=20) + data_line = result.split("\n")[2] + assert len(data_line.strip()) <= 20 + + +class TestFormatOutput: + """Test format_output dispatch.""" + + @pytest.fixture + def columns(self): + return [("id", "ID"), ("name", "Name")] + + def test_json_format_list(self, columns): + items = [{"id": "1", "name": "A"}] + result = format_output(items, "json", columns) + parsed = json.loads(result) + assert isinstance(parsed, list) + assert parsed[0]["id"] == "1" + + def test_json_format_single(self): + item = {"id": "1", "name": "A"} + result = format_output(item, "json") + parsed = json.loads(result) + assert parsed["id"] == "1" + + def test_table_format_list(self, columns): + items = [{"id": "1", "name": "A"}] + result = format_output(items, "table", columns) + assert "ID" in result + assert "Name" in result + + def test_table_format_single(self): + item = {"id": "1", "name": "Test"} + result = format_output(item, "table") + assert "Test" in result + + +class TestFormatSingle: + """Test format_single key-value rendering.""" + + def test_dict_input(self): + result = format_single({"name": "Alice", "age": 30}) + assert "Name" in result + assert "Alice" in result + assert "Age" in result + assert "30" in result + + def test_pydantic_model(self): + m = SampleModel(id="1", name="Test", score=0.5) + result = format_single(m) + assert "Id" in result + assert "1" in result + + def test_non_dict(self): + result = format_single("just a string") + assert result == "just a string" diff --git a/tests/resources/test_benchmarks.py b/tests/resources/test_benchmarks.py index 9e03e22..53843c0 100644 --- a/tests/resources/test_benchmarks.py +++ b/tests/resources/test_benchmarks.py @@ -976,3 +976,179 @@ def test_upload_file_sends_correct_upload_request(self, mock_put, benchmarks_res assert body["filename"] == "data.jsonl" assert "type" in body assert "size" in body + + +class TestBenchmarksClientSideFiltering: + """Test client-side filtering for benchmarks (fixes API not filtering custom objects).""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + @pytest.fixture + def public_reasoning(self): + return PublicBenchmark( + id="pub-1", + key="mmlu", + name="MMLU", + language="english", + categories=["reasoning", "knowledge"], + ) + + @pytest.fixture + def public_coding(self): + return PublicBenchmark( + id="pub-2", + key="humaneval", + name="HumanEval", + language="english", + categories=["coding"], + ) + + @pytest.fixture + def public_french(self): + return PublicBenchmark( + id="pub-3", + key="french-bench", + name="French Bench", + language="french", + categories=["reasoning"], + ) + + @pytest.fixture + def custom_bench(self): + return CustomBenchmark( + id="custom-1", + key="my-bench", + name="My Custom Benchmark", + ) + + def _mock_responses(self, resource, custom_list, public_list): + """Helper to set up mock API responses returning custom and public benchmarks.""" + custom_resp = BenchmarksResponse(data=BenchmarksResponse.Data(benchmarks=custom_list)) + public_resp = BenchmarksResponse(data=BenchmarksResponse.Data(benchmarks=public_list)) + resource._get.side_effect = lambda *_, **kwargs: ( + custom_resp if kwargs.get("params", {}).get("type") == "custom" else public_resp + ) + + def test_filter_by_categories_excludes_custom( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + public_coding, + ): + """Filtering by categories excludes custom benchmarks (they have no categories).""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_coding]) + + result = benchmarks_resource.get(categories=["reasoning"]) + + assert len(result) == 1 + assert result[0].key == "mmlu" + assert isinstance(result[0], PublicBenchmark) + + def test_filter_by_categories_no_match_returns_empty( + self, + benchmarks_resource, + custom_bench, + public_coding, + ): + """Filtering by a category that no benchmark matches returns empty list.""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_coding]) + + result = benchmarks_resource.get(categories=["math"]) + + assert result == [] + + def test_filter_by_languages_excludes_custom( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + public_french, + ): + """Filtering by language excludes custom benchmarks (they have no language).""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_french]) + + result = benchmarks_resource.get(languages=["french"]) + + assert len(result) == 1 + assert result[0].key == "french-bench" + + def test_filter_by_languages_no_match_returns_empty( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + ): + """Filtering by a language that no benchmark matches returns empty list.""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning]) + + result = benchmarks_resource.get(languages=["spanish"]) + + assert result == [] + + def test_filter_by_key_sends_param_to_api( + self, + benchmarks_resource, + public_reasoning, + ): + """Filtering by key sends the key param to the API.""" + self._mock_responses(benchmarks_resource, [], [public_reasoning]) + + benchmarks_resource.get(key="mmlu") + + # Verify key param was sent in API calls + for c in benchmarks_resource._get.call_args_list: + assert c.kwargs["params"]["key"] == "mmlu" + + def test_combined_filters_categories_and_languages( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + public_french, + ): + """Multiple filters are applied together (AND logic).""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_french]) + + result = benchmarks_resource.get(categories=["reasoning"], languages=["french"]) + + assert len(result) == 1 + assert result[0].key == "french-bench" + + def test_no_filters_returns_all( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + public_coding, + ): + """When no filters are applied, all benchmarks are returned.""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning, public_coding]) + + result = benchmarks_resource.get() + + assert len(result) == 3 + + def test_filter_case_insensitive( + self, + benchmarks_resource, + custom_bench, + public_reasoning, + ): + """Filters are case-insensitive.""" + self._mock_responses(benchmarks_resource, [custom_bench], [public_reasoning]) + + result = benchmarks_resource.get(categories=["REASONING"]) + + assert len(result) == 1 + assert result[0].key == "mmlu" diff --git a/tests/resources/test_evaluation_spaces.py b/tests/resources/test_evaluation_spaces.py new file mode 100644 index 0000000..8be2f16 --- /dev/null +++ b/tests/resources/test_evaluation_spaces.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +from unittest.mock import Mock + +import pytest + +from layerlens.models.evaluation_space import EvaluationSpace +from layerlens.resources.evaluation_spaces.evaluation_spaces import EvaluationSpaces + + +class TestEvaluationSpaces: + """Test EvaluationSpaces resource API methods.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + client.put_cast = Mock() + client.delete_cast = Mock() + return client + + @pytest.fixture + def spaces_resource(self, mock_client): + return EvaluationSpaces(mock_client) + + @pytest.fixture + def sample_space_data(self): + return { + "id": "sp-123", + "organization_id": "org-123", + "project_id": "proj-456", + "name": "Q1 Comparison", + "description": "Compare models for Q1", + "visibility": "private", + "owner": "admin@test.com", + "created_at": "2026-01-01T00:00:00Z", + } + + def test_base_url(self, spaces_resource): + """Base URL includes org and project.""" + assert spaces_resource._base_url() == "/organizations/org-123/projects/proj-456/evaluation-spaces" + + def test_get_success(self, spaces_resource, sample_space_data): + """get returns EvaluationSpace on success.""" + spaces_resource._get.return_value = {"status": "success", "data": sample_space_data} + + result = spaces_resource.get("sp-123") + + assert isinstance(result, EvaluationSpace) + assert result.name == "Q1 Comparison" + + def test_get_not_found(self, spaces_resource): + """get returns None when not found.""" + spaces_resource._get.return_value = None + + result = spaces_resource.get("nonexistent") + + assert result is None + + def test_get_many_success(self, spaces_resource, sample_space_data): + """get_many returns EvaluationSpacesResponse.""" + spaces_resource._get.return_value = { + "status": "success", + "data": {"evaluation_spaces": [sample_space_data], "count": 1, "total_count": 1}, + } + + result = spaces_resource.get_many() + + assert result is not None + assert len(result.evaluation_spaces) == 1 + assert result.evaluation_spaces[0].name == "Q1 Comparison" + + def test_get_many_pagination(self, spaces_resource): + """get_many passes pagination and sort parameters.""" + spaces_resource._get.return_value = { + "status": "success", + "data": {"evaluation_spaces": [], "count": 0, "total_count": 0}, + } + + spaces_resource.get_many(page=2, page_size=10, sort_by="created_at", order="desc") + + call_params = spaces_resource._get.call_args[1]["params"] + assert call_params["page"] == "2" + assert call_params["page_size"] == "10" + assert call_params["sort_by"] == "created_at" + assert call_params["order"] == "desc" + + def test_create_success(self, spaces_resource, sample_space_data): + """create returns EvaluationSpace.""" + spaces_resource._post.return_value = {"status": "success", "data": sample_space_data} + + result = spaces_resource.create(name="Q1 Comparison", description="Compare models for Q1") + + assert result is not None + assert result.name == "Q1 Comparison" + + def test_create_request_body(self, spaces_resource): + """create sends correct body.""" + spaces_resource._post.return_value = {"status": "success", "data": {"name": "Test"}} + + spaces_resource.create(name="Test", description="Desc", visibility="public") + + call_body = spaces_resource._post.call_args[1]["body"] + assert call_body["name"] == "Test" + assert call_body["description"] == "Desc" + assert call_body["visibility"] == "public" + + def test_delete_success(self, spaces_resource): + """delete returns True on success.""" + spaces_resource._delete.return_value = {} + + result = spaces_resource.delete("sp-123") + + assert result is True + + def test_delete_failure(self, spaces_resource): + """delete returns False on exception.""" + spaces_resource._delete.side_effect = Exception("error") + + result = spaces_resource.delete("sp-123") + + assert result is False diff --git a/tests/resources/test_integrations.py b/tests/resources/test_integrations.py new file mode 100644 index 0000000..c371a8c --- /dev/null +++ b/tests/resources/test_integrations.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from unittest.mock import Mock + +import pytest + +from layerlens._constants import DEFAULT_TIMEOUT +from layerlens.models.integration import Integration +from layerlens.resources.integrations.integrations import Integrations + + +class TestIntegrations: + """Test Integrations resource API methods.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + client.delete_cast = Mock() + return client + + @pytest.fixture + def integrations_resource(self, mock_client): + return Integrations(mock_client) + + @pytest.fixture + def sample_integration_data(self): + return { + "id": "int-123", + "organization_id": "org-123", + "project_id": "proj-456", + "name": "Langfuse Prod", + "type": "langfuse", + "status": "active", + "created_at": "2026-01-01T00:00:00Z", + } + + def test_base_url_org_level(self, integrations_resource): + """Base URL is at organization level (no project_id).""" + url = integrations_resource._base_url() + assert url == "/organizations/org-123/integrations" + assert "project" not in url + + def test_get_success(self, integrations_resource, sample_integration_data): + """get returns Integration on success.""" + integrations_resource._get.return_value = sample_integration_data + + result = integrations_resource.get("int-123") + + assert isinstance(result, Integration) + assert result.id == "int-123" + assert result.name == "Langfuse Prod" + + def test_get_with_envelope(self, integrations_resource, sample_integration_data): + """get handles {status, data} envelope.""" + integrations_resource._get.return_value = {"status": "success", "data": sample_integration_data} + + result = integrations_resource.get("int-123") + + assert isinstance(result, Integration) + + def test_get_not_found(self, integrations_resource): + """get returns None when not found.""" + integrations_resource._get.return_value = None + + result = integrations_resource.get("nonexistent") + + assert result is None + + def test_get_many_success(self, integrations_resource, sample_integration_data): + """get_many returns IntegrationsResponse.""" + integrations_resource._get.return_value = { + "status": "success", + "data": {"integrations": [sample_integration_data], "count": 1, "total_count": 1}, + } + + result = integrations_resource.get_many() + + assert result is not None + assert len(result.integrations) == 1 + assert result.integrations[0].name == "Langfuse Prod" + assert result.count == 1 + + def test_get_many_empty(self, integrations_resource): + """get_many returns empty list.""" + integrations_resource._get.return_value = { + "status": "success", + "data": {"integrations": [], "count": 0, "total_count": 0}, + } + + result = integrations_resource.get_many() + + assert result is not None + assert len(result.integrations) == 0 + + def test_get_many_pagination(self, integrations_resource, sample_integration_data): + """get_many passes pagination parameters.""" + integrations_resource._get.return_value = { + "status": "success", + "data": {"integrations": [sample_integration_data], "count": 1, "total_count": 10}, + } + + integrations_resource.get_many(page=2, page_size=5) + + integrations_resource._get.assert_called_once_with( + "/organizations/org-123/integrations", + params={"page": "2", "page_size": "5"}, + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_test_integration_success(self, integrations_resource): + """test returns TestIntegrationResponse.""" + integrations_resource._post.return_value = { + "status": "success", + "data": {"success": True, "message": "Connection OK"}, + } + + result = integrations_resource.test("int-123") + + assert result is not None + assert result.success is True + assert result.message == "Connection OK" + + def test_test_integration_failure(self, integrations_resource): + """test returns failure result.""" + integrations_resource._post.return_value = { + "status": "success", + "data": {"success": False, "message": "Connection refused"}, + } + + result = integrations_resource.test("int-123") + + assert result is not None + assert result.success is False diff --git a/tests/resources/test_models_resource.py b/tests/resources/test_models_resource.py index 4852a00..085177b 100644 --- a/tests/resources/test_models_resource.py +++ b/tests/resources/test_models_resource.py @@ -843,3 +843,242 @@ def test_create_custom_returns_none_on_error_envelope(self, models_resource): ) assert result is None + + +class TestModelsClientSideFiltering: + """Test client-side filtering for models (fixes API not filtering custom objects).""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + @pytest.fixture + def public_openai(self): + return PublicModel( + id="pub-1", + key="gpt-4", + name="GPT-4", + description="OpenAI model", + company="OpenAI", + architecture_type="Transformer", + open_weights=False, + region="us-east-1", + license="proprietary", + ) + + @pytest.fixture + def public_meta(self): + return PublicModel( + id="pub-2", + key="llama-3", + name="Llama 3", + description="Meta model", + company="Meta", + architecture_type="Transformer", + open_weights=True, + region="us-west-2", + license="llama-community", + ) + + @pytest.fixture + def public_mistral(self): + return PublicModel( + id="pub-3", + key="mixtral", + name="Mixtral", + description="Mistral MoE", + company="Mistral", + architecture_type="MoE", + open_weights=True, + region="eu-west-1", + license="apache-2.0", + ) + + @pytest.fixture + def custom_model(self): + return CustomModel( + id="custom-1", + key="my-model", + name="My Custom Model", + description="Custom", + max_tokens=4096, + ) + + def _mock_responses(self, resource, custom_list, public_list): + """Helper to set up mock API responses.""" + custom_resp = ModelsResponse(data=ModelsResponse.Data(models=custom_list)) + public_resp = ModelsResponse(data=ModelsResponse.Data(models=public_list)) + resource._get.side_effect = lambda *_, **kwargs: ( + custom_resp if kwargs.get("params", {}).get("type") == "custom" else public_resp + ) + + def test_filter_by_companies_excludes_custom( + self, + models_resource, + custom_model, + public_openai, + public_meta, + ): + """Filtering by company excludes custom models (they have no company).""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_meta]) + + result = models_resource.get(companies=["OpenAI"]) + + assert len(result) == 1 + assert result[0].key == "gpt-4" + + def test_filter_by_companies_no_match_returns_empty( + self, + models_resource, + custom_model, + public_openai, + ): + """Filtering by a company with no match returns empty list.""" + self._mock_responses(models_resource, [custom_model], [public_openai]) + + result = models_resource.get(companies=["Google"]) + + assert result == [] + + def test_filter_by_regions_excludes_custom( + self, + models_resource, + custom_model, + public_openai, + public_mistral, + ): + """Filtering by region excludes custom models.""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_mistral]) + + result = models_resource.get(regions=["eu-west-1"]) + + assert len(result) == 1 + assert result[0].key == "mixtral" + + def test_filter_by_licenses_excludes_custom( + self, + models_resource, + custom_model, + public_meta, + public_mistral, + ): + """Filtering by license excludes custom models.""" + self._mock_responses(models_resource, [custom_model], [public_meta, public_mistral]) + + result = models_resource.get(licenses=["apache-2.0"]) + + assert len(result) == 1 + assert result[0].key == "mixtral" + + def test_filter_by_categories_open_source( + self, + models_resource, + custom_model, + public_openai, + public_meta, + ): + """Filtering by 'Open-Source' category matches models with open_weights=True.""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_meta]) + + result = models_resource.get(categories=["Open-Source"]) + + assert len(result) == 1 + assert result[0].key == "llama-3" + + def test_filter_by_categories_closed_source( + self, + models_resource, + custom_model, + public_openai, + public_meta, + ): + """Filtering by 'Closed-Source' matches models with open_weights=False.""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_meta]) + + result = models_resource.get(categories=["Closed-Source"]) + + assert len(result) == 1 + assert result[0].key == "gpt-4" + + def test_filter_by_categories_architecture( + self, + models_resource, + custom_model, + public_openai, + public_mistral, + ): + """Filtering by architecture type category (MoE) works.""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_mistral]) + + result = models_resource.get(categories=["MoE"]) + + assert len(result) == 1 + assert result[0].key == "mixtral" + + def test_filter_by_key_sends_param_to_api( + self, + models_resource, + public_openai, + ): + """Filtering by key sends the key param to the API.""" + self._mock_responses(models_resource, [], [public_openai]) + + models_resource.get(key="gpt") + + for c in models_resource._get.call_args_list: + assert c.kwargs["params"]["key"] == "gpt" + + def test_combined_filters( + self, + models_resource, + custom_model, + public_openai, + public_meta, + public_mistral, + ): + """Multiple filters work together (AND logic).""" + self._mock_responses( + models_resource, + [custom_model], + [public_openai, public_meta, public_mistral], + ) + + result = models_resource.get(categories=["Open-Source"], regions=["eu-west-1"]) + + assert len(result) == 1 + assert result[0].key == "mixtral" + + def test_no_filters_returns_all( + self, + models_resource, + custom_model, + public_openai, + public_meta, + ): + """When no filters are applied, all models are returned.""" + self._mock_responses(models_resource, [custom_model], [public_openai, public_meta]) + + result = models_resource.get() + + assert len(result) == 3 + + def test_filter_excludes_all_when_no_match( + self, + models_resource, + custom_model, + public_openai, + ): + """Filtering with no matches returns empty list, including no custom models.""" + self._mock_responses(models_resource, [custom_model], [public_openai]) + + result = models_resource.get(regions=["ap-southeast-1"]) + + assert result == [] diff --git a/tests/resources/test_scorers.py b/tests/resources/test_scorers.py new file mode 100644 index 0000000..c1d4b61 --- /dev/null +++ b/tests/resources/test_scorers.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from unittest.mock import Mock + +import pytest + +from layerlens._constants import DEFAULT_TIMEOUT +from layerlens.models.scorer import Scorer +from layerlens.resources.scorers.scorers import Scorers, _normalize_keys, _pascal_to_snake + + +class TestPascalToSnake: + """Test PascalCase to snake_case conversion.""" + + def test_simple(self): + assert _pascal_to_snake("Name") == "name" + + def test_two_words(self): + assert _pascal_to_snake("ModelId") == "model_id" + + def test_three_words(self): + assert _pascal_to_snake("ModelCompany") == "model_company" + + def test_consecutive_caps(self): + assert _pascal_to_snake("ModelID") == "model_id" + + def test_already_snake(self): + assert _pascal_to_snake("model_id") == "model_id" + + def test_single_char(self): + assert _pascal_to_snake("A") == "a" + + +class TestNormalizeKeys: + """Test dict key normalization.""" + + def test_pascal_keys(self): + d = {"Name": "test", "ModelId": "m-1", "ModelCompany": "OpenAI"} + result = _normalize_keys(d) + assert result["name"] == "test" + assert result["model_id"] == "m-1" + assert result["model_company"] == "OpenAI" + + def test_snake_keys_passthrough(self): + d = {"name": "test", "model_id": "m-1"} + result = _normalize_keys(d) + assert result is d # Same object, not copied + + def test_empty_dict(self): + result = _normalize_keys({}) + assert result == {} + + +class TestScorers: + """Test Scorers resource API methods.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + client.patch_cast = Mock() + client.delete_cast = Mock() + return client + + @pytest.fixture + def scorers_resource(self, mock_client): + return Scorers(mock_client) + + @pytest.fixture + def sample_scorer_data(self): + return { + "id": "s-123", + "organization_id": "org-123", + "project_id": "proj-456", + "name": "Quality Scorer", + "description": "Rates quality", + "model_id": "m-1", + "model_name": "GPT-4", + "model_key": "openai/gpt-4", + "model_company": "OpenAI", + "prompt": "Rate quality", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + } + + def test_base_url(self, scorers_resource): + """Base URL includes org and project.""" + assert scorers_resource._base_url() == "/organizations/org-123/projects/proj-456/scorers" + + def test_get_success(self, scorers_resource, sample_scorer_data): + """get returns Scorer on success.""" + scorers_resource._get.return_value = sample_scorer_data + + result = scorers_resource.get("s-123") + + assert isinstance(result, Scorer) + assert result.name == "Quality Scorer" + + def test_get_with_envelope(self, scorers_resource, sample_scorer_data): + """get handles {status, data} envelope.""" + scorers_resource._get.return_value = {"status": "success", "data": sample_scorer_data} + + result = scorers_resource.get("s-123") + + assert isinstance(result, Scorer) + assert result.id == "s-123" + + def test_get_not_found(self, scorers_resource): + """get returns None when not found.""" + scorers_resource._get.return_value = None + + result = scorers_resource.get("nonexistent") + + assert result is None + + def test_get_many_success(self, scorers_resource, sample_scorer_data): + """get_many returns ScorersResponse.""" + scorers_resource._get.return_value = { + "status": "success", + "data": {"scorers": [sample_scorer_data], "count": 1, "total_count": 1}, + } + + result = scorers_resource.get_many() + + assert result is not None + assert len(result.scorers) == 1 + assert result.scorers[0].name == "Quality Scorer" + + def test_get_many_empty(self, scorers_resource): + """get_many returns empty list when no scorers.""" + scorers_resource._get.return_value = { + "status": "success", + "data": {"scorers": [], "count": 0, "total_count": 0}, + } + + result = scorers_resource.get_many() + + assert result is not None + assert len(result.scorers) == 0 + + def test_create_with_pascal_response(self, scorers_resource): + """create handles PascalCase API response.""" + scorers_resource._post.return_value = { + "status": "success", + "data": { + "Name": "New Scorer", + "Description": "Desc", + "ModelID": "m-1", + "ModelName": "GPT-4", + "ModelCompany": "OpenAI", + "ModelKey": "openai/gpt-4", + "Prompt": "Rate it", + "CreatedAt": "2026-01-01", + "UpdatedAt": "2026-01-01", + }, + } + + result = scorers_resource.create(name="New Scorer", description="Desc", model_id="m-1", prompt="Rate it") + + assert result is not None + assert result.name == "New Scorer" + assert result.model_name == "GPT-4" + + def test_create_request_parameters(self, scorers_resource): + """create sends correct body.""" + scorers_resource._post.return_value = {"status": "success", "data": {"Name": "X", "Prompt": "Y"}} + + scorers_resource.create(name="X", description="D", model_id="m-1", prompt="Y") + + scorers_resource._post.assert_called_once_with( + "/organizations/org-123/projects/proj-456/scorers", + body={"name": "X", "description": "D", "model_id": "m-1", "prompt": "Y"}, + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_delete_success(self, scorers_resource): + """delete returns True on success.""" + scorers_resource._delete.return_value = {} + + result = scorers_resource.delete("s-123") + + assert result is True + + def test_delete_failure(self, scorers_resource): + """delete returns False on exception.""" + scorers_resource._delete.side_effect = Exception("not found") + + result = scorers_resource.delete("s-123") + + assert result is False + + def test_update_sends_patch(self, scorers_resource): + """update sends PATCH with only provided fields.""" + scorers_resource._patch.return_value = {} + + result = scorers_resource.update("s-123", name="Updated") + + assert result is True + scorers_resource._patch.assert_called_once() + call_body = scorers_resource._patch.call_args[1]["body"] + assert call_body == {"name": "Updated"}