From 2f30eec83ed5e470758b8c85e170737e1a8a0bc5 Mon Sep 17 00:00:00 2001 From: Marin Peko <26385728+m-peko@users.noreply.github.com> Date: Fri, 27 Feb 2026 07:56:30 +0100 Subject: [PATCH 1/3] PublicClient, model & benchmarks management (#50) * Add public resources to SDK * Add model and benchmark managing functionality to the SDK * Bump version * Return evaluation summary * Fetch public evaluations --- docs/SUMMARY.md | 1 + docs/api-reference/client.md | 19 + docs/api-reference/evaluations.md | 87 ++- docs/api-reference/models-benchmarks.md | 220 +++++++- docs/api-reference/public-client.md | 419 ++++++++++++++ examples/compare_evaluations.py | 92 +++ examples/create_custom_benchmark.py | 50 ++ examples/create_custom_model.py | 41 ++ examples/create_smart_benchmark.py | 51 ++ examples/evaluation_sorting.py | 92 +++ examples/get_evaluation.py | 2 +- examples/manage_project_models_benchmarks.py | 41 ++ examples/public_benchmarks.py | 60 ++ examples/public_evaluations.py | 66 +++ examples/public_models.py | 49 ++ pyproject.toml | 2 +- src/layerlens/__init__.py | 19 + src/layerlens/_client.py | 13 + src/layerlens/_public_client.py | 219 +++++++ src/layerlens/_resource.py | 23 + src/layerlens/_version.py | 2 +- src/layerlens/models/__init__.py | 46 +- src/layerlens/models/api.py | 12 + src/layerlens/models/evaluation.py | 73 ++- src/layerlens/models/public.py | 94 +++ .../resources/benchmarks/benchmarks.py | 375 +++++++++++- .../resources/comparisons/__init__.py | 3 + .../resources/comparisons/comparisons.py | 89 +++ .../resources/evaluations/evaluations.py | 49 +- src/layerlens/resources/models/models.py | 172 +++++- .../resources/public_benchmarks/__init__.py | 3 + .../public_benchmarks/public_benchmarks.py | 263 +++++++++ .../resources/public_evaluations/__init__.py | 3 + .../public_evaluations/public_evaluations.py | 218 +++++++ .../resources/public_models/__init__.py | 3 + .../resources/public_models/public_models.py | 139 +++++ tests/resources/test_benchmarks.py | 533 +++++++++++++++++- tests/resources/test_evaluations.py | 426 ++++++++++++++ tests/resources/test_models_resource.py | 290 +++++++++- 39 files changed, 4327 insertions(+), 32 deletions(-) create mode 100644 docs/api-reference/public-client.md create mode 100644 examples/compare_evaluations.py create mode 100644 examples/create_custom_benchmark.py create mode 100644 examples/create_custom_model.py create mode 100644 examples/create_smart_benchmark.py create mode 100644 examples/evaluation_sorting.py create mode 100644 examples/manage_project_models_benchmarks.py create mode 100644 examples/public_benchmarks.py create mode 100644 examples/public_evaluations.py create mode 100644 examples/public_models.py create mode 100644 src/layerlens/_public_client.py create mode 100644 src/layerlens/models/public.py create mode 100644 src/layerlens/resources/comparisons/__init__.py create mode 100644 src/layerlens/resources/comparisons/comparisons.py create mode 100644 src/layerlens/resources/public_benchmarks/__init__.py create mode 100644 src/layerlens/resources/public_benchmarks/public_benchmarks.py create mode 100644 src/layerlens/resources/public_evaluations/__init__.py create mode 100644 src/layerlens/resources/public_evaluations/public_evaluations.py create mode 100644 src/layerlens/resources/public_models/__init__.py create mode 100644 src/layerlens/resources/public_models/public_models.py diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 3002655..a89a4b7 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -9,6 +9,7 @@ ## API Reference * [Client Configuration](api-reference/client.md) +* [Public Client](api-reference/public-client.md) * [Evaluations](api-reference/evaluations.md) * [Results](api-reference/results.md) * [Models & Benchmarks](api-reference/models-benchmarks.md) diff --git a/docs/api-reference/client.md b/docs/api-reference/client.md index 97f13bc..a7e534e 100644 --- a/docs/api-reference/client.md +++ b/docs/api-reference/client.md @@ -51,6 +51,25 @@ The client automatically loads configuration from these environment variables: LAYERLENS_STRATIX_API_KEY="your_api_key_here" ``` +## Public Client + +For accessing public endpoints (models, benchmarks, comparisons), use `PublicClient` or `AsyncPublicClient`. See the [Public Client](public-client.md) reference for full details. + +```python +from layerlens import PublicClient + +# Loads API key from the "LAYERLENS_STRATIX_API_KEY" environment variable +public = PublicClient() +models = public.models.get(companies=["OpenAI"]) +``` + +You can also access public endpoints from an authenticated client via the `.public` property: + +```python +client = Stratix() +public_models = client.public.models.get(query="claude") +``` + ## Timeout Configuration ### Simple Timeout diff --git a/docs/api-reference/evaluations.md b/docs/api-reference/evaluations.md index d2b0972..a9039e0 100644 --- a/docs/api-reference/evaluations.md +++ b/docs/api-reference/evaluations.md @@ -177,17 +177,22 @@ async def get_evaluation(): asyncio.run(get_evaluation()) ``` -### `get_many(page=None, page_size=None, timeout=None)` +### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, timeout=None)` -Retrieves multiple evaluations with optional pagination support. +Retrieves multiple evaluations with optional pagination, sorting, and filtering. #### Parameters -| Parameter | Type | Required | Description | -| ----------- | -------------------------------- | -------- | ------------------------------------------------------- | -| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | -| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | -| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | +| Parameter | Type | Required | Description | +| --------------- | -------------------------------- | -------- | ------------------------------------------------------- | +| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | +| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | +| `sort_by` | `str \| None` | No | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `model_ids` | `List[str] \| None` | No | Filter by model IDs | +| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | +| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | #### Returns @@ -198,6 +203,27 @@ Returns an `EvaluationsResponse` object containing: Returns `None` if the request fails. +#### Example + +```python +from layerlens import Stratix +from layerlens.models import EvaluationStatus + +client = Stratix() + +# Get top evaluations by accuracy +response = client.evaluations.get_many( + sort_by="accuracy", + order="desc", + status=EvaluationStatus.SUCCESS, + page_size=10, +) + +if response: + for evaluation in response.evaluations: + print(f"{evaluation.id}: accuracy={evaluation.accuracy:.2f}%") +``` + ### `get_results(page=None, page_size=None, timeout=None)` Fetches results for this evaluation with pagination support. This is a synchronous method. @@ -378,16 +404,43 @@ The `create`, `get_by_id` and `get_many` method returns an `Evaluation` objects ### Evaluation Object Properties -| Property | Type | Description | -| ------------------ | ------------------ | --------------------------------------------------------- | -| `id` | `str` | Unique evaluation identifier | -| `status` | `EvaluationStatus` | Current evaluation status (enum) | -| `submitted_at` | `int` | Unix timestamp when evaluation was submitted | -| `finished_at` | `int` | Unix timestamp when evaluation finished | -| `model_id` | `str` | ID of the model used in the evaluation | -| `benchmark_id` | `str` | ID of the benchmark used (aliased as "dataset_id" in API) | -| `average_duration` | `int` | Average response time in milliseconds | -| `accuracy` | `float` | Overall accuracy score (0.0 to 1.0) | +| Property | Type | Description | +| -------------------- | ----------------------------- | --------------------------------------------------------- | +| `id` | `str` | Unique evaluation identifier | +| `status` | `EvaluationStatus` | Current evaluation status (enum) | +| `status_description` | `str` | Human-readable status description (default: `""`) | +| `submitted_at` | `int` | Unix timestamp when evaluation was submitted | +| `finished_at` | `int` | Unix timestamp when evaluation finished | +| `model_id` | `str` | ID of the model used in the evaluation | +| `model_name` | `str` | Name of the model (default: `""`) | +| `model_key` | `str` | Key identifier of the model (default: `""`) | +| `model_company` | `str` | Company/provider of the model (default: `""`) | +| `benchmark_id` | `str` | ID of the benchmark used (aliased as "dataset_id" in API) | +| `benchmark_name` | `str` | Name of the benchmark (aliased as "dataset_name" in API, default: `""`) | +| `average_duration` | `int` | Average response time in milliseconds | +| `accuracy` | `float` | Overall accuracy score (0.0 to 1.0) | +| `readability_score` | `float` | Readability score (default: `0.0`) | +| `toxicity_score` | `float` | Toxicity score (default: `0.0`) | +| `ethics_score` | `float` | Ethics score (default: `0.0`) | +| `failed_prompt_count`| `int` | Number of failed prompts (default: `0`) | +| `queue_id` | `int` | Queue identifier (default: `0`) | +| `summary` | `EvaluationSummary \| None` | Rich evaluation summary (see below, default: `None`) | + +### EvaluationSummary Object + +The `summary` field contains a rich analysis of the evaluation when available. + +| Property | Type | Description | +| --------------------- | ------------------------------- | ---------------------------------------- | +| `name` | `str` | Summary title | +| `goal` | `str` | Goal of the evaluation | +| `metrics` | `List[EvaluationMetric]` | Metrics used (each has `name`, `description`) | +| `task_types` | `List[EvaluationTaskType]` | Task types (each has `name`, `description`) | +| `dataset` | `EvaluationDataset \| None` | Dataset info (`total_size`, `training_size`, `test_size`, `characteristics`) | +| `model` | `EvaluationModelInfo \| None` | Model info (`model_name`, `performance`) | +| `performance_details` | `PerformanceDetails \| None` | Strengths and challenges lists | +| `error_analysis` | `ErrorAnalysis \| None` | Common failure modes and example | +| `analysis_summary` | `AnalysisSummary \| None` | Key takeaways list | #### Evaluation Status diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md index 1681b75..935cbbc 100644 --- a/docs/api-reference/models-benchmarks.md +++ b/docs/api-reference/models-benchmarks.md @@ -95,6 +95,87 @@ Retrieves a specific model by its unique key. Both the `Stratix` and `AsyncStrat Returns an `Optional[Model]` - a single `Model` object if found, or `None` if the model doesn't exist or there's an error. +### `add(*model_ids, timeout=None)` + +Adds public models to the project by their IDs. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------ | -------------------------------- | -------- | ------------------------------ | +| `*model_ids` | `str` | Yes | One or more model IDs to add | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` - `True` if the operation succeeded, `False` otherwise. + +#### Example + +```python +client = Stratix() +success = client.models.add("model-id-1", "model-id-2") +``` + +### `remove(*model_ids, timeout=None)` + +Removes models from the project by their IDs. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------ | -------------------------------- | -------- | --------------------------------- | +| `*model_ids` | `str` | Yes | One or more model IDs to remove | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` - `True` if the operation succeeded, `False` otherwise. + +### `create_custom(name, key, description, api_url, max_tokens, api_key=None, timeout=None)` + +Creates a custom model backed by an OpenAI-compatible API endpoint. This allows you to evaluate any model accessible via a chat completions endpoint. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------- | -------------------------------- | -------- | --------------------------------------------------------------------------------- | +| `name` | `str` | Yes | Model name (max 256 characters) | +| `key` | `str` | Yes | Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 chars)| +| `description` | `str` | Yes | Model description (max 500 characters) | +| `api_url` | `str` | Yes | Base URL of the OpenAI-compatible API endpoint | +| `max_tokens` | `int` | Yes | Maximum number of tokens the model supports | +| `api_key` | `str \| None` | No | API key for the model provider | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns an `Optional[CreateModelResponse]` containing: + +- `organization_id`: Organization identifier +- `project_id`: Project identifier +- `model_id`: The newly created model's identifier + +Returns `None` if the request fails. + +#### Example + +```python +client = Stratix() + +result = client.models.create_custom( + name="My Custom Model", + key="my-org/custom-model-v1", + description="Custom fine-tuned model served via vLLM", + api_url="https://my-model-endpoint.example.com/v1", + api_key="my-provider-api-key", + max_tokens=4096, +) + +if result: + print(f"Created model: {result.model_id}") +``` + ## Benchmarks ### `get(type=None, name=None, timeout=None)` @@ -111,8 +192,6 @@ Retrieves a list of available benchmarks with optional filtering parameters. Bot #### Returns -Returns a `List[Benchmark]` containing available benchmarks that match the filter criteria. Returns `None` if no benchmarks are found or if there's an error. - Returns `Optional[List[Benchmark]]` - a list of `Benchmark` objects that match the filter criteria. Returns an empty list `[]` if no benchmarks match the criteria, or `None` if there's an error. #### Benchmark Object Properties @@ -154,3 +233,140 @@ Retrieves a specific benchmark by its unique key. Both the `Stratix` and `AsyncS #### Returns Returns an `Optional[Benchmark]` - a single `Benchmark` object if found, or `None` if the benchmark doesn't exist or there's an error. + +### `add(*benchmark_ids, timeout=None)` + +Adds benchmarks to the project by their IDs. + +#### Parameters + +| Parameter | Type | Required | Description | +| ---------------- | -------------------------------- | -------- | ---------------------------------- | +| `*benchmark_ids` | `str` | Yes | One or more benchmark IDs to add | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` - `True` if the operation succeeded, `False` otherwise. + +#### Example + +```python +client = Stratix() +success = client.benchmarks.add("benchmark-id-1", "benchmark-id-2") +``` + +### `remove(*benchmark_ids, timeout=None)` + +Removes benchmarks from the project by their IDs. + +#### Parameters + +| Parameter | Type | Required | Description | +| ---------------- | -------------------------------- | -------- | ------------------------------------- | +| `*benchmark_ids` | `str` | Yes | One or more benchmark IDs to remove | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` - `True` if the operation succeeded, `False` otherwise. + +### `create_custom(name, description, file_path, additional_metrics=None, custom_scorer_ids=None, input_type=None, timeout=None)` + +Creates a custom benchmark by uploading a JSONL file. The file should contain one JSON object per line with `input` and `truth` fields. + +#### Parameters + +| Parameter | Type | Required | Description | +| -------------------- | -------------------------------- | -------- | -------------------------------------------------------------------- | +| `name` | `str` | Yes | Benchmark name (max 64 characters) | +| `description` | `str` | Yes | Benchmark description (max 280 characters) | +| `file_path` | `str` | Yes | Path to a JSONL file with benchmark prompts | +| `additional_metrics` | `List[str] \| None` | No | Additional metrics: `readability`, `toxicity`, `hallucination` | +| `custom_scorer_ids` | `List[str] \| None` | No | List of custom scorer IDs to use | +| `input_type` | `str \| None` | No | Input type: `messages` or `json_payload` | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### JSONL File Format + +Each line should be a JSON object: + +```json +{"input": "What is 2+2?", "truth": "4"} +{"input": "Capital of France?", "truth": "Paris"} +``` + +Optional fields: `subset` (for grouping prompts into categories). + +#### Returns + +Returns an `Optional[CreateBenchmarkResponse]` containing: + +- `organization_id`: Organization identifier +- `project_id`: Project identifier +- `benchmark_id`: The newly created benchmark's identifier + +Returns `None` if the request fails. + +#### Example + +```python +client = Stratix() + +result = client.benchmarks.create_custom( + name="QA Benchmark", + description="Tests model factual accuracy", + file_path="benchmark_data.jsonl", + additional_metrics=["hallucination"], +) + +if result: + print(f"Created benchmark: {result.benchmark_id}") +``` + +### `create_smart(name, description, system_prompt, file_paths, metrics=None, timeout=None)` + +Creates a smart benchmark from uploaded files. The platform uses AI to automatically generate benchmark prompts from the provided documents. The benchmark is generated asynchronously. + +#### Parameters + +| Parameter | Type | Required | Description | +| --------------- | -------------------------------- | -------- | ------------------------------------------------------------------- | +| `name` | `str` | Yes | Benchmark name (max 256 characters) | +| `description` | `str` | Yes | Benchmark description (max 500 characters) | +| `system_prompt` | `str` | Yes | System prompt guiding benchmark generation (max 4000 characters) | +| `file_paths` | `List[str]` | Yes | List of file paths to upload (1-20 files, max 50 MB each) | +| `metrics` | `List[str] \| None` | No | Additional metrics: `readability`, `toxicity`, `hallucination` | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Supported File Types + +`.txt`, `.pdf`, `.html`, `.docx`, `.csv`, `.json`, `.jsonl`, `.parquet` + +#### Returns + +Returns an `Optional[CreateBenchmarkResponse]` containing: + +- `organization_id`: Organization identifier +- `project_id`: Project identifier +- `benchmark_id`: The newly created benchmark's identifier + +Returns `None` if the request fails. + +#### Example + +```python +client = Stratix() + +result = client.benchmarks.create_smart( + name="Product Knowledge Benchmark", + description="Evaluates model knowledge of product docs", + system_prompt="Generate QA pairs testing understanding of product features.", + file_paths=["product_docs.pdf", "faq.txt"], + metrics=["hallucination"], +) + +if result: + print(f"Smart benchmark created: {result.benchmark_id}") + print("Check the dashboard for generation progress.") +``` diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md new file mode 100644 index 0000000..9a79d1e --- /dev/null +++ b/docs/api-reference/public-client.md @@ -0,0 +1,419 @@ +# Public Client + +The `PublicClient` (synchronous) and `AsyncPublicClient` (asynchronous) classes provide access to public LayerLens API endpoints for browsing public models, benchmarks, benchmark content, fetching evaluations, and comparing evaluation results. + +## Basic Usage + +### Synchronous Client + +```python +from layerlens import PublicClient + +# Loads API key from the "LAYERLENS_STRATIX_API_KEY" environment variable +client = PublicClient() + +# Browse public models +models = client.models.get(companies=["OpenAI"]) + +# Browse public benchmarks +benchmarks = client.benchmarks.get(languages=["English"]) +``` + +### Asynchronous Client + +```python +import asyncio +from layerlens import AsyncPublicClient + +async def main(): + client = AsyncPublicClient() + + models = await client.models.get(companies=["OpenAI"]) + benchmarks = await client.benchmarks.get(languages=["English"]) + +asyncio.run(main()) +``` + +### Accessing from an Authenticated Client + +If you already have an authenticated `Stratix` or `AsyncStratix` client, you can access public endpoints through the `.public` property: + +```python +from layerlens import Stratix + +client = Stratix() # requires API key + +# Access public endpoints through the authenticated client +public_models = client.public.models.get(query="claude") +``` + +## Constructor Parameters + +### `PublicClient(api_key, base_url, timeout)` and `AsyncPublicClient(api_key, base_url, timeout)` + +| Parameter | Type | Required | Default | Description | +| ---------- | -------------------------------- | -------- | --------------- | ----------------------------- | +| `api_key` | `str \| None` | Yes\* | `None` | Your LayerLens Stratix API key | +| `base_url` | `str \| httpx.URL \| None` | No | Stratix API URL | Custom API base URL | +| `timeout` | `float \| httpx.Timeout \| None` | No | 10 minutes | Request timeout configuration | + +\*Required unless set via the `LAYERLENS_STRATIX_API_KEY` environment variable + +## Public Models + +### `models.get(...)` + +Retrieves a list of public models with optional filtering, sorting, and pagination. + +#### Parameters + +| Parameter | Type | Required | Description | +| -------------------- | ---------------------- | -------- | -------------------------------------------------------------------------------------------- | +| `query` | `str \| None` | No | Full-text search on model name | +| `name` | `str \| None` | No | Filter by model name | +| `key` | `str \| None` | No | Filter by model key | +| `ids` | `List[str] \| None` | No | Filter by specific model IDs | +| `categories` | `List[str] \| None` | No | Filter by categories (e.g. `transformer`, `moe`, `open-source`, `closed-source`, `usa`, `china`, `size-sm`, `size-md`, `size-lg`, `size-xl`) | +| `companies` | `List[str] \| None` | No | Filter by company names | +| `regions` | `List[str] \| None` | No | Filter by regions | +| `licenses` | `List[str] \| None` | No | Filter by license types | +| `sizes` | `List[str] \| None` | No | Filter by size (Small, Medium, Large, Extra Large) | +| `sort_by` | `str \| None` | No | Sort column: `name`, `createdAt`, `releasedAt`, `architectureType`, `contextLength`, `license`, `region` | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `page` | `int \| None` | No | Page number (1-based) | +| `page_size` | `int \| None` | No | Results per page | +| `include_deprecated` | `bool \| None` | No | Include deprecated models (default: false) | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `PublicModelsListResponse` containing: + +- `models`: List of `PublicModelDetail` objects +- `categories`: List of available category strings +- `count`: Number of results in current page +- `total_count`: Total number of matching results + +Returns `None` if the request fails. + +#### PublicModelDetail Properties + +| Property | Type | Description | +| ---------------------- | ---------------- | ---------------------------------- | +| `id` | `str` | Unique model identifier | +| `key` | `str` | Unique model key | +| `name` | `str` | Human-readable model name | +| `description` | `str \| None` | Text description | +| `company` | `str \| None` | Model provider company | +| `released_at` | `int \| None` | Release timestamp | +| `parameters` | `float \| None` | Number of parameters | +| `modality` | `str \| None` | Model modality | +| `context_length` | `int \| None` | Maximum context length | +| `architecture_type` | `str \| None` | Architecture type | +| `license` | `str \| None` | License type | +| `open_weights` | `bool \| None` | Whether weights are open | +| `region` | `str \| None` | Region | +| `key_takeaways` | `List[str] \| None` | Key takeaways | +| `deprecated` | `bool \| None` | Whether the model is deprecated | +| `cost_per_input_token` | `str \| None` | Cost per input token | +| `cost_per_output_token`| `str \| None` | Cost per output token | + +#### Example + +```python +from layerlens import PublicClient + +client = PublicClient() + +# Get newest OpenAI models +response = client.models.get( + companies=["OpenAI"], + sort_by="releasedAt", + order="desc", + page_size=5, +) + +for model in response.models: + print(f"{model.name} - {model.context_length} context length") +``` + +## Public Benchmarks + +### `benchmarks.get(...)` + +Retrieves a list of public benchmarks with optional filtering, sorting, and pagination. + +#### Parameters + +| Parameter | Type | Required | Description | +| -------------------- | ---------------------- | -------- | ------------------------------------------ | +| `query` | `str \| None` | No | Full-text search | +| `name` | `str \| None` | No | Filter by name | +| `key` | `str \| None` | No | Filter by key | +| `ids` | `List[str] \| None` | No | Filter by specific IDs | +| `categories` | `List[str] \| None` | No | Filter by categories | +| `languages` | `List[str] \| None` | No | Filter by languages | +| `sort_by` | `str \| None` | No | Sort column (currently: `name`) | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `page` | `int \| None` | No | Page number (1-based) | +| `page_size` | `int \| None` | No | Results per page | +| `include_deprecated` | `bool \| None` | No | Include deprecated benchmarks | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `PublicBenchmarksListResponse` containing: + +- `datasets`: List of `PublicBenchmarkDetail` objects +- `categories`: List of available category strings +- `count`: Number of results in current page +- `total_count`: Total number of matching results + +Returns `None` if the request fails. + +#### PublicBenchmarkDetail Properties + +| Property | Type | Description | +| ----------------- | ------------------ | ------------------------------------- | +| `id` | `str` | Unique benchmark identifier | +| `key` | `str` | Unique benchmark key | +| `name` | `str` | Human-readable name | +| `description` | `str \| None` | Text description | +| `prompt_count` | `int \| None` | Number of prompts in the benchmark | +| `language` | `str \| None` | Language of the benchmark | +| `categories` | `List[str] \| None`| Categories | +| `characteristics` | `List[str] \| None`| Characteristics | +| `deprecated` | `bool \| None` | Whether the benchmark is deprecated | +| `is_public` | `bool \| None` | Whether the benchmark is public | + +### `benchmarks.get_prompts(benchmark_id, ...)` + +Fetches prompts/content from a public benchmark with optional search and pagination. + +#### Parameters + +| Parameter | Type | Required | Description | +| -------------- | ---------------------- | -------- | ---------------------------------------------- | +| `benchmark_id` | `str` | Yes | The benchmark ID to fetch prompts from | +| `page` | `int \| None` | No | Page number (1-based) | +| `page_size` | `int \| None` | No | Results per page | +| `search_field` | `str \| None` | No | Search field: `id`, `input`, or `truth` | +| `search_value` | `str \| None` | No | Search value | +| `sort_by` | `str \| None` | No | Sort field: `id`, `input`, or `truth` | +| `sort_order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `BenchmarkPromptsResponse` containing: + +- `status`: Response status string +- `data.prompts`: List of `BenchmarkPrompt` objects +- `data.count`: Total number of prompts + +Returns `None` if the request fails. + +#### BenchmarkPrompt Properties + +| Property | Type | Description | +| -------- | ----- | -------------------------------------- | +| `id` | `str` | Unique prompt identifier | +| `input` | `str \| List \| Dict` | The prompt input | +| `truth` | `str` | The expected/ground truth answer | + +### `benchmarks.get_all_prompts(benchmark_id, timeout=None)` + +Fetches all prompts from a benchmark by automatically handling pagination. + +#### Parameters + +| Parameter | Type | Required | Description | +| -------------- | ---------------------- | -------- | --------------------------------------- | +| `benchmark_id` | `str` | Yes | The benchmark ID to fetch prompts from | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `List[BenchmarkPrompt]` containing all prompts in the benchmark. + +#### Example + +```python +from layerlens import PublicClient + +client = PublicClient() + +# List benchmarks +benchmarks = client.benchmarks.get(query="mmlu") + +if benchmarks and benchmarks.datasets: + benchmark = benchmarks.datasets[0] + + # Get first page of prompts + prompts = client.benchmarks.get_prompts(benchmark.id, page=1, page_size=10) + + if prompts: + print(f"Total prompts: {prompts.data.count}") + for prompt in prompts.data.prompts: + print(f" Input: {str(prompt.input)[:80]}...") + print(f" Truth: {prompt.truth[:50]}") + + # Or fetch all prompts at once + all_prompts = client.benchmarks.get_all_prompts(benchmark.id) + print(f"All prompts: {len(all_prompts)}") +``` + +## Evaluations + +### `evaluations.get_by_id(id, ...)` + +Retrieves a single evaluation by its unique identifier, including the full evaluation summary. + +#### Parameters + +| Parameter | Type | Required | Description | +| --------- | -------------------------------- | -------- | -------------------------------- | +| `id` | `str` | Yes | The unique evaluation identifier | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns an `Evaluation` object if found, `None` otherwise. See [Evaluations](evaluations.md) for the full `Evaluation` object properties. + +### `evaluations.get_many(...)` + +Retrieves evaluations for a given organization and project with optional pagination, sorting, and filtering. + +#### Parameters + +| Parameter | Type | Required | Description | +| ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ | +| `organization_id` | `str` | Yes | Organization ID (MongoDB ObjectID format) | +| `project_id` | `str` | Yes | Project ID (MongoDB ObjectID format) | +| `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | +| `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | +| `sort_by` | `str \| None` | No | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` | +| `order` | `str \| None` | No | Sort order: `asc` or `desc` | +| `model_ids` | `List[str] \| None` | No | Filter by model IDs | +| `benchmark_ids` | `List[str] \| None` | No | Filter by benchmark/dataset IDs | +| `status` | `EvaluationStatus \| None` | No | Filter by evaluation status | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns an `EvaluationsResponse` object containing: + +- `evaluations`: List of `Evaluation` objects +- `pagination`: Pagination metadata with `page`, `page_size`, `total_pages`, and `total_count` + +Returns `None` if the request fails. + +#### Example + +```python +from layerlens import PublicClient +from layerlens.models import EvaluationStatus + +client = PublicClient() + +# Get a specific evaluation by ID (with full summary) +evaluation = client.evaluations.get_by_id("eval_abc123") +if evaluation: + print(f"{evaluation.model_name} on {evaluation.benchmark_name}: {evaluation.accuracy:.2f}%") + if evaluation.summary: + print(f"Goal: {evaluation.summary.goal}") + for takeaway in evaluation.summary.analysis_summary.key_takeaways: + print(f" - {takeaway}") + +# List evaluations for an organization/project +response = client.evaluations.get_many( + organization_id="683e63925ef7e1c53c1f4b28", + project_id="683e63925ef7e1c53c1f4b29", + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page_size=10, +) +if response: + print(f"Top evaluations ({response.pagination.total_count} total):") + for e in response.evaluations: + print(f" {e.model_name}: {e.accuracy:.2f}%") +``` + +## Comparisons + +### `comparisons.compare(...)` + +Compares results between two evaluations side-by-side. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------------ | ---------------------- | -------- | -------------------------------------------------------------------------- | +| `evaluation_id_1` | `str` | Yes | First evaluation ID | +| `evaluation_id_2` | `str` | Yes | Second evaluation ID | +| `page` | `int \| None` | No | Page number (1-based) | +| `page_size` | `int \| None` | No | Results per page | +| `outcome_filter` | `str \| None` | No | Filter by outcome (see below) | +| `search` | `str \| None` | No | Search within results | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Outcome Filter Options + +| Value | Description | +| -------------------- | ---------------------------------------------- | +| `"all"` | All results (default) | +| `"both_succeed"` | Both models answered correctly | +| `"both_fail"` | Both models answered incorrectly | +| `"reference_fails"` | First model fails, second succeeds | +| `"comparison_fails"` | Second model fails, first succeeds | + +#### Returns + +Returns a `ComparisonResponse` containing: + +- `results`: List of `ComparisonResult` objects +- `total_count`: Total number of comparable results +- `correct_count_1`: Number of correct answers for evaluation 1 +- `total_results_1`: Total results for evaluation 1 +- `correct_count_2`: Number of correct answers for evaluation 2 +- `total_results_2`: Total results for evaluation 2 + +Returns `None` if the request fails. + +#### ComparisonResult Properties + +| Property | Type | Description | +| ------------- | --------------- | ------------------------------------- | +| `result_id_1` | `int \| None` | Result ID from evaluation 1 | +| `result_id_2` | `int \| None` | Result ID from evaluation 2 | +| `prompt` | `str` | The prompt text | +| `truth` | `str` | The ground truth answer | +| `result1` | `str \| None` | Model 1's response | +| `score1` | `float \| None` | Model 1's score | +| `result2` | `str \| None` | Model 2's response | +| `score2` | `float \| None` | Model 2's score | + +#### Example + +```python +from layerlens import PublicClient + +client = PublicClient() + +comparison = client.comparisons.compare( + evaluation_id_1="eval-abc", + evaluation_id_2="eval-def", + outcome_filter="reference_fails", + page=1, + page_size=20, +) + +if comparison: + print(f"Eval 1: {comparison.correct_count_1}/{comparison.total_results_1}") + print(f"Eval 2: {comparison.correct_count_2}/{comparison.total_results_2}") + + for result in comparison.results: + print(f" Prompt: {result.prompt[:80]}...") + print(f" Model 1 score: {result.score1}, Model 2 score: {result.score2}") +``` diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py new file mode 100644 index 0000000..eb292b4 --- /dev/null +++ b/examples/compare_evaluations.py @@ -0,0 +1,92 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import Stratix +from layerlens.models import EvaluationStatus + + +def main(): + # Construct client (API key from env or inline) + client = Stratix() + + # --- Get successful evaluations to find a comparable pair + response = client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page_size=100, + ) + + if not response or len(response.evaluations) < 2: + print("Need at least 2 successful evaluations to compare, exiting") + return + + # Find two evaluations on the same benchmark + eval_1 = None + eval_2 = None + for i, e1 in enumerate(response.evaluations): + for e2 in response.evaluations[i + 1 :]: + if e1.benchmark_id == e2.benchmark_id and e1.id != e2.id: + eval_1 = e1 + eval_2 = e2 + break + if eval_1: + break + + if not eval_1 or not eval_2: + print("No two evaluations share the same benchmark, exiting") + return + + print(f"Comparing evaluations on the same benchmark ({eval_1.benchmark_id}):") + print(f" Evaluation 1: {eval_1.id} (accuracy={eval_1.accuracy:.2f}%)") + print(f" Evaluation 2: {eval_2.id} (accuracy={eval_2.accuracy:.2f}%)") + + # --- Get comparison results + comparison = client.public.comparisons.compare( + evaluation_id_1=eval_1.id, + evaluation_id_2=eval_2.id, + page=1, + page_size=10, + ) + + if comparison: + print(f"\n=== Comparison Summary ===") + print(f"Evaluation 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Evaluation 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") + print(f"Total compared: {comparison.total_count}") + + # --- Show individual results + if comparison.results: + print(f"\nFirst {len(comparison.results)} results:") + for result in comparison.results: + score_indicator_1 = "✓" if result.score1 and result.score1 > 0.5 else "✗" + score_indicator_2 = "✓" if result.score2 and result.score2 > 0.5 else "✗" + print(f" Prompt: {result.prompt[:80]}...") + print(f" Model 1: {score_indicator_1} (score={result.score1})") + print(f" Model 2: {score_indicator_2} (score={result.score2})") + print() + + # --- Filter by outcome: where only model 1 fails + comparison = client.public.comparisons.compare( + evaluation_id_1=eval_1.id, + evaluation_id_2=eval_2.id, + outcome_filter="reference_fails", + ) + + if comparison: + print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===") + print(f"Found {comparison.total_count} such cases") + + # --- Filter by outcome: where both models fail + comparison = client.public.comparisons.compare( + evaluation_id_1=eval_1.id, + evaluation_id_2=eval_2.id, + outcome_filter="both_fail", + ) + + if comparison: + print(f"\n=== Where Both Models Fail ===") + print(f"Found {comparison.total_count} such cases") + + +if __name__ == "__main__": + main() diff --git a/examples/create_custom_benchmark.py b/examples/create_custom_benchmark.py new file mode 100644 index 0000000..4b263de --- /dev/null +++ b/examples/create_custom_benchmark.py @@ -0,0 +1,50 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import Stratix + + +def main(): + # Construct client (API key from env or inline) + client = Stratix() + + # --- Create a custom benchmark from a JSONL file + # + # The JSONL file should have one JSON object per line with these fields: + # {"input": "What is 2+2?", "truth": "4"} + # {"input": "Capital of France?", "truth": "Paris"} + # + # Optional fields: "subset" (for grouping prompts) + + result = client.benchmarks.create_custom( + name="My Custom Benchmark", + description="A simple test benchmark for QA evaluation", + file_path="path/to/benchmark.jsonl", + ) + + if result: + print(f"Custom benchmark created: {result.benchmark_id}") + else: + print("Failed to create custom benchmark") + + # --- Create with additional metrics and input type + result = client.benchmarks.create_custom( + name="Advanced Benchmark", + description="Benchmark with toxicity and readability scoring", + file_path="path/to/benchmark.jsonl", + additional_metrics=["toxicity", "readability"], + input_type="messages", + ) + + if result: + print(f"Advanced benchmark created: {result.benchmark_id}") + + # --- Verify the benchmark was added to the project + benchmarks = client.benchmarks.get(type="custom") + if benchmarks: + print(f"\nCustom benchmarks in project ({len(benchmarks)}):") + for b in benchmarks: + print(f" - {b.name} (id={b.id})") + + +if __name__ == "__main__": + main() diff --git a/examples/create_custom_model.py b/examples/create_custom_model.py new file mode 100644 index 0000000..8d375f6 --- /dev/null +++ b/examples/create_custom_model.py @@ -0,0 +1,41 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import Stratix + + +def main(): + # Construct client (API key from env or inline) + client = Stratix() + + # --- Create a custom model backed by an OpenAI-compatible API + # + # Custom models let you evaluate any model accessible via an + # OpenAI-compatible chat completions endpoint. + # + # Key format: lowercase alphanumeric with dots, hyphens, slashes + # e.g. "my-org/custom-llama-3.1-70b" + + result = client.models.create_custom( + name="My Custom Model", + key="my-org/custom-model-v1", + description="Custom fine-tuned model served via vLLM", + api_url="https://my-model-endpoint.example.com/v1", + api_key="my-provider-api-key", + max_tokens=4096, + ) + + if result: + print(f"Custom model created: {result.model_id}") + else: + print("Failed to create custom model") + + # --- Verify the model was added to the project + models = client.models.get(type="custom") + if models: + print(f"\nCustom models in project ({len(models)}):") + for m in models: + print(f" - {m.name} (id={m.id}, key={m.key})") + + +if __name__ == "__main__": + main() diff --git a/examples/create_smart_benchmark.py b/examples/create_smart_benchmark.py new file mode 100644 index 0000000..9c628d3 --- /dev/null +++ b/examples/create_smart_benchmark.py @@ -0,0 +1,51 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import Stratix + + +def main(): + # Construct client (API key from env or inline) + client = Stratix() + + # --- Create a smart benchmark from source files + # + # Smart benchmarks use AI to automatically generate benchmark prompts + # from your uploaded documents. Supported file types include: + # .txt, .pdf, .html, .docx, .csv, .json, .jsonl, .parquet + # + # You provide a system prompt that guides how the AI generates + # evaluation questions from the source material. + + result = client.benchmarks.create_smart( + name="Product Knowledge Benchmark", + description="Evaluates model knowledge of our product documentation", + system_prompt=( + "Generate question-answer pairs that test understanding of the " + "product features, capabilities, and limitations described in " + "the provided documents. Each question should have a clear, " + "factual answer derived from the source material." + ), + file_paths=[ + "path/to/product_docs.pdf", + "path/to/faq.txt", + ], + metrics=["hallucination"], + ) + + if result: + print(f"Smart benchmark created: {result.benchmark_id}") + print("The benchmark is being generated asynchronously.") + print("Check the dashboard for progress.") + else: + print("Failed to create smart benchmark") + + # --- Verify the benchmark was added to the project + benchmarks = client.benchmarks.get(type="custom") + if benchmarks: + print(f"\nCustom benchmarks in project ({len(benchmarks)}):") + for b in benchmarks: + print(f" - {b.name} (id={b.id})") + + +if __name__ == "__main__": + main() diff --git a/examples/evaluation_sorting.py b/examples/evaluation_sorting.py new file mode 100644 index 0000000..cb1906f --- /dev/null +++ b/examples/evaluation_sorting.py @@ -0,0 +1,92 @@ +#!/usr/bin/env -S poetry run python + +import asyncio + +from layerlens import AsyncStratix +from layerlens.models import EvaluationStatus + + +async def main(): + # Construct async client (requires API key) + client = AsyncStratix() + + # --- Get evaluations sorted by accuracy (highest first) + response = await client.evaluations.get_many( + sort_by="accuracy", + order="desc", + page_size=10, + ) + if response: + print(f"Top {len(response.evaluations)} evaluations by accuracy:") + for evaluation in response.evaluations: + print(f" - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%, status={evaluation.status.value}") + + # --- Get evaluations sorted by submission date (newest first) + response = await client.evaluations.get_many( + sort_by="submittedAt", + order="desc", + page_size=5, + ) + if response: + print(f"\nLatest {len(response.evaluations)} evaluations:") + for evaluation in response.evaluations: + print(f" - {evaluation.id}: submitted_at={evaluation.submitted_at}") + + # --- Get evaluations sorted by average duration (fastest first) + response = await client.evaluations.get_many( + sort_by="averageDuration", + order="asc", + page_size=5, + ) + if response: + print(f"\nFastest {len(response.evaluations)} evaluations:") + for evaluation in response.evaluations: + print(f" - {evaluation.id}: avg_duration={evaluation.average_duration}ms") + + # --- Filter by status (only successful evaluations) + response = await client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + ) + if response: + print(f"\nSuccessful evaluations: {response.pagination.total_count}") + + # --- Filter by specific model IDs + # Replace with actual model IDs from your organization + response = await client.evaluations.get_many( + model_ids=["your-model-id"], + sort_by="accuracy", + order="desc", + ) + if response: + print(f"\nEvaluations for specified model: {response.pagination.total_count}") + + # --- Filter by specific benchmark IDs + # Replace with actual benchmark IDs from your organization + response = await client.evaluations.get_many( + benchmark_ids=["your-benchmark-id"], + sort_by="submittedAt", + order="desc", + ) + if response: + print(f"\nEvaluations for specified benchmark: {response.pagination.total_count}") + + # --- Combine sorting, filtering, and pagination + response = await client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page=1, + page_size=20, + ) + if response: + print(f"\nPage 1 of successful evaluations (sorted by accuracy):") + print(f" Total: {response.pagination.total_count}") + print(f" Pages: {response.pagination.total_pages}") + for evaluation in response.evaluations: + print(f" - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/get_evaluation.py b/examples/get_evaluation.py index 45d667c..a6d8fe6 100644 --- a/examples/get_evaluation.py +++ b/examples/get_evaluation.py @@ -10,7 +10,7 @@ async def main(): client = AsyncStratix() # --- Get evaluation by id - evaluation_id = "eval_123" + evaluation_id = "699f1426c1212b2d9c78e947" evaluation = await client.evaluations.get_by_id(evaluation_id) print(f"Found evaluation {evaluation.id}") print(evaluation) diff --git a/examples/manage_project_models_benchmarks.py b/examples/manage_project_models_benchmarks.py new file mode 100644 index 0000000..0067051 --- /dev/null +++ b/examples/manage_project_models_benchmarks.py @@ -0,0 +1,41 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import Stratix + + +def main(): + # Construct client (API key from env or inline) + client = Stratix() + + # --- Add public models to the project + success = client.models.add("model-id-1", "model-id-2") + print(f"Add models: {'success' if success else 'failed'}") + + # --- Remove a model from the project + success = client.models.remove("model-id-1") + print(f"Remove model: {'success' if success else 'failed'}") + + # --- Add public benchmarks to the project + success = client.benchmarks.add("benchmark-id-1") + print(f"Add benchmark: {'success' if success else 'failed'}") + + # --- Remove a benchmark from the project + success = client.benchmarks.remove("benchmark-id-1") + print(f"Remove benchmark: {'success' if success else 'failed'}") + + # --- List current models and benchmarks + models = client.models.get() + if models: + print(f"\nModels in project ({len(models)}):") + for m in models: + print(f" - {m.name} (id={m.id})") + + benchmarks = client.benchmarks.get() + if benchmarks: + print(f"\nBenchmarks in project ({len(benchmarks)}):") + for b in benchmarks: + print(f" - {b.name} (id={b.id})") + + +if __name__ == "__main__": + main() diff --git a/examples/public_benchmarks.py b/examples/public_benchmarks.py new file mode 100644 index 0000000..396c0d9 --- /dev/null +++ b/examples/public_benchmarks.py @@ -0,0 +1,60 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import PublicClient + + +def main(): + # Construct public client (API key from env or inline) + client = PublicClient() + + # --- Browse all public benchmarks + response = client.benchmarks.get(page=1, page_size=10) + print(f"Found {response.total_count} public benchmarks (showing first {len(response.datasets)})") + for benchmark in response.datasets: + print(f" - {benchmark.name} (prompts={benchmark.prompt_count}, language={benchmark.language})") + + # --- Filter by language + response = client.benchmarks.get(languages=["English"]) + print(f"\nFound {response.total_count} English benchmarks") + + # --- Filter by category + if response.categories: + print(f"\nAvailable categories: {response.categories}") + + # --- Search by name + response = client.benchmarks.get(query="mmlu") + print(f"\nFound {response.total_count} benchmarks matching 'mmlu'") + for benchmark in response.datasets: + print(f" - {benchmark.name}: {benchmark.description[:80] if benchmark.description else 'N/A'}...") + + # --- Get benchmark prompts (content download) + if response.datasets: + benchmark = response.datasets[0] + print(f"\nFetching prompts for '{benchmark.name}' (id={benchmark.id})...") + + prompts_response = client.benchmarks.get_prompts( + benchmark.id, + page=1, + page_size=5, + ) + + if prompts_response: + print(f"Total prompts: {prompts_response.data.count}") + print(f"Showing first {len(prompts_response.data.prompts)} prompts:") + for prompt in prompts_response.data.prompts: + input_preview = str(prompt.input)[:80] + truth_preview = prompt.truth[:50] if prompt.truth else "N/A" + print(f" - Input: {input_preview}...") + print(f" Truth: {truth_preview}") + print() + + # --- Get all prompts (auto-paginates) + if response.datasets: + benchmark = response.datasets[0] + print(f"Fetching ALL prompts for '{benchmark.name}'...") + all_prompts = client.benchmarks.get_all_prompts(benchmark.id) + print(f"Retrieved {len(all_prompts)} total prompts") + + +if __name__ == "__main__": + main() diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py new file mode 100644 index 0000000..f28236f --- /dev/null +++ b/examples/public_evaluations.py @@ -0,0 +1,66 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import PublicClient +from layerlens.models import EvaluationStatus + + +def main(): + # Construct public client (API key from LAYERLENS_STRATIX_API_KEY env var or inline) + client = PublicClient() + + # --- Get a specific evaluation by ID + evaluation_id = "699f1426c1212b2d9c78e947" + evaluation = client.evaluations.get_by_id(evaluation_id) + if evaluation: + print(f"Evaluation: {evaluation.id}") + print(f" Model: {evaluation.model_name} ({evaluation.model_company})") + print(f" Benchmark: {evaluation.benchmark_name}") + print(f" Status: {evaluation.status.value}") + print(f" Accuracy: {evaluation.accuracy:.2f}%") + + if evaluation.summary: + print(f" Summary: {evaluation.summary.name}") + print(f" Goal: {evaluation.summary.goal}") + if evaluation.summary.metrics: + print(f" Metrics: {', '.join(m.name for m in evaluation.summary.metrics)}") + if evaluation.summary.performance_details: + print(f" Strengths: {evaluation.summary.performance_details.strengths}") + if evaluation.summary.analysis_summary: + print(f" Key takeaways: {evaluation.summary.analysis_summary.key_takeaways}") + else: + print(f"Evaluation {evaluation_id} not found") + + # --- List evaluations for a specific organization/project + organization_id = "683e63925ef7e1c53c1f4b28" + project_id = "683e63925ef7e1c53c1f4b29" + + response = client.evaluations.get_many( + organization_id=organization_id, + project_id=project_id, + page=1, + page_size=5, + sort_by="submittedAt", + order="desc", + ) + if response: + print(f"\nLatest evaluations ({response.pagination.total_count} total):") + for e in response.evaluations: + print(f" - {e.id}: {e.model_name} on {e.benchmark_name} -> {e.accuracy:.2f}% ({e.status.value})") + + # --- Filter by status (only successful) + response = client.evaluations.get_many( + organization_id=organization_id, + project_id=project_id, + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page_size=5, + ) + if response: + print(f"\nTop successful evaluations ({response.pagination.total_count} total):") + for e in response.evaluations: + print(f" - {e.model_name}: {e.accuracy:.2f}%") + + +if __name__ == "__main__": + main() diff --git a/examples/public_models.py b/examples/public_models.py new file mode 100644 index 0000000..7d86550 --- /dev/null +++ b/examples/public_models.py @@ -0,0 +1,49 @@ +#!/usr/bin/env -S poetry run python + +from layerlens import PublicClient + + +def main(): + # Construct public client (API key from env or inline) + client = PublicClient() + + # --- Browse all public models (first page) + response = client.models.get(page=1, page_size=10) + print(f"Found {response.total_count} public models (showing first {len(response.models)})") + for model in response.models: + print(f" - {model.name} ({model.company})") + + # --- Search models by query + response = client.models.get(query="gpt") + print(f"\nFound {response.total_count} models matching 'gpt'") + for model in response.models: + print(f" - {model.name}") + + # --- Filter by company + companies = ["OpenAI", "Anthropic"] + response = client.models.get(companies=companies) + print(f"\nFound {response.total_count} models from {companies}") + for model in response.models: + print(f" - {model.name} ({model.company})") + + # --- Filter by region + response = client.models.get(regions=["usa"]) + print(f"\nFound {response.total_count} models in region 'usa'") + + # --- Filter by category + response = client.models.get(categories=["open-source"]) + print(f"\nFound {response.total_count} open-source models") + + # --- Sort by release date (newest first) + response = client.models.get(sort_by="releasedAt", order="desc", page_size=5) + print(f"\nNewest 5 models:") + for model in response.models: + print(f" - {model.name} (released_at={model.released_at})") + + # --- Include deprecated models + response = client.models.get(include_deprecated=True) + print(f"\nTotal models (including deprecated): {response.total_count}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 1d08dfa..ef86a87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "layerlens" -version = "1.1.1" +version = "1.2.0" description = "The official Python library for the LayerLens Stratix API" license = "Apache-2.0" authors = [{ name = "LayerLens", email = "support@layerlens.ai" }] diff --git a/src/layerlens/__init__.py b/src/layerlens/__init__.py index a5fcc80..78a69f9 100644 --- a/src/layerlens/__init__.py +++ b/src/layerlens/__init__.py @@ -3,15 +3,23 @@ Trace, JudgeVersion, JudgeSnapshot, + BenchmarkPrompt, TraceEvaluation, + ComparisonResult, + PublicModelDetail, + ComparisonResponse, OptimizationBudget, TraceEvaluationStep, JudgeOptimizationRun, TraceWithEvaluations, OptimizationRunStatus, + PublicBenchmarkDetail, TraceEvaluationResult, TraceEvaluationStatus, TraceEvaluationSummary, + BenchmarkPromptsResponse, + PublicModelsListResponse, + PublicBenchmarksListResponse, ) from ._client import Atlas, Client, Stratix, AsyncAtlas, AsyncClient, AsyncStratix from ._exceptions import ( @@ -22,17 +30,23 @@ BadRequestError, AuthenticationError, ) +from ._public_client import PublicClient, AsyncPublicClient __all__ = [ "APIError", "AsyncAtlas", "AsyncClient", + "AsyncPublicClient", "AsyncStratix", "Atlas", "AtlasError", "AuthenticationError", "BadRequestError", + "BenchmarkPrompt", + "BenchmarkPromptsResponse", "Client", + "ComparisonResult", + "ComparisonResponse", "Judge", "JudgeOptimizationRun", "JudgeSnapshot", @@ -40,6 +54,11 @@ "NotFoundError", "OptimizationBudget", "OptimizationRunStatus", + "PublicBenchmarkDetail", + "PublicBenchmarksListResponse", + "PublicClient", + "PublicModelDetail", + "PublicModelsListResponse", "Stratix", "StratixError", "Trace", diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py index 9599ab9..e146f29 100644 --- a/src/layerlens/_client.py +++ b/src/layerlens/_client.py @@ -17,6 +17,7 @@ from ._base_client import BaseClient, BaseAsyncClient if TYPE_CHECKING: + from ._public_client import PublicClient, AsyncPublicClient from .resources.judges import Judges, AsyncJudges from .resources.models import Models, AsyncModels from .resources.traces import Traces, AsyncTraces @@ -124,6 +125,12 @@ def trace_evaluations(self) -> TraceEvaluations: return TraceEvaluations(self) + @cached_property + def public(self) -> PublicClient: + from ._public_client import PublicClient + + return PublicClient(api_key=self.api_key, base_url=str(self.base_url), timeout=self.timeout) + @property @override def auth_headers(self) -> dict[str, str]: @@ -287,6 +294,12 @@ def trace_evaluations(self) -> AsyncTraceEvaluations: return AsyncTraceEvaluations(self) + @cached_property + def public(self) -> AsyncPublicClient: + from ._public_client import AsyncPublicClient + + return AsyncPublicClient(api_key=self.api_key, base_url=str(self.base_url), timeout=self.timeout) + @property @override def auth_headers(self) -> dict[str, str]: diff --git a/src/layerlens/_public_client.py b/src/layerlens/_public_client.py new file mode 100644 index 0000000..fb04979 --- /dev/null +++ b/src/layerlens/_public_client.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import os +from http import HTTPStatus +from typing import TYPE_CHECKING, Any, Union, Mapping +from functools import cached_property +from typing_extensions import Self, override + +import httpx + +from . import _exceptions +from ._utils import is_mapping +from ._constants import DEFAULT_TIMEOUT +from ._exceptions import APIStatusError +from ._base_client import BaseClient, BaseAsyncClient + +if TYPE_CHECKING: + from .resources.comparisons import Comparisons, AsyncComparisons + from .resources.public_models import PublicModelsResource, AsyncPublicModelsResource + from .resources.public_benchmarks import PublicBenchmarksResource, AsyncPublicBenchmarksResource + from .resources.public_evaluations import PublicEvaluationsResource, AsyncPublicEvaluationsResource + + +__all__ = ["PublicClient", "AsyncPublicClient"] + + +def _make_status_error( + err_msg: str, + *, + body: object, + response: httpx.Response, +) -> APIStatusError: + data = body.get("error", body) if is_mapping(body) else body + + if response.status_code == HTTPStatus.BAD_REQUEST: + return _exceptions.BadRequestError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.UNAUTHORIZED: + return _exceptions.AuthenticationError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.FORBIDDEN: + return _exceptions.PermissionDeniedError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.NOT_FOUND: + return _exceptions.NotFoundError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.CONFLICT: + return _exceptions.ConflictError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY: + return _exceptions.UnprocessableEntityError(err_msg, response=response, body=data) + if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: + return _exceptions.RateLimitError(err_msg, response=response, body=data) + if response.status_code >= HTTPStatus.INTERNAL_SERVER_ERROR: + return _exceptions.InternalServerError(err_msg, response=response, body=data) + + return APIStatusError(err_msg, response=response, body=data) + + +class PublicClient(BaseClient): + """Client for accessing public LayerLens API endpoints.""" + + api_key: str + + def __init__( + self, + *, + api_key: str | None = None, + base_url: str | httpx.URL | None = None, + timeout: Union[float, httpx.Timeout, None] = DEFAULT_TIMEOUT, + ) -> None: + if api_key is None: + api_key = os.environ.get("LAYERLENS_STRATIX_API_KEY") + if api_key is None or api_key == "": + raise _exceptions.StratixError( + "The api_key client option must be set either by passing api_key to the client or by setting the LAYERLENS_STRATIX_API_KEY environment variable", + ) + self.api_key = api_key + + if base_url is None: + base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") + if base_url is None: + base_url = "https://api.layerlens.ai/api/v1" + + super().__init__(base_url=base_url, timeout=timeout) + + @cached_property + def models(self) -> PublicModelsResource: + from .resources.public_models import PublicModelsResource + + return PublicModelsResource(self) + + @cached_property + def benchmarks(self) -> PublicBenchmarksResource: + from .resources.public_benchmarks import PublicBenchmarksResource + + return PublicBenchmarksResource(self) + + @cached_property + def comparisons(self) -> Comparisons: + from .resources.comparisons import Comparisons + + return Comparisons(self) + + @cached_property + def evaluations(self) -> PublicEvaluationsResource: + from .resources.public_evaluations import PublicEvaluationsResource + + return PublicEvaluationsResource(self) + + @property + @override + def auth_headers(self) -> dict[str, str]: + return {"x-api-key": self.api_key} + + def copy( + self, + *, + api_key: str | None = None, + base_url: str | httpx.URL | None = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + _extra_kwargs: Mapping[str, Any] = {}, + ) -> Self: + return self.__class__( + api_key=api_key or self.api_key, + base_url=base_url or self.base_url, + timeout=self.timeout or timeout, + **_extra_kwargs, + ) + + with_options = copy + + @override + def _make_status_error( + self, + err_msg: str, + *, + body: object, + response: httpx.Response, + ) -> APIStatusError: + return _make_status_error(err_msg, body=body, response=response) + + +class AsyncPublicClient(BaseAsyncClient): + """Async client for accessing public LayerLens API endpoints.""" + + api_key: str + + def __init__( + self, + *, + api_key: str | None = None, + base_url: str | httpx.URL | None = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> None: + if api_key is None: + api_key = os.environ.get("LAYERLENS_STRATIX_API_KEY") + if api_key is None or api_key == "": + raise _exceptions.StratixError( + "The api_key client option must be set either by passing api_key to the client or by setting the LAYERLENS_STRATIX_API_KEY environment variable", + ) + self.api_key = api_key + + if base_url is None: + base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") + if base_url is None: + base_url = "https://api.layerlens.ai/api/v1" + + super().__init__(base_url=base_url, timeout=timeout) + + @cached_property + def models(self) -> AsyncPublicModelsResource: + from .resources.public_models import AsyncPublicModelsResource + + return AsyncPublicModelsResource(self) + + @cached_property + def benchmarks(self) -> AsyncPublicBenchmarksResource: + from .resources.public_benchmarks import AsyncPublicBenchmarksResource + + return AsyncPublicBenchmarksResource(self) + + @cached_property + def comparisons(self) -> AsyncComparisons: + from .resources.comparisons import AsyncComparisons + + return AsyncComparisons(self) + + @cached_property + def evaluations(self) -> AsyncPublicEvaluationsResource: + from .resources.public_evaluations import AsyncPublicEvaluationsResource + + return AsyncPublicEvaluationsResource(self) + + @property + @override + def auth_headers(self) -> dict[str, str]: + return {"x-api-key": self.api_key} + + def copy( + self, + *, + api_key: str | None = None, + base_url: str | httpx.URL | None = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + _extra_kwargs: Mapping[str, Any] = {}, + ) -> Self: + return self.__class__( + api_key=api_key or self.api_key, + base_url=base_url or self.base_url, + timeout=self.timeout or timeout, + **_extra_kwargs, + ) + + with_options = copy + + def _make_status_error( + self, + err_msg: str, + *, + body: object, + response: httpx.Response, + ) -> APIStatusError: + return _make_status_error(err_msg, body=body, response=response) diff --git a/src/layerlens/_resource.py b/src/layerlens/_resource.py index 973ae62..3606c97 100644 --- a/src/layerlens/_resource.py +++ b/src/layerlens/_resource.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from ._client import Stratix, AsyncStratix + from ._public_client import PublicClient, AsyncPublicClient class SyncAPIResource: @@ -34,3 +35,25 @@ def __init__(self, client: AsyncStratix) -> None: async def _sleep(self, seconds: float) -> None: await asyncio.sleep(seconds) + + +class SyncPublicAPIResource: + _client: PublicClient + + def __init__(self, client: PublicClient) -> None: + self._client = client + self._get = client.get_cast + + def _sleep(self, seconds: float) -> None: + time.sleep(seconds) + + +class AsyncPublicAPIResource: + _client: AsyncPublicClient + + def __init__(self, client: AsyncPublicClient) -> None: + self._client = client + self._get = client.get_cast + + async def _sleep(self, seconds: float) -> None: + await asyncio.sleep(seconds) diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index a82b376..c68196d 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.2.0" diff --git a/src/layerlens/models/__init__.py b/src/layerlens/models/__init__.py index 85c1da0..26bcfb9 100644 --- a/src/layerlens/models/__init__.py +++ b/src/layerlens/models/__init__.py @@ -8,12 +8,14 @@ UploadURLResponse, BenchmarksResponse, CreateJudgeResponse, + CreateModelResponse, DeleteJudgeResponse, EvaluationsResponse, UpdateJudgeResponse, CostEstimateResponse, CreateTracesResponse, OrganizationResponse, + CreateBenchmarkResponse, TraceEvaluationsResponse, CreateEvaluationsResponse, JudgeOptimizationRunsResponse, @@ -25,8 +27,31 @@ from .judge import Judge, JudgeVersion from .model import Model, CustomModel, PublicModel from .trace import Trace, TraceWithEvaluations, TraceEvaluationSummary +from .public import ( + BenchmarkPrompt, + ComparisonResult, + PublicModelDetail, + ComparisonResponse, + BenchmarkPromptsData, + PublicBenchmarkDetail, + BenchmarkPromptsResponse, + PublicModelsListResponse, + PublicBenchmarksListResponse, +) from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark -from .evaluation import Result, Evaluation, EvaluationStatus +from .evaluation import ( + Result, + Evaluation, + ErrorAnalysis, + AnalysisSummary, + EvaluationMetric, + EvaluationStatus, + EvaluationDataset, + EvaluationSummary, + EvaluationTaskType, + PerformanceDetails, + EvaluationModelInfo, +) from .organization import Project, Organization from .trace_evaluation import ( JudgeSnapshot, @@ -44,9 +69,16 @@ __all__ = [ "ApplyJudgeOptimizationResultResponse", "Benchmark", + "BenchmarkPrompt", + "BenchmarkPromptsData", + "BenchmarkPromptsResponse", "BenchmarksResponse", + "ComparisonResult", + "ComparisonResponse", "CostEstimateResponse", + "CreateBenchmarkResponse", "CreateEvaluationsResponse", + "CreateModelResponse", "CreateJudgeOptimizationRunResponse", "CreateJudgeResponse", "CreateTracesResponse", @@ -54,8 +86,15 @@ "CustomModel", "DeleteJudgeResponse", "EstimateJudgeOptimizationCostResponse", + "AnalysisSummary", + "ErrorAnalysis", "Evaluation", + "EvaluationDataset", + "EvaluationMetric", + "EvaluationModelInfo", "EvaluationStatus", + "EvaluationSummary", + "EvaluationTaskType", "EvaluationsResponse", "Judge", "JudgeOptimizationRun", @@ -70,9 +109,14 @@ "Organization", "OrganizationResponse", "Pagination", + "PerformanceDetails", "Project", "PublicBenchmark", + "PublicBenchmarkDetail", + "PublicBenchmarksListResponse", "PublicModel", + "PublicModelDetail", + "PublicModelsListResponse", "Result", "ResultMetrics", "ResultsResponse", diff --git a/src/layerlens/models/api.py b/src/layerlens/models/api.py index 398ebe4..bb36468 100644 --- a/src/layerlens/models/api.py +++ b/src/layerlens/models/api.py @@ -95,6 +95,18 @@ class UploadURLResponse(BaseModel): url: str +class CreateBenchmarkResponse(BaseModel): + organization_id: str + project_id: str + benchmark_id: str + + +class CreateModelResponse(BaseModel): + organization_id: str + project_id: str + model_id: str + + class CreateTracesResponse(BaseModel): trace_ids: List[str] diff --git a/src/layerlens/models/evaluation.py b/src/layerlens/models/evaluation.py index 651c60b..859d09f 100644 --- a/src/layerlens/models/evaluation.py +++ b/src/layerlens/models/evaluation.py @@ -1,7 +1,7 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional from datetime import timedelta import httpx @@ -22,17 +22,76 @@ class EvaluationStatus(str, Enum): CANCELLED = "cancelled" +class EvaluationMetric(BaseModel): + name: str + description: str = "" + + +class EvaluationTaskType(BaseModel): + name: str + description: str = "" + + +class EvaluationDataset(BaseModel): + total_size: int = 0 + training_size: int = 0 + test_size: int = 0 + characteristics: List[str] = [] + + +class EvaluationModelInfo(BaseModel): + model_name: str = "" + performance: Any = None + + +class PerformanceDetails(BaseModel): + strengths: List[str] = [] + challenges: List[str] = [] + + +class ErrorAnalysis(BaseModel): + common_failure_modes: List[str] = [] + example: str = "" + + +class AnalysisSummary(BaseModel): + key_takeaways: List[str] = [] + + +class EvaluationSummary(BaseModel): + name: str = "" + goal: str = "" + metrics: List[EvaluationMetric] = [] + task_types: List[EvaluationTaskType] = [] + dataset: Optional[EvaluationDataset] = None + model: Optional[EvaluationModelInfo] = None + performance_details: Optional[PerformanceDetails] = None + error_analysis: Optional[ErrorAnalysis] = None + analysis_summary: Optional[AnalysisSummary] = None + + class Evaluation(BaseModel): model_config = ConfigDict(populate_by_name=True) id: str status: EvaluationStatus + status_description: str = "" submitted_at: int finished_at: int model_id: str + model_name: str = "" + model_key: str = "" + model_company: str = "" benchmark_id: str = Field(..., alias="dataset_id") + benchmark_name: str = Field("", alias="dataset_name") average_duration: int accuracy: float + readability_score: float = 0.0 + toxicity_score: float = 0.0 + ethics_score: float = 0.0 + failed_prompt_count: int = 0 + queue_id: int = 0 + summary: Optional[EvaluationSummary] = None _client: "Optional[Stratix | AsyncStratix]" = None @@ -134,9 +193,15 @@ def wait_for_completion( ) if evaluation: self.status = evaluation.status + self.status_description = evaluation.status_description self.finished_at = evaluation.finished_at self.average_duration = evaluation.average_duration self.accuracy = evaluation.accuracy + self.readability_score = evaluation.readability_score + self.toxicity_score = evaluation.toxicity_score + self.ethics_score = evaluation.ethics_score + self.failed_prompt_count = evaluation.failed_prompt_count + self.summary = evaluation.summary return self @@ -156,9 +221,15 @@ async def wait_for_completion_async( ) if evaluation: self.status = evaluation.status + self.status_description = evaluation.status_description self.finished_at = evaluation.finished_at self.average_duration = evaluation.average_duration self.accuracy = evaluation.accuracy + self.readability_score = evaluation.readability_score + self.toxicity_score = evaluation.toxicity_score + self.ethics_score = evaluation.ethics_score + self.failed_prompt_count = evaluation.failed_prompt_count + self.summary = evaluation.summary return self diff --git a/src/layerlens/models/public.py b/src/layerlens/models/public.py new file mode 100644 index 0000000..5905a09 --- /dev/null +++ b/src/layerlens/models/public.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Union, Optional + +from pydantic import BaseModel, ConfigDict + + +class PublicModelDetail(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + id: str + key: str + name: str + description: Optional[str] = None + company: Optional[str] = None + created_at: Optional[Union[int, str]] = None + released_at: Optional[Union[int, str]] = None + parameters: Optional[float] = None + modality: Optional[str] = None + context_length: Optional[int] = None + architecture_type: Optional[str] = None + license: Optional[str] = None + open_weights: Optional[bool] = None + region: Optional[str] = None + key_takeaways: Optional[List[str]] = None + deprecated: Optional[bool] = None + cost_per_input_token: Optional[str] = None + cost_per_output_token: Optional[str] = None + + +class PublicModelsListResponse(BaseModel): + models: List[PublicModelDetail] + categories: List[str] = [] + count: int + total_count: int + + +class PublicBenchmarkDetail(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + id: str + key: str + name: str + description: Optional[str] = None + created_at: Optional[Union[int, str]] = None + prompt_count: Optional[int] = None + language: Optional[str] = None + categories: Optional[List[str]] = None + characteristics: Optional[List[str]] = None + deprecated: Optional[bool] = None + is_public: Optional[bool] = None + + +class PublicBenchmarksListResponse(BaseModel): + datasets: List[PublicBenchmarkDetail] + categories: List[str] = [] + count: int + total_count: int + + +class BenchmarkPrompt(BaseModel): + id: str + input: Union[str, List[Dict[str, Any]], Dict[str, Any]] + truth: str + + +class BenchmarkPromptsData(BaseModel): + prompts: List[BenchmarkPrompt] + count: int + + +class BenchmarkPromptsResponse(BaseModel): + status: str + data: BenchmarkPromptsData + + +class ComparisonResult(BaseModel): + result_id_1: Optional[int] = None + result_id_2: Optional[int] = None + prompt: str + truth: str + result1: Optional[str] = None + score1: Optional[float] = None + result2: Optional[str] = None + score2: Optional[float] = None + + +class ComparisonResponse(BaseModel): + results: Optional[List[ComparisonResult]] = None + total_count: int + correct_count_1: int + total_results_1: int + correct_count_2: int + total_results_2: int diff --git a/src/layerlens/resources/benchmarks/benchmarks.py b/src/layerlens/resources/benchmarks/benchmarks.py index c694f16..f8e4bf3 100644 --- a/src/layerlens/resources/benchmarks/benchmarks.py +++ b/src/layerlens/resources/benchmarks/benchmarks.py @@ -1,6 +1,8 @@ from __future__ import annotations -from typing import List, Literal, Optional +import os +import mimetypes +from typing import Any, Dict, List, Literal, Optional import httpx @@ -9,10 +11,26 @@ CustomBenchmark, PublicBenchmark, BenchmarksResponse, + CreateBenchmarkResponse, ) from ..._resource import SyncAPIResource, AsyncAPIResource from ..._constants import DEFAULT_TIMEOUT +MAX_UPLOAD_SIZE = 50 * 1024 * 1024 # 50 MB + + +def _get_content_type(filename: str) -> str: + ct, _ = mimetypes.guess_type(filename) + if ct: + return ct + ext = os.path.splitext(filename)[1].lower() + return { + ".jsonl": "application/jsonl", + ".json": "application/json", + ".csv": "text/csv", + ".parquet": "application/x-parquet", + }.get(ext, "application/octet-stream") + class Benchmarks(SyncAPIResource): def get( @@ -100,6 +118,183 @@ def get_by_key( return benchmark return None + def add( + self, + *benchmark_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Add benchmarks to the project by their IDs.""" + current = self.get(timeout=timeout) or [] + current_ids = [b.id for b in current] + new_ids = list(dict.fromkeys(current_ids + list(benchmark_ids))) + return self._patch_project_benchmarks(new_ids, timeout) + + def remove( + self, + *benchmark_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Remove benchmarks from the project by their IDs.""" + current = self.get(timeout=timeout) or [] + remove_set = set(benchmark_ids) + new_ids = [b.id for b in current if b.id not in remove_set] + return self._patch_project_benchmarks(new_ids, timeout) + + def _patch_project_benchmarks( + self, + dataset_ids: List[str], + timeout: float | httpx.Timeout | None, + ) -> bool: + url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + resp = self._patch( + url, + body={"datasets": dataset_ids}, + timeout=timeout, + cast_to=dict, + ) + return isinstance(resp, dict) and "id" in resp + + def _upload_file( + self, + file_path: str, + benchmark_name: str, + timeout: float | httpx.Timeout | None, + ) -> str: + """Upload a file and return the filename for use in benchmark creation.""" + file_path = os.path.abspath(file_path) + filename = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + + if file_size > MAX_UPLOAD_SIZE: + raise ValueError(f"File size {file_size} exceeds maximum of {MAX_UPLOAD_SIZE} bytes (50 MB)") + + content_type = _get_content_type(filename) + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + + raw_resp = self._post( + f"{base}/upload", + body={"key": benchmark_name, "filename": filename, "type": content_type, "size": file_size}, + timeout=timeout, + cast_to=dict, + ) + # Unwrap {"status": ..., "data": {...}} envelope if present + resp = raw_resp + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if not isinstance(resp, dict) or "url" not in resp: + raise ValueError("Failed to get upload URL") + + with open(file_path, "rb") as f: + put_resp = httpx.put( + resp["url"], + content=f.read(), + headers={"Content-Type": content_type}, + timeout=timeout if isinstance(timeout, httpx.Timeout) else httpx.Timeout(timeout), + ) + put_resp.raise_for_status() + + return filename + + def create_custom( + self, + *, + name: str, + description: str, + file_path: str, + additional_metrics: Optional[List[str]] = None, + custom_scorer_ids: Optional[List[str]] = None, + input_type: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateBenchmarkResponse]: + """Create a custom benchmark by uploading a JSONL file. + + Args: + name: Benchmark name (max 64 characters). + description: Benchmark description (max 280 characters). + file_path: Path to a JSONL file with benchmark prompts. + additional_metrics: Optional metrics: "readability", "toxicity", "hallucination". + custom_scorer_ids: Optional list of custom scorer IDs. + input_type: Optional input type: "messages" or "json_payload". + timeout: Request timeout override. + + Returns: + CreateBenchmarkResponse with benchmark_id, or None on failure. + """ + filename = self._upload_file(file_path, name, timeout) + + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = {"name": name, "description": description, "file": filename} + if additional_metrics: + body["additional_metrics"] = additional_metrics + if custom_scorer_ids: + body["custom_scorers"] = custom_scorer_ids + if input_type: + body["input_type"] = input_type + + resp = self._post( + f"{base}/custom-benchmarks", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "benchmark_id" in resp: + return CreateBenchmarkResponse(**resp) + return None + + def create_smart( + self, + *, + name: str, + description: str, + system_prompt: str, + file_paths: List[str], + metrics: Optional[List[str]] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateBenchmarkResponse]: + """Create a smart benchmark from uploaded files. + + The platform will use AI to generate benchmark prompts from the provided files. + + Args: + name: Benchmark name (max 256 characters). + description: Benchmark description (max 500 characters). + system_prompt: System prompt for benchmark generation (max 4000 characters). + file_paths: List of file paths to upload (1-20 files). + metrics: Optional metrics: "readability", "toxicity", "hallucination". + timeout: Request timeout override. + + Returns: + CreateBenchmarkResponse with benchmark_id, or None on failure. + """ + filenames = [] + for fp in file_paths: + filename = self._upload_file(fp, name, timeout) + filenames.append(filename) + + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = { + "name": name, + "description": description, + "system_prompt": system_prompt, + "files": filenames, + } + if metrics: + body["metrics"] = metrics + + resp = self._post( + f"{base}/smart-benchmarks", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "benchmark_id" in resp: + return CreateBenchmarkResponse(**resp) + return None + class AsyncBenchmarks(AsyncAPIResource): async def get( @@ -188,3 +383,181 @@ async def get_by_key( if benchmark.key == key: return benchmark return None + + async def add( + self, + *benchmark_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Add benchmarks to the project by their IDs.""" + current = await self.get(timeout=timeout) or [] + current_ids = [b.id for b in current] + new_ids = list(dict.fromkeys(current_ids + list(benchmark_ids))) + return await self._patch_project_benchmarks(new_ids, timeout) + + async def remove( + self, + *benchmark_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Remove benchmarks from the project by their IDs.""" + current = await self.get(timeout=timeout) or [] + remove_set = set(benchmark_ids) + new_ids = [b.id for b in current if b.id not in remove_set] + return await self._patch_project_benchmarks(new_ids, timeout) + + async def _patch_project_benchmarks( + self, + dataset_ids: List[str], + timeout: float | httpx.Timeout | None, + ) -> bool: + url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + resp = await self._patch( + url, + body={"datasets": dataset_ids}, + timeout=timeout, + cast_to=dict, + ) + return isinstance(resp, dict) and "id" in resp + + async def _upload_file( + self, + file_path: str, + benchmark_name: str, + timeout: float | httpx.Timeout | None, + ) -> str: + """Upload a file and return the filename for use in benchmark creation.""" + file_path = os.path.abspath(file_path) + filename = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + + if file_size > MAX_UPLOAD_SIZE: + raise ValueError(f"File size {file_size} exceeds maximum of {MAX_UPLOAD_SIZE} bytes (50 MB)") + + content_type = _get_content_type(filename) + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + + raw_resp = await self._post( + f"{base}/upload", + body={"key": benchmark_name, "filename": filename, "type": content_type, "size": file_size}, + timeout=timeout, + cast_to=dict, + ) + # Unwrap {"status": ..., "data": {...}} envelope if present + resp = raw_resp + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if not isinstance(resp, dict) or "url" not in resp: + raise ValueError("Failed to get upload URL") + + async with httpx.AsyncClient() as http: + with open(file_path, "rb") as f: + put_resp = await http.put( + resp["url"], + content=f.read(), + headers={"Content-Type": content_type}, + timeout=timeout if isinstance(timeout, httpx.Timeout) else httpx.Timeout(timeout), + ) + put_resp.raise_for_status() + + return filename + + async def create_custom( + self, + *, + name: str, + description: str, + file_path: str, + additional_metrics: Optional[List[str]] = None, + custom_scorer_ids: Optional[List[str]] = None, + input_type: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateBenchmarkResponse]: + """Create a custom benchmark by uploading a JSONL file. + + Args: + name: Benchmark name (max 64 characters). + description: Benchmark description (max 280 characters). + file_path: Path to a JSONL file with benchmark prompts. + additional_metrics: Optional metrics: "readability", "toxicity", "hallucination". + custom_scorer_ids: Optional list of custom scorer IDs. + input_type: Optional input type: "messages" or "json_payload". + timeout: Request timeout override. + + Returns: + CreateBenchmarkResponse with benchmark_id, or None on failure. + """ + filename = await self._upload_file(file_path, name, timeout) + + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = {"name": name, "description": description, "file": filename} + if additional_metrics: + body["additional_metrics"] = additional_metrics + if custom_scorer_ids: + body["custom_scorers"] = custom_scorer_ids + if input_type: + body["input_type"] = input_type + + resp = await self._post( + f"{base}/custom-benchmarks", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "benchmark_id" in resp: + return CreateBenchmarkResponse(**resp) + return None + + async def create_smart( + self, + *, + name: str, + description: str, + system_prompt: str, + file_paths: List[str], + metrics: Optional[List[str]] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateBenchmarkResponse]: + """Create a smart benchmark from uploaded files. + + The platform will use AI to generate benchmark prompts from the provided files. + + Args: + name: Benchmark name (max 256 characters). + description: Benchmark description (max 500 characters). + system_prompt: System prompt for benchmark generation (max 4000 characters). + file_paths: List of file paths to upload (1-20 files). + metrics: Optional metrics: "readability", "toxicity", "hallucination". + timeout: Request timeout override. + + Returns: + CreateBenchmarkResponse with benchmark_id, or None on failure. + """ + filenames = [] + for fp in file_paths: + filename = await self._upload_file(fp, name, timeout) + filenames.append(filename) + + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = { + "name": name, + "description": description, + "system_prompt": system_prompt, + "files": filenames, + } + if metrics: + body["metrics"] = metrics + + resp = await self._post( + f"{base}/smart-benchmarks", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "benchmark_id" in resp: + return CreateBenchmarkResponse(**resp) + return None diff --git a/src/layerlens/resources/comparisons/__init__.py b/src/layerlens/resources/comparisons/__init__.py new file mode 100644 index 0000000..b0a514e --- /dev/null +++ b/src/layerlens/resources/comparisons/__init__.py @@ -0,0 +1,3 @@ +from .comparisons import Comparisons, AsyncComparisons + +__all__ = ["Comparisons", "AsyncComparisons"] diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py new file mode 100644 index 0000000..eef469a --- /dev/null +++ b/src/layerlens/resources/comparisons/comparisons.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import Literal, Optional + +import httpx + +from ...models import ComparisonResponse +from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource +from ..._constants import DEFAULT_TIMEOUT + + +class Comparisons(SyncPublicAPIResource): + def compare( + self, + *, + evaluation_id_1: str, + evaluation_id_2: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + outcome_filter: Optional[ + Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"] + ] = None, + search: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ComparisonResponse]: + params = { + "evaluation_id_1": evaluation_id_1, + "evaluation_id_2": evaluation_id_2, + } + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if outcome_filter: + params["outcomeFilter"] = outcome_filter + if search: + params["search"] = search + + resp = self._get( + "/results/comparison", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return ComparisonResponse.model_validate(resp) + + +class AsyncComparisons(AsyncPublicAPIResource): + async def compare( + self, + *, + evaluation_id_1: str, + evaluation_id_2: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + outcome_filter: Optional[ + Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"] + ] = None, + search: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ComparisonResponse]: + params = { + "evaluation_id_1": evaluation_id_1, + "evaluation_id_2": evaluation_id_2, + } + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if outcome_filter: + params["outcomeFilter"] = outcome_filter + if search: + params["search"] = search + + resp = await self._get( + "/results/comparison", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return ComparisonResponse.model_validate(resp) diff --git a/src/layerlens/resources/evaluations/evaluations.py b/src/layerlens/resources/evaluations/evaluations.py index d1ea851..c5cc040 100644 --- a/src/layerlens/resources/evaluations/evaluations.py +++ b/src/layerlens/resources/evaluations/evaluations.py @@ -3,7 +3,7 @@ import math import time import asyncio -from typing import Optional +from typing import List, Literal, Optional import httpx @@ -13,6 +13,7 @@ Evaluation, CustomModel, CustomBenchmark, + EvaluationStatus, EvaluationsResponse, CreateEvaluationsResponse, ) @@ -80,14 +81,24 @@ def get_many( *, page: Optional[int] = None, page_size: Optional[int] = None, + sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + model_ids: Optional[List[str]] = None, + benchmark_ids: Optional[List[str]] = None, + status: Optional[EvaluationStatus] = None, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> Optional[EvaluationsResponse]: """ - Get evaluations with optional pagination. + Get evaluations with optional pagination, sorting, and filtering. Args: page: Page number for pagination (1-based, defaults to 1 if not provided) page_size: Number of evaluations per page (default: 100, optional) + sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) + order: Sort order (asc or desc) + model_ids: Filter by model IDs + benchmark_ids: Filter by benchmark/dataset IDs + status: Filter by evaluation status timeout: Request timeout Returns: @@ -104,6 +115,17 @@ def get_many( params["page"] = str(effective_page) params["pageSize"] = str(effective_page_size) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if model_ids: + params["models"] = ",".join(model_ids) + if benchmark_ids: + params["datasets"] = ",".join(benchmark_ids) + if status: + params["status"] = status.value + resp = self._get( f"/evaluations", params=params, @@ -214,14 +236,24 @@ async def get_many( *, page: Optional[int] = None, page_size: Optional[int] = None, + sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + model_ids: Optional[List[str]] = None, + benchmark_ids: Optional[List[str]] = None, + status: Optional[EvaluationStatus] = None, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> Optional[EvaluationsResponse]: """ - Get evaluations with optional pagination. + Get evaluations with optional pagination, sorting, and filtering. Args: page: Page number for pagination (1-based, defaults to 1 if not provided) page_size: Number of evaluations per page (default: 100, optional) + sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) + order: Sort order (asc or desc) + model_ids: Filter by model IDs + benchmark_ids: Filter by benchmark/dataset IDs + status: Filter by evaluation status timeout: Request timeout Returns: @@ -238,6 +270,17 @@ async def get_many( params["page"] = str(effective_page) params["pageSize"] = str(effective_page_size) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if model_ids: + params["models"] = ",".join(model_ids) + if benchmark_ids: + params["datasets"] = ",".join(benchmark_ids) + if status: + params["status"] = status.value + resp = await self._get( f"/evaluations", params=params, diff --git a/src/layerlens/resources/models/models.py b/src/layerlens/resources/models/models.py index 7e0cd70..122a3ae 100644 --- a/src/layerlens/resources/models/models.py +++ b/src/layerlens/resources/models/models.py @@ -1,10 +1,10 @@ from __future__ import annotations -from typing import List, Literal, Optional +from typing import Any, Dict, List, Literal, Optional import httpx -from ...models import Model, CustomModel, PublicModel, ModelsResponse +from ...models import Model, CustomModel, PublicModel, ModelsResponse, CreateModelResponse from ..._resource import SyncAPIResource, AsyncAPIResource from ..._constants import DEFAULT_TIMEOUT @@ -104,6 +104,90 @@ def get_by_key( return model return None + def add( + self, + *model_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Add models to the project by their IDs.""" + current = self.get(timeout=timeout) or [] + current_ids = [m.id for m in current] + new_ids = list(dict.fromkeys(current_ids + list(model_ids))) + return self._patch_project_models(new_ids, timeout) + + def remove( + self, + *model_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Remove models from the project by their IDs.""" + current = self.get(timeout=timeout) or [] + remove_set = set(model_ids) + new_ids = [m.id for m in current if m.id not in remove_set] + return self._patch_project_models(new_ids, timeout) + + def _patch_project_models( + self, + model_ids: List[str], + timeout: float | httpx.Timeout | None, + ) -> bool: + url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + resp = self._patch( + url, + body={"models": model_ids}, + timeout=timeout, + cast_to=dict, + ) + return isinstance(resp, dict) and "id" in resp + + def create_custom( + self, + *, + name: str, + key: str, + description: str, + api_url: str, + max_tokens: int, + api_key: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateModelResponse]: + """Create a custom model backed by an OpenAI-compatible API. + + Args: + name: Model name (max 256 characters). + key: Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 characters). + description: Model description (max 500 characters). + api_url: Base URL of the OpenAI-compatible API endpoint. + max_tokens: Maximum number of tokens the model supports. + api_key: Optional API key for the model provider. + timeout: Request timeout override. + + Returns: + CreateModelResponse with model_id, or None on failure. + """ + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = { + "name": name, + "key": key, + "description": description, + "api_url": api_url, + "max_tokens": max_tokens, + } + if api_key is not None: + body["api_key"] = api_key + + resp = self._post( + f"{base}/custom-models", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "model_id" in resp: + return CreateModelResponse(**resp) + return None + class AsyncModels(AsyncAPIResource): async def get( @@ -199,3 +283,87 @@ async def get_by_key( if model.key == key: return model return None + + async def add( + self, + *model_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Add models to the project by their IDs.""" + current = await self.get(timeout=timeout) or [] + current_ids = [m.id for m in current] + new_ids = list(dict.fromkeys(current_ids + list(model_ids))) + return await self._patch_project_models(new_ids, timeout) + + async def remove( + self, + *model_ids: str, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Remove models from the project by their IDs.""" + current = await self.get(timeout=timeout) or [] + remove_set = set(model_ids) + new_ids = [m.id for m in current if m.id not in remove_set] + return await self._patch_project_models(new_ids, timeout) + + async def _patch_project_models( + self, + model_ids: List[str], + timeout: float | httpx.Timeout | None, + ) -> bool: + url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + resp = await self._patch( + url, + body={"models": model_ids}, + timeout=timeout, + cast_to=dict, + ) + return isinstance(resp, dict) and "id" in resp + + async def create_custom( + self, + *, + name: str, + key: str, + description: str, + api_url: str, + max_tokens: int, + api_key: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[CreateModelResponse]: + """Create a custom model backed by an OpenAI-compatible API. + + Args: + name: Model name (max 256 characters). + key: Unique model key, lowercase alphanumeric with dots/hyphens/slashes (max 256 characters). + description: Model description (max 500 characters). + api_url: Base URL of the OpenAI-compatible API endpoint. + max_tokens: Maximum number of tokens the model supports. + api_key: Optional API key for the model provider. + timeout: Request timeout override. + + Returns: + CreateModelResponse with model_id, or None on failure. + """ + base = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}" + body: Dict[str, Any] = { + "name": name, + "key": key, + "description": description, + "api_url": api_url, + "max_tokens": max_tokens, + } + if api_key is not None: + body["api_key"] = api_key + + resp = await self._post( + f"{base}/custom-models", + body=body, + timeout=timeout, + cast_to=dict, + ) + if isinstance(resp, dict) and "data" in resp and "status" in resp: + resp = resp["data"] + if isinstance(resp, dict) and "model_id" in resp: + return CreateModelResponse(**resp) + return None diff --git a/src/layerlens/resources/public_benchmarks/__init__.py b/src/layerlens/resources/public_benchmarks/__init__.py new file mode 100644 index 0000000..52ea1d4 --- /dev/null +++ b/src/layerlens/resources/public_benchmarks/__init__.py @@ -0,0 +1,3 @@ +from .public_benchmarks import PublicBenchmarksResource, AsyncPublicBenchmarksResource + +__all__ = ["PublicBenchmarksResource", "AsyncPublicBenchmarksResource"] diff --git a/src/layerlens/resources/public_benchmarks/public_benchmarks.py b/src/layerlens/resources/public_benchmarks/public_benchmarks.py new file mode 100644 index 0000000..f73d75e --- /dev/null +++ b/src/layerlens/resources/public_benchmarks/public_benchmarks.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import math +from typing import List, Literal, Optional + +import httpx + +from ...models import ( + BenchmarkPrompt, + BenchmarkPromptsResponse, + PublicBenchmarksListResponse, +) +from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource +from ..._constants import DEFAULT_TIMEOUT + +DEFAULT_PROMPTS_PAGE_SIZE = 100 +MAX_PROMPTS_PAGE_SIZE = 500 + + +class PublicBenchmarksResource(SyncPublicAPIResource): + def get( + self, + *, + query: Optional[str] = None, + name: Optional[str] = None, + key: Optional[str] = None, + ids: Optional[List[str]] = None, + categories: Optional[List[str]] = None, + languages: Optional[List[str]] = None, + sort_by: Optional[Literal["name"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + page: Optional[int] = None, + page_size: Optional[int] = None, + include_deprecated: Optional[bool] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[PublicBenchmarksListResponse]: + params = {} + if query: + params["query"] = query + if name: + params["name"] = name + if key: + params["key"] = key + if ids: + params["ids"] = ",".join(ids) + if categories: + params["categories"] = ",".join(categories) + if languages: + params["languages"] = ",".join(languages) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if include_deprecated is not None: + params["include_deprecated"] = str(include_deprecated).lower() + + resp = self._get( + "/datasets", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return PublicBenchmarksListResponse.model_validate(resp) + + def get_prompts( + self, + benchmark_id: str, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + search_field: Optional[Literal["id", "input", "truth"]] = None, + search_value: Optional[str] = None, + sort_by: Optional[Literal["id", "input", "truth"]] = None, + sort_order: Optional[Literal["asc", "desc"]] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[BenchmarkPromptsResponse]: + params = {} + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if search_field: + params["search"] = search_field + if search_value: + params["searchValue"] = search_value + if sort_by: + params["sortBy"] = sort_by + if sort_order: + params["sortOrder"] = sort_order + + resp = self._get( + f"/datasets/{benchmark_id}/prompts", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return BenchmarkPromptsResponse.model_validate(resp) + + def get_all_prompts( + self, + benchmark_id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> List[BenchmarkPrompt]: + all_prompts: List[BenchmarkPrompt] = [] + page = 1 + page_size = DEFAULT_PROMPTS_PAGE_SIZE + + while True: + resp = self.get_prompts( + benchmark_id, + page=page, + page_size=page_size, + timeout=timeout, + ) + if resp is None or not resp.data.prompts: + break + + all_prompts.extend(resp.data.prompts) + + total_count = resp.data.count + total_pages = math.ceil(total_count / page_size) if total_count > 0 else 0 + if page >= total_pages: + break + + page += 1 + + return all_prompts + + +class AsyncPublicBenchmarksResource(AsyncPublicAPIResource): + async def get( + self, + *, + query: Optional[str] = None, + name: Optional[str] = None, + key: Optional[str] = None, + ids: Optional[List[str]] = None, + categories: Optional[List[str]] = None, + languages: Optional[List[str]] = None, + sort_by: Optional[Literal["name"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + page: Optional[int] = None, + page_size: Optional[int] = None, + include_deprecated: Optional[bool] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[PublicBenchmarksListResponse]: + params = {} + if query: + params["query"] = query + if name: + params["name"] = name + if key: + params["key"] = key + if ids: + params["ids"] = ",".join(ids) + if categories: + params["categories"] = ",".join(categories) + if languages: + params["languages"] = ",".join(languages) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if include_deprecated is not None: + params["include_deprecated"] = str(include_deprecated).lower() + + resp = await self._get( + "/datasets", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return PublicBenchmarksListResponse.model_validate(resp) + + async def get_prompts( + self, + benchmark_id: str, + *, + page: Optional[int] = None, + page_size: Optional[int] = None, + search_field: Optional[Literal["id", "input", "truth"]] = None, + search_value: Optional[str] = None, + sort_by: Optional[Literal["id", "input", "truth"]] = None, + sort_order: Optional[Literal["asc", "desc"]] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[BenchmarkPromptsResponse]: + params = {} + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if search_field: + params["search"] = search_field + if search_value: + params["searchValue"] = search_value + if sort_by: + params["sortBy"] = sort_by + if sort_order: + params["sortOrder"] = sort_order + + resp = await self._get( + f"/datasets/{benchmark_id}/prompts", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return BenchmarkPromptsResponse.model_validate(resp) + + async def get_all_prompts( + self, + benchmark_id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> List[BenchmarkPrompt]: + all_prompts: List[BenchmarkPrompt] = [] + page = 1 + page_size = DEFAULT_PROMPTS_PAGE_SIZE + + while True: + resp = await self.get_prompts( + benchmark_id, + page=page, + page_size=page_size, + timeout=timeout, + ) + if resp is None or not resp.data.prompts: + break + + all_prompts.extend(resp.data.prompts) + + total_count = resp.data.count + total_pages = math.ceil(total_count / page_size) if total_count > 0 else 0 + if page >= total_pages: + break + + page += 1 + + return all_prompts diff --git a/src/layerlens/resources/public_evaluations/__init__.py b/src/layerlens/resources/public_evaluations/__init__.py new file mode 100644 index 0000000..e1b1781 --- /dev/null +++ b/src/layerlens/resources/public_evaluations/__init__.py @@ -0,0 +1,3 @@ +from .public_evaluations import PublicEvaluationsResource, AsyncPublicEvaluationsResource + +__all__ = ["PublicEvaluationsResource", "AsyncPublicEvaluationsResource"] diff --git a/src/layerlens/resources/public_evaluations/public_evaluations.py b/src/layerlens/resources/public_evaluations/public_evaluations.py new file mode 100644 index 0000000..ddd1cdf --- /dev/null +++ b/src/layerlens/resources/public_evaluations/public_evaluations.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import math +from typing import List, Literal, Optional + +import httpx + +from ...models import ( + Evaluation, + EvaluationStatus, + EvaluationsResponse, +) +from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource +from ..._constants import DEFAULT_TIMEOUT + +DEFAULT_PAGE = 1 +DEFAULT_PAGE_SIZE = 100 +MAX_PAGE_SIZE = 500 + + +class PublicEvaluationsResource(SyncPublicAPIResource): + def get_by_id( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Evaluation]: + evaluation = self._get( + f"/evaluations/{id}", + timeout=timeout, + cast_to=Evaluation, + ) + if isinstance(evaluation, Evaluation): + return evaluation + return None + + def get_many( + self, + *, + organization_id: str, + project_id: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + model_ids: Optional[List[str]] = None, + benchmark_ids: Optional[List[str]] = None, + status: Optional[EvaluationStatus] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationsResponse]: + """ + Get evaluations with optional pagination, sorting, and filtering. + + Args: + organization_id: Organization ID (required) + project_id: Project ID (required) + page: Page number for pagination (1-based, defaults to 1 if not provided) + page_size: Number of evaluations per page (default: 100, optional) + sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) + order: Sort order (asc or desc) + model_ids: Filter by model IDs + benchmark_ids: Filter by benchmark/dataset IDs + status: Filter by evaluation status + timeout: Request timeout + + Returns: + EvaluationsResponse object or None + """ + params = { + "organizationID": organization_id, + "projectID": project_id, + } + + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + + params["page"] = str(effective_page) + params["pageSize"] = str(effective_page_size) + + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if model_ids: + params["models"] = ",".join(model_ids) + if benchmark_ids: + params["datasets"] = ",".join(benchmark_ids) + if status: + params["status"] = status.value + + resp = self._get( + "/evaluations", + params=params, + timeout=timeout, + cast_to=dict, + ) + if not resp or not isinstance(resp, dict): + return None + + evaluations = [e if isinstance(e, Evaluation) else Evaluation(**e) for e in resp.get("evaluations", [])] + + total_count = resp.get("total_count", 0) + total_pages = math.ceil(total_count / effective_page_size) if total_count > 0 and effective_page_size > 0 else 0 + + resp_with_pagination = { + "evaluations": evaluations, + "pagination": { + "page": effective_page, + "page_size": effective_page_size, + "total_pages": total_pages, + "total_count": total_count, + }, + } + + try: + return EvaluationsResponse.model_validate(resp_with_pagination) + except Exception: + return None + + +class AsyncPublicEvaluationsResource(AsyncPublicAPIResource): + async def get_by_id( + self, + id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[Evaluation]: + evaluation = await self._get( + f"/evaluations/{id}", + timeout=timeout, + cast_to=Evaluation, + ) + if isinstance(evaluation, Evaluation): + return evaluation + return None + + async def get_many( + self, + *, + organization_id: str, + project_id: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, + order: Optional[Literal["asc", "desc"]] = None, + model_ids: Optional[List[str]] = None, + benchmark_ids: Optional[List[str]] = None, + status: Optional[EvaluationStatus] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[EvaluationsResponse]: + """ + Get evaluations with optional pagination, sorting, and filtering. + + Args: + organization_id: Organization ID (required) + project_id: Project ID (required) + page: Page number for pagination (1-based, defaults to 1 if not provided) + page_size: Number of evaluations per page (default: 100, optional) + sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) + order: Sort order (asc or desc) + model_ids: Filter by model IDs + benchmark_ids: Filter by benchmark/dataset IDs + status: Filter by evaluation status + timeout: Request timeout + + Returns: + EvaluationsResponse object or None + """ + params = { + "organizationID": organization_id, + "projectID": project_id, + } + + effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE + effective_page = page if page is not None else DEFAULT_PAGE + + params["page"] = str(effective_page) + params["pageSize"] = str(effective_page_size) + + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if model_ids: + params["models"] = ",".join(model_ids) + if benchmark_ids: + params["datasets"] = ",".join(benchmark_ids) + if status: + params["status"] = status.value + + resp = await self._get( + "/evaluations", + params=params, + timeout=timeout, + cast_to=dict, + ) + if not resp or not isinstance(resp, dict): + return None + + evaluations = [e if isinstance(e, Evaluation) else Evaluation(**e) for e in resp.get("evaluations", [])] + + total_count = resp.get("total_count", 0) + total_pages = math.ceil(total_count / effective_page_size) if total_count > 0 and effective_page_size > 0 else 0 + + resp_with_pagination = { + "evaluations": evaluations, + "pagination": { + "page": effective_page, + "page_size": effective_page_size, + "total_pages": total_pages, + "total_count": total_count, + }, + } + + try: + return EvaluationsResponse.model_validate(resp_with_pagination) + except Exception: + return None diff --git a/src/layerlens/resources/public_models/__init__.py b/src/layerlens/resources/public_models/__init__.py new file mode 100644 index 0000000..cc73f53 --- /dev/null +++ b/src/layerlens/resources/public_models/__init__.py @@ -0,0 +1,3 @@ +from .public_models import PublicModelsResource, AsyncPublicModelsResource + +__all__ = ["PublicModelsResource", "AsyncPublicModelsResource"] diff --git a/src/layerlens/resources/public_models/public_models.py b/src/layerlens/resources/public_models/public_models.py new file mode 100644 index 0000000..3b23b41 --- /dev/null +++ b/src/layerlens/resources/public_models/public_models.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from typing import List, Literal, Optional + +import httpx + +from ...models import PublicModelsListResponse +from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource +from ..._constants import DEFAULT_TIMEOUT + + +class PublicModelsResource(SyncPublicAPIResource): + def get( + self, + *, + query: Optional[str] = None, + name: Optional[str] = None, + key: Optional[str] = None, + ids: Optional[List[str]] = None, + categories: Optional[List[str]] = None, + companies: Optional[List[str]] = None, + regions: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, + sizes: Optional[List[str]] = None, + sort_by: Optional[ + Literal["name", "createdAt", "releasedAt", "architectureType", "contextLength", "license", "region"] + ] = None, + order: Optional[Literal["asc", "desc"]] = None, + page: Optional[int] = None, + page_size: Optional[int] = None, + include_deprecated: Optional[bool] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[PublicModelsListResponse]: + params = {} + if query: + params["query"] = query + if name: + params["name"] = name + if key: + params["key"] = key + if ids: + params["ids"] = ",".join(ids) + if categories: + params["categories"] = ",".join(categories) + if companies: + params["companies"] = ",".join(companies) + if regions: + params["regions"] = ",".join(regions) + if licenses: + params["licenses"] = ",".join(licenses) + if sizes: + params["sizes"] = ",".join(sizes) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if include_deprecated is not None: + params["include_deprecated"] = str(include_deprecated).lower() + + resp = self._get( + "/models", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return PublicModelsListResponse.model_validate(resp) + + +class AsyncPublicModelsResource(AsyncPublicAPIResource): + async def get( + self, + *, + query: Optional[str] = None, + name: Optional[str] = None, + key: Optional[str] = None, + ids: Optional[List[str]] = None, + categories: Optional[List[str]] = None, + companies: Optional[List[str]] = None, + regions: Optional[List[str]] = None, + licenses: Optional[List[str]] = None, + sizes: Optional[List[str]] = None, + sort_by: Optional[ + Literal["name", "createdAt", "releasedAt", "architectureType", "contextLength", "license", "region"] + ] = None, + order: Optional[Literal["asc", "desc"]] = None, + page: Optional[int] = None, + page_size: Optional[int] = None, + include_deprecated: Optional[bool] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[PublicModelsListResponse]: + params = {} + if query: + params["query"] = query + if name: + params["name"] = name + if key: + params["key"] = key + if ids: + params["ids"] = ",".join(ids) + if categories: + params["categories"] = ",".join(categories) + if companies: + params["companies"] = ",".join(companies) + if regions: + params["regions"] = ",".join(regions) + if licenses: + params["licenses"] = ",".join(licenses) + if sizes: + params["sizes"] = ",".join(sizes) + if sort_by: + params["sortBy"] = sort_by + if order: + params["order"] = order + if page is not None: + params["page"] = str(page) + if page_size is not None: + params["pageSize"] = str(page_size) + if include_deprecated is not None: + params["include_deprecated"] = str(include_deprecated).lower() + + resp = await self._get( + "/models", + params=params, + timeout=timeout, + cast_to=dict, + ) + + if not isinstance(resp, dict): + return None + + return PublicModelsListResponse.model_validate(resp) diff --git a/tests/resources/test_benchmarks.py b/tests/resources/test_benchmarks.py index 2155105..9e03e22 100644 --- a/tests/resources/test_benchmarks.py +++ b/tests/resources/test_benchmarks.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, call +from unittest.mock import Mock, call, patch import httpx import pytest @@ -8,6 +8,7 @@ CustomBenchmark, PublicBenchmark, BenchmarksResponse, + CreateBenchmarkResponse, ) from layerlens._constants import DEFAULT_TIMEOUT from layerlens.resources.benchmarks.benchmarks import Benchmarks @@ -445,3 +446,533 @@ def test_get_benchmarks_mixed_benchmark_types(self, benchmarks_resource): assert isinstance(result[1], PublicBenchmark) assert result[0].key == "my-bench" assert result[1].key == "mmlu" + + +class TestBenchmarksAdd: + """Test Benchmarks.add() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.patch_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + def test_add_single_benchmark(self, benchmarks_resource): + """add() merges new ID with current benchmarks and PATCHes.""" + existing = PublicBenchmark(id="b1", key="b1", name="B1") + benchmarks_resource.get = Mock(return_value=[existing]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + result = benchmarks_resource.add("b2") + + assert result is True + benchmarks_resource._patch.assert_called_once_with( + "/organizations/org-123/projects/proj-456", + body={"datasets": ["b1", "b2"]}, + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_add_multiple_benchmarks(self, benchmarks_resource): + """add() handles multiple benchmark IDs.""" + benchmarks_resource.get = Mock(return_value=[]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + result = benchmarks_resource.add("b1", "b2", "b3") + + assert result is True + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b1", "b2", "b3"]} + + def test_add_deduplicates(self, benchmarks_resource): + """add() deduplicates IDs already in the project.""" + existing = PublicBenchmark(id="b1", key="b1", name="B1") + benchmarks_resource.get = Mock(return_value=[existing]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + benchmarks_resource.add("b1", "b2") + + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b1", "b2"]} + + def test_add_returns_false_on_failure(self, benchmarks_resource): + """add() returns False when PATCH fails.""" + benchmarks_resource.get = Mock(return_value=[]) + benchmarks_resource._patch.return_value = "error" + + result = benchmarks_resource.add("b1") + + assert result is False + + def test_add_with_none_get_response(self, benchmarks_resource): + """add() handles None from get() gracefully.""" + benchmarks_resource.get = Mock(return_value=None) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + result = benchmarks_resource.add("b1") + + assert result is True + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b1"]} + + def test_add_uses_datasets_field(self, benchmarks_resource): + """add() sends 'datasets' (not 'benchmarks') in the PATCH body.""" + benchmarks_resource.get = Mock(return_value=[]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + benchmarks_resource.add("b1") + + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert "datasets" in call_body + assert "benchmarks" not in call_body + + +class TestBenchmarksRemove: + """Test Benchmarks.remove() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.patch_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + def test_remove_single_benchmark(self, benchmarks_resource): + """remove() removes specified ID and PATCHes remaining.""" + b1 = PublicBenchmark(id="b1", key="b1", name="B1") + b2 = PublicBenchmark(id="b2", key="b2", name="B2") + benchmarks_resource.get = Mock(return_value=[b1, b2]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + result = benchmarks_resource.remove("b1") + + assert result is True + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b2"]} + + def test_remove_multiple_benchmarks(self, benchmarks_resource): + """remove() handles removing multiple IDs.""" + b1 = PublicBenchmark(id="b1", key="b1", name="B1") + b2 = PublicBenchmark(id="b2", key="b2", name="B2") + b3 = PublicBenchmark(id="b3", key="b3", name="B3") + benchmarks_resource.get = Mock(return_value=[b1, b2, b3]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + benchmarks_resource.remove("b1", "b3") + + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b2"]} + + def test_remove_nonexistent_id(self, benchmarks_resource): + """remove() ignores IDs that aren't in the project.""" + b1 = PublicBenchmark(id="b1", key="b1", name="B1") + benchmarks_resource.get = Mock(return_value=[b1]) + benchmarks_resource._patch.return_value = {"id": "proj-456"} + + benchmarks_resource.remove("nonexistent") + + call_body = benchmarks_resource._patch.call_args.kwargs["body"] + assert call_body == {"datasets": ["b1"]} + + def test_remove_returns_false_on_failure(self, benchmarks_resource): + """remove() returns False when PATCH fails.""" + benchmarks_resource.get = Mock(return_value=[]) + benchmarks_resource._patch.return_value = None + + result = benchmarks_resource.remove("b1") + + assert result is False + + +class TestBenchmarksCreateCustom: + """Test Benchmarks.create_custom() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + @pytest.fixture + def tmp_jsonl(self, tmp_path): + """Create a temporary JSONL file.""" + f = tmp_path / "test.jsonl" + f.write_text('{"input": "What is 2+2?", "truth": "4"}\n') + return str(f) + + def test_create_custom_success_with_envelope(self, benchmarks_resource, tmp_jsonl): + """create_custom() unwraps envelope and returns CreateBenchmarkResponse.""" + # Mock _upload_file to skip actual upload + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "status": "success", + "data": { + "benchmark_id": "bench-123", + "organization_id": "org-123", + "project_id": "proj-456", + }, + } + + result = benchmarks_resource.create_custom( + name="Test Benchmark", + description="Test description", + file_path=tmp_jsonl, + ) + + assert isinstance(result, CreateBenchmarkResponse) + assert result.benchmark_id == "bench-123" + assert result.organization_id == "org-123" + + def test_create_custom_success_without_envelope(self, benchmarks_resource, tmp_jsonl): + """create_custom() works when response has no envelope.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "benchmark_id": "bench-123", + "organization_id": "org-123", + "project_id": "proj-456", + } + + result = benchmarks_resource.create_custom( + name="Test", + description="Test", + file_path=tmp_jsonl, + ) + + assert isinstance(result, CreateBenchmarkResponse) + assert result.benchmark_id == "bench-123" + + def test_create_custom_sends_correct_body(self, benchmarks_resource, tmp_jsonl): + """create_custom() sends all fields in the request body.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_custom( + name="My Bench", + description="A benchmark", + file_path=tmp_jsonl, + additional_metrics=["toxicity", "readability"], + custom_scorer_ids=["scorer-1"], + input_type="messages", + ) + + call_kwargs = benchmarks_resource._post.call_args.kwargs + assert call_kwargs["body"] == { + "name": "My Bench", + "description": "A benchmark", + "file": "test.jsonl", + "additional_metrics": ["toxicity", "readability"], + "custom_scorers": ["scorer-1"], + "input_type": "messages", + } + + def test_create_custom_omits_optional_fields(self, benchmarks_resource, tmp_jsonl): + """create_custom() does not include optional fields when not provided.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_custom( + name="Bench", + description="Desc", + file_path=tmp_jsonl, + ) + + call_body = benchmarks_resource._post.call_args.kwargs["body"] + assert "additional_metrics" not in call_body + assert "custom_scorers" not in call_body + assert "input_type" not in call_body + + def test_create_custom_correct_url(self, benchmarks_resource, tmp_jsonl): + """create_custom() posts to the correct endpoint.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_custom( + name="B", + description="D", + file_path=tmp_jsonl, + ) + + call_args = benchmarks_resource._post.call_args + assert call_args[0][0] == "/organizations/org-123/projects/proj-456/custom-benchmarks" + + def test_create_custom_returns_none_on_failure(self, benchmarks_resource, tmp_jsonl): + """create_custom() returns None when response is unexpected.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = "not-a-dict" + + result = benchmarks_resource.create_custom( + name="B", + description="D", + file_path=tmp_jsonl, + ) + + assert result is None + + def test_create_custom_calls_upload_file(self, benchmarks_resource, tmp_jsonl): + """create_custom() calls _upload_file with correct args.""" + benchmarks_resource._upload_file = Mock(return_value="test.jsonl") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_custom( + name="My Bench", + description="Desc", + file_path=tmp_jsonl, + ) + + benchmarks_resource._upload_file.assert_called_once_with(tmp_jsonl, "My Bench", DEFAULT_TIMEOUT) + + +class TestBenchmarksCreateSmart: + """Test Benchmarks.create_smart() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + def test_create_smart_success_with_envelope(self, benchmarks_resource): + """create_smart() unwraps envelope and returns CreateBenchmarkResponse.""" + benchmarks_resource._upload_file = Mock(return_value="doc.txt") + benchmarks_resource._post.return_value = { + "status": "success", + "data": { + "benchmark_id": "smart-123", + "organization_id": "org-123", + "project_id": "proj-456", + }, + } + + result = benchmarks_resource.create_smart( + name="Smart Bench", + description="Smart benchmark", + system_prompt="Generate QA pairs", + file_paths=["/tmp/doc.txt"], + ) + + assert isinstance(result, CreateBenchmarkResponse) + assert result.benchmark_id == "smart-123" + + def test_create_smart_sends_correct_body(self, benchmarks_resource): + """create_smart() sends all fields in the request body.""" + benchmarks_resource._upload_file = Mock(side_effect=["doc1.txt", "doc2.pdf"]) + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_smart( + name="Smart", + description="Desc", + system_prompt="Generate pairs", + file_paths=["/tmp/doc1.txt", "/tmp/doc2.pdf"], + metrics=["hallucination"], + ) + + call_kwargs = benchmarks_resource._post.call_args.kwargs + assert call_kwargs["body"] == { + "name": "Smart", + "description": "Desc", + "system_prompt": "Generate pairs", + "files": ["doc1.txt", "doc2.pdf"], + "metrics": ["hallucination"], + } + + def test_create_smart_uploads_all_files(self, benchmarks_resource): + """create_smart() calls _upload_file for each file path.""" + benchmarks_resource._upload_file = Mock(side_effect=["a.txt", "b.pdf", "c.csv"]) + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_smart( + name="S", + description="D", + system_prompt="P", + file_paths=["/tmp/a.txt", "/tmp/b.pdf", "/tmp/c.csv"], + ) + + assert benchmarks_resource._upload_file.call_count == 3 + + def test_create_smart_correct_url(self, benchmarks_resource): + """create_smart() posts to the correct endpoint.""" + benchmarks_resource._upload_file = Mock(return_value="doc.txt") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_smart( + name="S", + description="D", + system_prompt="P", + file_paths=["/tmp/doc.txt"], + ) + + call_args = benchmarks_resource._post.call_args + assert call_args[0][0] == "/organizations/org-123/projects/proj-456/smart-benchmarks" + + def test_create_smart_omits_metrics_when_none(self, benchmarks_resource): + """create_smart() does not include metrics when not provided.""" + benchmarks_resource._upload_file = Mock(return_value="doc.txt") + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"benchmark_id": "x", "organization_id": "o", "project_id": "p"}, + } + + benchmarks_resource.create_smart( + name="S", + description="D", + system_prompt="P", + file_paths=["/tmp/doc.txt"], + ) + + call_body = benchmarks_resource._post.call_args.kwargs["body"] + assert "metrics" not in call_body + + def test_create_smart_returns_none_on_failure(self, benchmarks_resource): + """create_smart() returns None when response is unexpected.""" + benchmarks_resource._upload_file = Mock(return_value="doc.txt") + benchmarks_resource._post.return_value = None + + result = benchmarks_resource.create_smart( + name="S", + description="D", + system_prompt="P", + file_paths=["/tmp/doc.txt"], + ) + + assert result is None + + +class TestBenchmarksUploadFile: + """Test Benchmarks._upload_file() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + return client + + @pytest.fixture + def benchmarks_resource(self, mock_client): + return Benchmarks(mock_client) + + @pytest.fixture + def tmp_jsonl(self, tmp_path): + """Create a temporary JSONL file.""" + f = tmp_path / "data.jsonl" + f.write_text('{"input": "test", "truth": "answer"}\n') + return str(f) + + @patch("layerlens.resources.benchmarks.benchmarks.httpx.put") + def test_upload_file_success_with_envelope(self, mock_put, benchmarks_resource, tmp_jsonl): + """_upload_file() unwraps envelope and uploads to presigned URL.""" + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"url": "https://s3.example.com/upload?signed=1"}, + } + mock_put.return_value = Mock(status_code=200, raise_for_status=Mock()) + + result = benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT) + + assert result == "data.jsonl" + mock_put.assert_called_once() + assert mock_put.call_args.args[0] == "https://s3.example.com/upload?signed=1" + + @patch("layerlens.resources.benchmarks.benchmarks.httpx.put") + def test_upload_file_success_without_envelope(self, mock_put, benchmarks_resource, tmp_jsonl): + """_upload_file() works when response has no envelope.""" + benchmarks_resource._post.return_value = { + "url": "https://s3.example.com/upload?signed=1", + } + mock_put.return_value = Mock(status_code=200, raise_for_status=Mock()) + + result = benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT) + + assert result == "data.jsonl" + + def test_upload_file_raises_on_missing_url(self, benchmarks_resource, tmp_jsonl): + """_upload_file() raises ValueError when URL is missing.""" + benchmarks_resource._post.return_value = {"status": "success", "data": {"no_url": True}} + + with pytest.raises(ValueError, match="Failed to get upload URL"): + benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT) + + def test_upload_file_raises_on_invalid_response(self, benchmarks_resource, tmp_jsonl): + """_upload_file() raises ValueError when response is not a dict.""" + benchmarks_resource._post.return_value = "not-a-dict" + + with pytest.raises(ValueError, match="Failed to get upload URL"): + benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT) + + def test_upload_file_raises_on_oversized_file(self, benchmarks_resource, tmp_path): + """_upload_file() raises ValueError when file exceeds size limit.""" + big_file = tmp_path / "big.jsonl" + # Create a file that appears to be larger than MAX_UPLOAD_SIZE + big_file.write_text("x") + + with patch("os.path.getsize", return_value=51 * 1024 * 1024): + with pytest.raises(ValueError, match="exceeds maximum"): + benchmarks_resource._upload_file(str(big_file), "my-bench", DEFAULT_TIMEOUT) + + @patch("layerlens.resources.benchmarks.benchmarks.httpx.put") + def test_upload_file_sends_correct_upload_request(self, mock_put, benchmarks_resource, tmp_jsonl): + """_upload_file() sends correct metadata to upload endpoint.""" + benchmarks_resource._post.return_value = { + "status": "success", + "data": {"url": "https://s3.example.com/upload"}, + } + mock_put.return_value = Mock(status_code=200, raise_for_status=Mock()) + + benchmarks_resource._upload_file(tmp_jsonl, "my-bench", DEFAULT_TIMEOUT) + + post_kwargs = benchmarks_resource._post.call_args.kwargs + body = post_kwargs["body"] + assert body["key"] == "my-bench" + assert body["filename"] == "data.jsonl" + assert "type" in body + assert "size" in body diff --git a/tests/resources/test_evaluations.py b/tests/resources/test_evaluations.py index 02480a3..0d40f08 100644 --- a/tests/resources/test_evaluations.py +++ b/tests/resources/test_evaluations.py @@ -5,12 +5,22 @@ from layerlens.models import ( Evaluation, + ErrorAnalysis, + AnalysisSummary, + EvaluationMetric, EvaluationStatus, + EvaluationDataset, + EvaluationSummary, + EvaluationTaskType, + PerformanceDetails, EvaluationsResponse, CreateEvaluationsResponse, ) from layerlens._constants import DEFAULT_TIMEOUT from layerlens.resources.evaluations.evaluations import Evaluations +from layerlens.resources.public_evaluations.public_evaluations import ( + PublicEvaluationsResource, +) class TestEvaluations: @@ -446,3 +456,419 @@ def test_create_evaluation_end_to_end_flow(self): assert "/organizations/test-org/projects/test-project/evaluations" in call_args[0][0] assert call_args.kwargs["body"][0]["model_id"] == mock_model.id assert call_args.kwargs["body"][0]["dataset_id"] == mock_benchmark.id + + +class TestEvaluationModelFields: + """Test Evaluation model parses all backend fields.""" + + @pytest.fixture + def full_evaluation_data(self): + return { + "id": "eval-full", + "status": "success", + "status_description": "Evaluation completed successfully", + "submitted_at": 1640995200, + "finished_at": 1640995800, + "model_id": "model-456", + "model_name": "GPT-4", + "model_key": "gpt-4", + "model_company": "OpenAI", + "dataset_id": "benchmark-789", + "dataset_name": "MMLU", + "average_duration": 2500, + "accuracy": 0.89, + "readability_score": 0.75, + "toxicity_score": 0.02, + "ethics_score": 0.95, + "failed_prompt_count": 3, + "queue_id": 42, + "summary": { + "name": "GPT-4 on MMLU", + "goal": "Evaluate general knowledge", + "metrics": [ + {"name": "accuracy", "description": "Correctness of responses"}, + {"name": "toxicity", "description": "Harmful content detection"}, + ], + "task_types": [ + {"name": "multiple_choice", "description": "Select correct answer"}, + ], + "dataset": { + "total_size": 15908, + "training_size": 0, + "test_size": 15908, + "characteristics": ["multi-domain", "multiple-choice"], + }, + "model": { + "model_name": "GPT-4", + "performance": {"overall": 0.89}, + }, + "performance_details": { + "strengths": ["Strong reasoning", "Good factual recall"], + "challenges": ["Abstract math", "Ambiguous questions"], + }, + "error_analysis": { + "common_failure_modes": ["Off-by-one errors", "Misinterpreting negation"], + "example": "Q: Which is NOT true? A: Selected a true statement.", + }, + "analysis_summary": { + "key_takeaways": [ + "Strong overall performance at 89%", + "Struggles with negation-based questions", + ], + }, + }, + } + + def test_parse_all_fields(self, full_evaluation_data): + """Evaluation model parses all backend fields correctly.""" + evaluation = Evaluation(**full_evaluation_data) + + assert evaluation.id == "eval-full" + assert evaluation.status == EvaluationStatus.SUCCESS + assert evaluation.status_description == "Evaluation completed successfully" + assert evaluation.model_id == "model-456" + assert evaluation.model_name == "GPT-4" + assert evaluation.model_key == "gpt-4" + assert evaluation.model_company == "OpenAI" + assert evaluation.benchmark_id == "benchmark-789" + assert evaluation.benchmark_name == "MMLU" + assert evaluation.accuracy == 0.89 + assert evaluation.readability_score == 0.75 + assert evaluation.toxicity_score == 0.02 + assert evaluation.ethics_score == 0.95 + assert evaluation.failed_prompt_count == 3 + assert evaluation.queue_id == 42 + + def test_parse_summary(self, full_evaluation_data): + """Evaluation model parses nested summary correctly.""" + evaluation = Evaluation(**full_evaluation_data) + + assert evaluation.summary is not None + summary = evaluation.summary + assert isinstance(summary, EvaluationSummary) + assert summary.name == "GPT-4 on MMLU" + assert summary.goal == "Evaluate general knowledge" + + def test_parse_summary_metrics(self, full_evaluation_data): + """Summary metrics are parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + metrics = evaluation.summary.metrics + + assert len(metrics) == 2 + assert isinstance(metrics[0], EvaluationMetric) + assert metrics[0].name == "accuracy" + assert metrics[1].name == "toxicity" + + def test_parse_summary_task_types(self, full_evaluation_data): + """Summary task types are parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + task_types = evaluation.summary.task_types + + assert len(task_types) == 1 + assert isinstance(task_types[0], EvaluationTaskType) + assert task_types[0].name == "multiple_choice" + + def test_parse_summary_dataset(self, full_evaluation_data): + """Summary dataset info is parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + dataset = evaluation.summary.dataset + + assert isinstance(dataset, EvaluationDataset) + assert dataset.total_size == 15908 + assert dataset.test_size == 15908 + assert "multi-domain" in dataset.characteristics + + def test_parse_summary_performance_details(self, full_evaluation_data): + """Summary performance details are parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + perf = evaluation.summary.performance_details + + assert isinstance(perf, PerformanceDetails) + assert len(perf.strengths) == 2 + assert "Strong reasoning" in perf.strengths + assert len(perf.challenges) == 2 + + def test_parse_summary_error_analysis(self, full_evaluation_data): + """Summary error analysis is parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + errors = evaluation.summary.error_analysis + + assert isinstance(errors, ErrorAnalysis) + assert len(errors.common_failure_modes) == 2 + assert "Off-by-one errors" in errors.common_failure_modes + assert "NOT true" in errors.example + + def test_parse_summary_analysis_summary(self, full_evaluation_data): + """Summary analysis summary is parsed correctly.""" + evaluation = Evaluation(**full_evaluation_data) + analysis = evaluation.summary.analysis_summary + + assert isinstance(analysis, AnalysisSummary) + assert len(analysis.key_takeaways) == 2 + + def test_missing_optional_fields_default(self): + """Evaluation model uses defaults for missing optional fields.""" + minimal_data = { + "id": "eval-min", + "status": "pending", + "submitted_at": 1640995200, + "finished_at": 0, + "model_id": "m1", + "dataset_id": "b1", + "average_duration": 0, + "accuracy": 0.0, + } + evaluation = Evaluation(**minimal_data) + + assert evaluation.status_description == "" + assert evaluation.model_name == "" + assert evaluation.model_key == "" + assert evaluation.model_company == "" + assert evaluation.benchmark_name == "" + assert evaluation.readability_score == 0.0 + assert evaluation.toxicity_score == 0.0 + assert evaluation.ethics_score == 0.0 + assert evaluation.failed_prompt_count == 0 + assert evaluation.queue_id == 0 + assert evaluation.summary is None + + def test_null_summary_field(self): + """Evaluation model handles null summary.""" + data = { + "id": "eval-no-summary", + "status": "in-progress", + "submitted_at": 1640995200, + "finished_at": 0, + "model_id": "m1", + "dataset_id": "b1", + "average_duration": 0, + "accuracy": 0.0, + "summary": None, + } + evaluation = Evaluation(**data) + assert evaluation.summary is None + + def test_get_by_id_returns_full_evaluation(self): + """get_by_id returns Evaluation with all fields populated.""" + mock_client = Mock() + mock_client.organization_id = "org-123" + mock_client.project_id = "proj-456" + mock_client.get_cast = Mock() + + full_eval = Evaluation( + id="eval-123", + status=EvaluationStatus.SUCCESS, + status_description="Done", + submitted_at=1640995200, + finished_at=1640995800, + model_id="m1", + model_name="GPT-4", + model_key="gpt-4", + model_company="OpenAI", + dataset_id="b1", + dataset_name="MMLU", + average_duration=2500, + accuracy=0.89, + readability_score=0.75, + toxicity_score=0.02, + ethics_score=0.95, + failed_prompt_count=3, + queue_id=42, + summary=EvaluationSummary( + name="Test", + goal="Evaluate", + metrics=[EvaluationMetric(name="accuracy", description="Correctness")], + ), + ) + mock_client.get_cast.return_value = full_eval + + evaluations = Evaluations(mock_client) + result = evaluations.get_by_id("eval-123") + + assert result is not None + assert result.model_name == "GPT-4" + assert result.benchmark_name == "MMLU" + assert result.readability_score == 0.75 + assert result.summary is not None + assert result.summary.goal == "Evaluate" + + +class TestPublicEvaluationsResource: + """Test PublicEvaluationsResource for the public client.""" + + @pytest.fixture + def mock_public_client(self): + """Mock PublicClient.""" + client = Mock() + client.get_cast = Mock() + return client + + @pytest.fixture + def public_evaluations(self, mock_public_client): + """PublicEvaluationsResource instance.""" + return PublicEvaluationsResource(mock_public_client) + + @pytest.fixture + def sample_evaluation_data(self): + return { + "id": "eval-pub-123", + "status": "success", + "status_description": "Done", + "submitted_at": 1640995200, + "finished_at": 1640995800, + "model_id": "model-456", + "model_name": "GPT-4", + "dataset_id": "benchmark-789", + "dataset_name": "MMLU", + "average_duration": 2500, + "accuracy": 0.89, + "summary": { + "name": "GPT-4 on MMLU", + "goal": "Evaluate general knowledge", + "metrics": [{"name": "accuracy", "description": "Correctness"}], + }, + } + + def test_get_by_id_success(self, public_evaluations, sample_evaluation_data): + """get_by_id returns Evaluation on success.""" + evaluation = Evaluation(**sample_evaluation_data) + public_evaluations._get.return_value = evaluation + + result = public_evaluations.get_by_id("eval-pub-123") + + assert isinstance(result, Evaluation) + assert result.id == "eval-pub-123" + assert result.model_name == "GPT-4" + assert result.summary is not None + assert result.summary.name == "GPT-4 on MMLU" + + def test_get_by_id_correct_url(self, public_evaluations, sample_evaluation_data): + """get_by_id calls correct endpoint.""" + evaluation = Evaluation(**sample_evaluation_data) + public_evaluations._get.return_value = evaluation + + public_evaluations.get_by_id("eval-pub-123") + + public_evaluations._get.assert_called_once_with( + "/evaluations/eval-pub-123", + timeout=DEFAULT_TIMEOUT, + cast_to=Evaluation, + ) + + def test_get_by_id_returns_none_on_invalid(self, public_evaluations): + """get_by_id returns None when response is not Evaluation.""" + public_evaluations._get.return_value = None + + result = public_evaluations.get_by_id("nonexistent") + + assert result is None + + def test_get_by_id_no_client_attached(self, public_evaluations, sample_evaluation_data): + """get_by_id does not attach client (public client has no org/project).""" + evaluation = Evaluation(**sample_evaluation_data) + public_evaluations._get.return_value = evaluation + + result = public_evaluations.get_by_id("eval-pub-123") + + assert result._client is None + + def test_get_many_success(self, public_evaluations, sample_evaluation_data): + """get_many returns EvaluationsResponse with evaluations.""" + resp = { + "evaluations": [sample_evaluation_data], + "total_count": 1, + } + public_evaluations._get.return_value = resp + + result = public_evaluations.get_many( + organization_id="org-123", + project_id="proj-456", + ) + + assert isinstance(result, EvaluationsResponse) + assert len(result.evaluations) == 1 + assert result.evaluations[0].id == "eval-pub-123" + + def test_get_many_sends_org_and_project(self, public_evaluations, sample_evaluation_data): + """get_many sends organizationID and projectID as params.""" + resp = {"evaluations": [sample_evaluation_data], "total_count": 1} + public_evaluations._get.return_value = resp + + public_evaluations.get_many( + organization_id="org-abc", + project_id="proj-xyz", + ) + + call_args = public_evaluations._get.call_args + params = call_args.kwargs.get("params") or call_args[1].get("params") + assert params["organizationID"] == "org-abc" + assert params["projectID"] == "proj-xyz" + + def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data): + """get_many passes filter parameters correctly.""" + resp = {"evaluations": [sample_evaluation_data], "total_count": 1} + public_evaluations._get.return_value = resp + + public_evaluations.get_many( + organization_id="org-123", + project_id="proj-456", + page=2, + page_size=50, + sort_by="accuracy", + order="desc", + model_ids=["m1", "m2"], + benchmark_ids=["b1"], + status=EvaluationStatus.SUCCESS, + ) + + call_args = public_evaluations._get.call_args + params = call_args.kwargs.get("params") or call_args[1].get("params") + assert params["page"] == "2" + assert params["pageSize"] == "50" + assert params["sortBy"] == "accuracy" + assert params["order"] == "desc" + assert params["models"] == "m1,m2" + assert params["datasets"] == "b1" + assert params["status"] == "success" + + def test_get_many_pagination(self, public_evaluations, sample_evaluation_data): + """get_many computes pagination correctly.""" + resp = {"evaluations": [sample_evaluation_data] * 3, "total_count": 25} + public_evaluations._get.return_value = resp + + result = public_evaluations.get_many( + organization_id="org-123", + project_id="proj-456", + page=1, + page_size=10, + ) + + assert result.pagination.page == 1 + assert result.pagination.page_size == 10 + assert result.pagination.total_count == 25 + assert result.pagination.total_pages == 3 # ceil(25/10) + + def test_get_many_returns_none_on_invalid(self, public_evaluations): + """get_many returns None when response is invalid.""" + public_evaluations._get.return_value = "not-a-dict" + + result = public_evaluations.get_many( + organization_id="org-123", + project_id="proj-456", + ) + + assert result is None + + def test_get_many_empty_results(self, public_evaluations): + """get_many handles empty evaluations list.""" + resp = {"evaluations": [], "total_count": 0} + public_evaluations._get.return_value = resp + + result = public_evaluations.get_many( + organization_id="org-123", + project_id="proj-456", + ) + + assert isinstance(result, EvaluationsResponse) + assert len(result.evaluations) == 0 + assert result.pagination.total_count == 0 diff --git a/tests/resources/test_models_resource.py b/tests/resources/test_models_resource.py index 92eb7b4..4852a00 100644 --- a/tests/resources/test_models_resource.py +++ b/tests/resources/test_models_resource.py @@ -3,7 +3,7 @@ import httpx import pytest -from layerlens.models import CustomModel, PublicModel, ModelsResponse +from layerlens.models import CustomModel, PublicModel, ModelsResponse, CreateModelResponse from layerlens._constants import DEFAULT_TIMEOUT from layerlens.resources.models.models import Models @@ -555,3 +555,291 @@ def test_get_models_large_parameters_handling(self, models_resource): assert result[0].context_length == 200000 assert isinstance(result[0].parameters, float) assert isinstance(result[0].context_length, int) + + +class TestModelsAdd: + """Test Models.add() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.patch_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_add_single_model(self, models_resource): + """add() merges new ID with current models and PATCHes.""" + existing = PublicModel(id="m1", key="m1", name="M1", description="") + models_resource.get = Mock(return_value=[existing]) + models_resource._patch.return_value = {"id": "proj-456"} + + result = models_resource.add("m2") + + assert result is True + models_resource._patch.assert_called_once_with( + "/organizations/org-123/projects/proj-456", + body={"models": ["m1", "m2"]}, + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_add_multiple_models(self, models_resource): + """add() handles multiple model IDs.""" + models_resource.get = Mock(return_value=[]) + models_resource._patch.return_value = {"id": "proj-456"} + + result = models_resource.add("m1", "m2", "m3") + + assert result is True + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m1", "m2", "m3"]} + + def test_add_deduplicates(self, models_resource): + """add() deduplicates IDs already in the project.""" + existing = PublicModel(id="m1", key="m1", name="M1", description="") + models_resource.get = Mock(return_value=[existing]) + models_resource._patch.return_value = {"id": "proj-456"} + + models_resource.add("m1", "m2") + + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m1", "m2"]} + + def test_add_returns_false_on_failure(self, models_resource): + """add() returns False when PATCH fails.""" + models_resource.get = Mock(return_value=[]) + models_resource._patch.return_value = "error" + + result = models_resource.add("m1") + + assert result is False + + def test_add_with_none_get_response(self, models_resource): + """add() handles None from get() gracefully.""" + models_resource.get = Mock(return_value=None) + models_resource._patch.return_value = {"id": "proj-456"} + + result = models_resource.add("m1") + + assert result is True + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m1"]} + + +class TestModelsRemove: + """Test Models.remove() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.patch_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_remove_single_model(self, models_resource): + """remove() removes specified ID and PATCHes remaining.""" + m1 = PublicModel(id="m1", key="m1", name="M1", description="") + m2 = PublicModel(id="m2", key="m2", name="M2", description="") + models_resource.get = Mock(return_value=[m1, m2]) + models_resource._patch.return_value = {"id": "proj-456"} + + result = models_resource.remove("m1") + + assert result is True + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m2"]} + + def test_remove_multiple_models(self, models_resource): + """remove() handles removing multiple IDs.""" + m1 = PublicModel(id="m1", key="m1", name="M1", description="") + m2 = PublicModel(id="m2", key="m2", name="M2", description="") + m3 = PublicModel(id="m3", key="m3", name="M3", description="") + models_resource.get = Mock(return_value=[m1, m2, m3]) + models_resource._patch.return_value = {"id": "proj-456"} + + models_resource.remove("m1", "m3") + + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m2"]} + + def test_remove_nonexistent_id(self, models_resource): + """remove() ignores IDs that aren't in the project.""" + m1 = PublicModel(id="m1", key="m1", name="M1", description="") + models_resource.get = Mock(return_value=[m1]) + models_resource._patch.return_value = {"id": "proj-456"} + + models_resource.remove("nonexistent") + + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["m1"]} + + def test_remove_returns_false_on_failure(self, models_resource): + """remove() returns False when PATCH fails.""" + models_resource.get = Mock(return_value=[]) + models_resource._patch.return_value = None + + result = models_resource.remove("m1") + + assert result is False + + +class TestModelsCreateCustom: + """Test Models.create_custom() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.post_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_create_custom_success_with_envelope(self, models_resource): + """create_custom() unwraps envelope and returns CreateModelResponse.""" + models_resource._post.return_value = { + "status": "success", + "data": { + "model_id": "new-model-123", + "organization_id": "org-123", + "project_id": "proj-456", + }, + } + + result = models_resource.create_custom( + name="Test Model", + key="test/model-v1", + description="A test model", + api_url="https://api.example.com/v1", + max_tokens=4096, + ) + + assert isinstance(result, CreateModelResponse) + assert result.model_id == "new-model-123" + assert result.organization_id == "org-123" + assert result.project_id == "proj-456" + + def test_create_custom_success_without_envelope(self, models_resource): + """create_custom() works when response has no envelope.""" + models_resource._post.return_value = { + "model_id": "new-model-123", + "organization_id": "org-123", + "project_id": "proj-456", + } + + result = models_resource.create_custom( + name="Test Model", + key="test/model-v1", + description="A test model", + api_url="https://api.example.com/v1", + max_tokens=4096, + ) + + assert isinstance(result, CreateModelResponse) + assert result.model_id == "new-model-123" + + def test_create_custom_sends_correct_body(self, models_resource): + """create_custom() sends all required fields in the request body.""" + models_resource._post.return_value = { + "status": "success", + "data": {"model_id": "x", "organization_id": "o", "project_id": "p"}, + } + + models_resource.create_custom( + name="My Model", + key="my/model", + description="desc", + api_url="https://example.com/v1", + max_tokens=8192, + api_key="sk-secret", + ) + + call_kwargs = models_resource._post.call_args.kwargs + assert call_kwargs["body"] == { + "name": "My Model", + "key": "my/model", + "description": "desc", + "api_url": "https://example.com/v1", + "max_tokens": 8192, + "api_key": "sk-secret", + } + + def test_create_custom_omits_api_key_when_none(self, models_resource): + """create_custom() does not include api_key when not provided.""" + models_resource._post.return_value = { + "status": "success", + "data": {"model_id": "x", "organization_id": "o", "project_id": "p"}, + } + + models_resource.create_custom( + name="My Model", + key="my/model", + description="desc", + api_url="https://example.com/v1", + max_tokens=4096, + ) + + call_body = models_resource._post.call_args.kwargs["body"] + assert "api_key" not in call_body + + def test_create_custom_correct_url(self, models_resource): + """create_custom() posts to the correct endpoint.""" + models_resource._post.return_value = { + "status": "success", + "data": {"model_id": "x", "organization_id": "o", "project_id": "p"}, + } + + models_resource.create_custom( + name="M", + key="k", + description="d", + api_url="https://x.com", + max_tokens=1, + ) + + call_args = models_resource._post.call_args + assert call_args[0][0] == "/organizations/org-123/projects/proj-456/custom-models" + + def test_create_custom_returns_none_on_failure(self, models_resource): + """create_custom() returns None when response is unexpected.""" + models_resource._post.return_value = "not-a-dict" + + result = models_resource.create_custom( + name="M", + key="k", + description="d", + api_url="https://x.com", + max_tokens=1, + ) + + assert result is None + + def test_create_custom_returns_none_on_error_envelope(self, models_resource): + """create_custom() returns None when response has no model_id.""" + models_resource._post.return_value = {"status": "error", "data": {"message": "failed"}} + + result = models_resource.create_custom( + name="M", + key="k", + description="d", + api_url="https://x.com", + max_tokens=1, + ) + + assert result is None From dc1c2427f09d7322bd3ace31fc747550446370cc Mon Sep 17 00:00:00 2001 From: Marin Peko <26385728+m-peko@users.noreply.github.com> Date: Fri, 27 Feb 2026 08:25:44 +0100 Subject: [PATCH 2/3] Improve comparisons resource (#51) --- docs/api-reference/public-client.md | 52 ++++- examples/compare_evaluations.py | 95 ++++----- examples/public_evaluations.py | 9 +- .../resources/comparisons/comparisons.py | 123 ++++++++++- .../public_evaluations/public_evaluations.py | 18 +- tests/resources/test_comparisons.py | 201 ++++++++++++++++++ tests/resources/test_evaluations.py | 36 +--- 7 files changed, 407 insertions(+), 127 deletions(-) create mode 100644 tests/resources/test_comparisons.py diff --git a/docs/api-reference/public-client.md b/docs/api-reference/public-client.md index 9a79d1e..31afc82 100644 --- a/docs/api-reference/public-client.md +++ b/docs/api-reference/public-client.md @@ -282,14 +282,12 @@ Returns an `Evaluation` object if found, `None` otherwise. See [Evaluations](eva ### `evaluations.get_many(...)` -Retrieves evaluations for a given organization and project with optional pagination, sorting, and filtering. +Retrieves evaluations with optional pagination, sorting, and filtering. #### Parameters | Parameter | Type | Required | Description | | ----------------- | -------------------------------- | -------- | ------------------------------------------------------------------ | -| `organization_id` | `str` | Yes | Organization ID (MongoDB ObjectID format) | -| `project_id` | `str` | Yes | Project ID (MongoDB ObjectID format) | | `page` | `int \| None` | No | Page number for pagination (1-based, defaults to 1) | | `page_size` | `int \| None` | No | Number of evaluations per page (default: 100, max: 500) | | `sort_by` | `str \| None` | No | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` | @@ -325,10 +323,8 @@ if evaluation: for takeaway in evaluation.summary.analysis_summary.key_takeaways: print(f" - {takeaway}") -# List evaluations for an organization/project +# List successful evaluations sorted by accuracy response = client.evaluations.get_many( - organization_id="683e63925ef7e1c53c1f4b28", - project_id="683e63925ef7e1c53c1f4b29", status=EvaluationStatus.SUCCESS, sort_by="accuracy", order="desc", @@ -417,3 +413,47 @@ if comparison: print(f" Prompt: {result.prompt[:80]}...") print(f" Model 1 score: {result.score1}, Model 2 score: {result.score2}") ``` + +### `comparisons.compare_models(...)` + +Compares two models on a benchmark by automatically finding their most recent successful evaluations. This is a convenience method that wraps `compare()`. + +#### Parameters + +| Parameter | Type | Required | Description | +| ---------------- | ---------------------- | -------- | ------------------------------------------ | +| `benchmark_id` | `str` | Yes | Benchmark ID to compare on | +| `model_id_1` | `str` | Yes | First model ID | +| `model_id_2` | `str` | Yes | Second model ID | +| `page` | `int \| None` | No | Page number (1-based) | +| `page_size` | `int \| None` | No | Results per page | +| `outcome_filter` | `str \| None` | No | Filter by outcome (same options as `compare`) | +| `search` | `str \| None` | No | Search within results | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns a `ComparisonResponse` (same as `compare()`), or `None` if the comparison request fails. + +Raises `ValueError` if no successful evaluation is found for either model on the given benchmark. + +#### Example + +```python +from layerlens import PublicClient + +client = PublicClient() + +# Compare two models on AIME 2025 - no need to look up evaluation IDs +comparison = client.comparisons.compare_models( + benchmark_id="682bddc1e014f9fa440f8a91", + model_id_1="699f9761e014f9c3072b0513", + model_id_2="699f9761e014f9c3072b0512", + page=1, + page_size=10, +) + +if comparison: + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") +``` diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py index eb292b4..2293e8d 100644 --- a/examples/compare_evaluations.py +++ b/examples/compare_evaluations.py @@ -1,74 +1,49 @@ #!/usr/bin/env -S poetry run python -from layerlens import Stratix -from layerlens.models import EvaluationStatus +from layerlens import PublicClient def main(): - # Construct client (API key from env or inline) - client = Stratix() - - # --- Get successful evaluations to find a comparable pair - response = client.evaluations.get_many( - status=EvaluationStatus.SUCCESS, - sort_by="accuracy", - order="desc", - page_size=100, - ) - - if not response or len(response.evaluations) < 2: - print("Need at least 2 successful evaluations to compare, exiting") - return - - # Find two evaluations on the same benchmark - eval_1 = None - eval_2 = None - for i, e1 in enumerate(response.evaluations): - for e2 in response.evaluations[i + 1 :]: - if e1.benchmark_id == e2.benchmark_id and e1.id != e2.id: - eval_1 = e1 - eval_2 = e2 - break - if eval_1: - break - - if not eval_1 or not eval_2: - print("No two evaluations share the same benchmark, exiting") - return - - print(f"Comparing evaluations on the same benchmark ({eval_1.benchmark_id}):") - print(f" Evaluation 1: {eval_1.id} (accuracy={eval_1.accuracy:.2f}%)") - print(f" Evaluation 2: {eval_2.id} (accuracy={eval_2.accuracy:.2f}%)") - - # --- Get comparison results - comparison = client.public.comparisons.compare( - evaluation_id_1=eval_1.id, - evaluation_id_2=eval_2.id, + # Construct public client (API key from LAYERLENS_STRATIX_API_KEY env var or inline) + client = PublicClient() + + # --- Compare two models on a benchmark using compare_models + # Just provide the benchmark and two model IDs - the SDK automatically + # finds the most recent successful evaluation for each model. + benchmark_id = "682bddc1e014f9fa440f8a91" # AIME 2025 + model_id_1 = "699f9761e014f9c3072b0513" # Qwen3.5 27B + model_id_2 = "699f9761e014f9c3072b0512" # Qwen3.5 122B A10B + + print(f"Comparing models on benchmark {benchmark_id}...") + comparison = client.comparisons.compare_models( + benchmark_id=benchmark_id, + model_id_1=model_id_1, + model_id_2=model_id_2, page=1, page_size=10, ) if comparison: print(f"\n=== Comparison Summary ===") - print(f"Evaluation 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") - print(f"Evaluation 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") print(f"Total compared: {comparison.total_count}") - # --- Show individual results if comparison.results: print(f"\nFirst {len(comparison.results)} results:") for result in comparison.results: - score_indicator_1 = "✓" if result.score1 and result.score1 > 0.5 else "✗" - score_indicator_2 = "✓" if result.score2 and result.score2 > 0.5 else "✗" + s1 = "Y" if result.score1 and result.score1 > 0.5 else "N" + s2 = "Y" if result.score2 and result.score2 > 0.5 else "N" print(f" Prompt: {result.prompt[:80]}...") - print(f" Model 1: {score_indicator_1} (score={result.score1})") - print(f" Model 2: {score_indicator_2} (score={result.score2})") + print(f" Model 1: {s1} (score={result.score1})") + print(f" Model 2: {s2} (score={result.score2})") print() - # --- Filter by outcome: where only model 1 fails - comparison = client.public.comparisons.compare( - evaluation_id_1=eval_1.id, - evaluation_id_2=eval_2.id, + # --- Filter: where model 1 fails but model 2 succeeds + comparison = client.comparisons.compare_models( + benchmark_id=benchmark_id, + model_id_1=model_id_1, + model_id_2=model_id_2, outcome_filter="reference_fails", ) @@ -76,16 +51,18 @@ def main(): print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===") print(f"Found {comparison.total_count} such cases") - # --- Filter by outcome: where both models fail - comparison = client.public.comparisons.compare( - evaluation_id_1=eval_1.id, - evaluation_id_2=eval_2.id, - outcome_filter="both_fail", + # --- You can also compare using evaluation IDs directly + comparison = client.comparisons.compare( + evaluation_id_1="699f9938a03d70bf6607081f", # Qwen3.5 27B on AIME 2025 + evaluation_id_2="699f991ca782d00ebd666ba1", # Qwen3.5 122B A10B on AIME 2025 + page=1, + page_size=5, ) if comparison: - print(f"\n=== Where Both Models Fail ===") - print(f"Found {comparison.total_count} such cases") + print(f"\n=== Direct Comparison by Evaluation IDs ===") + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") if __name__ == "__main__": diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py index f28236f..a8eb588 100644 --- a/examples/public_evaluations.py +++ b/examples/public_evaluations.py @@ -30,13 +30,8 @@ def main(): else: print(f"Evaluation {evaluation_id} not found") - # --- List evaluations for a specific organization/project - organization_id = "683e63925ef7e1c53c1f4b28" - project_id = "683e63925ef7e1c53c1f4b29" - + # --- List latest evaluations response = client.evaluations.get_many( - organization_id=organization_id, - project_id=project_id, page=1, page_size=5, sort_by="submittedAt", @@ -49,8 +44,6 @@ def main(): # --- Filter by status (only successful) response = client.evaluations.get_many( - organization_id=organization_id, - project_id=project_id, status=EvaluationStatus.SUCCESS, sort_by="accuracy", order="desc", diff --git a/src/layerlens/resources/comparisons/comparisons.py b/src/layerlens/resources/comparisons/comparisons.py index eef469a..ed4851c 100644 --- a/src/layerlens/resources/comparisons/comparisons.py +++ b/src/layerlens/resources/comparisons/comparisons.py @@ -4,10 +4,19 @@ import httpx -from ...models import ComparisonResponse +from ...models import EvaluationStatus, ComparisonResponse, EvaluationsResponse from ..._resource import SyncPublicAPIResource, AsyncPublicAPIResource from ..._constants import DEFAULT_TIMEOUT +_OUTCOME_FILTER = Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"] + + +def _find_evaluation_id(response: Optional[EvaluationsResponse], model_id: str, benchmark_id: str) -> str: + """Extract the first evaluation ID from a response, or raise ValueError.""" + if not response or not response.evaluations: + raise ValueError(f"No successful evaluation found for model '{model_id}' on benchmark '{benchmark_id}'") + return str(response.evaluations[0].id) + class Comparisons(SyncPublicAPIResource): def compare( @@ -17,9 +26,7 @@ def compare( evaluation_id_2: str, page: Optional[int] = None, page_size: Optional[int] = None, - outcome_filter: Optional[ - Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"] - ] = None, + outcome_filter: Optional[_OUTCOME_FILTER] = None, search: Optional[str] = None, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> Optional[ComparisonResponse]: @@ -48,6 +55,58 @@ def compare( return ComparisonResponse.model_validate(resp) + def compare_models( + self, + *, + benchmark_id: str, + model_id_1: str, + model_id_2: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + outcome_filter: Optional[_OUTCOME_FILTER] = None, + search: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ComparisonResponse]: + """Compare two models on a benchmark by automatically finding their evaluations. + + Finds the most recent successful evaluation for each model on the given + benchmark, then compares the results side-by-side. + + Raises: + ValueError: If no successful evaluation is found for either model. + """ + resp1 = self._client.evaluations.get_many( + model_ids=[model_id_1], + benchmark_ids=[benchmark_id], + status=EvaluationStatus.SUCCESS, + sort_by="submittedAt", + order="desc", + page_size=1, + timeout=timeout, + ) + eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id) + + resp2 = self._client.evaluations.get_many( + model_ids=[model_id_2], + benchmark_ids=[benchmark_id], + status=EvaluationStatus.SUCCESS, + sort_by="submittedAt", + order="desc", + page_size=1, + timeout=timeout, + ) + eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id) + + return self.compare( + evaluation_id_1=eval_id_1, + evaluation_id_2=eval_id_2, + page=page, + page_size=page_size, + outcome_filter=outcome_filter, + search=search, + timeout=timeout, + ) + class AsyncComparisons(AsyncPublicAPIResource): async def compare( @@ -57,9 +116,7 @@ async def compare( evaluation_id_2: str, page: Optional[int] = None, page_size: Optional[int] = None, - outcome_filter: Optional[ - Literal["all", "both_succeed", "both_fail", "reference_fails", "comparison_fails"] - ] = None, + outcome_filter: Optional[_OUTCOME_FILTER] = None, search: Optional[str] = None, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> Optional[ComparisonResponse]: @@ -87,3 +144,55 @@ async def compare( return None return ComparisonResponse.model_validate(resp) + + async def compare_models( + self, + *, + benchmark_id: str, + model_id_1: str, + model_id_2: str, + page: Optional[int] = None, + page_size: Optional[int] = None, + outcome_filter: Optional[_OUTCOME_FILTER] = None, + search: Optional[str] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> Optional[ComparisonResponse]: + """Compare two models on a benchmark by automatically finding their evaluations. + + Finds the most recent successful evaluation for each model on the given + benchmark, then compares the results side-by-side. + + Raises: + ValueError: If no successful evaluation is found for either model. + """ + resp1 = await self._client.evaluations.get_many( + model_ids=[model_id_1], + benchmark_ids=[benchmark_id], + status=EvaluationStatus.SUCCESS, + sort_by="submittedAt", + order="desc", + page_size=1, + timeout=timeout, + ) + eval_id_1 = _find_evaluation_id(resp1, model_id_1, benchmark_id) + + resp2 = await self._client.evaluations.get_many( + model_ids=[model_id_2], + benchmark_ids=[benchmark_id], + status=EvaluationStatus.SUCCESS, + sort_by="submittedAt", + order="desc", + page_size=1, + timeout=timeout, + ) + eval_id_2 = _find_evaluation_id(resp2, model_id_2, benchmark_id) + + return await self.compare( + evaluation_id_1=eval_id_1, + evaluation_id_2=eval_id_2, + page=page, + page_size=page_size, + outcome_filter=outcome_filter, + search=search, + timeout=timeout, + ) diff --git a/src/layerlens/resources/public_evaluations/public_evaluations.py b/src/layerlens/resources/public_evaluations/public_evaluations.py index ddd1cdf..71f736d 100644 --- a/src/layerlens/resources/public_evaluations/public_evaluations.py +++ b/src/layerlens/resources/public_evaluations/public_evaluations.py @@ -37,8 +37,6 @@ def get_by_id( def get_many( self, *, - organization_id: str, - project_id: str, page: Optional[int] = None, page_size: Optional[int] = None, sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, @@ -52,8 +50,6 @@ def get_many( Get evaluations with optional pagination, sorting, and filtering. Args: - organization_id: Organization ID (required) - project_id: Project ID (required) page: Page number for pagination (1-based, defaults to 1 if not provided) page_size: Number of evaluations per page (default: 100, optional) sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) @@ -66,10 +62,7 @@ def get_many( Returns: EvaluationsResponse object or None """ - params = { - "organizationID": organization_id, - "projectID": project_id, - } + params: dict[str, str] = {} effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE effective_page = page if page is not None else DEFAULT_PAGE @@ -137,8 +130,6 @@ async def get_by_id( async def get_many( self, *, - organization_id: str, - project_id: str, page: Optional[int] = None, page_size: Optional[int] = None, sort_by: Optional[Literal["submittedAt", "accuracy", "averageDuration"]] = None, @@ -152,8 +143,6 @@ async def get_many( Get evaluations with optional pagination, sorting, and filtering. Args: - organization_id: Organization ID (required) - project_id: Project ID (required) page: Page number for pagination (1-based, defaults to 1 if not provided) page_size: Number of evaluations per page (default: 100, optional) sort_by: Sort evaluations by field (submittedAt, accuracy, averageDuration) @@ -166,10 +155,7 @@ async def get_many( Returns: EvaluationsResponse object or None """ - params = { - "organizationID": organization_id, - "projectID": project_id, - } + params: dict[str, str] = {} effective_page_size = min(max(page_size, 1), MAX_PAGE_SIZE) if page_size is not None else DEFAULT_PAGE_SIZE effective_page = page if page is not None else DEFAULT_PAGE diff --git a/tests/resources/test_comparisons.py b/tests/resources/test_comparisons.py new file mode 100644 index 0000000..85dfa2f --- /dev/null +++ b/tests/resources/test_comparisons.py @@ -0,0 +1,201 @@ +from unittest.mock import Mock + +import pytest + +from layerlens.models import ( + Evaluation, + Pagination, + EvaluationStatus, + ComparisonResponse, + EvaluationsResponse, +) +from layerlens.resources.comparisons.comparisons import Comparisons + + +def _make_eval(eval_id: str, model_id: str, benchmark_id: str) -> Evaluation: + return Evaluation( + id=eval_id, + status=EvaluationStatus.SUCCESS, + submitted_at=1640995200, + finished_at=1640995800, + model_id=model_id, + dataset_id=benchmark_id, + average_duration=2500, + accuracy=0.89, + ) + + +def _make_eval_response(evaluations: list[Evaluation]) -> EvaluationsResponse: + return EvaluationsResponse( + evaluations=evaluations, + pagination=Pagination( + page=1, + page_size=1, + total_pages=1, + total_count=len(evaluations), + ), + ) + + +class TestCompareModels: + """Test Comparisons.compare_models convenience method.""" + + @pytest.fixture + def mock_public_client(self): + client = Mock() + client.get_cast = Mock() + client.evaluations = Mock() + return client + + @pytest.fixture + def comparisons(self, mock_public_client): + return Comparisons(mock_public_client) + + def test_compare_models_success(self, comparisons, mock_public_client): + """compare_models finds evaluations for both models and calls compare.""" + eval1 = _make_eval("eval-1", "model-a", "bench-1") + eval2 = _make_eval("eval-2", "model-b", "bench-1") + + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([eval2]), + ] + + comparisons._get.return_value = { + "results": [], + "total_count": 0, + "correct_count_1": 5, + "total_results_1": 10, + "correct_count_2": 7, + "total_results_2": 10, + } + + result = comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + ) + + assert isinstance(result, ComparisonResponse) + + # Verify get_many was called correctly for both models + calls = mock_public_client.evaluations.get_many.call_args_list + assert len(calls) == 2 + + assert calls[0].kwargs["model_ids"] == ["model-a"] + assert calls[0].kwargs["benchmark_ids"] == ["bench-1"] + assert calls[0].kwargs["status"] == EvaluationStatus.SUCCESS + assert calls[0].kwargs["sort_by"] == "submittedAt" + assert calls[0].kwargs["order"] == "desc" + assert calls[0].kwargs["page_size"] == 1 + + assert calls[1].kwargs["model_ids"] == ["model-b"] + + # Verify compare was called with the found evaluation IDs + compare_call = comparisons._get.call_args + params = compare_call.kwargs.get("params") or compare_call[1].get("params") + assert params["evaluation_id_1"] == "eval-1" + assert params["evaluation_id_2"] == "eval-2" + + def test_compare_models_model_1_not_found(self, comparisons, mock_public_client): + """compare_models raises ValueError when model 1 has no evaluation.""" + mock_public_client.evaluations.get_many.return_value = _make_eval_response([]) + + with pytest.raises(ValueError, match="model-a"): + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + ) + + def test_compare_models_model_2_not_found(self, comparisons, mock_public_client): + """compare_models raises ValueError when model 2 has no evaluation.""" + eval1 = _make_eval("eval-1", "model-a", "bench-1") + + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([]), + ] + + with pytest.raises(ValueError, match="model-b"): + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + ) + + def test_compare_models_none_response(self, comparisons, mock_public_client): + """compare_models raises ValueError when get_many returns None.""" + mock_public_client.evaluations.get_many.return_value = None + + with pytest.raises(ValueError, match="model-a"): + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + ) + + def test_compare_models_passes_through_params(self, comparisons, mock_public_client): + """compare_models forwards pagination, filter, and search to compare.""" + eval1 = _make_eval("eval-1", "model-a", "bench-1") + eval2 = _make_eval("eval-2", "model-b", "bench-1") + + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([eval2]), + ] + comparisons._get.return_value = { + "results": [], + "total_count": 0, + "correct_count_1": 0, + "total_results_1": 0, + "correct_count_2": 0, + "total_results_2": 0, + } + + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + page=2, + page_size=50, + outcome_filter="both_succeed", + search="test query", + ) + + compare_call = comparisons._get.call_args + params = compare_call.kwargs.get("params") or compare_call[1].get("params") + assert params["page"] == "2" + assert params["pageSize"] == "50" + assert params["outcomeFilter"] == "both_succeed" + assert params["search"] == "test query" + + def test_compare_models_picks_most_recent(self, comparisons, mock_public_client): + """compare_models requests sort by submittedAt desc to get the most recent.""" + eval1 = _make_eval("eval-1", "model-a", "bench-1") + eval2 = _make_eval("eval-2", "model-b", "bench-1") + + mock_public_client.evaluations.get_many.side_effect = [ + _make_eval_response([eval1]), + _make_eval_response([eval2]), + ] + comparisons._get.return_value = { + "results": [], + "total_count": 0, + "correct_count_1": 0, + "total_results_1": 0, + "correct_count_2": 0, + "total_results_2": 0, + } + + comparisons.compare_models( + benchmark_id="bench-1", + model_id_1="model-a", + model_id_2="model-b", + ) + + for call in mock_public_client.evaluations.get_many.call_args_list: + assert call.kwargs["sort_by"] == "submittedAt" + assert call.kwargs["order"] == "desc" + assert call.kwargs["page_size"] == 1 + assert call.kwargs["status"] == EvaluationStatus.SUCCESS diff --git a/tests/resources/test_evaluations.py b/tests/resources/test_evaluations.py index 0d40f08..8a337eb 100644 --- a/tests/resources/test_evaluations.py +++ b/tests/resources/test_evaluations.py @@ -780,38 +780,18 @@ def test_get_many_success(self, public_evaluations, sample_evaluation_data): } public_evaluations._get.return_value = resp - result = public_evaluations.get_many( - organization_id="org-123", - project_id="proj-456", - ) + result = public_evaluations.get_many() assert isinstance(result, EvaluationsResponse) assert len(result.evaluations) == 1 assert result.evaluations[0].id == "eval-pub-123" - def test_get_many_sends_org_and_project(self, public_evaluations, sample_evaluation_data): - """get_many sends organizationID and projectID as params.""" - resp = {"evaluations": [sample_evaluation_data], "total_count": 1} - public_evaluations._get.return_value = resp - - public_evaluations.get_many( - organization_id="org-abc", - project_id="proj-xyz", - ) - - call_args = public_evaluations._get.call_args - params = call_args.kwargs.get("params") or call_args[1].get("params") - assert params["organizationID"] == "org-abc" - assert params["projectID"] == "proj-xyz" - def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data): """get_many passes filter parameters correctly.""" resp = {"evaluations": [sample_evaluation_data], "total_count": 1} public_evaluations._get.return_value = resp public_evaluations.get_many( - organization_id="org-123", - project_id="proj-456", page=2, page_size=50, sort_by="accuracy", @@ -830,6 +810,8 @@ def test_get_many_with_filters(self, public_evaluations, sample_evaluation_data) assert params["models"] == "m1,m2" assert params["datasets"] == "b1" assert params["status"] == "success" + assert "organizationID" not in params + assert "projectID" not in params def test_get_many_pagination(self, public_evaluations, sample_evaluation_data): """get_many computes pagination correctly.""" @@ -837,8 +819,6 @@ def test_get_many_pagination(self, public_evaluations, sample_evaluation_data): public_evaluations._get.return_value = resp result = public_evaluations.get_many( - organization_id="org-123", - project_id="proj-456", page=1, page_size=10, ) @@ -852,10 +832,7 @@ def test_get_many_returns_none_on_invalid(self, public_evaluations): """get_many returns None when response is invalid.""" public_evaluations._get.return_value = "not-a-dict" - result = public_evaluations.get_many( - organization_id="org-123", - project_id="proj-456", - ) + result = public_evaluations.get_many() assert result is None @@ -864,10 +841,7 @@ def test_get_many_empty_results(self, public_evaluations): resp = {"evaluations": [], "total_count": 0} public_evaluations._get.return_value = resp - result = public_evaluations.get_many( - organization_id="org-123", - project_id="proj-456", - ) + result = public_evaluations.get_many() assert isinstance(result, EvaluationsResponse) assert len(result.evaluations) == 0 From 99a6a152f6eda06d34026c671fce6577e084b147 Mon Sep 17 00:00:00 2001 From: Leandro Echevarria Date: Fri, 27 Feb 2026 04:38:10 -0300 Subject: [PATCH 3/3] Feat | LAY-885 cicd publish (#7) * feat | LAY-885 Added ground files for our publishing CICD * feat | LAY-885 Added twine to publish the package. Updated scripts. * feat | LAY-885 Addressing PR feedback * Couple of fixes to release process --------- Co-authored-by: m-peko --- .github/workflows/publish-sdk.yaml | 63 ++++++++++++++++++++ .github/workflows/release-tag.yaml | 78 +++++++++++++++++++++++++ .github/workflows/test-publish-sdk.yaml | 41 +++++++++++++ Makefile | 48 +++++++++++++++ pyproject.toml | 13 ++++- requirements-dev.lock | 58 ++++++++++++++++-- requirements.lock | 6 +- scripts/get_version.sh | 10 +--- scripts/publish.sh | 24 ++++++++ scripts/push-release-tag.sh | 72 +++++++++++++++++++++++ scripts/template-version.sh | 28 +++++++++ scripts/validate-release-tag.sh | 62 ++++++++++++++++++++ src/layerlens/_version.py | 3 + 13 files changed, 488 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/publish-sdk.yaml create mode 100644 .github/workflows/release-tag.yaml create mode 100644 .github/workflows/test-publish-sdk.yaml create mode 100644 Makefile create mode 100755 scripts/publish.sh create mode 100755 scripts/push-release-tag.sh create mode 100644 scripts/template-version.sh create mode 100755 scripts/validate-release-tag.sh diff --git a/.github/workflows/publish-sdk.yaml b/.github/workflows/publish-sdk.yaml new file mode 100644 index 0000000..bc27b0d --- /dev/null +++ b/.github/workflows/publish-sdk.yaml @@ -0,0 +1,63 @@ +# This workflow is used to publish the Python SDK to the actual PyPI. +# It is triggered by a tag push, and will only publish if the tag is valid. +# The tag must match the format sdk-v*.*.* + +name: Publish Python SDK + +on: + push: + tags: + - "sdk-v*.*.*" # Trigger on version tags like sdk-v0.1.0 etc. + +jobs: + validate: + runs-on: ubuntu-latest + environment: production + outputs: + release_tag: ${{ steps.set_release_tag.outputs.release_tag }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for checking branch + - name: Set release tag + id: set_release_tag + # ensure the tag is valid (matches code, is on main, etc) + run: | + RELEASE_TAG=${GITHUB_REF#refs/tags/} + echo "Using tag: $RELEASE_TAG" + ./scripts/validate-release-tag.sh "$RELEASE_TAG" + echo "RELEASE_TAG=$RELEASE_TAG" >> $GITHUB_ENV + echo "release_tag=$RELEASE_TAG" >> $GITHUB_OUTPUT + + build-and-publish: + needs: validate + runs-on: ubuntu-latest + environment: production + + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + RELEASE_TAG: ${{ needs.validate.outputs.release_tag }} + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install build dependencies + run: make install-build-deps + - name: Build + run: make build + - name: Test wheel + run: make test-wheel + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: sdk-dist + path: dist/ + retention-days: 5 + - name: Publish to PyPI + run: make _publish + env: + PYPI_REPO: pypi diff --git a/.github/workflows/release-tag.yaml b/.github/workflows/release-tag.yaml new file mode 100644 index 0000000..ae495c3 --- /dev/null +++ b/.github/workflows/release-tag.yaml @@ -0,0 +1,78 @@ +# This workflow creates and pushes a release tag using the push-release-tag.sh script. +# It can be triggered manually and will prompt for confirmation before creating the tag. + +name: Create Release Tag + +on: + workflow_dispatch: + inputs: + dry_run: + description: "Run in dry-run mode (show what would be done without actually creating/pushing the tag)" + required: false + type: boolean + default: true + confirm_release: + description: "Type 'YES' to confirm you want to create and push the release tag" + required: true + type: string + +jobs: + check-branch: + runs-on: ubuntu-latest + environment: production + steps: + - name: Check if running on release branch + run: | + if [ "${{ github.ref }}" != "refs/heads/release" ]; then + echo "Error: This workflow can only be run from the 'release' branch." + echo "Current branch: ${{ github.ref }}" + echo "Please switch to the 'release' branch and try again." + exit 1 + fi + echo "Running on release branch - proceeding with workflow." + + create-release-tag: + runs-on: ubuntu-latest + needs: check-branch + environment: production + if: github.ref == 'refs/heads/release' + + permissions: + contents: write # Required to create and push tags + + steps: + - name: Validate confirmation + if: github.event.inputs.confirm_release != 'YES' && github.event.inputs.dry_run != 'true' + run: | + echo "Error: You must type 'YES' in the confirm_release input to proceed with creating a release tag." + echo "Received: '${{ github.event.inputs.confirm_release }}'" + exit 1 + + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history and tags + + - name: Make scripts executable + run: | + chmod +x scripts/push-release-tag.sh + chmod +x scripts/get_version.sh + + - name: Configure Git + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Run push-release-tag script (dry-run) + if: github.event.inputs.dry_run == 'true' + run: | + echo "Running in dry-run mode..." + make push-release-tag DRY_RUN=--dry-run + + - name: Run push-release-tag script + if: github.event.inputs.dry_run != 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "Creating and pushing release tag..." + # Override the interactive confirmation since we already confirmed via workflow input + echo "YES" | make push-release-tag diff --git a/.github/workflows/test-publish-sdk.yaml b/.github/workflows/test-publish-sdk.yaml new file mode 100644 index 0000000..9703970 --- /dev/null +++ b/.github/workflows/test-publish-sdk.yaml @@ -0,0 +1,41 @@ +# This workflow is used to publish the Python SDK to TestPyPI. Do not need to upgrade the +# version number to use this workflow. +# Only upgrade the version number when you are ready to publish to PyPi +# The script will automatically add an "rc" suffix to the version number for test.pypi.org releases. + +name: Publish Python SDK to TestPyPI + +on: + workflow_dispatch: + inputs: + ref: + description: "Publish the given Git ref to test.pypi.org (branch, tag, or commit SHA)" + required: true + type: string + default: "main" + +jobs: + build-and-publish-test: + runs-on: ubuntu-latest + + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + PYPI_REPO: testpypi + + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.ref }} + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install build dependencies + run: make install-build-deps + - name: Build + run: make build + - name: Test wheel + run: make test-wheel + - name: Publish to TestPyPI + run: make _publish diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e511695 --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +install-build-deps: + pip install build twine + +build: clean _template-version + python -m build + # Restore the original version file after the build + git checkout src/layerlens/_version.py + +test-wheel: + pip install dist/*.whl + python -c "import layerlens; print('Package imported successfully')" + +clean: + rm -rf build dist + +_publish: + ./scripts/publish.sh + +_template-version: + @bash scripts/template-version.sh + +_check-git-clean: + @if [ -n "$$(git status --porcelain)" ]; then \ + echo "Error: Git working directory is not clean. Won't run publish."; \ + exit 1; \ + fi + +_verify-build-publish: _check-git-clean build test-wheel _publish + +publish-to-testpypi: export PYPI_REPO := testpypi +publish-to-testpypi: _verify-build-publish + +publish-to-pypi: export PYPI_REPO := pypi +publish-to-pypi: _verify-build-publish + +push-release-tag: + @bash scripts/push-release-tag.sh $(DRY_RUN) + +help: + @echo "Available targets:" + @echo " build - Build Python package" + @echo " clean - Remove build artifacts" + @echo " help - Show this help message" + @echo " install-build-deps - Install build dependencies for CI" + @echo " test-wheel - Run tests against built wheel" + @echo " publish-to-pypi - Publish to PyPI" + @echo " publish-to-testpypi - Publish to TestPyPI" + @echo " push-release-tag - Create and push a release tag" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ef86a87..f5efccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,14 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.version] +path = "src/layerlens/_version.py" +pattern = '__version__ = "(?P[^"]+)"' + [project] name = "layerlens" -version = "1.2.0" +dynamic = ["version"] description = "The official Python library for the LayerLens Stratix API" license = "Apache-2.0" authors = [{ name = "LayerLens", email = "support@layerlens.ai" }] @@ -30,7 +38,6 @@ Repository = "https://github.com/LayerLens/stratix-python" [project.scripts] layerlens = "layerlens.cli:main" - [tool.rye] managed = true # version pins are in requirements-dev.lock @@ -41,6 +48,8 @@ dev-dependencies = [ "pytest-cov>=6.2.1", "ruff", "types-requests", + "build", + "twine==6.1.0", ] [tool.rye.scripts] diff --git a/requirements-dev.lock b/requirements-dev.lock index 9c0f730..2aaa85b 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -4,7 +4,7 @@ # last locked with the following flags: # pre: false # features: [] -# all-features: false +# all-features: true # with-sources: false # generate-hashes: false # universal: false @@ -14,6 +14,9 @@ annotated-types==0.7.0 # via pydantic anyio==4.9.0 # via httpx +backports-tarfile==1.2.0 + # via jaraco-context +build==1.3.0 certifi==2025.7.14 # via httpcore # via httpx @@ -22,6 +25,8 @@ charset-normalizer==3.4.3 # via requests coverage==7.10.2 # via pytest-cov +docutils==0.22 + # via readme-renderer exceptiongroup==1.3.0 # via anyio # via pytest @@ -30,44 +35,86 @@ h11==0.16.0 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via atlas + # via test-atlas-lzok +id==1.5.0 + # via twine idna==3.10 # via anyio # via httpx # via requests +importlib-metadata==8.7.0 + # via build + # via keyring + # via twine iniconfig==2.1.0 # via pytest +jaraco-classes==3.4.0 + # via keyring +jaraco-context==6.0.1 + # via keyring +jaraco-functools==4.2.1 + # via keyring +keyring==25.6.0 + # via twine +markdown-it-py==3.0.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +more-itertools==10.7.0 + # via jaraco-classes + # via jaraco-functools mypy==1.17.0 mypy-extensions==1.1.0 # via mypy +nh3==0.3.0 + # via readme-renderer nodeenv==1.9.1 # via pyright packaging==25.0 + # via build # via pytest + # via twine pathspec==0.12.1 # via mypy pluggy==1.6.0 # via pytest # via pytest-cov pydantic==2.11.7 - # via atlas + # via test-atlas-lzok pydantic-core==2.33.2 # via pydantic pygments==2.19.2 # via pytest + # via readme-renderer + # via rich +pyproject-hooks==1.2.0 + # via build pyright==1.1.399 pytest==8.4.1 # via pytest-cov pytest-cov==6.2.1 +readme-renderer==44.0 + # via twine requests==2.32.5 - # via atlas + # via id + # via layerlens + # via requests-toolbelt + # via twine +requests-toolbelt==1.0.0 + # via twine +rfc3986==2.0.0 + # via twine +rich==14.1.0 + # via twine ruff==0.12.7 sniffio==1.3.1 # via anyio tomli==2.2.1 + # via build # via coverage # via mypy # via pytest +twine==6.1.0 types-requests==2.32.4.20250809 typing-extensions==4.14.1 # via anyio @@ -81,4 +128,7 @@ typing-inspection==0.4.1 # via pydantic urllib3==2.5.0 # via requests + # via twine # via types-requests +zipp==3.23.0 + # via importlib-metadata diff --git a/requirements.lock b/requirements.lock index 887d3cc..540f4d6 100644 --- a/requirements.lock +++ b/requirements.lock @@ -4,7 +4,7 @@ # last locked with the following flags: # pre: false # features: [] -# all-features: false +# all-features: true # with-sources: false # generate-hashes: false # universal: false @@ -27,13 +27,13 @@ h11==0.16.0 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via atlas + # via test-atlas-lzok idna==3.10 # via anyio # via httpx # via requests pydantic==2.11.7 - # via atlas + # via test-atlas-lzok pydantic-core==2.33.2 # via pydantic requests==2.32.5 diff --git a/scripts/get_version.sh b/scripts/get_version.sh index 42caa27..04ab2da 100755 --- a/scripts/get_version.sh +++ b/scripts/get_version.sh @@ -6,24 +6,16 @@ set -e ROOT_DIR=$(git rev-parse --show-toplevel) VERSION_FILE="$ROOT_DIR/src/layerlens/_version.py" -echo "Debug: ROOT_DIR=$ROOT_DIR" >&2 -echo "Debug: VERSION_FILE=$VERSION_FILE" >&2 - if [ ! -f "$VERSION_FILE" ]; then echo "Error: Version file not found at $VERSION_FILE" >&2 exit 1 fi -echo "Debug: File exists, content:" >&2 -cat "$VERSION_FILE" >&2 - VERSION=$(grep -E '^__version__\s*=' "$VERSION_FILE" | grep -o '".*"' | tr -d '"') -echo "Debug: Extracted version='$VERSION'" >&2 - if [ -z "$VERSION" ]; then echo "Error: Could not extract version from $VERSION_FILE" >&2 exit 1 fi -echo "$VERSION" \ No newline at end of file +echo "$VERSION" diff --git a/scripts/publish.sh b/scripts/publish.sh new file mode 100755 index 0000000..e6ac0f7 --- /dev/null +++ b/scripts/publish.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Publish the package to PyPI or TestPyPI depending +# on the PYPI_REPO (pypi | testpypi) environment variable + +if [ -z "$PYPI_REPO" ]; then + echo "Error: PYPI_REPO environment variable must be set" + exit 1 +fi + +if [ "$PYPI_REPO" != "pypi" ] && [ "$PYPI_REPO" != "testpypi" ]; then + echo "Error: PYPI_REPO must be either 'pypi' or 'testpypi'" + exit 1 +fi + +VERSION=$(bash scripts/get_version.sh) + +if [ -z "$VERSION" ]; then + echo "Error: Could not determine version" + exit 1 +fi + +echo "Publishing version $VERSION to $PYPI_REPO" + +twine upload --repository "$PYPI_REPO" dist/* \ No newline at end of file diff --git a/scripts/push-release-tag.sh b/scripts/push-release-tag.sh new file mode 100755 index 0000000..64ba944 --- /dev/null +++ b/scripts/push-release-tag.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -euo pipefail + +ROOT_DIR=$(git rev-parse --show-toplevel) + +# Parse command line arguments +DRY_RUN=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + DRY_RUN=true + shift + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 [--dry-run]" + exit 1 + ;; + esac +done + +git fetch --tags --prune + +REPO_URL="https://github.com/LayerLens/atlas-python" +TAG_PREFIX="sdk-v" +COMMIT=$(git rev-parse --short HEAD) +VERSION=$(bash "$ROOT_DIR/scripts/get_version.sh") +TAG="${TAG_PREFIX}${VERSION}" + +if git rev-parse "$TAG" >/dev/null 2>&1; then + echo "Error: Tag $TAG already exists" + exit 1 +fi + +# Find the most recent version tag +LAST_RELEASE=$(git tag -l "${TAG_PREFIX}*" --sort=-v:refname | head -n 1) + +echo "================================================" +echo " Atlas Python SDK Release" +echo "================================================" +echo "version: ${TAG}" +echo "commit: ${COMMIT}" +echo "code: ${REPO_URL}/commit/${COMMIT}" +echo "changeset: ${REPO_URL}/compare/${LAST_RELEASE}...${COMMIT}" + +if [ "$DRY_RUN" = true ]; then + exit 0 +fi + +echo "" +echo "" +echo "Are you ready to release version ${VERSION}? Type 'YES' to continue:" +read -r CONFIRMATION + +if [ "$CONFIRMATION" != "YES" ]; then + echo "Release cancelled." + exit 1 +fi + +# Create and push the tag +echo "" +echo "Creating and pushing tag ${TAG}" +echo "" + +git tag "$TAG" "$COMMIT" +git push origin "$TAG" + +echo "" +echo "Tag ${TAG} has been created and pushed to origin. Check GitHub Actions for build progress:" +echo "https://github.com/LayerLens/atlas-python/actions/workflows/publish-sdk.yaml" +echo "" \ No newline at end of file diff --git a/scripts/template-version.sh b/scripts/template-version.sh new file mode 100644 index 0000000..d3d8b84 --- /dev/null +++ b/scripts/template-version.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -e + +VERSION_FILE="src/layerlens/_version.py" + +GIT_COMMIT=$(git rev-parse HEAD) + +sed_inplace() { + if [[ "$OSTYPE" == "darwin"* ]]; then + sed -i '' "$@" + else + sed -i "$@" + fi +} + +# Update git commit hash +sed_inplace "s/__GIT_COMMIT__/$GIT_COMMIT/g" "$VERSION_FILE" + +# Get current version +CURRENT_VERSION=$(grep '__version__ = ' "$VERSION_FILE" | cut -d'"' -f2) + +# If we're uploading to testpypi, add a run number to the version so we can +# test multiple times. +if [[ "$PYPI_REPO" == "testpypi" ]] && [[ -n "$GITHUB_RUN_NUMBER" ]]; then + NEW_VERSION="${CURRENT_VERSION}rc${GITHUB_RUN_NUMBER}" + sed_inplace "s/__version__ = \".*\"/__version__ = \"$NEW_VERSION\"/" "$VERSION_FILE" +fi diff --git a/scripts/validate-release-tag.sh b/scripts/validate-release-tag.sh new file mode 100755 index 0000000..175464e --- /dev/null +++ b/scripts/validate-release-tag.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Validate release requirements +# - Checks if the tag matches naming convention (sdk-v*.*.*) +# - Checks if the tag matches the version in the package +# - Ensures we're releasing from the release branch + +set -e + +# Get the tag from the first command line argument +if [ $# -eq 0 ]; then + echo "ERROR: Release tag argument not provided" + echo "Usage: $0 " + exit 1 +fi + +ROOT_DIR=$(git rev-parse --show-toplevel) + +# Fetch the latest tags to ensure we're up to date +git fetch --tags --prune --force + +TAG=$1 + +# Check if tag starts with sdk-v +if [[ ! "$TAG" =~ ^sdk-v ]]; then + echo "ERROR: Tag must start with 'sdk-v'" + exit 1 +fi + +# Extract version without the 'sdk-v' prefix +VERSION=${TAG#sdk-v} + +PACKAGE_VERSION=$(bash "$ROOT_DIR/scripts/get_version.sh") + +# Check if the tag version matches the package version +if [ "$VERSION" != "$PACKAGE_VERSION" ]; then + echo "ERROR: Tag version ($VERSION) does not match package version ($PACKAGE_VERSION)" + exit 1 +fi + +CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) +if [ "$CURRENT_BRANCH" != "release" ]; then + # If we're in detached HEAD state (which is likely in GitHub Actions with a tag), + # we need to check if the tag is on the release branch + if ! git rev-parse "$TAG" &>/dev/null; then + echo "ERROR: Tag $TAG does not exist in the repository" + exit 1 + fi + + TAG_COMMIT=$(git rev-parse "$TAG") + + # Ensure we have release branch history + git fetch origin release --depth=1000 + + # Check if tag is on release branch + if ! git merge-base --is-ancestor "$TAG_COMMIT" origin/release; then + echo "ERROR: Tag $TAG is not on the release branch" + exit 1 + fi +fi + +# All checks passed +exit 0 \ No newline at end of file diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index c68196d..8fb65ee 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1 +1,4 @@ __version__ = "1.2.0" + +# Will be templated during the build +__git_commit__ = "__GIT_COMMIT__"