From bc9b0825ebfa4f37f4e5060aba1a3870f1f29058 Mon Sep 17 00:00:00 2001 From: Marin Peko <26385728+m-peko@users.noreply.github.com> Date: Wed, 11 Mar 2026 12:17:20 +0100 Subject: [PATCH] Merge pull request #57 from LayerLens/bugfix/minor-issues Bugfix/minor issues --- README.md | 266 +++++++++++++++ docs/README.md | 282 ++++++++++++---- docs/SUMMARY.md | 6 +- docs/api-reference/models-benchmarks.md | 14 + docs/api-reference/traces.md | 8 + docs/examples/README.md | 39 +++ docs/examples/creating-evaluations.md | 337 ++++++++++++++----- docs/examples/judges-and-traces.md | 301 ++++++++++------- docs/examples/models-and-benchmarks.md | 256 ++++++++++++++ docs/examples/public-api.md | 279 +++++++++++++++ docs/examples/retrieving-results.md | 194 +++++++++++ examples/all_results_no_pagination.py | 2 +- examples/async_client.py | 2 +- examples/async_client_simple.py | 2 +- examples/async_judges_and_traces.py | 2 +- examples/async_run_evaluations.py | 2 +- examples/client.py | 2 +- examples/client_simple.py | 2 +- examples/compare_evaluations.py | 2 +- examples/create_custom_benchmark.py | 2 +- examples/create_custom_model.py | 6 +- examples/create_smart_benchmark.py | 2 +- examples/evaluation_sorting.py | 2 +- examples/fetch_results_async.py | 2 +- examples/get_benchmarks.py | 2 +- examples/get_evaluation.py | 2 +- examples/get_models.py | 2 +- examples/judge_optimizations.py | 2 +- examples/judges.py | 2 +- examples/manage_project_models_benchmarks.py | 2 +- examples/paginated_results.py | 2 +- examples/public_benchmarks.py | 2 +- examples/public_evaluations.py | 2 +- examples/public_models.py | 2 +- examples/trace_evaluations.py | 2 +- examples/traces.py | 2 +- pyproject.toml | 4 +- src/layerlens/_base_client.py | 128 ++++--- src/layerlens/_client.py | 12 +- src/layerlens/_constants.py | 2 + src/layerlens/_public_client.py | 6 +- src/layerlens/_version.py | 2 +- src/layerlens/cli.py | 16 + 43 files changed, 1861 insertions(+), 345 deletions(-) create mode 100644 README.md create mode 100644 docs/examples/models-and-benchmarks.md create mode 100644 docs/examples/public-api.md create mode 100644 docs/examples/retrieving-results.md create mode 100644 src/layerlens/cli.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..a6de2ce --- /dev/null +++ b/README.md @@ -0,0 +1,266 @@ +# LayerLens Stratix Python SDK + +The official Python library for the [LayerLens Stratix](https://layerlens.ai) evaluation API. + +[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) + +## Installation + +```bash +pip install layerlens --extra-index-url https://sdk.layerlens.ai/package +``` + +## Authentication + +Set your API key as an environment variable: + +```bash +export LAYERLENS_STRATIX_API_KEY="your-api-key" +``` + +Or pass it directly when creating a client: + +```python +from layerlens import Stratix + +client = Stratix(api_key="your-api-key") +``` + +## Quick Start + +### Run an evaluation + +```python +import os +from layerlens import Stratix + +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + +# Get a model and benchmark by key +model = client.models.get_by_key("openai/gpt-4o") +benchmark = client.benchmarks.get_by_key("arc-agi-2") + +# Create an evaluation (pass the full model and benchmark objects) +evaluation = client.evaluations.create( + model=model, + benchmark=benchmark, +) + +# Wait for results (pass the evaluation object, not just the ID) +result = client.evaluations.wait_for_completion(evaluation) +print(f"Accuracy: {result.accuracy}") +``` + +### Async usage + +```python +import os +import asyncio +from layerlens import AsyncStratix + +async def main(): + client = AsyncStratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + + model = await client.models.get_by_key("openai/gpt-4o") + benchmark = await client.benchmarks.get_by_key("arc-agi-2") + + evaluation = await client.evaluations.create( + model=model, + benchmark=benchmark, + ) + + result = await client.evaluations.wait_for_completion(evaluation) + print(f"Accuracy: {result.accuracy}") + +asyncio.run(main()) +``` + +### Public endpoints + +Public models, benchmarks, and evaluations are accessible through `client.public`. Note: the public client still requires an API key. + +```python +import os +from layerlens import Stratix + +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + +# Browse public models +models = client.public.models.get() +for model in models.models: + print(f"{model.key}: {model.name}") +``` + +Or instantiate the public client directly: + +```python +import os +from layerlens import PublicClient + +public = PublicClient(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) +models = public.models.get() +``` + +## Resources + +The SDK provides access to these resource types: + +| Resource | Description | +| ---------------------------- | ----------------------------------------------------------------------------- | +| `client.models` | Manage models (get, get_by_key, add, remove, create_custom) | +| `client.benchmarks` | Manage benchmarks (get, get_by_key, add, remove, create_custom, create_smart) | +| `client.evaluations` | Create evaluations and wait for results | +| `client.judges` | CRUD operations for evaluation judges | +| `client.traces` | Upload trace files and manage traces | +| `client.trace_evaluations` | Run trace-level evaluations with judges | +| `client.judge_optimizations` | Optimize judge configurations | +| `client.results` | Retrieve evaluation results | +| `client.public` | Public models, benchmarks, evaluations, and comparisons | + +Every resource is available in both sync (`Stratix`) and async (`AsyncStratix`) clients. + +## Examples + +### Working with judges + +```python +# Create a judge (name and evaluation_goal are required) +judge = client.judges.create( + name="Response Quality Judge", + evaluation_goal="Rate whether the response is accurate, complete, and well-structured", +) + +# List judges (returns a JudgesResponse with .judges list) +response = client.judges.get_many() +for j in response.judges: + print(f"{j.name} (id: {j.id})") + +# Update a judge +client.judges.update(judge.id, name="Updated Judge Name") + +# Delete a judge +client.judges.delete(judge.id) +``` + +### Uploading and evaluating traces + +Trace upload works with JSON or JSONL files (up to 50 MB). The SDK handles presigned S3 uploads automatically. + +```python +# Upload a trace file (pass a file path, not raw data) +result = client.traces.upload("./my_traces.json") +print(f"Uploaded trace IDs: {result.trace_ids}") + +# List traces +traces = client.traces.get_many() +for t in traces.traces: + print(f"Trace {t.id}") + +# Create a trace evaluation +trace_eval = client.trace_evaluations.create( + trace_id=t.id, + judge_id=judge.id, +) + +# Get results +results = client.trace_evaluations.get_results(trace_eval.id) +``` + +### Custom models + +Custom models require an OpenAI-compatible API endpoint. + +```python +response = client.models.create_custom( + name="My Fine-tuned Model", + key="my-org/custom-model-v1", + description="Fine-tuned GPT for medical Q&A", + api_url="https://my-api.example.com/v1", + max_tokens=4096, + api_key=os.environ.get("MY_PROVIDER_API_KEY"), # optional +) +print(f"Created model: {response.model_id}") +``` + +## Client aliases + +For backward compatibility, multiple import names are available: + +```python +from layerlens import Stratix # Primary +from layerlens import AsyncStratix # Async primary +from layerlens import Client # Alias for Stratix +from layerlens import AsyncClient # Alias for AsyncStratix +from layerlens import Atlas # Legacy alias +from layerlens import AsyncAtlas # Legacy alias +from layerlens import PublicClient # Public endpoints +from layerlens import AsyncPublicClient +``` + +## Configuration + +| Environment Variable | Description | Default | +| ---------------------------- | ------------------------- | --------------------------------- | +| `LAYERLENS_STRATIX_API_KEY` | Your API key | (required) | +| `LAYERLENS_STRATIX_BASE_URL` | Override the API base URL | `https://api.layerlens.ai/api/v1` | + +Legacy env vars (`LAYERLENS_ATLAS_API_KEY`, `LAYERLENS_ATLAS_BASE_URL`) are also supported. + +## Error handling + +The SDK raises typed exceptions for API errors: + +```python +import os +from layerlens import Stratix, StratixError, APIError, BadRequestError, NotFoundError + +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + +try: + result = client.models.get_by_id("nonexistent-id") +except NotFoundError as e: + print(f"Not found (HTTP {e.status_code}): {e.message}") +except BadRequestError as e: + print(f"Bad request: {e.message}") +except APIError as e: + print(f"API error: {e.message}") +except StratixError as e: + print(f"Client error: {e}") +``` + +Catch the most specific exception first. The hierarchy: + +- `StratixError` (base for all SDK errors) + - `APIError` (base for all API-related errors) + - `APIConnectionError` (network issues) + - `APITimeoutError` (request timed out) + - `APIResponseValidationError` (response didn't match expected schema) + - `APIStatusError` (HTTP 4xx/5xx) + - `BadRequestError` (400) + - `AuthenticationError` (401) + - `PermissionDeniedError` (403) + - `NotFoundError` (404) + - `ConflictError` (409) + - `UnprocessableEntityError` (422) + - `RateLimitError` (429) + - `InternalServerError` (500+) + +Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`. + +## Requirements + +- Python 3.8+ +- Dependencies: `httpx`, `pydantic`, `requests` + +## Documentation + +Full API reference and examples are available in the [docs/](docs/) directory: + +- [API Reference](docs/api-reference/) (client config, all resource methods, error handling) +- [Code Examples](docs/examples/) (evaluations, judges, traces) +- [Troubleshooting](docs/troubleshooting/) (auth issues, error codes) + +## License + +Apache 2.0. See [LICENSE](LICENSE) for details. diff --git a/docs/README.md b/docs/README.md index d1dbfa4..a6de2ce 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,98 +1,266 @@ -# Layerlens Python SDK Documentation +# LayerLens Stratix Python SDK -Welcome to the official documentation for the Layerlens Python SDK for the Stratix platform. This library provides convenient programmatic to the Stratix platform from any Python 3.8+ application. +The official Python library for the [LayerLens Stratix](https://layerlens.ai) evaluation API. -## What is Stratix? +[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) -Stratix is an evaluation platform that allows you to benchmark AI models against various datasets and metrics. The Python SDK provides two HTTP clients (syncronous and asynchronous) powered by [httpx](https://github.com/encode/httpx) and [Pydantic](https://pydantic.dev/) models for type-safe API interactions. +## Installation -## Quick Start +```bash +pip install layerlens --extra-index-url https://sdk.layerlens.ai/package +``` -### Install LayerLens python sdk +## Authentication -Install the layerlens python sdk using the following command +Set your API key as an environment variable: ```bash -pip install layerlens --index-url https://sdk.layerlens.ai/package +export LAYERLENS_STRATIX_API_KEY="your-api-key" ``` -### Generate an api key on the Stratix platform +Or pass it directly when creating a client: -Login to your organization at [app.layerlens.ai](https://app.layerlens.ai) to generate an api key. Admin users of organizations can generate a keys in the settings page. - -Run this command to add your API key to your environment: +```python +from layerlens import Stratix -```bash -export LAYERLENS_STRATIX_API_KEY="YOUR_API_KEY" +client = Stratix(api_key="your-api-key") ``` -### Running an evaluation on the Stratix platform - -Before triggering an evaluation using the sdk, login to your organization at [app.layerlens.ai](https://app.layerlens.ai) to ensure that the model and benchmark you are trying to evaluate has been added to your organizations dashboard. +## Quick Start -#### Using synchronous client +### Run an evaluation ```python +import os from layerlens import Stratix - # Construct sync client - client = Stratix() +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) - # --- Models replace with the model key you want to run - model = client.models.get_by_key("openai/gpt-4o") +# Get a model and benchmark by key +model = client.models.get_by_key("openai/gpt-4o") +benchmark = client.benchmarks.get_by_key("arc-agi-2") - if not model: - print("Model not found") +# Create an evaluation (pass the full model and benchmark objects) +evaluation = client.evaluations.create( + model=model, + benchmark=benchmark, +) - # --- Benchmarks replace with the benchmark name you want to run - benchmark = client.benchmarks.get_by_key("aime2024") +# Wait for results (pass the evaluation object, not just the ID) +result = client.evaluations.wait_for_completion(evaluation) +print(f"Accuracy: {result.accuracy}") +``` + +### Async usage - if not benchmark: - print("benchmark not found") +```python +import os +import asyncio +from layerlens import AsyncStratix - # --- Create evaluation - evaluation = client.evaluations.create( +async def main(): + client = AsyncStratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + + model = await client.models.get_by_key("openai/gpt-4o") + benchmark = await client.benchmarks.get_by_key("arc-agi-2") + + evaluation = await client.evaluations.create( model=model, benchmark=benchmark, ) + + result = await client.evaluations.wait_for_completion(evaluation) + print(f"Accuracy: {result.accuracy}") + +asyncio.run(main()) ``` -#### Using Async Client +### Public endpoints + +Public models, benchmarks, and evaluations are accessible through `client.public`. Note: the public client still requires an API key. ```python -import asyncio -from layerlens import AsyncStratix +import os +from layerlens import Stratix -async def run_evaluation_async(): - # Construct async client - client = AsyncStratix() +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) - # --- Model to use - model = await client.models.get_by_key("openai/gpt-4o") +# Browse public models +models = client.public.models.get() +for model in models.models: + print(f"{model.key}: {model.name}") +``` - if not model: - print("Model not found") - return +Or instantiate the public client directly: - # --- Benchmark to use - benchmark = await client.benchmarks.get_by_key("aime2024") +```python +import os +from layerlens import PublicClient - if not benchmark: - print("benchmark not found") - return +public = PublicClient(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) +models = public.models.get() +``` - # --- Create evaluation - evaluation = await client.evaluations.create( - model=model, - benchmark=benchmark, - ) +## Resources + +The SDK provides access to these resource types: + +| Resource | Description | +| ---------------------------- | ----------------------------------------------------------------------------- | +| `client.models` | Manage models (get, get_by_key, add, remove, create_custom) | +| `client.benchmarks` | Manage benchmarks (get, get_by_key, add, remove, create_custom, create_smart) | +| `client.evaluations` | Create evaluations and wait for results | +| `client.judges` | CRUD operations for evaluation judges | +| `client.traces` | Upload trace files and manage traces | +| `client.trace_evaluations` | Run trace-level evaluations with judges | +| `client.judge_optimizations` | Optimize judge configurations | +| `client.results` | Retrieve evaluation results | +| `client.public` | Public models, benchmarks, evaluations, and comparisons | + +Every resource is available in both sync (`Stratix`) and async (`AsyncStratix`) clients. -if __name__ == "__main__": - asyncio.run(main()) +## Examples + +### Working with judges + +```python +# Create a judge (name and evaluation_goal are required) +judge = client.judges.create( + name="Response Quality Judge", + evaluation_goal="Rate whether the response is accurate, complete, and well-structured", +) + +# List judges (returns a JudgesResponse with .judges list) +response = client.judges.get_many() +for j in response.judges: + print(f"{j.name} (id: {j.id})") + +# Update a judge +client.judges.update(judge.id, name="Updated Judge Name") + +# Delete a judge +client.judges.delete(judge.id) +``` + +### Uploading and evaluating traces + +Trace upload works with JSON or JSONL files (up to 50 MB). The SDK handles presigned S3 uploads automatically. + +```python +# Upload a trace file (pass a file path, not raw data) +result = client.traces.upload("./my_traces.json") +print(f"Uploaded trace IDs: {result.trace_ids}") + +# List traces +traces = client.traces.get_many() +for t in traces.traces: + print(f"Trace {t.id}") + +# Create a trace evaluation +trace_eval = client.trace_evaluations.create( + trace_id=t.id, + judge_id=judge.id, +) + +# Get results +results = client.trace_evaluations.get_results(trace_eval.id) +``` + +### Custom models + +Custom models require an OpenAI-compatible API endpoint. + +```python +response = client.models.create_custom( + name="My Fine-tuned Model", + key="my-org/custom-model-v1", + description="Fine-tuned GPT for medical Q&A", + api_url="https://my-api.example.com/v1", + max_tokens=4096, + api_key=os.environ.get("MY_PROVIDER_API_KEY"), # optional +) +print(f"Created model: {response.model_id}") +``` + +## Client aliases + +For backward compatibility, multiple import names are available: + +```python +from layerlens import Stratix # Primary +from layerlens import AsyncStratix # Async primary +from layerlens import Client # Alias for Stratix +from layerlens import AsyncClient # Alias for AsyncStratix +from layerlens import Atlas # Legacy alias +from layerlens import AsyncAtlas # Legacy alias +from layerlens import PublicClient # Public endpoints +from layerlens import AsyncPublicClient +``` + +## Configuration + +| Environment Variable | Description | Default | +| ---------------------------- | ------------------------- | --------------------------------- | +| `LAYERLENS_STRATIX_API_KEY` | Your API key | (required) | +| `LAYERLENS_STRATIX_BASE_URL` | Override the API base URL | `https://api.layerlens.ai/api/v1` | + +Legacy env vars (`LAYERLENS_ATLAS_API_KEY`, `LAYERLENS_ATLAS_BASE_URL`) are also supported. + +## Error handling + +The SDK raises typed exceptions for API errors: + +```python +import os +from layerlens import Stratix, StratixError, APIError, BadRequestError, NotFoundError + +client = Stratix(api_key=os.environ.get("LAYERLENS_STRATIX_API_KEY")) + +try: + result = client.models.get_by_id("nonexistent-id") +except NotFoundError as e: + print(f"Not found (HTTP {e.status_code}): {e.message}") +except BadRequestError as e: + print(f"Bad request: {e.message}") +except APIError as e: + print(f"API error: {e.message}") +except StratixError as e: + print(f"Client error: {e}") ``` -## Next steps +Catch the most specific exception first. The hierarchy: + +- `StratixError` (base for all SDK errors) + - `APIError` (base for all API-related errors) + - `APIConnectionError` (network issues) + - `APITimeoutError` (request timed out) + - `APIResponseValidationError` (response didn't match expected schema) + - `APIStatusError` (HTTP 4xx/5xx) + - `BadRequestError` (400) + - `AuthenticationError` (401) + - `PermissionDeniedError` (403) + - `NotFoundError` (404) + - `ConflictError` (409) + - `UnprocessableEntityError` (422) + - `RateLimitError` (429) + - `InternalServerError` (500+) + +Note: Only `StratixError`, `APIError`, `BadRequestError`, `AuthenticationError`, and `NotFoundError` are exported from the top-level package. For other exception types, import from `layerlens._exceptions`. + +## Requirements + +- Python 3.8+ +- Dependencies: `httpx`, `pydantic`, `requests` + +## Documentation + +Full API reference and examples are available in the [docs/](docs/) directory: + +- [API Reference](docs/api-reference/) (client config, all resource methods, error handling) +- [Code Examples](docs/examples/) (evaluations, judges, traces) +- [Troubleshooting](docs/troubleshooting/) (auth issues, error codes) + +## License -- **[API Reference](api-reference/)** - Complete documentation of all available methods -- **[Code Examples](examples/)** - Practical examples for common use cases -- **[Troubleshooting](troubleshooting/)** - Solutions to common issues +Apache 2.0. See [LICENSE](LICENSE) for details. diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index a89a4b7..43bc303 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -21,10 +21,10 @@ ## Code Examples * [Creating Evaluations](examples/creating-evaluations.md) -* [Judges and Traces](examples/judges-and-traces.md) * [Retrieving Results](examples/retrieving-results.md) -* [Working with Timeouts](examples/timeouts.md) -* [Advanced Usage Patterns](examples/advanced-usage.md) +* [Models and Benchmarks](examples/models-and-benchmarks.md) +* [Judges and Traces](examples/judges-and-traces.md) +* [Public API](examples/public-api.md) ## Troubleshooting * [Common Issues](troubleshooting/common-issues.md) diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md index 935cbbc..550190c 100644 --- a/docs/api-reference/models-benchmarks.md +++ b/docs/api-reference/models-benchmarks.md @@ -132,6 +132,13 @@ Removes models from the project by their IDs. Returns `bool` - `True` if the operation succeeded, `False` otherwise. +#### Example + +```python +client = Stratix() +success = client.models.remove("model-id-1", "model-id-2") +``` + ### `create_custom(name, key, description, api_url, max_tokens, api_key=None, timeout=None)` Creates a custom model backed by an OpenAI-compatible API endpoint. This allows you to evaluate any model accessible via a chat completions endpoint. @@ -271,6 +278,13 @@ Removes benchmarks from the project by their IDs. Returns `bool` - `True` if the operation succeeded, `False` otherwise. +#### Example + +```python +client = Stratix() +success = client.benchmarks.remove("benchmark-id-1", "benchmark-id-2") +``` + ### `create_custom(name, description, file_path, additional_metrics=None, custom_scorer_ids=None, input_type=None, timeout=None)` Creates a custom benchmark by uploading a JSONL file. The file should contain one JSON object per line with `input` and `truth` fields. diff --git a/docs/api-reference/traces.md b/docs/api-reference/traces.md index ce4bfd8..9114fef 100644 --- a/docs/api-reference/traces.md +++ b/docs/api-reference/traces.md @@ -168,6 +168,14 @@ Deletes a trace by its unique identifier. Returns `True` if the trace was deleted, `False` otherwise. +#### Example + +```python +deleted = client.traces.delete("trace-abc123") +if deleted: + print("Trace deleted successfully") +``` + ### `get_sources(timeout=None)` Retrieves the list of available trace sources for the current project. diff --git a/docs/examples/README.md b/docs/examples/README.md index 65afe60..35f1cc7 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -1,2 +1,41 @@ # Examples +This section provides practical code examples for common SDK use cases. All examples are available as runnable scripts in the [`examples/`](../../examples/) directory. + +## Quick Reference + +| Example | Description | +| ------- | ----------- | +| [`client_simple.py`](../../examples/client_simple.py) | Minimal sync client usage | +| [`client.py`](../../examples/client.py) | Full sync evaluation workflow | +| [`async_client_simple.py`](../../examples/async_client_simple.py) | Minimal async client usage | +| [`async_client.py`](../../examples/async_client.py) | Full async evaluation workflow | +| [`async_run_evaluations.py`](../../examples/async_run_evaluations.py) | Run multiple evaluations in parallel | +| [`get_models.py`](../../examples/get_models.py) | Filter models by name, company, region, type | +| [`get_benchmarks.py`](../../examples/get_benchmarks.py) | Filter benchmarks by name and type | +| [`get_evaluation.py`](../../examples/get_evaluation.py) | Fetch an evaluation by ID | +| [`evaluation_sorting.py`](../../examples/evaluation_sorting.py) | Sort and filter evaluations | +| [`compare_evaluations.py`](../../examples/compare_evaluations.py) | Compare two models on a benchmark | +| [`paginated_results.py`](../../examples/paginated_results.py) | Paginate through evaluation results | +| [`all_results_no_pagination.py`](../../examples/all_results_no_pagination.py) | Fetch all results at once | +| [`fetch_results_async.py`](../../examples/fetch_results_async.py) | Fetch results for multiple evaluations concurrently | +| [`create_custom_model.py`](../../examples/create_custom_model.py) | Create a custom model with an OpenAI-compatible API | +| [`create_custom_benchmark.py`](../../examples/create_custom_benchmark.py) | Create a custom benchmark from a JSONL file | +| [`create_smart_benchmark.py`](../../examples/create_smart_benchmark.py) | Create an AI-generated benchmark from documents | +| [`manage_project_models_benchmarks.py`](../../examples/manage_project_models_benchmarks.py) | Add/remove models and benchmarks from a project | +| [`judges.py`](../../examples/judges.py) | Create, list, update, and delete judges | +| [`traces.py`](../../examples/traces.py) | Upload, list, get, and delete traces | +| [`trace_evaluations.py`](../../examples/trace_evaluations.py) | Run judges on traces, estimate cost, get results | +| [`async_judges_and_traces.py`](../../examples/async_judges_and_traces.py) | Async judge and trace evaluation workflow | +| [`judge_optimizations.py`](../../examples/judge_optimizations.py) | Estimate, run, and apply judge optimizations | +| [`public_models.py`](../../examples/public_models.py) | Browse, search, and filter public models | +| [`public_benchmarks.py`](../../examples/public_benchmarks.py) | Browse public benchmarks and download prompts | +| [`public_evaluations.py`](../../examples/public_evaluations.py) | Get public evaluation details and results | + +## Guides + +- [Creating Evaluations](creating-evaluations.md) - Sync, async, and parallel evaluations +- [Retrieving Results](retrieving-results.md) - Paginated, bulk, and concurrent result fetching +- [Models and Benchmarks](models-and-benchmarks.md) - Filtering, custom models, custom/smart benchmarks, project management +- [Judges and Traces](judges-and-traces.md) - Judge CRUD, trace uploads, trace evaluations, and optimizations +- [Public API](public-api.md) - Public models, benchmarks, evaluations, and comparisons diff --git a/docs/examples/creating-evaluations.md b/docs/examples/creating-evaluations.md index ffec8cf..47f5adc 100644 --- a/docs/examples/creating-evaluations.md +++ b/docs/examples/creating-evaluations.md @@ -1,14 +1,14 @@ # Creating Evaluations -Examples for creating evaluations on the Stratix platform using the Layerlens python sdk. +Examples for creating evaluations on the Stratix platform using the LayerLens Python SDK. -> Before running the below examples ensure the model and benchmark being run are present on your organiztion. +> Before running the below examples ensure the model and benchmark being run are present on your organization. ## Basic Evaluation ### Using Synchronous Client -Below is an example showing how to trigger an evaluation, waiting for it to complete and finally fetching the evaluations results. +> Source: [`examples/client.py`](../../examples/client.py) ```python from layerlens import Stratix @@ -16,23 +16,20 @@ from layerlens import Stratix # Construct sync client (API key from env or inline) client = Stratix() -# --- Models replace with the model key you want to run -model = client.models.get_by_key("openai/gpt-4o") +# --- Models +models = client.models.get() +print(f"Found {len(models)} models") -if not model: - print("Model not found") - -# --- Benchmarks replace with the benchmark name you want to run -benchmark = client.benchmarks.get_by_key("aime2024") - -if not benchmark: - print("benchmark not found") +# --- Benchmarks +benchmarks = client.benchmarks.get() +print(f"Found {len(benchmarks)} benchmarks") # --- Create evaluation evaluation = client.evaluations.create( - model=model, - benchmark=benchmark, + model=models[0], + benchmark=benchmarks[0], ) +print(f"Created evaluation {evaluation.id}, status={evaluation.status}") # --- Wait for completion evaluation = client.evaluations.wait_for_completion( @@ -40,17 +37,41 @@ evaluation = client.evaluations.wait_for_completion( interval_seconds=10, timeout_seconds=600, # 10 minutes ) +print(f"Evaluation {evaluation.id} finished with status={evaluation.status}") # --- Results if evaluation.is_success: - # Loads the first page of results results = client.results.get(evaluation=evaluation) print("Results:", results) +else: + print("Evaluation did not succeed, no results to show.") +``` + +### Minimal Sync Example + +> Source: [`examples/client_simple.py`](../../examples/client_simple.py) + +```python +from layerlens import Stratix + +client = Stratix() + +models = client.models.get(type="public", name="gpt-4o") +model = models[0] + +benchmarks = client.benchmarks.get(type="public", name="simpleQA") +benchmark = benchmarks[0] +evaluation = client.evaluations.create( + model=model, + benchmark=benchmark, +) ``` ### Using Async Client +> Source: [`examples/async_client_simple.py`](../../examples/async_client_simple.py) + ```python import asyncio @@ -58,65 +79,128 @@ from layerlens import AsyncStratix async def main(): - # Construct async client client = AsyncStratix() - # --- Model to use - model = await client.models.get_by_key("openai/gpt-4o") + models = await client.models.get() + print(f"Found {len(models)} models") - if not model: - print("Model not found") - return + benchmarks = await client.benchmarks.get() + print(f"Found {len(benchmarks)} benchmarks") - # --- Benchmark to use - benchmark = await client.benchmarks.get_by_key("aime2024") + evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0]) + print(f"Created evaluation {evaluation.id}, status={evaluation.status}") - if not benchmark: - print("benchmark not found") - return + await evaluation.wait_for_completion_async(interval_seconds=10, timeout_seconds=600) + print(f"Evaluation {evaluation.id} finished with status={evaluation.status}") + if evaluation.is_success: + results = await evaluation.get_results_async() + print("Results:", results) + else: + print("Evaluation did not succeed, no results to show.") - # --- Create evaluation - evaluation = await client.evaluations.create(model=model, benchmark=benchmark) +if __name__ == "__main__": + asyncio.run(main()) +``` - await evaluation.wait_for_completion_async(interval_seconds=10) +## Sorting and Filtering Evaluations - # --- Results - if evaluation.is_success: - results = await evaluation.get_results_async() +> Source: [`examples/evaluation_sorting.py`](../../examples/evaluation_sorting.py) + +```python +import asyncio + +from layerlens import AsyncStratix +from layerlens.models import EvaluationStatus + + +async def main(): + client = AsyncStratix() + + # --- Sort by accuracy (highest first) + response = await client.evaluations.get_many( + sort_by="accuracy", + order="desc", + page_size=10, + ) + if response: + print(f"Top {len(response.evaluations)} evaluations by accuracy:") + for evaluation in response.evaluations: + print(f" - {evaluation.id}: accuracy={evaluation.accuracy:.2f}%") + + # --- Filter by status (only successful) + response = await client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + ) + if response: + print(f"Successful evaluations: {response.pagination.total_count}") + + # --- Filter by model or benchmark IDs + response = await client.evaluations.get_many( + model_ids=["your-model-id"], + sort_by="accuracy", + order="desc", + ) + + # --- Combine sorting, filtering, and pagination + response = await client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page=1, + page_size=20, + ) + if response: + print(f"Page 1: {response.pagination.total_count} total, {response.pagination.total_pages} pages") if __name__ == "__main__": asyncio.run(main()) ``` -## Error Handling +## Comparing Evaluations + +> Source: [`examples/compare_evaluations.py`](../../examples/compare_evaluations.py) ```python -from layerlens import Stratix -import layerlens +from layerlens import PublicClient -client = Stratix() +client = PublicClient() -try: - models = client.models.get() - benchmarks = client.benchmarks.get() +# Compare two models on a benchmark +comparison = client.comparisons.compare_models( + benchmark_id="682bddc1e014f9fa440f8a91", + model_id_1="699f9761e014f9c3072b0513", + model_id_2="699f9761e014f9c3072b0512", + page=1, + page_size=10, +) - evaluation = client.evaluations.create( - model=models[0], - benchmark=benchmarks[0] - ) +if comparison: + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") -except layerlens.AuthenticationError: - print("Check your API key") -except layerlens.NotFoundError: - print("Model or benchmark not found") -except layerlens.APIError as e: - print(f"API error: {e}") +# Filter: where model 1 fails but model 2 succeeds +comparison = client.comparisons.compare_models( + benchmark_id="682bddc1e014f9fa440f8a91", + model_id_1="699f9761e014f9c3072b0513", + model_id_2="699f9761e014f9c3072b0512", + outcome_filter="reference_fails", +) + +# Or compare using evaluation IDs directly +comparison = client.comparisons.compare( + evaluation_id_1="699f9938a03d70bf6607081f", + evaluation_id_2="699f991ca782d00ebd666ba1", +) ``` -## Triggering Multiple Evaluations +## Running Multiple Evaluations in Parallel + +> Source: [`examples/async_run_evaluations.py`](../../examples/async_run_evaluations.py) ```python import asyncio @@ -125,81 +209,136 @@ from layerlens import AsyncStratix async def create_and_run_evaluation(client, model, benchmark, eval_number): - """Create and run a single evaluation, tracking progress.""" try: - print(f"Starting evaluation #{eval_number}...") - - # Create evaluation evaluation = await client.evaluations.create(model=model, benchmark=benchmark) - # Wait for completion evaluation = await client.evaluations.wait_for_completion( evaluation, interval_seconds=10, - timeout_seconds=600, # 10 minutes + timeout_seconds=600, ) - # Get results if successful if evaluation.is_success: results = await client.results.get_all(evaluation=evaluation) - return results + print(f"Evaluation #{eval_number} completed with {len(results)} results") + return eval_number, evaluation.id, len(results), True else: - return None + return eval_number, evaluation.id, 0, False except Exception as e: - print(f"✗ Error in evaluation #{eval_number}: {e}") + print(f"Error in evaluation #{eval_number}: {e}") return eval_number, None, 0, False async def main(): - # Construct async client client = AsyncStratix() - # --- Models models = await client.models.get() + benchmarks = await client.benchmarks.get() + + num_evaluations = 3 + tasks = [ + create_and_run_evaluation(client, models[0], benchmarks[0], i + 1) + for i in range(num_evaluations) + ] + + results = await asyncio.gather(*tasks, return_exceptions=True) - # --- Benchmarks + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Fetching Results + +### Paginated Results + +> Source: [`examples/paginated_results.py`](../../examples/paginated_results.py) + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + models = await client.models.get() benchmarks = await client.benchmarks.get() - # Use first model and benchmark for all evaluations - target_model = models[0] - target_benchmark = benchmarks[0] + evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0]) + evaluation = await client.evaluations.wait_for_completion(evaluation, interval_seconds=10, timeout_seconds=600) - print(f"Using model: {target_model}") - print(f"Using benchmark: {target_benchmark}") + if evaluation.is_success: + all_results = [] + page = 1 + page_size = 50 - # Create 3 evaluation tasks - num_evaluations = 3 - print(f"Starting {num_evaluations} evaluations in parallel...") + while True: + results_data = await client.results.get_by_id( + evaluation_id=evaluation.id, page=page, page_size=page_size + ) + + if not results_data or not results_data.results: + break - tasks = [create_and_run_evaluation(client, target_model, target_benchmark, i + 1) for i in range(num_evaluations)] + all_results.extend(results_data.results) - # Execute all evaluations concurrently - await asyncio.gather(*tasks, return_exceptions=True) + if page >= results_data.pagination.total_pages: + break + page += 1 + + print(f"Total results collected: {len(all_results)}") if __name__ == "__main__": asyncio.run(main()) ``` -## Fetching Results of Multiple Evaluations Async +### All Results Without Pagination + +> Source: [`examples/all_results_no_pagination.py`](../../examples/all_results_no_pagination.py) ```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + models = await client.models.get() + benchmarks = await client.benchmarks.get() + + evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0]) + evaluation = await client.evaluations.wait_for_completion(evaluation, interval_seconds=10, timeout_seconds=600) + + # Fetch all results at once + results = await client.results.get_all(evaluation=evaluation) + print(f"Found {len(results)} results") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Fetch Results for Multiple Evaluations Concurrently + +> Source: [`examples/fetch_results_async.py`](../../examples/fetch_results_async.py) + +```python import asyncio from layerlens import AsyncStratix async def fetch_evaluation_results(client, evaluation_id): - """Fetch results for a single evaluation and print when loaded.""" try: - print(f"Fetching evaluation {evaluation_id}...") evaluation = await client.evaluations.get_by_id(evaluation_id) - # Get all results for this evaluation results = await client.results.get_all(evaluation=evaluation) print(f"Loaded {len(results)} results for evaluation {evaluation_id}") - return evaluation_id, results except Exception as e: print(f"Error fetching evaluation {evaluation_id}: {e}") @@ -207,26 +346,42 @@ async def fetch_evaluation_results(client, evaluation_id): async def main(): - # Construct async client client = AsyncStratix() - # List of example evaluation IDs to fetch - - evaluation_ids = ["68a65a3de7ad047fbd8e7d4", "688a54c673f6b2835cc7278"] + evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"] - print(f"Starting async fetch for {len(evaluation_ids)} evaluations...") - - # Create tasks for concurrent execution tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids] - - # Execute all tasks concurrently and print results as they complete results = await asyncio.gather(*tasks, return_exceptions=True) - print("=" * 80) - print("Summary:") successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception)) + print(f"Successfully fetched {successful}/{len(evaluation_ids)} evaluations") + if __name__ == "__main__": asyncio.run(main()) +``` +## Error Handling + +```python +from layerlens import Stratix +import layerlens + +client = Stratix() + +try: + models = client.models.get() + benchmarks = client.benchmarks.get() + + evaluation = client.evaluations.create( + model=models[0], + benchmark=benchmarks[0], + ) + +except layerlens.AuthenticationError: + print("Check your API key") +except layerlens.NotFoundError: + print("Model or benchmark not found") +except layerlens.APIError as e: + print(f"API error: {e}") ``` diff --git a/docs/examples/judges-and-traces.md b/docs/examples/judges-and-traces.md index 8ac976e..9d5e80b 100644 --- a/docs/examples/judges-and-traces.md +++ b/docs/examples/judges-and-traces.md @@ -1,245 +1,319 @@ # Judges and Traces -Examples for working with judges, traces, and trace evaluations on the Stratix platform using the Layerlens Python SDK. +Examples for working with judges, traces, and trace evaluations on the Stratix platform using the LayerLens Python SDK. ## Creating and Managing Judges -### Basic Judge CRUD +> Source: [`examples/judges.py`](../../examples/judges.py) ```python +import time + from layerlens import Stratix client = Stratix() -# Fetch a model to use for the judge +# Fetch a model to use as the judge's LLM models = client.models.get(type="public", name="gpt-4o") model = models[0] +print(f"Using model: {model.name} ({model.id})") -# Create a judge (model_id is required) +# --- Create a judge judge = client.judges.create( - name="Code Quality Judge", + name=f"Code Quality Judge {int(time.time())}", evaluation_goal="Evaluate the quality of code output including correctness, readability, and style", model_id=model.id, ) -print(f"Created judge: {judge.name} (v{judge.version})") +print(f"Created judge {judge.id}: {judge.name}") -# Get a judge by ID +# --- Get a judge by ID judge = client.judges.get(judge.id) +print(f"Judge: {judge.name}, version: {judge.version}") -# List all judges with pagination -response = client.judges.get_many(page=1, page_size=50) +# --- List all judges +response = client.judges.get_many() +print(f"Found {response.total_count} judges") for j in response.judges: - print(f" {j.name}: v{j.version}, {j.run_count} runs") + print(f" - {j.name} (v{j.version}, {j.run_count} runs)") -# Update a judge (creates a new version) -client.judges.update( +# --- Update a judge (creates a new version) +updated = client.judges.update( judge.id, - evaluation_goal="Evaluate code for correctness, readability, style, and security", + name="Updated Code Quality Judge", + evaluation_goal="Evaluate code output for correctness, readability, style, and security", ) +print(f"Updated judge {updated.id}") -# Delete a judge -client.judges.delete(judge.id) +# --- Delete a judge +deleted = client.judges.delete(judge.id) +print(f"Deleted judge {deleted.id}") ``` ## Uploading and Managing Traces -### Upload Trace Files +> Source: [`examples/traces.py`](../../examples/traces.py) ```python +import os + from layerlens import Stratix client = Stratix() -# Upload a JSONL file containing multiple traces -result = client.traces.upload("./traces.jsonl") +# --- Upload traces from a file +traces_file = os.path.join(os.path.dirname(__file__), "traces.jsonl") +result = client.traces.upload(traces_file) print(f"Uploaded {len(result.trace_ids)} traces") -# Upload a single JSON trace -result = client.traces.upload("./single-trace.json") -``` - -### Browse and Filter Traces - -```python -from layerlens import Stratix - -client = Stratix() - -# List all traces +# --- List traces response = client.traces.get_many() -print(f"Total traces: {response.total_count}") +print(f"Found {response.total_count} traces") +for trace in response.traces[:5]: + print(f" - {trace.id}: {trace.filename}") -# Filter by time range and sort -response = client.traces.get_many( - time_range="7d", +# --- List traces with filters +filtered = client.traces.get_many( sort_by="created_at", sort_order="desc", + page_size=10, ) +print(f"Filtered traces: {filtered.count}") -# Search traces -response = client.traces.get_many(search="authentication") +# --- Get a single trace +trace = client.traces.get(result.trace_ids[0]) +print(f"Trace {trace.id}: {len(trace.data)} data keys") -# Get available sources +# --- Get available sources sources = client.traces.get_sources() print(f"Sources: {sources}") -``` - -### Get Trace Details - -```python -from layerlens import Stratix - -client = Stratix() -trace = client.traces.get("trace-abc123") -if trace: - print(f"Filename: {trace.filename}") - print(f"Created: {trace.created_at}") - print(f"Data keys: {list(trace.data.keys())}") +# --- Delete a trace +deleted = client.traces.delete(trace.id) +print(f"Deleted: {deleted}") ``` ## Running Trace Evaluations -### Estimate Cost Before Running +> Source: [`examples/trace_evaluations.py`](../../examples/trace_evaluations.py) ```python +import time + from layerlens import Stratix client = Stratix() -# Get trace IDs to evaluate -traces_response = client.traces.get_many(page_size=10) +# Fetch a model and create a judge +models = client.models.get(type="public", name="gpt-4o") +model = models[0] + +judge = client.judges.create( + name=f"Trace Eval Demo Judge {int(time.time())}", + evaluation_goal="Evaluate whether the response is accurate, complete, and well-structured", + model_id=model.id, +) +print(f"Created judge {judge.id}: {judge.name}") + +# --- Get existing traces to evaluate +traces_response = client.traces.get_many(page_size=3) trace_ids = [t.id for t in traces_response.traces] +print(f"Found {len(trace_ids)} traces to evaluate") -# Estimate cost +# --- Estimate cost before running estimate = client.trace_evaluations.estimate_cost( trace_ids=trace_ids, - judge_id="judge-123", + judge_id=judge.id, ) -print(f"Cost for {estimate.trace_count} traces: ${estimate.estimated_cost:.4f}") -print(f"Model: {estimate.model}") -``` - -### Run a Judge on a Trace +print(f"Estimated cost: ${estimate.estimated_cost:.4f} for {estimate.trace_count} traces") -```python -import time -from layerlens import Stratix - -client = Stratix() - -# Create an evaluation +# --- Run a judge on the first trace evaluation = client.trace_evaluations.create( - trace_id="trace-abc", - judge_id="judge-xyz", + trace_id=trace_ids[0], + judge_id=judge.id, ) -print(f"Evaluation {evaluation.id}: {evaluation.status}") +print(f"Created evaluation {evaluation.id}, status: {evaluation.status}") -# Wait for evaluation to complete (evaluations run asynchronously on the server) +# --- Wait for evaluation to complete for _ in range(30): evaluation = client.trace_evaluations.get(evaluation.id) + print(f"Evaluation status: {evaluation.status}") if evaluation.status.value in ("success", "failure"): break time.sleep(2) -# Get results (only available after evaluation completes) +# --- Get evaluation results try: results_response = client.trace_evaluations.get_results(evaluation.id) - if results_response: + if results_response and results_response.results: for result in results_response.results: - print(f"Score: {result.score}, Passed: {result.passed}") - print(f"Reasoning: {result.reasoning}") - print(f"Latency: {result.latency_ms}ms, Cost: ${result.total_cost:.4f}") - for step in result.steps: - print(f" Step {step.step}: {step.reasoning}") + print(f" Score: {result.score}, Passed: {result.passed}") + print(f" Reasoning: {result.reasoning}") + if result.steps: + for step in result.steps: + print(f" Step {step.step}: {step.reasoning}") + else: + print(" No results returned") except Exception: - print("Results not available yet") + print(" No results yet (evaluation may still be in progress)") + +# --- List all trace evaluations +response = client.trace_evaluations.get_many() +print(f"Found {response.total} trace evaluations") + +# --- Clean up +client.judges.delete(judge.id) ``` -### Browse Evaluation Results +## Judge Optimizations + +> Source: [`examples/judge_optimizations.py`](../../examples/judge_optimizations.py) + +Optimization requires that the judge has at least 10 annotations (trace evaluation results). Run trace evaluations first to build up annotation data. ```python +import time + +import layerlens from layerlens import Stratix client = Stratix() -# List all evaluations -response = client.trace_evaluations.get_many() -print(f"Total evaluations: {response.total}") +models = client.models.get(type="public", name="gpt-4o") +model = models[0] -# Filter by judge and outcome -response = client.trace_evaluations.get_many( - judge_id="judge-123", - outcome="pass", - sort_by="created_at", - sort_order="desc", +judge = client.judges.create( + name=f"Optimization Demo Judge {int(time.time())}", + evaluation_goal="Evaluate whether the response is accurate, complete, and well-structured", + model_id=model.id, ) -# Filter by trace -response = client.trace_evaluations.get_many( - trace_id="trace-abc", +# --- Estimate cost +estimate = client.judge_optimizations.estimate( + judge_id=judge.id, + budget="medium", ) +if estimate: + print(f"Estimated cost: ${estimate.estimated_cost:.4f}") + print(f" Annotations: {estimate.annotation_count}, Budget: {estimate.budget}") + +# --- Create an optimization run +try: + run = client.judge_optimizations.create( + judge_id=judge.id, + budget="medium", + ) +except layerlens.BadRequestError as e: + print(f"Cannot start optimization: {e}") + print("Tip: Run trace evaluations with this judge first to build up annotations.") + client.judges.delete(judge.id) + exit(0) + +# --- Poll for completion +optimization = None +for i in range(60): + optimization = client.judge_optimizations.get(run.id) + if not optimization: + break + print(f" [{i * 5}s] Status: {optimization.status}") + if optimization.status.value in ("success", "failure"): + print(f" Baseline accuracy: {optimization.baseline_accuracy}") + print(f" Optimized accuracy: {optimization.optimized_accuracy}") + break + time.sleep(5) + +# --- List optimization runs +response = client.judge_optimizations.get_many(judge_id=judge.id) +if response: + print(f"Found {response.total} optimization runs") + +# --- Apply optimization results +if optimization and optimization.status.value == "success": + result = client.judge_optimizations.apply(run.id) + if result: + print(f"Applied optimization: new version v{result.new_version}") + +client.judges.delete(judge.id) ``` -## Async Workflows +## Async Judges and Traces -### Run Evaluations Concurrently +> Source: [`examples/async_judges_and_traces.py`](../../examples/async_judges_and_traces.py) ```python +import os +import time import asyncio + from layerlens import Stratix, AsyncStratix + async def main(): - # Fetch a model for judge creation + # Fetch a model using sync client sync_client = Stratix() models = sync_client.models.get(type="public", name="gpt-4o") model = models[0] client = AsyncStratix() - # Create a judge (model_id is required) + # --- Create a judge judge = await client.judges.create( - name="Response Quality Judge", - evaluation_goal="Evaluate whether the response is accurate and well-structured", + name=f"Response Quality Judge {int(time.time())}", + evaluation_goal="Evaluate whether the response is accurate, helpful, and well-structured", model_id=model.id, ) + print(f"Created judge {judge.id}: {judge.name}") - # Upload traces - result = await client.traces.upload("./traces.jsonl") + # --- Upload traces + traces_file = os.path.join(os.path.dirname(__file__), "traces.jsonl") + result = await client.traces.upload(traces_file) print(f"Uploaded {len(result.trace_ids)} traces") - # Get traces to evaluate - traces_response = await client.traces.get_many(page_size=5) - trace_ids = [t.id for t in traces_response.traces] + # --- List traces + traces_response = await client.traces.get_many(page_size=10) + trace_ids = [t.id for t in traces_response.traces[:5]] + + # --- Estimate cost + estimate = await client.trace_evaluations.estimate_cost( + trace_ids=trace_ids, + judge_id=judge.id, + ) + print(f"Estimated cost: ${estimate.estimated_cost:.4f}") - # Run evaluations concurrently - tasks = [ - client.trace_evaluations.create(trace_id=tid, judge_id=judge.id) - for tid in trace_ids - ] + # --- Run evaluations concurrently + tasks = [client.trace_evaluations.create(trace_id=tid, judge_id=judge.id) for tid in trace_ids] evaluations = await asyncio.gather(*tasks) for evaluation in evaluations: if evaluation: - print(f"Evaluation {evaluation.id}: {evaluation.status}") + print(f" Evaluation {evaluation.id}: {evaluation.status}") - # Wait for evaluations to complete, then fetch results + # --- Wait and fetch results await asyncio.sleep(10) for evaluation in evaluations: if not evaluation: continue try: results_response = await client.trace_evaluations.get_results(evaluation.id) - if results_response: + if results_response and results_response.results: for result in results_response.results: - print(f"Score: {result.score}, Passed: {result.passed}") + print(f" Score: {result.score}, Passed: {result.passed}") + else: + print(f" Evaluation {evaluation.id}: no results yet") except Exception: - print(f"Evaluation {evaluation.id}: results not available yet") + print(f" Evaluation {evaluation.id}: results not available yet") + + await client.judges.delete(judge.id) + if __name__ == "__main__": asyncio.run(main()) ``` +## See Also + +- [Models and Benchmarks](models-and-benchmarks.md) - Custom models, custom/smart benchmarks, project management +- [Public API](public-api.md) - Public models, benchmarks, evaluations, and comparisons + ## Error Handling ```python @@ -249,7 +323,6 @@ import layerlens client = Stratix() try: - # Fetch a model for the judge models = client.models.get(type="public", name="gpt-4o") model = models[0] diff --git a/docs/examples/models-and-benchmarks.md b/docs/examples/models-and-benchmarks.md new file mode 100644 index 0000000..fc21c68 --- /dev/null +++ b/docs/examples/models-and-benchmarks.md @@ -0,0 +1,256 @@ +# Models and Benchmarks + +Examples for browsing, filtering, creating, and managing models and benchmarks using the LayerLens Python SDK. + +## Filtering Models + +> Source: [`examples/get_models.py`](../../examples/get_models.py) + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + # --- Filter by name + model_name = "gpt-4o" + models = await client.models.get(name=model_name) + print(f"Found {len(models)} models with name {model_name}") + + # --- Filter by company + company_names = ["openai", "anthropic"] + models = await client.models.get(companies=company_names) + print(f"Found {len(models)} models with companies {company_names}") + + # --- Filter by region + region_names = ["usa"] + models = await client.models.get(regions=region_names) + print(f"Found {len(models)} models with regions {region_names}") + + # --- Filter by type + model_type = "public" + models = await client.models.get(type=model_type) + print(f"Found {len(models)} models with type {model_type}") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Filtering Benchmarks + +> Source: [`examples/get_benchmarks.py`](../../examples/get_benchmarks.py) + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + # --- Filter by name + benchmark_name = "mmlu" + benchmarks = await client.benchmarks.get(name=benchmark_name) + print(f"Found {len(benchmarks)} benchmarks with name {benchmark_name}") + + # --- Filter by type + benchmark_type = "public" + benchmarks = await client.benchmarks.get(type=benchmark_type) + print(f"Found {len(benchmarks)} benchmarks with type {benchmark_type}") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Creating a Custom Model + +> Source: [`examples/create_custom_model.py`](../../examples/create_custom_model.py) + +Custom models let you evaluate any model accessible via an OpenAI-compatible chat completions endpoint. + +```python +import os + +from layerlens import Stratix + + +def main(): + client = Stratix() + + result = client.models.create_custom( + name="My Custom Model", + key="my-org/custom-model-v1", + description="Custom fine-tuned model served via vLLM", + api_url="https://my-model-endpoint.example.com/v1", + api_key=os.environ["MY_PROVIDER_API_KEY"], + max_tokens=4096, + ) + + if result: + print(f"Custom model created: {result.model_id}") + else: + print("Failed to create custom model") + + # Verify the model was added + models = client.models.get(type="custom") + if models: + print(f"\nCustom models in project ({len(models)}):") + for m in models: + print(f" - {m.name} (id={m.id}, key={m.key})") + + +if __name__ == "__main__": + main() +``` + +## Creating a Custom Benchmark + +> Source: [`examples/create_custom_benchmark.py`](../../examples/create_custom_benchmark.py) + +Custom benchmarks are created from JSONL files with `input` and `truth` fields. + +```python +from layerlens import Stratix + + +def main(): + client = Stratix() + + # Basic custom benchmark + result = client.benchmarks.create_custom( + name="My Custom Benchmark", + description="A simple test benchmark for QA evaluation", + file_path="path/to/benchmark.jsonl", + ) + + if result: + print(f"Custom benchmark created: {result.benchmark_id}") + + # With additional metrics and input type + result = client.benchmarks.create_custom( + name="Advanced Benchmark", + description="Benchmark with toxicity and readability scoring", + file_path="path/to/benchmark.jsonl", + additional_metrics=["toxicity", "readability"], + input_type="messages", + ) + + if result: + print(f"Advanced benchmark created: {result.benchmark_id}") + + # Verify + benchmarks = client.benchmarks.get(type="custom") + if benchmarks: + print(f"\nCustom benchmarks in project ({len(benchmarks)}):") + for b in benchmarks: + print(f" - {b.name} (id={b.id})") + + +if __name__ == "__main__": + main() +``` + +### JSONL File Format + +Each line should be a JSON object: + +```json +{"input": "What is 2+2?", "truth": "4"} +{"input": "Capital of France?", "truth": "Paris"} +``` + +Optional field: `subset` (for grouping prompts into categories). + +## Creating a Smart Benchmark + +> Source: [`examples/create_smart_benchmark.py`](../../examples/create_smart_benchmark.py) + +Smart benchmarks use AI to automatically generate benchmark prompts from uploaded documents. Supported file types: `.txt`, `.pdf`, `.html`, `.docx`, `.csv`, `.json`, `.jsonl`, `.parquet`. + +```python +from layerlens import Stratix + + +def main(): + client = Stratix() + + result = client.benchmarks.create_smart( + name="Product Knowledge Benchmark", + description="Evaluates model knowledge of our product documentation", + system_prompt=( + "Generate question-answer pairs that test understanding of the " + "product features, capabilities, and limitations described in " + "the provided documents. Each question should have a clear, " + "factual answer derived from the source material." + ), + file_paths=[ + "path/to/product_docs.pdf", + "path/to/faq.txt", + ], + metrics=["hallucination"], + ) + + if result: + print(f"Smart benchmark created: {result.benchmark_id}") + print("The benchmark is being generated asynchronously.") + print("Check the dashboard for progress.") + else: + print("Failed to create smart benchmark") + + +if __name__ == "__main__": + main() +``` + +## Managing Project Models and Benchmarks + +> Source: [`examples/manage_project_models_benchmarks.py`](../../examples/manage_project_models_benchmarks.py) + +Add and remove public models and benchmarks from your project. + +```python +from layerlens import Stratix + + +def main(): + client = Stratix() + + # --- Add public models to the project + success = client.models.add("model-id-1", "model-id-2") + print(f"Add models: {'success' if success else 'failed'}") + + # --- Remove a model from the project + success = client.models.remove("model-id-1") + print(f"Remove model: {'success' if success else 'failed'}") + + # --- Add public benchmarks to the project + success = client.benchmarks.add("benchmark-id-1") + print(f"Add benchmark: {'success' if success else 'failed'}") + + # --- Remove a benchmark from the project + success = client.benchmarks.remove("benchmark-id-1") + print(f"Remove benchmark: {'success' if success else 'failed'}") + + # --- List current models and benchmarks + models = client.models.get() + if models: + print(f"\nModels in project ({len(models)}):") + for m in models: + print(f" - {m.name} (id={m.id})") + + benchmarks = client.benchmarks.get() + if benchmarks: + print(f"\nBenchmarks in project ({len(benchmarks)}):") + for b in benchmarks: + print(f" - {b.name} (id={b.id})") + + +if __name__ == "__main__": + main() +``` diff --git a/docs/examples/public-api.md b/docs/examples/public-api.md new file mode 100644 index 0000000..5ace255 --- /dev/null +++ b/docs/examples/public-api.md @@ -0,0 +1,279 @@ +# Public API + +Examples for browsing public models, benchmarks, evaluations, and comparing results using the LayerLens Python SDK. + +The public API is accessed through `client.public` on a `Stratix` client, or by instantiating `PublicClient` directly. An API key is still required. + +```python +from layerlens import Stratix, PublicClient + +# Via the main client +client = Stratix() +public = client.public + +# Or directly +public = PublicClient() +``` + +## Public Models + +> Source: [`examples/public_models.py`](../../examples/public_models.py) + +```python +from layerlens import PublicClient + + +def main(): + client = PublicClient() + + # --- Browse all public models (first page) + response = client.models.get(page=1, page_size=10) + print(f"Found {response.total_count} public models (showing first {len(response.models)})") + for model in response.models: + print(f" - {model.name} ({model.company})") + + # --- Search models by query + response = client.models.get(query="gpt") + print(f"\nFound {response.total_count} models matching 'gpt'") + for model in response.models: + print(f" - {model.name}") + + # --- Filter by company + companies = ["OpenAI", "Anthropic"] + response = client.models.get(companies=companies) + print(f"\nFound {response.total_count} models from {companies}") + for model in response.models: + print(f" - {model.name} ({model.company})") + + # --- Filter by region + response = client.models.get(regions=["usa"]) + print(f"\nFound {response.total_count} models in region 'usa'") + + # --- Filter by category + response = client.models.get(categories=["open-source"]) + print(f"\nFound {response.total_count} open-source models") + + # --- Sort by release date (newest first) + response = client.models.get(sort_by="releasedAt", order="desc", page_size=5) + print(f"\nNewest 5 models:") + for model in response.models: + print(f" - {model.name} (released_at={model.released_at})") + + # --- Include deprecated models + response = client.models.get(include_deprecated=True) + print(f"\nTotal models (including deprecated): {response.total_count}") + + # --- Discover available filter values + response = client.models.get(page=1, page_size=1) + print(f"\nAvailable filter values:") + print(f" Categories: {response.categories}") + print(f" Companies: {response.companies}") + print(f" Regions: {response.regions}") + print(f" Licenses: {response.licenses}") + print(f" Sizes: {response.sizes}") + + +if __name__ == "__main__": + main() +``` + +## Public Benchmarks + +> Source: [`examples/public_benchmarks.py`](../../examples/public_benchmarks.py) + +```python +from layerlens import PublicClient + + +def main(): + client = PublicClient() + + # --- Browse all public benchmarks + response = client.benchmarks.get(page=1, page_size=10) + print(f"Found {response.total_count} public benchmarks (showing first {len(response.datasets)})") + for benchmark in response.datasets: + print(f" - {benchmark.name} (prompts={benchmark.prompt_count}, language={benchmark.language})") + + # --- Filter by language + response = client.benchmarks.get(languages=["English"]) + print(f"\nFound {response.total_count} English benchmarks") + + # --- Discover available filter values + if response.categories: + print(f"\nAvailable categories: {response.categories}") + if response.languages: + print(f"Available languages: {response.languages}") + + # --- Search by name + response = client.benchmarks.get(query="mmlu") + print(f"\nFound {response.total_count} benchmarks matching 'mmlu'") + for benchmark in response.datasets: + print(f" - {benchmark.name}: {benchmark.description[:80] if benchmark.description else 'N/A'}...") + + # --- Get benchmark prompts (paginated) + if response.datasets: + benchmark = response.datasets[0] + print(f"\nFetching prompts for '{benchmark.name}' (id={benchmark.id})...") + + prompts_response = client.benchmarks.get_prompts( + benchmark.id, + page=1, + page_size=5, + ) + + if prompts_response: + print(f"Total prompts: {prompts_response.data.count}") + print(f"Showing first {len(prompts_response.data.prompts)} prompts:") + for prompt in prompts_response.data.prompts: + input_preview = str(prompt.input)[:80] + truth_preview = prompt.truth[:50] if prompt.truth else "N/A" + print(f" - Input: {input_preview}...") + print(f" Truth: {truth_preview}") + + # --- Get all prompts (auto-paginates) + if response.datasets: + benchmark = response.datasets[0] + print(f"\nFetching ALL prompts for '{benchmark.name}'...") + all_prompts = client.benchmarks.get_all_prompts(benchmark.id) + print(f"Retrieved {len(all_prompts)} total prompts") + + +if __name__ == "__main__": + main() +``` + +## Public Evaluations + +> Source: [`examples/public_evaluations.py`](../../examples/public_evaluations.py) + +```python +from layerlens import PublicClient +from layerlens.models import EvaluationStatus + + +def main(): + client = PublicClient() + + # --- Get a specific evaluation by ID + evaluation_id = "699f1426c1212b2d9c78e947" + evaluation = client.evaluations.get_by_id(evaluation_id) + if evaluation: + print(f"Evaluation: {evaluation.id}") + print(f" Model: {evaluation.model_name} ({evaluation.model_company})") + print(f" Benchmark: {evaluation.benchmark_name}") + print(f" Status: {evaluation.status.value}") + print(f" Accuracy: {evaluation.accuracy:.2f}%") + + if evaluation.summary: + print(f" Summary: {evaluation.summary.name}") + print(f" Goal: {evaluation.summary.goal}") + if evaluation.summary.metrics: + print(f" Metrics: {', '.join(m.name for m in evaluation.summary.metrics)}") + if evaluation.summary.performance_details: + print(f" Strengths: {evaluation.summary.performance_details.strengths}") + if evaluation.summary.analysis_summary: + print(f" Key takeaways: {evaluation.summary.analysis_summary.key_takeaways}") + else: + print(f"Evaluation {evaluation_id} not found") + + # --- List latest evaluations + response = client.evaluations.get_many( + page=1, + page_size=5, + sort_by="submittedAt", + order="desc", + ) + if response: + print(f"\nLatest evaluations ({response.pagination.total_count} total):") + for e in response.evaluations: + print(f" - {e.id}: {e.model_name} on {e.benchmark_name} -> {e.accuracy:.2f}% ({e.status.value})") + + # --- Filter by status (only successful) + response = client.evaluations.get_many( + status=EvaluationStatus.SUCCESS, + sort_by="accuracy", + order="desc", + page_size=5, + ) + if response: + print(f"\nTop successful evaluations ({response.pagination.total_count} total):") + for e in response.evaluations: + print(f" - {e.model_name}: {e.accuracy:.2f}%") + + +if __name__ == "__main__": + main() +``` + +## Comparing Evaluations + +> Source: [`examples/compare_evaluations.py`](../../examples/compare_evaluations.py) + +Compare how two models perform on the same benchmark, prompt by prompt. + +```python +from layerlens import PublicClient + + +def main(): + client = PublicClient() + + # --- Compare two models on a benchmark + # The SDK automatically finds the most recent successful evaluation for each model. + benchmark_id = "682bddc1e014f9fa440f8a91" # AIME 2025 + model_id_1 = "699f9761e014f9c3072b0513" # Qwen3.5 27B + model_id_2 = "699f9761e014f9c3072b0512" # Qwen3.5 122B A10B + + print(f"Comparing models on benchmark {benchmark_id}...") + comparison = client.comparisons.compare_models( + benchmark_id=benchmark_id, + model_id_1=model_id_1, + model_id_2=model_id_2, + page=1, + page_size=10, + ) + + if comparison: + print(f"\n=== Comparison Summary ===") + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") + print(f"Total compared: {comparison.total_count}") + + if comparison.results: + print(f"\nFirst {len(comparison.results)} results:") + for result in comparison.results: + s1 = "Y" if result.score1 and result.score1 > 0.5 else "N" + s2 = "Y" if result.score2 and result.score2 > 0.5 else "N" + print(f" Prompt: {result.prompt[:80]}...") + print(f" Model 1: {s1} (score={result.score1})") + print(f" Model 2: {s2} (score={result.score2})") + + # --- Filter: where model 1 fails but model 2 succeeds + comparison = client.comparisons.compare_models( + benchmark_id=benchmark_id, + model_id_1=model_id_1, + model_id_2=model_id_2, + outcome_filter="reference_fails", + ) + + if comparison: + print(f"\n=== Where Model 1 Fails but Model 2 Succeeds ===") + print(f"Found {comparison.total_count} such cases") + + # --- Compare using evaluation IDs directly + comparison = client.comparisons.compare( + evaluation_id_1="699f9938a03d70bf6607081f", + evaluation_id_2="699f991ca782d00ebd666ba1", + page=1, + page_size=5, + ) + + if comparison: + print(f"\n=== Direct Comparison by Evaluation IDs ===") + print(f"Model 1: {comparison.correct_count_1}/{comparison.total_results_1} correct") + print(f"Model 2: {comparison.correct_count_2}/{comparison.total_results_2} correct") + + +if __name__ == "__main__": + main() +``` diff --git a/docs/examples/retrieving-results.md b/docs/examples/retrieving-results.md new file mode 100644 index 0000000..d54da79 --- /dev/null +++ b/docs/examples/retrieving-results.md @@ -0,0 +1,194 @@ +# Retrieving Results + +Examples for fetching evaluation results using the LayerLens Python SDK, including pagination, bulk fetching, and concurrent retrieval. + +## Paginated Results + +> Source: [`examples/paginated_results.py`](../../examples/paginated_results.py) + +Walk through results page by page with full control over page size. + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + models = await client.models.get() + benchmarks = await client.benchmarks.get() + + evaluation = await client.evaluations.create(model=models[0], benchmark=benchmarks[0]) + evaluation = await client.evaluations.wait_for_completion( + evaluation, interval_seconds=10, timeout_seconds=600 + ) + + if evaluation.is_success: + print("Fetching all results with pagination...") + + all_results = [] + page = 1 + page_size = 50 + + while True: + print(f"Fetching page {page} (page size: {page_size})...") + + results_data = await client.results.get_by_id( + evaluation_id=evaluation.id, page=page, page_size=page_size + ) + + if not results_data or not results_data.results: + print("No more results to fetch") + break + + all_results.extend(results_data.results) + + if page == 1: + total_count = results_data.pagination.total_count + total_pages = results_data.pagination.total_pages + print(f"Total results: {total_count:,}") + print(f"Total pages: {total_pages}") + + print(f"Page {page}: Retrieved {len(results_data.results)} results") + print(f"Running total: {len(all_results):,} results") + + if page >= results_data.pagination.total_pages: + print("Reached last page") + break + + page += 1 + + print(f"\nTotal results collected: {len(all_results):,}") + + if all_results: + correct_answers = sum(1 for r in all_results if r.score > 0.5) + accuracy = correct_answers / len(all_results) + avg_score = sum(r.score for r in all_results) / len(all_results) + + print(f"Overall accuracy: {accuracy:.1%} ({correct_answers:,}/{len(all_results):,})") + print(f"Average score: {avg_score:.3f}") + + print(f"\nFirst 3 results:") + for i, result in enumerate(all_results[:3], 1): + print(f" {i}. Score: {result.score:.3f}, Subset: {result.subset}") + print(f" Prompt: {result.prompt[:100]}...") + print(f" Response: {result.result[:100]}...") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## All Results Without Pagination + +> Source: [`examples/all_results_no_pagination.py`](../../examples/all_results_no_pagination.py) + +Use `get_all()` to fetch every result in a single call. Simpler but loads everything into memory. + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def main(): + client = AsyncStratix() + + models = await client.models.get() + benchmarks = await client.benchmarks.get() + + evaluation = await client.evaluations.create( + model=models[0], + benchmark=benchmarks[0], + ) + + evaluation = await client.evaluations.wait_for_completion( + evaluation, + interval_seconds=10, + timeout_seconds=600, + ) + + # Fetch all results at once + results = await client.results.get_all(evaluation=evaluation) + print(f"Found {len(results)} results") + print(results) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Fetch Results for Multiple Evaluations Concurrently + +> Source: [`examples/fetch_results_async.py`](../../examples/fetch_results_async.py) + +Use `asyncio.gather` to load results for several evaluations in parallel. + +```python +import asyncio + +from layerlens import AsyncStratix + + +async def fetch_evaluation_results(client, evaluation_id): + """Fetch results for a single evaluation and print when loaded.""" + try: + print(f"Fetching evaluation {evaluation_id}...") + evaluation = await client.evaluations.get_by_id(evaluation_id) + print(f"Found evaluation {evaluation.id}, status={evaluation.status}") + + results = await client.results.get_all(evaluation=evaluation) + print(f"Loaded {len(results)} results for evaluation {evaluation_id}") + print(f"Results for {evaluation_id}: {results}") + print("-" * 80) + + return evaluation_id, results + except Exception as e: + print(f"Error fetching evaluation {evaluation_id}: {e}") + return evaluation_id, None + + +async def main(): + client = AsyncStratix() + + # Replace with your own evaluation IDs + evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"] + + print(f"Starting async fetch for {len(evaluation_ids)} evaluations...") + print("=" * 80) + + tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids] + results = await asyncio.gather(*tasks, return_exceptions=True) + + print("=" * 80) + print("Summary:") + successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception)) + print(f"Successfully fetched results for {successful}/{len(evaluation_ids)} evaluations") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Using the Evaluation Object Helpers + +Results can also be fetched directly from an `Evaluation` object when a client is attached: + +```python +from layerlens import Stratix + +client = Stratix() + +# Get results via the client +results_response = client.results.get(evaluation=evaluation, page=1, page_size=50) + +# Or via the evaluation object (client must be attached) +results_response = evaluation.get_results(page=1, page_size=50) +all_results = evaluation.get_all_results() + +# Async equivalents +results_response = await evaluation.get_results_async(page=1, page_size=50) +all_results = await evaluation.get_all_results_async() +``` diff --git a/examples/all_results_no_pagination.py b/examples/all_results_no_pagination.py index 173828a..ea0390b 100644 --- a/examples/all_results_no_pagination.py +++ b/examples/all_results_no_pagination.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/async_client.py b/examples/async_client.py index c64c2de..596878f 100644 --- a/examples/async_client.py +++ b/examples/async_client.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/async_client_simple.py b/examples/async_client_simple.py index 1c4623e..8ebfafc 100644 --- a/examples/async_client_simple.py +++ b/examples/async_client_simple.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/async_judges_and_traces.py b/examples/async_judges_and_traces.py index de78e0e..90ca657 100644 --- a/examples/async_judges_and_traces.py +++ b/examples/async_judges_and_traces.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import os import time diff --git a/examples/async_run_evaluations.py b/examples/async_run_evaluations.py index 3764e17..8469f76 100644 --- a/examples/async_run_evaluations.py +++ b/examples/async_run_evaluations.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/client.py b/examples/client.py index 8047cea..eead9b1 100644 --- a/examples/client.py +++ b/examples/client.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import Stratix diff --git a/examples/client_simple.py b/examples/client_simple.py index 7f1c7c6..52dad3b 100644 --- a/examples/client_simple.py +++ b/examples/client_simple.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import Stratix diff --git a/examples/compare_evaluations.py b/examples/compare_evaluations.py index 2293e8d..1f71704 100644 --- a/examples/compare_evaluations.py +++ b/examples/compare_evaluations.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import PublicClient diff --git a/examples/create_custom_benchmark.py b/examples/create_custom_benchmark.py index 4b263de..d7f1aba 100644 --- a/examples/create_custom_benchmark.py +++ b/examples/create_custom_benchmark.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import Stratix diff --git a/examples/create_custom_model.py b/examples/create_custom_model.py index 8d375f6..6325922 100644 --- a/examples/create_custom_model.py +++ b/examples/create_custom_model.py @@ -1,4 +1,6 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 + +import os from layerlens import Stratix @@ -20,7 +22,7 @@ def main(): key="my-org/custom-model-v1", description="Custom fine-tuned model served via vLLM", api_url="https://my-model-endpoint.example.com/v1", - api_key="my-provider-api-key", + api_key=os.environ["MY_PROVIDER_API_KEY"], max_tokens=4096, ) diff --git a/examples/create_smart_benchmark.py b/examples/create_smart_benchmark.py index 9c628d3..af16a7f 100644 --- a/examples/create_smart_benchmark.py +++ b/examples/create_smart_benchmark.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import Stratix diff --git a/examples/evaluation_sorting.py b/examples/evaluation_sorting.py index cb1906f..ff48e44 100644 --- a/examples/evaluation_sorting.py +++ b/examples/evaluation_sorting.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/fetch_results_async.py b/examples/fetch_results_async.py index d9ef929..91c9350 100644 --- a/examples/fetch_results_async.py +++ b/examples/fetch_results_async.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/get_benchmarks.py b/examples/get_benchmarks.py index 169ffa9..6699e8e 100644 --- a/examples/get_benchmarks.py +++ b/examples/get_benchmarks.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/get_evaluation.py b/examples/get_evaluation.py index a6d8fe6..2f99379 100644 --- a/examples/get_evaluation.py +++ b/examples/get_evaluation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/get_models.py b/examples/get_models.py index c8d14de..c1fb9bb 100644 --- a/examples/get_models.py +++ b/examples/get_models.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/judge_optimizations.py b/examples/judge_optimizations.py index 9681ec9..2cb59ff 100644 --- a/examples/judge_optimizations.py +++ b/examples/judge_optimizations.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 """ Judge Optimizations example. diff --git a/examples/judges.py b/examples/judges.py index b6a3e3e..ab3e940 100644 --- a/examples/judges.py +++ b/examples/judges.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import time diff --git a/examples/manage_project_models_benchmarks.py b/examples/manage_project_models_benchmarks.py index 0067051..07bbab9 100644 --- a/examples/manage_project_models_benchmarks.py +++ b/examples/manage_project_models_benchmarks.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import Stratix diff --git a/examples/paginated_results.py b/examples/paginated_results.py index 2f135e7..3e69cf7 100644 --- a/examples/paginated_results.py +++ b/examples/paginated_results.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import asyncio diff --git a/examples/public_benchmarks.py b/examples/public_benchmarks.py index 491c51b..ecfd23e 100644 --- a/examples/public_benchmarks.py +++ b/examples/public_benchmarks.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import PublicClient diff --git a/examples/public_evaluations.py b/examples/public_evaluations.py index a8eb588..824063a 100644 --- a/examples/public_evaluations.py +++ b/examples/public_evaluations.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import PublicClient from layerlens.models import EvaluationStatus diff --git a/examples/public_models.py b/examples/public_models.py index be22040..122ba2d 100644 --- a/examples/public_models.py +++ b/examples/public_models.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 from layerlens import PublicClient diff --git a/examples/trace_evaluations.py b/examples/trace_evaluations.py index e956598..5100d72 100644 --- a/examples/trace_evaluations.py +++ b/examples/trace_evaluations.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import time diff --git a/examples/traces.py b/examples/traces.py index af99970..af03792 100644 --- a/examples/traces.py +++ b/examples/traces.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S poetry run python +#!/usr/bin/env python3 import os diff --git a/pyproject.toml b/pyproject.toml index f5efccc..783372f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dynamic = ["version"] description = "The official Python library for the LayerLens Stratix API" license = "Apache-2.0" authors = [{ name = "LayerLens", email = "support@layerlens.ai" }] -dependencies = ["httpx>=0.23.0, <1", "pydantic>=1.9.0, <3", "requests"] +dependencies = ["httpx>=0.23.0, <1", "pydantic>=1.9.0, <3"] requires-python = ">= 3.8" classifiers = [ "Typing :: Typed", @@ -47,7 +47,6 @@ dev-dependencies = [ "pyright==1.1.399", "pytest-cov>=6.2.1", "ruff", - "types-requests", "build", "twine==6.1.0", ] @@ -133,6 +132,7 @@ known-first-party = ["openai", "tests"] "scripts/**.py" = ["T201", "T203"] "tests/**.py" = ["T201", "T203"] "examples/**.py" = ["T201", "T203"] +"src/layerlens/cli.py" = ["T201", "T203"] [tool.pyright] include = ["src", "tests"] diff --git a/src/layerlens/_base_client.py b/src/layerlens/_base_client.py index 6fbf3d1..8cd7650 100644 --- a/src/layerlens/_base_client.py +++ b/src/layerlens/_base_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import time import logging from typing import Any, Dict, Type, Union, TypeVar, Optional @@ -12,6 +13,11 @@ ResponseT = TypeVar("ResponseT") +MAX_RETRIES = 2 +RETRY_STATUS_CODES = {429, 500, 502, 503, 504} +INITIAL_RETRY_DELAY = 0.5 +MAX_RETRY_DELAY = 8.0 + log: logging.Logger = logging.getLogger(__name__) log.addFilter(SensitiveHeadersFilter()) @@ -52,27 +58,45 @@ def _request_cast( **kwargs: Any, ) -> Union[ResponseT, httpx.Response]: combined_headers = {**self.default_headers, **(headers or {})} - - response = super().request( - method=method, - url=url, - json=body, - params=params, - headers=combined_headers, - **kwargs, - ) - - try: - response.raise_for_status() - except httpx.HTTPStatusError as err: - log.debug("Encountered httpx.HTTPStatusError", exc_info=True) - log.debug("Re-raising status error") - raise self._make_status_error_from_response(err.response) from None - - if cast_to: - data = response.json() - return cast_to(**data) - return response + retries_left = MAX_RETRIES + delay = INITIAL_RETRY_DELAY + + while True: + response = super().request( + method=method, + url=url, + json=body, + params=params, + headers=combined_headers, + **kwargs, + ) + + if response.status_code in RETRY_STATUS_CODES and retries_left > 0: + retry_after = response.headers.get("retry-after") + sleep_time = float(retry_after) if retry_after else delay + sleep_time = min(sleep_time, MAX_RETRY_DELAY) + log.debug( + "Retrying request after %.1fs (status %d, %d retries left)", + sleep_time, + response.status_code, + retries_left, + ) + time.sleep(sleep_time) + delay = min(delay * 2, MAX_RETRY_DELAY) + retries_left -= 1 + continue + + try: + response.raise_for_status() + except httpx.HTTPStatusError as err: + log.debug("Encountered httpx.HTTPStatusError", exc_info=True) + log.debug("Re-raising status error") + raise self._make_status_error_from_response(err.response) from None + + if cast_to: + data = response.json() + return cast_to(**data) + return response def get_cast( self, @@ -177,28 +201,48 @@ async def _request_cast( headers: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> Union[ResponseT, httpx.Response]: - combined_headers = {**self.default_headers, **(headers or {})} - - response = await super().request( - method=method, - url=url, - json=body, - params=params, - headers=combined_headers, - **kwargs, - ) + import asyncio - try: - response.raise_for_status() - except httpx.HTTPStatusError as err: - log.debug("Encountered httpx.HTTPStatusError", exc_info=True) - log.debug("Re-raising status error") - raise self._make_status_error_from_response(err.response) from None - - if cast_to: - data = response.json() - return cast_to(**data) - return response + combined_headers = {**self.default_headers, **(headers or {})} + retries_left = MAX_RETRIES + delay = INITIAL_RETRY_DELAY + + while True: + response = await super().request( + method=method, + url=url, + json=body, + params=params, + headers=combined_headers, + **kwargs, + ) + + if response.status_code in RETRY_STATUS_CODES and retries_left > 0: + retry_after = response.headers.get("retry-after") + sleep_time = float(retry_after) if retry_after else delay + sleep_time = min(sleep_time, MAX_RETRY_DELAY) + log.debug( + "Retrying request after %.1fs (status %d, %d retries left)", + sleep_time, + response.status_code, + retries_left, + ) + await asyncio.sleep(sleep_time) + delay = min(delay * 2, MAX_RETRY_DELAY) + retries_left -= 1 + continue + + try: + response.raise_for_status() + except httpx.HTTPStatusError as err: + log.debug("Encountered httpx.HTTPStatusError", exc_info=True) + log.debug("Re-raising status error") + raise self._make_status_error_from_response(err.response) from None + + if cast_to: + data = response.json() + return cast_to(**data) + return response async def get_cast( self, diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py index e146f29..e7688e0 100644 --- a/src/layerlens/_client.py +++ b/src/layerlens/_client.py @@ -7,12 +7,11 @@ from typing_extensions import Self, override import httpx -import requests from . import _exceptions from ._utils import is_mapping from .models import Organization, OrganizationResponse -from ._constants import DEFAULT_TIMEOUT +from ._constants import DEFAULT_TIMEOUT, DEFAULT_BASE_URL from ._exceptions import StratixError, APIStatusError from ._base_client import BaseClient, BaseAsyncClient @@ -59,7 +58,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") if base_url is None: - base_url = "https://api.layerlens.ai/api/v1" + base_url = DEFAULT_BASE_URL super().__init__( base_url=base_url, @@ -231,7 +230,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") if base_url is None: - base_url = "https://api.layerlens.ai/api/v1" + base_url = DEFAULT_BASE_URL super().__init__(base_url=base_url, timeout=timeout) @@ -354,8 +353,9 @@ def _make_status_error( def _get_organization(self) -> Optional[Organization]: url = f"{self.base_url}organizations" - response = requests.get(url, headers=self.default_headers, timeout=30) - response.raise_for_status() + with httpx.Client(timeout=30) as http: + response = http.get(url, headers=self.default_headers) + response.raise_for_status() data = response.json() diff --git a/src/layerlens/_constants.py b/src/layerlens/_constants.py index 625df7c..2440945 100644 --- a/src/layerlens/_constants.py +++ b/src/layerlens/_constants.py @@ -2,3 +2,5 @@ # default timeout is 10 minutes DEFAULT_TIMEOUT = httpx.Timeout(timeout=600, connect=5.0) + +DEFAULT_BASE_URL = "https://api.layerlens.ai/api/v1" diff --git a/src/layerlens/_public_client.py b/src/layerlens/_public_client.py index fb04979..057793e 100644 --- a/src/layerlens/_public_client.py +++ b/src/layerlens/_public_client.py @@ -10,7 +10,7 @@ from . import _exceptions from ._utils import is_mapping -from ._constants import DEFAULT_TIMEOUT +from ._constants import DEFAULT_TIMEOUT, DEFAULT_BASE_URL from ._exceptions import APIStatusError from ._base_client import BaseClient, BaseAsyncClient @@ -75,7 +75,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") if base_url is None: - base_url = "https://api.layerlens.ai/api/v1" + base_url = DEFAULT_BASE_URL super().__init__(base_url=base_url, timeout=timeout) @@ -159,7 +159,7 @@ def __init__( if base_url is None: base_url = os.environ.get("LAYERLENS_STRATIX_BASE_URL") or os.environ.get("LAYERLENS_ATLAS_BASE_URL") if base_url is None: - base_url = "https://api.layerlens.ai/api/v1" + base_url = DEFAULT_BASE_URL super().__init__(base_url=base_url, timeout=timeout) diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index df29efa..41607c3 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1,4 +1,4 @@ -__version__ = "1.3.0" +__version__ = "1.3.1" # Will be templated during the build __git_commit__ = "__GIT_COMMIT__" diff --git a/src/layerlens/cli.py b/src/layerlens/cli.py new file mode 100644 index 0000000..5c899c7 --- /dev/null +++ b/src/layerlens/cli.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +import sys + +from ._version import __version__ + + +def main() -> None: + if len(sys.argv) > 1 and sys.argv[1] in ("--version", "-v"): + print(f"layerlens {__version__}") + sys.exit(0) + + print(f"layerlens {__version__}") + print("See https://layerlens.gitbook.io/stratix-python-sdk for documentation.") + print("\nUsage:") + print(" layerlens --version Show version")