LayerLens · m-peko · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.github/workflows/publish-sdk.yaml b/.github/workflows/publish-sdk.yaml
@@ -0,0 +1,63 @@
+# This workflow is used to publish the Python SDK to the actual PyPI.
+# It is triggered by a tag push, and will only publish if the tag is valid.
+# The tag must match the format sdk-v*.*.*
+
+name: Publish Python SDK
+
+on:
+  push:
+    tags:
+      - "sdk-v*.*.*" # Trigger on version tags like sdk-v0.1.0 etc.
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    environment: production
+    outputs:
+      release_tag: ${{ steps.set_release_tag.outputs.release_tag }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for checking branch
+      - name: Set release tag
+        id: set_release_tag
+        # ensure the tag is valid (matches code, is on main, etc)
+        run: |
+          RELEASE_TAG=${GITHUB_REF#refs/tags/}
+          echo "Using tag: $RELEASE_TAG"
+          ./scripts/validate-release-tag.sh "$RELEASE_TAG"
+          echo "RELEASE_TAG=$RELEASE_TAG" >> $GITHUB_ENV
+          echo "release_tag=$RELEASE_TAG" >> $GITHUB_OUTPUT
+
+  build-and-publish:
+    needs: validate
+    runs-on: ubuntu-latest
+    environment: production
+
+    env:
+      TWINE_USERNAME: __token__
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      RELEASE_TAG: ${{ needs.validate.outputs.release_tag }}
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Install build dependencies
+        run: make install-build-deps
+      - name: Build
+        run: make build
+      - name: Test wheel
+        run: make test-wheel
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdk-dist
+          path: dist/
+          retention-days: 5
+      - name: Publish to PyPI
+        run: make _publish
+        env:
+          PYPI_REPO: pypi
diff --git a/.github/workflows/release-tag.yaml b/.github/workflows/release-tag.yaml
@@ -0,0 +1,78 @@
+# This workflow creates and pushes a release tag using the push-release-tag.sh script.
+# It can be triggered manually and will prompt for confirmation before creating the tag.
+
+name: Create Release Tag
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Run in dry-run mode (show what would be done without actually creating/pushing the tag)"
+        required: false
+        type: boolean
+        default: true
+      confirm_release:
+        description: "Type 'YES' to confirm you want to create and push the release tag"
+        required: true
+        type: string
+
+jobs:
+  check-branch:
+    runs-on: ubuntu-latest
+    environment: production
+    steps:
+      - name: Check if running on release branch
+        run: |
+          if [ "${{ github.ref }}" != "refs/heads/release" ]; then
+            echo "Error: This workflow can only be run from the 'release' branch."
+            echo "Current branch: ${{ github.ref }}"
+            echo "Please switch to the 'release' branch and try again."
+            exit 1
+          fi
+          echo "Running on release branch - proceeding with workflow."
+
+  create-release-tag:
+    runs-on: ubuntu-latest
+    needs: check-branch
+    environment: production
+    if: github.ref == 'refs/heads/release'
+
+    permissions:
+      contents: write # Required to create and push tags
+
+    steps:
+      - name: Validate confirmation
+        if: github.event.inputs.confirm_release != 'YES' && github.event.inputs.dry_run != 'true'
+        run: |
+          echo "Error: You must type 'YES' in the confirm_release input to proceed with creating a release tag."
+          echo "Received: '${{ github.event.inputs.confirm_release }}'"
+          exit 1
+
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history and tags
+
+      - name: Make scripts executable
+        run: |
+          chmod +x scripts/push-release-tag.sh
+          chmod +x scripts/get_version.sh
+
+      - name: Configure Git
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Run push-release-tag script (dry-run)
+        if: github.event.inputs.dry_run == 'true'
+        run: |
+          echo "Running in dry-run mode..."
+          make push-release-tag DRY_RUN=--dry-run
+
+      - name: Run push-release-tag script
+        if: github.event.inputs.dry_run != 'true'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "Creating and pushing release tag..."
+          # Override the interactive confirmation since we already confirmed via workflow input
+          echo "YES" | make push-release-tag
diff --git a/.github/workflows/test-publish-sdk.yaml b/.github/workflows/test-publish-sdk.yaml
@@ -0,0 +1,41 @@
+# This workflow is used to publish the Python SDK to TestPyPI. Do not need to upgrade the
+# version number to use this workflow.
+# Only upgrade the version number when you are ready to publish to PyPi
+# The script will automatically add an "rc" suffix to the version number for test.pypi.org releases.
+
+name: Publish Python SDK to TestPyPI
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Publish the given Git ref to test.pypi.org (branch, tag, or commit SHA)"
+        required: true
+        type: string
+        default: "main"
+
+jobs:
+  build-and-publish-test:
+    runs-on: ubuntu-latest
+
+    env:
+      TWINE_USERNAME: __token__
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      PYPI_REPO: testpypi
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.ref }}
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Install build dependencies
+        run: make install-build-deps
+      - name: Build
+        run: make build
+      - name: Test wheel
+        run: make test-wheel
+      - name: Publish to TestPyPI
+        run: make _publish
diff --git a/Makefile b/Makefile
@@ -0,0 +1,48 @@
+install-build-deps:
+	pip install build twine
+
+build: clean _template-version
+	python -m build
+	# Restore the original version file after the build
+	git checkout src/layerlens/_version.py
+
+test-wheel:
+	pip install dist/*.whl
+	python -c "import layerlens; print('Package imported successfully')"
+
+clean:
+	rm -rf build dist
+
+_publish:
+	./scripts/publish.sh
+
+_template-version:
+	@bash scripts/template-version.sh
+
+_check-git-clean:
+	@if [ -n "$$(git status --porcelain)" ]; then \
+		echo "Error: Git working directory is not clean. Won't run publish."; \
+		exit 1; \
+	fi
+
+_verify-build-publish: _check-git-clean build test-wheel _publish
+
+publish-to-testpypi: export PYPI_REPO := testpypi
+publish-to-testpypi: _verify-build-publish
+
+publish-to-pypi: export PYPI_REPO := pypi
+publish-to-pypi: _verify-build-publish
+
+push-release-tag:
+	@bash scripts/push-release-tag.sh $(DRY_RUN)
+
+help:
+	@echo "Available targets:"
+	@echo "  build               - Build Python package"
+	@echo "  clean               - Remove build artifacts"
+	@echo "  help                - Show this help message"
+	@echo "  install-build-deps  - Install build dependencies for CI"
+	@echo "  test-wheel          - Run tests against built wheel"
+	@echo "  publish-to-pypi     - Publish to PyPI"
+	@echo "  publish-to-testpypi - Publish to TestPyPI"
+	@echo "  push-release-tag    - Create and push a release tag"
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -9,6 +9,7 @@
 
 ## API Reference
 * [Client Configuration](api-reference/client.md)
+* [Public Client](api-reference/public-client.md)
 * [Evaluations](api-reference/evaluations.md)
 * [Results](api-reference/results.md)
 * [Models & Benchmarks](api-reference/models-benchmarks.md)

diff --git a/docs/api-reference/client.md b/docs/api-reference/client.md
@@ -51,6 +51,25 @@ The client automatically loads configuration from these environment variables:
 LAYERLENS_STRATIX_API_KEY="your_api_key_here"
 ```
 
+## Public Client
+
+For accessing public endpoints (models, benchmarks, comparisons), use `PublicClient` or `AsyncPublicClient`. See the [Public Client](public-client.md) reference for full details.
+
+```python
+from layerlens import PublicClient
+
+# Loads API key from the "LAYERLENS_STRATIX_API_KEY" environment variable
+public = PublicClient()
+models = public.models.get(companies=["OpenAI"])
+```
+
+You can also access public endpoints from an authenticated client via the `.public` property:
+
+```python
+client = Stratix()
+public_models = client.public.models.get(query="claude")
+```
+
 ## Timeout Configuration
 
 ### Simple Timeout

diff --git a/docs/api-reference/evaluations.md b/docs/api-reference/evaluations.md
@@ -177,17 +177,22 @@ async def get_evaluation():
 asyncio.run(get_evaluation())
 ```
 
-### `get_many(page=None, page_size=None, timeout=None)`
+### `get_many(page=None, page_size=None, sort_by=None, order=None, model_ids=None, benchmark_ids=None, status=None, timeout=None)`
 
-Retrieves multiple evaluations with optional pagination support.
+Retrieves multiple evaluations with optional pagination, sorting, and filtering.
 
 #### Parameters
 
-| Parameter   | Type                             | Required | Description                                             |
-| ----------- | -------------------------------- | -------- | ------------------------------------------------------- |
-| `page`      | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)     |
-| `page_size` | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500) |
-| `timeout`   | `float \| httpx.Timeout \| None` | No       | Override request timeout                                |
+| Parameter       | Type                             | Required | Description                                             |
+| --------------- | -------------------------------- | -------- | ------------------------------------------------------- |
+| `page`          | `int \| None`                    | No       | Page number for pagination (1-based, defaults to 1)     |
+| `page_size`     | `int \| None`                    | No       | Number of evaluations per page (default: 100, max: 500) |
+| `sort_by`       | `str \| None`                    | No       | Sort by field: `submittedAt`, `accuracy`, or `averageDuration` |
+| `order`         | `str \| None`                    | No       | Sort order: `asc` or `desc`                             |
+| `model_ids`     | `List[str] \| None`              | No       | Filter by model IDs                                     |
+| `benchmark_ids` | `List[str] \| None`              | No       | Filter by benchmark/dataset IDs                         |
+| `status`        | `EvaluationStatus \| None`       | No       | Filter by evaluation status                             |
+| `timeout`       | `float \| httpx.Timeout \| None` | No       | Override request timeout                                |
 
 #### Returns
 
@@ -198,6 +203,27 @@ Returns an `EvaluationsResponse` object containing:
 
 Returns `None` if the request fails.
 
+#### Example
+
+```python
+from layerlens import Stratix
+from layerlens.models import EvaluationStatus
+
+client = Stratix()
+
+# Get top evaluations by accuracy
+response = client.evaluations.get_many(
+    sort_by="accuracy",
+    order="desc",
+    status=EvaluationStatus.SUCCESS,
+    page_size=10,
+)
+
+if response:
+    for evaluation in response.evaluations:
+        print(f"{evaluation.id}: accuracy={evaluation.accuracy:.2f}%")
+```
+
 ### `get_results(page=None, page_size=None, timeout=None)`
 
 Fetches results for this evaluation with pagination support. This is a synchronous method.
@@ -378,16 +404,43 @@ The `create`, `get_by_id` and `get_many` method returns an `Evaluation` objects
 
 ### Evaluation Object Properties
 
-| Property           | Type               | Description                                               |
-| ------------------ | ------------------ | --------------------------------------------------------- |
-| `id`               | `str`              | Unique evaluation identifier                              |
-| `status`           | `EvaluationStatus` | Current evaluation status (enum)                          |
-| `submitted_at`     | `int`              | Unix timestamp when evaluation was submitted              |
-| `finished_at`      | `int`              | Unix timestamp when evaluation finished                   |
-| `model_id`         | `str`              | ID of the model used in the evaluation                    |
-| `benchmark_id`     | `str`              | ID of the benchmark used (aliased as "dataset_id" in API) |
-| `average_duration` | `int`              | Average response time in milliseconds                     |
-| `accuracy`         | `float`            | Overall accuracy score (0.0 to 1.0)                       |
+| Property             | Type                          | Description                                               |
+| -------------------- | ----------------------------- | --------------------------------------------------------- |
+| `id`                 | `str`                         | Unique evaluation identifier                              |
+| `status`             | `EvaluationStatus`            | Current evaluation status (enum)                          |
+| `status_description` | `str`                         | Human-readable status description (default: `""`)         |
+| `submitted_at`       | `int`                         | Unix timestamp when evaluation was submitted              |
+| `finished_at`        | `int`                         | Unix timestamp when evaluation finished                   |
+| `model_id`           | `str`                         | ID of the model used in the evaluation                    |
+| `model_name`         | `str`                         | Name of the model (default: `""`)                         |
+| `model_key`          | `str`                         | Key identifier of the model (default: `""`)               |
+| `model_company`      | `str`                         | Company/provider of the model (default: `""`)             |
+| `benchmark_id`       | `str`                         | ID of the benchmark used (aliased as "dataset_id" in API) |
+| `benchmark_name`     | `str`                         | Name of the benchmark (aliased as "dataset_name" in API, default: `""`) |
+| `average_duration`   | `int`                         | Average response time in milliseconds                     |
+| `accuracy`           | `float`                       | Overall accuracy score (0.0 to 1.0)                       |
+| `readability_score`  | `float`                       | Readability score (default: `0.0`)                        |
+| `toxicity_score`     | `float`                       | Toxicity score (default: `0.0`)                           |
+| `ethics_score`       | `float`                       | Ethics score (default: `0.0`)                             |
+| `failed_prompt_count`| `int`                         | Number of failed prompts (default: `0`)                   |
+| `queue_id`           | `int`                         | Queue identifier (default: `0`)                           |
+| `summary`            | `EvaluationSummary \| None`   | Rich evaluation summary (see below, default: `None`)      |
+
+### EvaluationSummary Object
+
+The `summary` field contains a rich analysis of the evaluation when available.
+
+| Property              | Type                            | Description                              |
+| --------------------- | ------------------------------- | ---------------------------------------- |
+| `name`                | `str`                           | Summary title                            |
+| `goal`                | `str`                           | Goal of the evaluation                   |
+| `metrics`             | `List[EvaluationMetric]`        | Metrics used (each has `name`, `description`) |
+| `task_types`          | `List[EvaluationTaskType]`      | Task types (each has `name`, `description`)   |
+| `dataset`             | `EvaluationDataset \| None`     | Dataset info (`total_size`, `training_size`, `test_size`, `characteristics`) |
+| `model`               | `EvaluationModelInfo \| None`   | Model info (`model_name`, `performance`)  |
+| `performance_details` | `PerformanceDetails \| None`    | Strengths and challenges lists            |
+| `error_analysis`      | `ErrorAnalysis \| None`         | Common failure modes and example          |
+| `analysis_summary`    | `AnalysisSummary \| None`       | Key takeaways list                        |
 
 #### Evaluation Status