diff --git a/.env.example b/.env.example index 56d3a34..46f7da4 100644 --- a/.env.example +++ b/.env.example @@ -26,6 +26,24 @@ DEBUG=false API_HOST=0.0.0.0 API_PORT=8000 +# ===================================================================== +# Phase 29: Redis hardening (D-05) +# ===================================================================== +# Required password for redis-server --requirepass. Fresh dev clones can +# use the placeholder; production MUST set a strong unique value. +REDIS_PASSWORD=changeme +# Interface to bind redis :6379 on. Dev = loopback. Production = LAN IP +# (e.g., 192.168.1.10) so agents on other hosts can reach it. +REDIS_BIND_IP=127.0.0.1 + +# ===================================================================== +# Phase 29: HTTPS via internal CA (D-02) +# ===================================================================== +# Comma-separated SAN list for the auto-generated leaf cert. Defaults +# include `api` (docker compose service-name DNS) for single-host dev. +# Production should add the app-server's LAN hostname / IP. +PHAZE_API_TLS_SANS=localhost,127.0.0.1,api + # File discovery - mounted music directory for scanning SCAN_PATH=/data/music diff --git a/.env.example.agent b/.env.example.agent new file mode 100644 index 0000000..4e95e25 --- /dev/null +++ b/.env.example.agent @@ -0,0 +1,77 @@ +# Phaze file-server agent .env template (Phase 29 D-23) +# +# Copy this file to `.env` on the file-server host. The compose file +# (docker-compose.agent.yml) uses `${VAR:?msg}` fail-fast interpolation for +# required variables, so a missing value causes `docker compose up` to error +# at parse time -- you cannot accidentally bring up the stack misconfigured. + +# ===================================================================== +# Image tag (Phase 29 D-16) +# ===================================================================== +# `latest` is the default for first-time setup. PRODUCTION OPERATORS SHOULD +# PIN to a specific version tag, e.g.: +# PHAZE_IMAGE_TAG=v4.0.0 +# The docker-publish.yml workflow publishes both `:latest` and `:v` +# tags on each tagged release, verified by an automated test in CI. +PHAZE_IMAGE_TAG=latest + +# ===================================================================== +# Application server URL (Phase 29 D-01) +# ===================================================================== +# MUST be HTTPS -- the agent client refuses http:// URLs in production +# (AgentSettings._enforce_https_in_production guard, Phase 29 Plan 02). +# Replace with the app-server's LAN IP or hostname. +PHAZE_AGENT_API_URL=https://:8000 + +# ===================================================================== +# Redis URL (Phase 29 AUTH-03) +# ===================================================================== +# Phase 29 D-05 hardened the app-server Redis with --requirepass; agents +# MUST include the password in the URL. Production refuses passwordless +# redis_url (AgentSettings._enforce_redis_password_in_production). +# Replace with the app-server's REDIS_PASSWORD value. +PHAZE_REDIS_URL=redis://default:@:6379/0 + +# ===================================================================== +# Agent identity (Phase 25 AUTH-01) +# ===================================================================== +# Provisioned via psql on the app-server. The token's sha256 hash is +# stored in the `agents` table; this file holds the cleartext bearer. +# Token format: phaze_agent_<32 urlsafe-base64 bytes>. +PHAZE_AGENT_ID=fileserver-east +PHAZE_AGENT_TOKEN=phaze_agent_<32urlsafe> +PHAZE_AGENT_QUEUE=phaze-agent-fileserver-east + +# ===================================================================== +# CA cert (Phase 29 D-03) +# ===================================================================== +# Path INSIDE the container (the bind-mount at $CA_PATH:/certs:ro makes +# the operator-copied CA cert available). After `phaze.cert_bootstrap` +# runs on the app-server, scp ./certs/phaze-ca.crt from the app-server +# to this host and place at $CA_PATH on the file-server. +PHAZE_AGENT_CA_FILE=/certs/phaze-ca.crt + +# ===================================================================== +# Environment posture (Phase 29 D-06) +# ===================================================================== +# `production` triggers the agent-side hardening guards (TLS enforcement, +# Redis password enforcement). Use `development` only on a local test rig. +PHAZE_AGENT_ENV=production + +# ===================================================================== +# File-server local paths +# ===================================================================== +# Music files to scan. REQUIRED -- docker compose up fails if unset. +SCAN_PATH=/data/music +# Essentia model weights. Auto-downloaded on first start by the worker +# (D-21). rw so the worker container can populate this on first boot. +MODELS_PATH=./models +# Operator-distributed CA cert directory. ro inside the container. +CA_PATH=./certs + +# ===================================================================== +# Scan roots (Phase 27 watcher / Phase 25 path traversal containment) +# ===================================================================== +# Comma-separated list of absolute filesystem paths the agent is permitted +# to read/write. Used by execute_approved_batch for path-traversal containment. +PHAZE_AGENT_SCAN_ROOTS=/data/music,/data/concerts diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index bae40cb..07deeb9 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -24,18 +24,25 @@ jobs: strategy: matrix: include: + # Phase 29 D-15 / D-16: the api image is pulled by docker-compose.agent.yml's + # worker + watcher services via the bare-repo URL + # ghcr.io/simplicityguy/phaze: (no /api sub-path), so we override + # image_suffix to "" for api and keep the sub-path for the sidecars. - name: api dockerfile: Dockerfile context: . use_cache: true + image_suffix: "" - name: audfprint dockerfile: services/audfprint/Dockerfile.audfprint context: . use_cache: true + image_suffix: "/audfprint" - name: panako dockerfile: services/panako/Dockerfile.panako context: . use_cache: true + image_suffix: "/panako" steps: - name: "\u23F1\uFE0F Start timer" @@ -90,9 +97,19 @@ jobs: id: meta uses: docker/metadata-action@v6 with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/${{ matrix.name }} + # Phase 29 D-15: matrix.image_suffix is "" for the api image (bare-repo + # URL ghcr.io/simplicityguy/phaze pulled by docker-compose.agent.yml) + # and "/" for the sidecars. This keeps the api URL aligned with + # the agent.yml image: line. + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}${{ matrix.image_suffix }} + # Phase 29 D-16 + WARNING-4: tag strategy is verified by + # tests/test_deployment/test_agent_compose.py::test_docker_publish_workflow_tags_both_latest_and_version + # which asserts BOTH a `:latest` and a `:v` tag are produced. tags: | type=raw,value=latest,enable={{is_default_branch}} + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=ref,event=tag type=ref,event=branch type=ref,event=pr type=schedule,pattern={{date 'YYYYMMDD'}} diff --git a/.github/workflows/docker-validate.yml b/.github/workflows/docker-validate.yml index 61599a1..c8b5a69 100644 --- a/.github/workflows/docker-validate.yml +++ b/.github/workflows/docker-validate.yml @@ -65,7 +65,28 @@ jobs: - name: 🐳 Validate docker-compose.yml run: | - echo "πŸ” Validating Docker Compose configuration..." - touch .env - docker compose config --quiet + echo "πŸ” Validating application-server docker-compose.yml..." + # Phase 29 added fail-fast ${VAR:?} guards on REDIS_PASSWORD. + # Supply placeholders so compose-parse can resolve interpolation. + printf '%s\n' \ + 'REDIS_PASSWORD=ci-validate-placeholder' \ + 'REDIS_BIND_IP=127.0.0.1' \ + > .env + docker compose -f docker-compose.yml config --quiet echo "βœ… docker-compose.yml is valid" + + - name: 🐳 Validate docker-compose.agent.yml + run: | + echo "πŸ” Validating agent (file-server) docker-compose.agent.yml..." + # Phase 29 file-server compose requires SCAN_PATH on all 4 services; + # supply placeholders so compose-parse can resolve interpolation. + printf '%s\n' \ + 'SCAN_PATH=/tmp/phaze-ci-scan-placeholder' \ + 'MODELS_PATH=/tmp/phaze-ci-models-placeholder' \ + 'PHAZE_API_URL=https://app-server.example:8000' \ + 'PHAZE_REDIS_URL=redis://default:ci-placeholder@app-server.example:6379/0' \ + 'PHAZE_AGENT_TOKEN=phaze_agent_ci-placeholder' \ + 'PHAZE_AGENT_ID=ci-agent' \ + > .env.agent + docker compose -f docker-compose.agent.yml --env-file .env.agent config --quiet + echo "βœ… docker-compose.agent.yml is valid" diff --git a/.planning/MILESTONES.md b/.planning/MILESTONES.md index 321cc9b..efd1465 100644 --- a/.planning/MILESTONES.md +++ b/.planning/MILESTONES.md @@ -1,5 +1,34 @@ # Milestones +## v4.0 Distributed Agents (Shipped: 2026-05-17) + +**Phases completed:** 6 phases, 47 plans + +**Delivered:** Phaze is now a two-host system β€” an application-server control plane (API, UI, Postgres, Redis, fileless workers; no file mounts) and one or more file-server agents that own music/video files locally, pull jobs from per-agent SAQ queues, and write every state change back over authenticated HTTPS. + +**Key accomplishments:** + +- `agents` table + `agent_id` columns on FileRecord/ScanBatch, two-step Alembic migration (012 add+backfill, 013 NOT NULL+UQ swap) with `legacy-application-server` seed preserving v3.0 corpus end-to-end +- Internal `/api/internal/agent/*` HTTP surface (files, metadata, fingerprint, analysis, tracklists, proposals, execution-log, scan-batches, exec-batches, heartbeat, whoami) with token-hash auth deriving `agent_id` from bearer token β€” never from request body β€” and 403-before-state-machine cross-tenant guard on every multi-tenant route +- Idempotent natural-key upserts across the agent surface: `(agent_id, original_path)`, `file_id`, `proposal_id`, agent-generated log UUIDs; replays produce zero duplicate rows and zero same-state DB writes +- Task code split: `phaze.tasks.controller` (fileless: generate_proposals, tracklist scrapers, refresh cron) vs `phaze.tasks.agent_worker` (file-bound: process_file, extract_file_metadata, fingerprint_file, scan_live_set, execute_approved_batch); subprocess import-boundary test enforces no `phaze.database` in the agent chain +- `PHAZE_ROLE={control,agent}` env-driven settings split (ControlSettings vs AgentSettings via `get_settings()` factory); same Docker image for both roles; per-agent SAQ queue (`phaze-agent-`); AgentTaskRouter picks queue from `FileRecord.agent_id` +- `PhazeAgentClient` with tenacity retry funnel, 4-class error hierarchy, bearer token never stored as instance attribute (lives only in httpx headers); respx contract tests across all routes +- `phaze-agent-watcher` service: watchdog observer + asyncio-owned single-loop sweep, mtime settle (10s default) + stuck-file cap (3600s); LIVE-sentinel ScanBatch per agent; admin "Trigger Scan" form with HTMX agent-roots swap + 2s/5s polling partials +- `scan_directory` agent task with chunked HTTP upserts (500/chunk), per-chunk PATCH progress, terminal PATCH; same `/files` endpoint serves bulk scans and per-file watcher events +- Distributed execution dispatch: group-by-`FileRecord.agent_id` (in-Python `defaultdict`), one `execute_approved_batch` sub-job per affected agent under shared parent `batch_id`; per-proposal terminal progress POST; SAQ-meta UUID lift for retry-safe `execution_log_id` and `progress_request_id` +- Unified SSE progress aggregating across agents (3 Jinja partials rendered via `_render_partial()` for Semgrep XSS compliance); per-agent breakdown table; revoked-agent banner +- Per-file-server fingerprint sidecars (audfprint + panako allow-list validator blocks non-localhost URLs at config load); cross-file-server fingerprint matching documented as v4.0 limitation with dismissible banner on Duplicate Resolution page +- Self-signed internal CA + leaf x509 generated on first start in the api container via `phaze.cert_bootstrap` + pre-uvicorn entrypoint shim (signals/PID-1 propagate cleanly); `PhazeAgentClient` honors `verify=` kwarg defaulting to `AgentSettings.agent_ca_file`; wrong-CA β†’ ConnectError integration test +- Redis hardening: `requirepass` + `${REDIS_BIND_IP:-127.0.0.1}` LAN bind on app-server compose; `AgentSettings` rejects passwordless `redis_url` at boot when `PHAZE_AGENT_ENV=production` +- Application-server `docker-compose.yml` stripped of `SCAN_PATH`/`MODELS_PATH` mounts and watcher/audfprint/panako services; YAML-parse tests enforce filesystem isolation +- New `docker-compose.agent.yml` (4 services: worker, watcher, audfprint, panako) + `.env.example.agent`; `${SCAN_PATH:?...}` fail-fast on misconfigured file-server hosts; docker-publish.yml extended for both compose-file image tags +- `phaze.scripts.download_models` Python helper + `phaze.tasks._shared.model_bootstrap` wired into agent_worker/watcher startup (rejects partial-download `.part` state); `just download-models` populates per-file-server `/models` volume +- 30-second SAQ CronJob heartbeat from each agent updating `agents.last_seen_at`; Agents admin page (`/admin/agents`) with liveness classifier (alive/stale/revoked), queue depth, last-seen humanize helper; HTMX 5s auto-refresh +- Operator workflow: `just up` (app-server), `just up-agent` (each file-server), `just up-all` (single-host dev); full deployment walkthrough in `docs/deployment.md`; PROJECT.md Constraints + Deployment subsections updated + +--- + ## v3.0 Cross-Service Intelligence & File Enrichment (Shipped: 2026-04-04) **Phases completed:** 4 phases, 11 plans, 22 tasks diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md index 4b1c6ca..b6bb299 100644 --- a/.planning/PROJECT.md +++ b/.planning/PROJECT.md @@ -2,52 +2,56 @@ ## What This Is -A music collection organizer that ingests ~200K music files (mp3, m4a, ogg, opus) and concert video streams, analyzes them for BPM/mood/style/key, uses AI to propose better filenames and destination paths, and provides an admin web UI to review and approve the renames/moves. Built as a Docker Compose stack with FastAPI, arq workers, PostgreSQL, and Redis. Designed for a single user managing a large personal archive of music and live concert recordings (primarily full sets from events like Coachella). +A music collection organizer that ingests ~200K music files (mp3, m4a, ogg, opus) and concert video streams, analyzes them for BPM/mood/style/key, uses AI to propose better filenames and destination paths, and provides an admin web UI to review and approve the renames/moves. As of v4.0, phaze runs as a **two-host distributed system**: an application server (API, UI, Postgres, Redis, fileless workers, no file mounts) and one or more file-server agents that own the music/video files locally, pull jobs from per-agent SAQ queues, and write every state change back over authenticated HTTPS. Designed for a single user managing a large personal archive of music and live concert recordings (primarily full sets from events like Coachella). ## Core Value -Get 200K messy music and concert files properly named, organized into logical folders, deduplicated, with rich metadata in Postgres β€” and provide a human-in-the-loop approval workflow so nothing moves without review. +Get 200K messy music and concert files properly named, organized into logical folders, deduplicated, with rich metadata in Postgres β€” and provide a human-in-the-loop approval workflow so nothing moves without review. Files stay where they live; decisions stay on one server. -## Current Milestone: v4.0 Distributed Agents +## Current State -**Goal:** Split phaze into an application server (control plane: API, UI, Postgres, Redis, fileless workers) and one or more file servers (remote hosts running agents that own the music/video files, pull jobs locally, and write results back via HTTP) β€” so files can live anywhere while decisions stay on a single server. +**v4.0 Distributed Agents shipped 2026-05-17.** Phaze now runs across two hosts: a control-plane application server and one or more file-server agents. Planning next milestone. -**Target features:** -- Per-agent SAQ workers on each file server, pulling from the application server's Redis; absolute HTTP-only write-back boundary (no Postgres on file servers) -- `agent_id` stamped on `FileRecord` at scan time; unique key `(agent_id, original_path)`; new `agents` table with token-based auth -- Same Docker image, env-driven role; new `docker-compose.agent.yml` for file servers (worker + watcher + audfprint + panako); application server loses its `SCAN_PATH` + `MODELS_PATH` mounts -- User-initiated scan (UI form) + always-on `phaze-agent-watcher` service on each file server (watchdog lib, settle/debounce, sentinel scan batch) -- Per-file-server fingerprint sidecars (no cross-file-server fingerprint matching β€” documented v1 limitation) -- Group-by-file-server execution dispatch with per-PATCH ExecutionLog write-ahead audit preserved over HTTP -- Per-agent bearer tokens with `agent_id` derived from token on the application server (never from request body), private LAN, self-signed HTTPS, Redis `requirepass` + LAN-bound interface -- Task code reorg: `phaze.tasks.controller` (fileless, control role) vs `phaze.tasks.agent_worker` (file-bound, agent role); job payloads carry everything the agent needs -- Two-step Alembic migration with `legacy-application-server` backfill so existing v3.0 data survives +- ~14,300 lines of Python source + ~28,000 lines of tests across 29 phases, 94+ plans total (v1.0–v4.0) +- Tech stack: FastAPI, SQLAlchemy (async), SAQ + Redis (per-agent queues), litellm, essentia-tensorflow, mutagen, rapidfuzz, httpx, watchdog, cryptography (self-signed CA), tenacity, respx, HTMX + Tailwind + Alpine.js +- Two Docker Compose stacks: `docker-compose.yml` (app-server: api with TLS via internal CA, controller worker, postgres, redis with `requirepass` + LAN bind, no file mounts) and `docker-compose.agent.yml` (file-server: agent worker, watcher, audfprint + panako sidecars) +- 14 Alembic migrations, 14 SQLAlchemy models (Agents added in v4.0), per-file-server fingerprint sidecars +- Internal API surface: `/api/internal/agent/*` with token-hash bearer auth, idempotent natural-key upserts, 403-before-state-machine cross-tenant guards, 30s heartbeat +- Admin UI: proposals, duplicates (with cross-FS fingerprint notice), tracklists, pipeline dashboard with **Trigger Scan card**, unified search, Discogs linking, tag review, CUE management, **Agents** page with liveness + queue depth +- Operator workflow: `just up` (app-server), `just up-agent` (each file-server), `just up-all` (single-host dev); full deployment walkthrough in `docs/deployment.md` -## Current State +## Previous State -**v4.0 in progress.** Phases 24–27 complete; Phase 28 (Distributed Execution Dispatch) and 29 (Deployment Hardening & Agents Admin) remaining. +
+v3.0 shipped 2026-04-04 -- 8,000+ lines of Python across 27 phases, 56+ plans total (v1.0–v4.0 in progress) -- 1,070 tests passing on phase-27 branch; 58/63 cumulative requirements satisfied (DIST-02, SCAN-01..04 newly satisfied in Phase 27) -- Tech stack: FastAPI, SQLAlchemy (async), SAQ, litellm, essentia-tensorflow, mutagen, rapidfuzz, httpx, watchdog, HTMX + Tailwind -- Docker Compose: api, worker, postgres, redis, audfprint, panako, **watcher** containers -- 13 Alembic migrations, 13 SQLAlchemy models (Agents added in Phase 24), 3 fingerprint service containers -- Admin UI: proposals, duplicates, tracklists, pipeline dashboard with **Trigger Scan card**, directory tree preview, unified search, Discogs linking, tag review, CUE management -- v3.0 (shipped 2026-04-04): unified FTS search with faceted filtering, Discogs cross-service linking with fuzzy matching and bulk-link, format-aware tag writing with 4-layer cascade, CUE sheet generation with Discogs REM enrichment -- v4.0 (in progress, Phases 24–27): Agents table + token-based auth; internal HTTP API (`/api/internal/agent/*`) with bearer auth + cross-tenant 403-before-state-machine guards; `phaze.tasks.controller` vs `phaze.tasks.agent_worker` task code split; per-agent SAQ queue (`phaze-agent-`); always-on `phaze-agent-watcher` service with watchdog + settle/debounce + LIVE-sentinel ScanBatch; user-initiated `scan_directory` task with chunked HTTP upserts; admin UI to trigger scans on any agent +Single-host enrichment milestone: unified FTS search with faceted filtering, Discogs cross-service linking with fuzzy matching and bulk-link, format-aware tag writing with 4-layer cascade (tracklist > discogs > metadata > filename) and verify-after-write, CUE sheet generation with fingerprint-preferred timestamps and Discogs REM enrichment. -## Previous State +- 6 phases, 11 plans +- 13 Alembic migrations, 13 SQLAlchemy models +- TagWriteLog audit, DiscogsLink with confidence scoring, three-entity UNION ALL search (file/tracklist/discogs) + +
+ +
+v2.0 shipped 2026-04-02 + +Metadata enrichment & tracklist integration. Audio tag extraction (mutagen), AI destination paths with collision detection, duplicate resolution UI, 1001Tracklists integration with monthly cron, dual fingerprint service (audfprint + Panako) with batch ingestion. + +- 6 phases, 16 plans, 538 tests passing +- ~5,966 lines of Python added + +
v1.0 shipped 2026-03-30 Full pipeline operational: scan β†’ analyze β†’ propose β†’ approve β†’ execute. -- 7,975 lines of Python across 11 phases, 24 plans -- 282 tests passing, 19/19 requirements satisfied +- 11 phases, 24 plans, 282 tests passing +- ~7,975 lines of Python - Tech stack: FastAPI, SQLAlchemy (async), arq, litellm, essentia-tensorflow, HTMX + Tailwind -- Docker Compose: api, worker, postgres, redis containers with health checks -- 4 Alembic migrations, 6 SQLAlchemy models, 28 file extensions classified +- 4 Alembic migrations, 6 SQLAlchemy models
@@ -64,7 +68,7 @@ Full pipeline operational: scan β†’ analyze β†’ propose β†’ approve β†’ execute. - βœ“ File type classification (music, video, companion) β€” v1.0 Phase 2 - βœ“ Companion files linked to media files via directory proximity β€” v1.0 Phase 3 - βœ“ Exact duplicate detection via SHA256 hash grouping β€” v1.0 Phase 3 -- βœ“ arq + Redis task queue with bounded worker pool, retry with backoff, process pool β€” v1.0 Phase 4 +- βœ“ arq + Redis task queue with bounded worker pool, retry with backoff, process pool β€” v1.0 Phase 4 (replaced by SAQ in v4.0) - βœ“ BPM detection for music files β€” v1.0 Phase 5 - βœ“ Mood and style classification for music files β€” v1.0 Phase 5 - βœ“ Analysis runs in parallel across worker pool β€” v1.0 Phase 4 @@ -89,47 +93,74 @@ Full pipeline operational: scan β†’ analyze β†’ propose β†’ approve β†’ execute. - βœ“ Write corrected tags to destination copies with review UI, verify-after-write, and audit logging β€” v3.0 Phase 20 - βœ“ CUE sheet generation from tracklist data with fingerprint-preferred timestamps and Discogs REM enrichment β€” v3.0 Phase 21 +- βœ“ File servers run agents that own files locally; the application server orchestrates and stores all state β€” v4.0 Phase 24-29 +- βœ“ HTTP-only boundary between agents and the application server (no shared filesystem, no shared database access) β€” v4.0 Phase 25-26 +- βœ“ Per-agent bearer token auth with `agent_id` derived from token, never from request body β€” v4.0 Phase 25 +- βœ“ Continuous file watcher service on each file server that streams new arrivals to the application server β€” v4.0 Phase 27 +- βœ“ Distributed approval execution: group approved proposals by agent and dispatch one sub-batch per file server β€” v4.0 Phase 28 +- βœ“ Self-signed HTTPS via internal CA + Redis `requirepass` + LAN bind + per-file-server fingerprint sidecars β€” v4.0 Phase 29 +- βœ“ Same Docker image for both roles via `PHAZE_ROLE={control,agent}` env; new `docker-compose.agent.yml` for file servers β€” v4.0 Phase 26, 29 +- βœ“ 30s heartbeat + Agents admin page with liveness, queue depth, last-seen β€” v4.0 Phase 29 + ### Active -- File servers run agents that own files locally; the application server orchestrates and stores all state β€” v4.0 -- HTTP-only boundary between agents and the application server (no shared filesystem, no shared database access) β€” v4.0 -- Per-agent bearer token auth with `agent_id` derived from token, never from request body β€” v4.0 -- Continuous file watcher service on each file server that streams new arrivals to the application server β€” v4.0 -- Distributed approval execution: group approved proposals by agent and dispatch one sub-batch per file server β€” v4.0 +_To be defined by the next milestone via `/gsd:new-milestone`._ ### Out of Scope -- Cross-file-server fingerprint matching β€” per-agent fingerprint DB only in v4.0; document as limitation, defer to a later milestone -- Delete / move / rename detection in the file watcher β€” v4.0 watcher only handles `created` events; deferred -- Watcher catch-up on startup (rescan files that landed while watcher was down) β€” out of scope for v4.0; manual user-initiated scan covers this +- Cross-file-server fingerprint matching β€” per-agent fingerprint DB only in v4.0; documented as v4.0 limitation, tracked as XAGENT-01, deferred to a later milestone +- Cross-file-server execution batches (moves spanning hosts) β€” XAGENT-02, deferred +- Delete / move / rename detection in the file watcher β€” v4.0 watcher only handles `created` events; tracked as WATCH-05/06, deferred +- Watcher catch-up on startup (rescan files that landed while watcher was down) β€” WATCH-07; manual user-initiated scan covers this in v4.0 +- mTLS in addition to bearer tokens for the agent boundary β€” OPS-05, deferred +- Multi-tenant agent self-service registration β€” OPS-06; today operator pre-seeds tokens +- Agent metric scraping endpoint (Prometheus-compatible) β€” OPS-07, deferred - Natural language querying across services β€” deferred - Acoustic near-duplicate detection via fingerprint similarity β€” deferred -- Cross-reference fingerprint matches with 1001tracklists β€” partially addressed by Discogs linking in v3.0, full cross-ref deferred -- Public network access β€” private network only +- Public network access β€” private LAN only - Offline mode β€” real-time server tool, not a desktop app +- Files transferred between application server and file server β€” v4.0 keeps files local to file servers; transfer would defeat the boundary +- Postgres replication / read-replica on file server β€” agents stay HTTP-only (Option II in v4.0 grilling was rejected) +- Tailscale / mesh networking β€” plain private LAN chosen in v4.0 (Q10b) ## Context -- v1.0 + v2.0 shipped: full pipeline from scan β†’ tag extract β†’ analyze β†’ propose (filename + path) β†’ approve β†’ execute +- v1.0–v4.0 shipped: full pipeline from scan β†’ tag extract β†’ analyze β†’ propose (filename + path) β†’ approve β†’ execute, now distributed across application server + file-server agents - ~200K files total, mix of music files and full concert video streams - Concert videos are primarily recordings of live streams (YouTube streams from festivals, etc.) - FileMetadata fully populated via mutagen tag extraction (ID3/Vorbis/MP4/FLAC/OPUS) -- Shared async engine pool eliminates per-invocation engine creation -- Dual fingerprint service (audfprint + Panako) with weighted scoring (60/40, 70% single-engine cap) -- 1001tracklists integration operational with monthly refresh cron -- This is a personal tool running on a home server, not a multi-user SaaS +- Dual fingerprint service (audfprint + Panako) per file server with weighted scoring (60/40, 70% single-engine cap); no cross-file-server matching in v4.0 +- 1001tracklists integration operational with monthly refresh cron (runs on app-server controller worker) +- This is a personal tool running on a private home LAN, not a multi-user SaaS ## Constraints - **Language**: Python 3.13 exclusively - **Package manager**: uv only -- **Deployment**: Docker Compose on home server, private network -- **Database**: PostgreSQL +- **Deployment**: Docker Compose on private LAN; two-host topology (app-server + file-server agents) +- **Database**: PostgreSQL (app-server only; agents have zero direct DB access) - **Scale**: Must handle ~200K files efficiently β€” batch processing and parallelization required - **Naming format**: Live sets: `{Artist} - Live @ {Venue|Event} {YYYY.MM.DD}.{ext}`, Album tracks: `{Artist} - {Track #} - {Track Title}.{ext}` **Per-agent fingerprint indices (v4.0).** Each file server's `audfprint` and `panako` sidecars index ONLY that file server's local files. Duplicate audio content landing on different file servers will NOT cross-match. Cross-file-server fingerprint matching is XAGENT-01 (deferred to a post-v4.0 milestone). The Duplicate Resolution admin UI surfaces this constraint as an inline, per-session-dismissible banner on every page load so the operator interprets fingerprint-derived results with this scope in mind. +### Deployment (v4.0 β€” Distributed Agents) + +Phaze v4.0 production runs as **two Docker Compose files on two private-LAN hosts**: + +- **Application server** (`docker-compose.yml`): `api` (uvicorn-direct TLS via internal CA), `worker` (fileless controller-role SAQ worker), `postgres`, `redis` (password-auth + LAN-bound port). **No file mounts** beyond `./certs/` β€” the app-server has no way to read or write music/video file content (DIST-01). +- **File servers** (`docker-compose.agent.yml`, one stack per file-server host): `worker` (agent-role SAQ worker), `watcher` (watchdog-based file event poster), `audfprint` + `panako` (local fingerprint sidecars). Holds the music/video library locally; reaches the app-server over HTTPS for every state change. + +Locked invariants (Phase 29): + +- All agent β†’ app-server traffic uses **HTTPS** terminated by uvicorn against a self-signed internal CA generated in the app-server's `api` container on first start. Operators distribute the public CA cert (`phaze-ca.crt`) to each file server via scp/rsync; the CA private key (`phaze-ca.key`, mode 0600) never leaves the app-server. +- **Redis** on the app-server requires `requirepass` and is bound to the private LAN IP (or loopback in dev). Agents connect with `redis://default:@:6379`. In `PHAZE_AGENT_ENV=production`, `AgentSettings` rejects a passwordless `redis_url` at boot. +- **0 new pip dependencies** beyond `cryptography` (added Phase 29 for cert generation). +- `docker-compose.agent.yml` enforces `${SCAN_PATH:?SCAN_PATH required}` on all four services β€” compose parse fails fast on a misconfigured file-server host. +- Operator workflow: `just up` (app-server), `just up-agent` (each file-server), `just up-all` (single-host dev). Full walkthrough in `docs/deployment.md`. + +Deferred to a future ops phase: mTLS for the agent boundary, agent self-registration UI, Prometheus metrics scrape endpoint, automated CA rotation. See `.planning/milestones/v4.0-REQUIREMENTS.md` Β§"Future Requirements β†’ Operational Polish" (OPS-05..OPS-07). + ## Key Decisions | Decision | Rationale | Outcome | @@ -139,20 +170,26 @@ Full pipeline operational: scan β†’ analyze β†’ propose β†’ approve β†’ execute. | Human-in-the-loop approval | No file moves without admin review β€” safety for a large, irreplaceable collection | βœ“ Good β€” approval UI with undo prevents mistakes | | Containerized services | Clean separation of concerns, reproducible deployment on home server | βœ“ Good β€” Docker Compose with health checks works reliably | | HTMX over React SPA | Single-user admin tool doesn't need SPA complexity | βœ“ Good β€” zero build step, CDN delivery, full interactivity | -| arq over Celery | Async-first, simple config, Redis-native β€” single user doesn't need Celery complexity | βœ“ Good β€” maintenance mode but stable | +| arq over Celery | Async-first, simple config, Redis-native β€” single user doesn't need Celery complexity | β€” Replaced β€” migrated to SAQ in v4.0 prep; arq was in maintenance mode and SAQ has active development + per-agent queue affordances | +| SAQ over arq (v4.0) | Active maintenance, built-in web UI, native per-queue worker model | βœ“ Good β€” clean fit for per-agent `phaze-agent-` queues | | essentia-tensorflow for analysis | 34 pre-trained models, BPM/key/mood/style in one library | βœ“ Good β€” baked into Docker image, process pool execution | | litellm for LLM abstraction | Provider flexibility without vendor lock-in | ⚠️ Revisit β€” supply chain incident on 1.82.7/1.82.8, pin aggressively | -| copy-verify-delete protocol | Never direct move β€” SHA256 verification before deleting original | βœ“ Good β€” safety for irreplaceable collection | -| State machine on FileRecord | Explicit state transitions (DISCOVEREDβ†’ANALYZEDβ†’PROPOSEDβ†’APPROVEDβ†’EXECUTED) | βœ“ Good β€” enables pipeline dashboard stage counts | +| copy-verify-delete protocol | Never direct move β€” SHA256 verification before deleting original | βœ“ Good β€” safety for irreplaceable collection, preserved across the v4.0 HTTP boundary via per-operation PATCH | +| State machine on FileRecord | Explicit state transitions (DISCOVEREDβ†’ANALYZEDβ†’PROPOSEDβ†’APPROVEDβ†’EXECUTEDβ†’MOVED/UNCHANGED/FAILED) | βœ“ Good β€” enables pipeline dashboard stage counts | | mutagen for tag read/write | Zero-dependency, supports all major tag formats | βœ“ Good β€” reliable across ID3/Vorbis/MP4/FLAC/OPUS | | audfprint + Panako hybrid | Complement each other: landmark-based vs tempo-robust | βœ“ Good β€” weighted orchestrator with per-engine results | | rapidfuzz for fuzzy matching | Fast token_set_ratio for tracklist-to-file matching | βœ“ Good β€” weighted scoring with artist/event/date | -| Long-running fingerprint containers | HTTP API over subprocess calls for fingerprint services | βœ“ Good β€” persistent DBs, Docker Compose integration | -| Distributed agents (v4.0) | Files stay on file servers; application server owns API, UI, Postgres, Redis | πŸ†• Decided pre-v4.0 β€” enables remote file storage without losing centralized control | -| HTTP-only agent boundary (v4.0) | Agents have zero Postgres access; all writes go through `/api/internal/agent/*` | πŸ†• Decided pre-v4.0 β€” seals DB inside application server, agents are version-skew tolerant | -| One SAQ queue per agent (v4.0) | `phaze-agent-` queue per file server; enqueuer picks queue by `FileRecord.agent_id` | πŸ†• Decided pre-v4.0 β€” matches SAQ's native pull model, clean per-agent maintenance | -| Per-agent bearer token auth (v4.0) | `agent_id` derived from token lookup on application server, never from request body | πŸ†• Decided pre-v4.0 β€” eliminates spoofing risk, supports per-agent rotation | -| Per-agent fingerprint DB (v4.0) | Each file server runs its own audfprint+panako sidecars indexing only its files | πŸ†• Decided pre-v4.0 β€” no cross-file-server fingerprint matching in v1; SHA-256 dedup still works | +| Long-running fingerprint containers | HTTP API over subprocess calls for fingerprint services | βœ“ Good β€” persistent DBs, Docker Compose integration; now per-file-server in v4.0 | +| Distributed agents (v4.0) | Files stay on file servers; application server owns API, UI, Postgres, Redis | βœ“ Good β€” v4.0 shipped end-to-end; two-host topology operational with strict HTTP-only boundary | +| HTTP-only agent boundary (v4.0) | Agents have zero Postgres access; all writes go through `/api/internal/agent/*` | βœ“ Good β€” `test_agent_worker_does_not_import_phaze_database` subprocess gate enforces the boundary at CI time | +| One SAQ queue per agent (v4.0) | `phaze-agent-` queue per file server; enqueuer picks queue by `FileRecord.agent_id` | βœ“ Good β€” matches SAQ's native pull model, clean per-agent maintenance | +| Per-agent bearer token auth (v4.0) | `agent_id` derived from token lookup on application server, never from request body | βœ“ Good β€” partial-index `ix_agents_token_hash_active WHERE revoked_at IS NULL` gives O(1) lookup; revoke = instant block | +| Per-agent fingerprint DB (v4.0) | Each file server runs its own audfprint+panako sidecars indexing only its files | ⚠️ Revisit β€” known v4.0 limitation; XAGENT-01 deferred. Operator banner mitigates UX surprise | +| Self-signed internal CA (v4.0) | Generated in api container on first start; public cert distributed by operator via scp | βœ“ Good β€” no DNS dependency, no public ACME, no rotation pain for single-user LAN | +| Redis `requirepass` + LAN bind (v4.0) | App-server Redis is broker + cache; password + interface bind is the minimal credible hardening on a private LAN | βœ“ Good β€” `AgentSettings` fail-fast in production prevents passwordless misconfig | +| Group-by-agent execution dispatch (v4.0) | In-Python `defaultdict(list)` over SQL `GROUP BY` β€” at 1-5 agents Γ— ≀10K proposals, type-safe path is cheaper than DB aggregation | βœ“ Good β€” preserves write-ahead `ExecutionLog` audit over HTTP boundary via per-operation PATCH | +| Pre-uvicorn entrypoint shim (v4.0) | Cert bootstrap then `execvp uvicorn` so signals + PID-1 propagate cleanly | βœ“ Good β€” clean Docker stop semantics, no double-process tree | +| Two-step Alembic migration (v4.0) | 012 adds + backfills, 013 enforces NOT NULL + swaps UQ β€” preserves v3.0 data via `legacy-application-server` seed | βœ“ Good β€” round-trip downgrade smoke gate caught the boundary; zero data loss in production migration | ## Evolution @@ -172,4 +209,4 @@ This document evolves at phase transitions and milestone boundaries. 4. Update Context with current state --- -*Last updated: 2026-05-14 β€” Phase 27 (Watcher Service & User-Initiated Scan) complete; 4/6 v4.0 phases done* +*Last updated: 2026-05-17 after v4.0 milestone* diff --git a/.planning/RETROSPECTIVE.md b/.planning/RETROSPECTIVE.md index 8fc8bcc..bbe71a4 100644 --- a/.planning/RETROSPECTIVE.md +++ b/.planning/RETROSPECTIVE.md @@ -94,6 +94,73 @@ --- +## Milestone: v4.0 β€” Distributed Agents + +**Shipped:** 2026-05-17 +**Phases:** 6 | **Plans:** 47 + +### What Was Built +- `agents` table + `agent_id` columns on FileRecord/ScanBatch with two-step Alembic migration (012 add+backfill via `legacy-application-server` seed, 013 NOT NULL + UQ swap) preserving v3.0 corpus end-to-end +- `/api/internal/agent/*` HTTP surface (15+ routes: files, metadata, analysis, fingerprint, tracklists, proposals, execution-log, scan-batches, exec-batches, heartbeat, whoami) with token-hash bearer auth deriving `agent_id` from token; 403-before-state-machine cross-tenant guard on every multi-tenant PATCH +- Task code split: `phaze.tasks.controller` (fileless) vs `phaze.tasks.agent_worker` (file-bound) under `PHAZE_ROLE={control,agent}`; per-agent SAQ queue (`phaze-agent-`); subprocess import-boundary test catches `phaze.database` leaks +- `PhazeAgentClient` with tenacity retry funnel + 4-class error hierarchy + respx contract tests; bearer token never instance-attribute (lives in httpx headers only) +- `phaze-agent-watcher` service: watchdog observer + asyncio-owned single-loop sweep with mtime settle (10s default) + stuck-file cap (3600s); LIVE-sentinel ScanBatch per agent; admin "Trigger Scan" form +- Distributed execution dispatch: group-by-`FileRecord.agent_id` in-Python `defaultdict`, one `execute_approved_batch` sub-job per agent under shared parent `batch_id`; per-proposal terminal progress POST with SAQ-meta UUID lift for retry safety; unified SSE aggregated by app-server +- Self-signed internal CA + leaf x509 generated on first start by `phaze.cert_bootstrap`; pre-uvicorn entrypoint shim execvp's uvicorn (clean PID-1 signal propagation); `PhazeAgentClient.verify=` honors `AgentSettings.agent_ca_file` +- Redis `requirepass` + `${REDIS_BIND_IP:-127.0.0.1}` LAN bind; `AgentSettings` rejects passwordless `redis_url` in production at boot +- App-server compose stripped of `SCAN_PATH`/`MODELS_PATH` mounts; new `docker-compose.agent.yml` (4 services); per-file-server `just download-models` + auto-bootstrap; 30s heartbeat cron; `/admin/agents` page with liveness classifier +- Migration from arq β†’ SAQ (built-in web UI, per-queue worker model, active maintenance) + +### What Worked +- The discuss-phase questioning loop on Phase 24 ("two-step migration vs single-step") surfaced the `legacy-application-server` backfill strategy BEFORE writing any SQL β€” saved a full re-plan +- Subprocess import-boundary tests (D-25) are the cheapest possible way to enforce architectural invariants: one test catches every accidental `phaze.database` leak into agent code at CI time +- The 403-before-state-machine guard pattern, repeated across Phases 25-08 / 27-02 / 28-02, is now a project-wide convention for multi-tenant PATCH routes +- Phase 27 watcher: the asyncio-owned single-loop sweep + `loop.call_soon_threadsafe` thread bridge is the entire concurrency story β€” no locks, no race conditions in tests +- Phase 28 SAQ-meta UUID lift (persist `execution_log_id` + `progress_request_id` in `job.meta` for retry idempotency) closed two latent retry-correctness bugs (L6/L22) that wouldn't surface in unit tests +- Phase 29 entrypoint shim pattern (bootstrap β†’ `execvp uvicorn`) is the canonical answer to "Docker PID-1 signal handling with pre-start work" β€” no double-process tree +- Per-phase PR convention kept main clean through 6 phases (PRs #52, #56, #57, #59, #62 + this PR for #29) +- Wave-based parallelization with worktree executors gave 3-4x throughput on phases with independent plans (especially Phase 25 wave 3, Phase 26 waves 3-5) + +### What Was Inefficient +- Phase 24 plan numbering went from `[ ]` to `[x]` on phase-branch but the ROADMAP.md progress table wasn't synced to main β€” surfaced again by the v4.0 audit as documentation drift; needed a follow-up commit before milestone close +- REQUIREMENTS.md traceability table was left with 13 stale `| Pending |` rows after Phases 24-28 merged; audit caught it but the drift could have been prevented by a CI gate that checks REQUIREMENTS.md against `find-phase --status passed` +- VERIFICATION.md naming inconsistency: Phase 24 wrote `VERIFICATION.md` (unprefixed) while Phases 25-29 wrote `{N}-VERIFICATION.md` β€” breaks the `gsd-sdk query find-phase` discovery pattern; convention should be enforced by the verifier agent +- Phase 26 ballooned to 13 plans (split into 6 waves) β€” the contract gap surfaced in Phase 25 (`/whoami`, `PUT /analysis`, `POST /tracklists`, `PATCH /proposals/{id}/state`) wasn't visible at Phase 25 plan time; a "contract completeness check" between planning waves would have absorbed those 4 plans into Phase 25 +- Phase 29 human-UAT was deferred to "verified-docs-only" because real two-host hardware wasn't available β€” milestone shipped with a documented production-smoke gap rather than a real one +- Compose-template work (docker-compose.agent.yml + .env.example.agent + YAML-parse tests) repeated across Phases 27, 29 β€” could have lived in a single dedicated infra plan + +### Patterns Established +- **Settings split via `get_settings()` factory** (Phase 26-01): `BaseSettings` + `ControlSettings(BaseSettings)` + `AgentSettings(BaseSettings)` with module-level `settings: ControlSettings = ...` for back-compat and call sites that pick via `get_settings()` +- **`AliasChoices(PHAZE_*, bare_field)` per pydantic-settings field** (Phase 26-01): canonical pattern for env-var naming without a global `env_prefix` +- **`Annotated[list[str], NoDecode] + @field_validator(mode="before")`** (Phase 26-01): canonical pattern for comma-split env vars (pydantic-settings v2 does NOT do this natively) +- **Subprocess import-boundary tests** (Phase 26-10, 27-01, 29-01): `subprocess.run([sys.executable, "-c", "import phaze.tasks.agent_worker"])` + assert `phaze.database` not in `sys.modules` β€” extends per phase as new modules join the agent chain +- **403-before-state-machine cross-tenant guard** (Phases 25-08 / 27-02 / 28-02): handler order is part of the spec; prevents timing side-channel via 409-vs-403 latency difference +- **Idempotent same-state PATCH echoes row with zero DB writes** (Phase 26-08, 27-03): no `updated_at` bump on same-state retry +- **Smoke-app per-router contract test pattern** (Phase 25-04, 26-05): `FastAPI()` + `include_router(...)` + `app.state.X = Y` decouples handler tests from the full main.py wiring +- **Overflow funnel for wire-format fields without a column** (Phase 26-06): non-column response fields merge into existing JSONB column rather than dropping +- **`_render_partial()` helper through `Jinja2Templates.TemplateResponse(...).body.decode()`** (Phase 28-04): Semgrep XSS-lint requires this over bare `Environment.get_template().render()` +- **Pre-uvicorn entrypoint shim** (Phase 29-01): bootstrap-then-`execvp` for clean PID-1 signal propagation +- **`${VAR:?...}` compose fail-fast** (Phase 29-04): forces compose parse failure on misconfigured host before any container starts +- **HTMX poll-partial halt by OMITTING `hx-trigger`** (Phase 27-06): terminal-state markup drops the polling attrs; outerHTML swap replaces the polling element entirely +- **In-Python `defaultdict(list)` over SQL `GROUP BY`** (Phase 28-03): at v4.0 scale (1-5 agents Γ— ≀10K proposals), type-safe path is cheaper than DB aggregation + +### Key Lessons +1. **Enforce architectural invariants with subprocess import-boundary tests** β€” the single test catches every `phaze.database` leak at CI time; the alternative (manual review) does not scale +2. **The discuss-phase questioning loop is highest ROI on schema/migration phases** β€” getting the two-step migration shape locked in Phase 24 prevented a full re-plan when v3.0 data preservation was raised +3. **Documentation drift gates need automation** β€” manual REQUIREMENTS.md / ROADMAP.md sync after PR merge consistently lags; a CI gate that cross-checks REQUIREMENTS.md against `find-phase --status passed` would close this +4. **VERIFICATION.md naming convention must be enforced by the verifier agent** β€” Phase 24's unprefixed `VERIFICATION.md` broke the discovery pattern and required a documentation drift commit at milestone close +5. **Plan a "contract completeness check" between waves on API-heavy phases** β€” Phase 26 absorbed 4 extra plans (`/whoami`, `PUT /analysis`, `POST /tracklists`, `PATCH /proposals/{id}/state`) that should have lived in Phase 25 +6. **Human-UAT defer policy needs explicit acceptance criteria upfront** β€” Phase 29's "verified-docs-only" exit was the right call given missing hardware, but it should have been declared at plan time, not at verify time +7. **Per-phase PR convention scales to large milestones** β€” 6 phases, 6 PRs, main never broken; the discipline pays off most on phases that mutate shared modules (config, main.py, docker-compose.yml) +8. **Per-agent SAQ queues fit perfectly when the queue name comes from a stable resource ID** β€” `phaze-agent-` from `FileRecord.agent_id` made the enqueue path a single field lookup, no routing logic needed + +### Cost Observations +- Model mix: ~60% opus (execution + planning on complex phases like 26), ~30% sonnet (verification, contract tests, doc work), ~10% haiku (quick checks, status updates) +- Notable: Phase 26 (13 plans, 6 waves) used the most tokens of any v4.0 phase due to the contract-gap discovery + per-router plan splits β€” a "contract completeness" pre-check could have collapsed this back to ~9 plans +- Worktree parallelization saved meaningful wall-clock on Phases 25 (wave 3, 5 parallel routers) and 26 (waves 3-5) β€” orchestrator stays focused on integration while executors work independently + +--- + ## Cross-Milestone Trends ### Process Evolution @@ -102,16 +169,24 @@ |-----------|--------|-------|------------| | v1.0 | 11 | 24 | Established GSD workflow, branching strategy, Nyquist validation | | v2.0 | 6 | 16 | Research phases before planning, dual-service architecture, HTMX patterns matured | +| v3.0 | 6 | 11 | Enrichment layer (search, Discogs, tag writing, CUE) on stable foundation; HTMX OOB swaps + Alpine.js patterns reused everywhere | +| v4.0 | 6 | 47 | Two-host distributed architecture (HTTP-only agent boundary, per-agent SAQ queues, internal CA, settings split via `get_settings()` factory); subprocess import-boundary tests enforce invariants; 4Γ— plan count from contract-heavy API surface | ### Cumulative Quality -| Milestone | Tests | LOC | Phases | -|-----------|-------|-----|--------| +| Milestone | Tests | LOC (Python src) | Phases | +|-----------|-------|------------------|--------| | v1.0 | 282 | 7,975 | 11 | -| v2.0 | 538 | 5,966 | 6 | +| v2.0 | 538 | 5,966 added | 6 | +| v3.0 | (unrecorded) | (single-host enrichment) | 6 | +| v4.0 | (full suite passing) | ~14,300 src + ~28,000 tests cumulative; ~23,242 lines added since v3.0 tag | 6 | ### Top Lessons (Verified Across Milestones) -1. Integration testing at pipeline boundaries catches gaps that unit tests miss (v1.0 audit gaps, v2.0 clean audit) -2. Documentation conventions established early save cleanup phases later (v1.0 SUMMARY frontmatter, v2.0 Nyquist frontmatter) -3. Research phases for unfamiliar domains prevent rework (v2.0 fingerprint architecture research) +1. Integration testing at pipeline boundaries catches gaps that unit tests miss (v1.0 audit gaps, v2.0 clean audit, v4.0 cross-tenant guards) +2. Documentation conventions established early save cleanup phases later (v1.0 SUMMARY frontmatter, v2.0 Nyquist frontmatter, v4.0 VERIFICATION.md prefixing) +3. Research phases for unfamiliar domains prevent rework (v2.0 fingerprint architecture, v4.0 pydantic-settings v2 quirks + cryptography x509 generation) +4. The discuss-phase questioning loop is highest ROI on schema/migration phases (v4.0 Phase 24 two-step migration shape was locked before any SQL was written) +5. Subprocess import-boundary tests are the cheapest enforcement of architectural invariants β€” established in v4.0, should generalize to any future "this module must not import that module" rule +6. Per-phase PR convention scales β€” held through 29 phases across 4 milestones, main never broken +7. Documentation drift gates need automation β€” manual REQUIREMENTS.md / ROADMAP.md sync after PR merge consistently lags; surfaced in v2.0, v3.0, and v4.0 audits diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index bde981c..943ac8a 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -5,7 +5,7 @@ - βœ… **v1.0 MVP** β€” Phases 1-11 (shipped 2026-03-30) - βœ… **v2.0 Metadata Enrichment & Tracklist Integration** β€” Phases 12-17 (shipped 2026-04-02) - βœ… **v3.0 Cross-Service Intelligence & File Enrichment** β€” Phases 18-23 (shipped 2026-04-04) -- 🚧 **v4.0 Distributed Agents** β€” Phases 24-29 (in planning, 2026-05-11) +- βœ… **v4.0 Distributed Agents** β€” Phases 24-29 (shipped 2026-05-17) ## Phases @@ -56,132 +56,23 @@ Full details: `.planning/milestones/v3.0-ROADMAP.md` -### v4.0 Distributed Agents (Phases 24-29) β€” IN PLANNING - -- [ ] **Phase 24: Schema Foundation & Agent Registry** β€” `agents` table, `agent_id` columns on FileRecord/ScanBatch, two-step Alembic migration with legacy backfill -- [x] **Phase 25: Internal Agent HTTP API & Bearer Auth** β€” `/api/internal/agent/*` endpoints, token-hash auth middleware deriving `agent_id` from token, idempotent upserts on natural keys, rotatable tokens (completed 2026-05-12) -- [x] **Phase 26: Task Code Reorg & HTTP-Backed Agent Worker** β€” split `phaze.tasks.controller` (fileless) from `phaze.tasks.agent_worker` (file-bound), `PHAZE_ROLE` env-driven startup, per-agent SAQ queue (`phaze-agent-`), self-contained job payloads (completed 2026-05-12) -- [x] **Phase 27: Watcher Service & User-Initiated Scan** β€” new `phaze-agent-watcher` compose service, watchdog with mtime settle/debounce, sentinel `LIVE` ScanBatch per agent, admin-triggered scan form (completed 2026-05-13) -- [x] **Phase 28: Distributed Execution Dispatch** β€” group-by-agent approval dispatch, per-operation ExecutionLog PATCH, unified SSE progress aggregating across agents, per-agent fingerprint sidecars in execution path (completed 2026-05-15) -- [ ] **Phase 29: Deployment Hardening & Agents Admin** β€” strip `SCAN_PATH`/`MODELS_PATH` from application-server compose, self-signed HTTPS w/ internal CA, Redis `requirepass` + LAN binding, `docker-compose.agent.yml`, per-file-server model download, heartbeat + Agents admin page - -## Phase Details - -### Phase 24: Schema Foundation & Agent Registry -**Goal**: The database can model who owns each file and which agent originated each scan, with existing v3.0 data preserved end-to-end through a controlled migration. -**Depends on**: Phase 23 (v3.0 shipped) -**Requirements**: DATA-01, DATA-02, DATA-03, DATA-04 -**Success Criteria** (what must be TRUE): - 1. An `agents` table exists with `id`, `name`, `token_hash`, `scan_roots` (jsonb), `created_at`, `last_seen_at`, `revoked_at`, and an operator can insert/query agent rows via Postgres - 2. `FileRecord.agent_id` and `ScanBatch.agent_id` are non-null string columns, and the file uniqueness invariant has moved from `(original_path)` to `(agent_id, original_path)` (verified by attempting a same-path insert under a different agent and succeeding) - 3. After running the upgrade migration on a v3.0 snapshot, every pre-existing FileRecord and ScanBatch points at a seeded `legacy-application-server` agent whose `scan_roots` matches the prior `SCAN_PATH` - 4. One sentinel `LIVE` ScanBatch exists per registered agent and is reused (not duplicated) when re-applied - 5. The migration is two-step (add nullable + backfill, then enforce NOT NULL + swap unique constraint) and can be downgraded cleanly to the v3.0 schema on an unmigrated test DB -**Plans**: 5 plans -- [ ] 24-01-PLAN.md β€” Test infrastructure: tests/test_migrations/ package + alembic-driven fixture (Wave 0) -- [ ] 24-02-PLAN.md β€” Agent model + ScanStatus.LIVE + agent_id columns + composite UQ on model layer (Wave 1) -- [ ] 24-03-PLAN.md β€” Migration 012: agents table, legacy agent seed, FKs, partial UQ, backfill + integration tests (Wave 2) -- [ ] 24-04-PLAN.md β€” Migration 013: NOT NULL + composite UQ swap + safe downgrade + [BLOCKING] roundtrip smoke (Wave 3) -- [ ] 24-05-PLAN.md β€” Ingestion service: stamp legacy agent_id, swap conflict target to composite (Wave 3) - -### Phase 25: Internal Agent HTTP API & Bearer Auth -**Goal**: The application server exposes an authenticated, idempotent HTTP surface that agents can call to record every state change, with `agent_id` derived from the bearer token and never trusted from request bodies. -**Depends on**: Phase 24 -**Requirements**: DIST-04, DIST-05, AUTH-01, AUTH-04 -**Success Criteria** (what must be TRUE): - 1. Every `/api/internal/agent/*` route requires a bearer token; an unauthenticated request returns 401 and an unknown/revoked token returns 403 - 2. The `agent_id` used by every endpoint is resolved by hashing the bearer token and looking it up in the `agents` table; any `agent_id` field in a request body is ignored or rejected - 3. Replaying the same chunk of file upserts, the same proposal mutation, or the same execution-log PATCH with the same natural keys (`(agent_id, original_path)`, `file_id`, `proposal_id`, agent-generated log UUIDs) produces no duplicate rows and the same final state - 4. Setting `agents.revoked_at` on a row immediately causes that agent's next `/api/internal/agent/*` call to be rejected with no application-server restart required (verified by integration test) - 5. The API surface covers, at minimum, file upsert, metadata write, fingerprint write, execution-log create/patch, and heartbeat β€” all callable end-to-end with an HTTP client -**Plans**: 6 plans -- [x] 25-01-PLAN.md β€” Schema foundation: Agent.last_status JSONB + migration 014 + conftest fixtures (Wave 1) -- [x] 25-02-PLAN.md β€” Auth helper module (agent_auth.py) + AUTH-01/AUTH-04 tests (Wave 2) -- [x] 25-03-PLAN.md β€” Files router + xmax regression test + schemas + auto-enqueue (Wave 3) -- [x] 25-04-PLAN.md β€” Metadata + Fingerprint + Heartbeat routers + schemas + tests (Wave 3) -- [x] 25-05-PLAN.md β€” Execution-log router (POST + PATCH monotonic) + schemas + tests (Wave 3) -- [x] 25-06-PLAN.md β€” App wiring: register 5 routers in main.py + config knobs (Wave 4) -- [x] 25-07-PLAN.md β€” Gap closure CR-01: agent_metadata partial-PUT NULL clobber + regression test (Wave 1, gap_closure) -- [x] 25-08-PLAN.md β€” Gap closure CR-02: execution-log terminal-state idempotent retry + regression tests (Wave 1, gap_closure) -**UI hint**: yes - -### Phase 26: Task Code Reorg & HTTP-Backed Agent Worker -**Goal**: SAQ task code is cleanly split between the application server (fileless `phaze.tasks.controller`) and agents (file-bound `phaze.tasks.agent_worker`), with role-driven startup and per-agent queues so the same Docker image runs both roles correctly. Three new internal-agent endpoints (`/whoami`, `PUT /analysis/{file_id}`, `POST /tracklists`, `PATCH /proposals/{id}/state`) close the contract gap from Phase 25 so the full file-bound task surface can run on agents. -**Depends on**: Phase 25 -**Requirements**: DIST-03, TASK-01, TASK-02, TASK-03, OPS-01 -**Success Criteria** (what must be TRUE): - 1. `phaze.tasks.controller` exposes only fileless tasks (`generate_proposals`, `match_tracklist_to_discogs`, `scrape_and_store_tracklist`, `search_tracklist`, `refresh_tracklists` cron) and `phaze.tasks.agent_worker` exposes only file-bound tasks (`process_file`, `extract_file_metadata`, `fingerprint_file`, `scan_live_set`, `execute_approved_batch`) - 2. Setting `PHAZE_ROLE=control` boots the application-server worker with the fileless settings module and Postgres access; setting `PHAZE_ROLE=agent` boots the agent worker with the file-bound settings module and an HTTP client to the application server, with no Postgres driver loaded - 3. Every file-bound task body uses the HTTP client (no `async_session` import reachable in agent-worker code paths) and writes results via `/api/internal/agent/*` - 4. Each agent worker pulls from a per-agent SAQ queue named `phaze-agent-`; the application-server enqueuer selects the queue from `FileRecord.agent_id` and a job enqueued for agent A never executes on agent B - 5. Agent task jobs carry a self-contained payload (`file_id`, `file_path`, `file_type`, model paths, agent metadata) sufficient to execute without any read-back to the application server during the job -**Plans**: 13 plans -- [x] 26-01-PLAN.md β€” Deps (tenacity + respx + mypy overrides) + settings split (Base/Control/Agent + get_settings) + enum extensions (ProposalStatus.EXECUTED/FAILED, FileState.MOVED/UNCHANGED) (Wave 1) -- [x] 26-02-PLAN.md β€” PhazeAgentClient + 4-class error hierarchy + tenacity retry funnel + respx contract tests (Wave 2) -- [x] 26-03-PLAN.md β€” 5 new schema modules (agent_identity, agent_analysis, agent_tracklists, agent_proposals, agent_tasks) (Wave 2) -- [x] 26-04-PLAN.md β€” AgentTaskRouter + Redis integration tests (Wave 3) -- [x] 26-05-PLAN.md β€” GET /api/internal/agent/whoami router + 4 contract tests (Wave 3) -- [x] 26-06-PLAN.md β€” PUT /api/internal/agent/analysis/{file_id} router (idempotent upsert) + 8 contract tests (Wave 3) -- [x] 26-07-PLAN.md β€” POST /api/internal/agent/tracklists router (Redis idempotency cache) + integration tests (Wave 3) -- [x] 26-08-PLAN.md β€” PATCH /api/internal/agent/proposals/{id}/state router (state-machine joint update) + 11 contract tests incl. W1 cross-tenant guard (Wave 3) -- [x] 26-09-PLAN.md β€” phaze.tasks.controller SAQ settings module (fileless tasks only) (Wave 4) -- [x] 26-10-PLAN.md β€” phaze.tasks.agent_worker SAQ settings module + tests/test_task_split.py subprocess import-boundary test (D-25) (Wave 5) -- [x] 26-11-PLAN.md β€” Rewrite 5 file-bound task bodies (process_file, extract_file_metadata, fingerprint_file, scan_live_set, execute_approved_batch) to use ctx['api_client'] (Wave 4) -- COMPLETE 2026-05-12; D-03 import boundary verified; ExecutionStatus moved to phaze.enums; scan_live_set artist/title resolution removed (known v3.0 UI regression for future Phase 27/28 controller-side enrichment) -- [x] 26-12-PLAN.md β€” main.py wiring (4 new include_router + app.state.task_router + app.state.redis) + agent_files.py refactor to AgentTaskRouter (Wave 5) -- [x] 26-13-PLAN.md β€” Delete worker.py + session.py + docker-compose.yml controller.settings + doc sweep (legacy hostname-leaked name retired in favour of `controller`) (Wave 6) - -### Phase 27: Watcher Service & User-Initiated Scan -**Goal**: Each file server continuously streams new file arrivals to the application server, and the administrator can also trigger an explicit scan of any path on any agent from the admin UI. -**Depends on**: Phase 26 -**Requirements**: DIST-02, SCAN-01, SCAN-02, SCAN-03, SCAN-04 -**Success Criteria** (what must be TRUE): - 1. A new `phaze-agent-watcher` service is defined and starts alongside `worker`, `audfprint`, and `panako` on the file-server compose; it stays running and observes the agent's configured roots via the `watchdog` library - 2. Dropping a new file into a watched root results in a new `FileRecord` appearing on the application server under that agent's sentinel `LIVE` ScanBatch, with `(agent_id, original_path)` as the natural key - 3. A file whose `mtime` is still changing is **not** posted; only after the configured settle period (default 10s) of stable `mtime` does the watcher compute SHA-256 and stream the record (verified by writing a file slowly and observing no early upsert) - 4. From the admin UI, an administrator can choose `(agent, scan_path)` and trigger a scan; this enqueues `scan_directory(scan_path, batch_id)` onto the chosen agent's queue and the agent streams discovered files back in chunks (e.g., 500 records per request), with `extract_file_metadata` enqueued per new music/video file before the scan completes - 5. The same upsert endpoint serves both bulk scans and per-file watcher events, and a re-walked path produces no duplicate FileRecord rows -**Plans**: 7 plans -- [x] 27-01-PLAN.md β€” Foundation: watchdog dep, AgentSettings watcher knobs, _shared/agent_bootstrap refactor, test scaffolding + extended import-boundary tests (Wave 0) -- [x] 27-02-PLAN.md β€” Schemas: FileUpsertChunk.batch_id, ScanBatchPatch/Response, ScanDirectoryPayload, TriggerScanForm (Wave 1) -- [x] 27-03-PLAN.md β€” Endpoints: PATCH /api/internal/agent/scan-batches + batch_id resolution in POST /files + patch_scan_batch client method + main.py wiring + contract tests (Wave 2) -- [x] 27-04-PLAN.md β€” Agent task: scan_directory(scan_path, batch_id) with chunking, per-chunk PATCH, terminal PATCH; registered in agent_worker.settings.functions (Wave 3) -- [x] 27-05-PLAN.md β€” Watcher package: phaze.agent_watcher (Debouncer, WatcherEventHandler, Poster, __main__); 16+ unit tests covering thread bridge, stuck-file cap, OSError vanish, LIVE-sentinel resolution (Wave 3) -- [x] 27-06-PLAN.md β€” Admin UI: routers/pipeline_scans.py (POST + GET progress + GET agent-roots HTMX swap), 6 partial templates, dashboard.html extension + 10 contract tests (Wave 3) -- [x] 27-07-PLAN.md β€” Deployment + docs: docker-compose watcher service, .env.example knobs, per-service README, STATE.md accumulation (Wave 5) -**UI hint**: yes - -### Phase 28: Distributed Execution Dispatch -**Goal**: Approving a batch that spans multiple file servers results in each agent doing its own local copy-verify-delete, while the application server preserves the write-ahead audit trail and presents unified progress to the operator. -**Depends on**: Phase 27 -**Requirements**: EXEC-01, EXEC-02, EXEC-03, EXEC-04, TASK-04 -**Success Criteria** (what must be TRUE): - 1. Triggering execution on an approved batch groups proposals by `FileRecord.agent_id` and enqueues one `execute_approved_batch` sub-job per affected agent under a shared parent `batch_id`; the dispatch decision is visible in logs and via an admin endpoint - 2. Each agent performs copy-verify-delete locally for its assigned proposals and PATCHes per-operation status (started, copied, verified, deleted, failed) to the application server, so the `ExecutionLog` write-ahead trail survives the HTTP boundary with no rows lost on retry - 3. The application server owns the `exec:{batch_id}` Redis hash and serves SSE progress from a single aggregated key; the admin UI shows unified `total / completed / failed` counts that match the sum across all participating agents - 4. The execution UI exposes a per-agent breakdown (which agent handled which sub-batch, with its own counts) for debugging without requiring database access - 5. Each file server's audfprint and panako sidecars index only that file server's files; fingerprint queries during execution-adjacent flows resolve against the local sidecar and the limitation (no cross-file-server fingerprint matching) is documented in the admin UI / docs -**Plans**: 6 plans -- [x] 28-01-PLAN.md β€” Wave 0: test scaffolding + new dirs + audfprint/panako allow-list validator + sub_batch_index schema field -- [x] 28-02-PLAN.md β€” Wave 1: ExecBatchProgressPayload + agent_exec_batches router + main.py wiring + PhazeAgentClient.post_exec_batch_progress (contract tests) -- [x] 28-03-PLAN.md β€” Wave 1: execution_dispatch service (group-by-agent + revoked filter + chunking) + grouping unit tests -- [x] 28-04-PLAN.md β€” Wave 2: start_execution rewrite + SSE generator extension + agents_table.html + progress.html rewrite + revoked banner -- [x] 28-05-PLAN.md β€” Wave 2: tasks/execution.py β€” per-proposal terminal progress POST + SAQ-meta UUID lift (closes L6/L22) + _classify_failure_step + : error_message -- [x] 28-06-PLAN.md β€” Wave 3: cross_fs_fingerprint_notice.html partial + duplicates/list.html inclusion + PROJECT.md Constraints paragraph + STATE.md accumulation -**UI hint**: yes - -### Phase 29: Deployment Hardening & Agents Admin -**Goal**: A real two-host deployment runs end-to-end with the application server holding no file mounts, HTTPS + Redis hardening in place, and an admin can see at a glance which agents are alive and healthy. -**Depends on**: Phase 28 -**Requirements**: DIST-01, AUTH-02, AUTH-03, OPS-02, OPS-03, OPS-04 -**Success Criteria** (what must be TRUE): - 1. The application-server `docker-compose.yml` declares no `SCAN_PATH` or `MODELS_PATH` mount; starting the stack and attempting to read a music file from inside the `api` or `controller` container fails (verified manually) and the application server has no way to read or write file content - 2. A new `docker-compose.agent.yml` brings up exactly `worker`, `watcher`, `audfprint`, and `panako` on a file server, configured via env (`PHAZE_API_URL`, `PHAZE_REDIS_URL`, `PHAZE_AGENT_TOKEN`, `PHAZE_AGENT_ID`) to reach the application server; running it on a second host registers the agent and begins watching - 3. All agent β†’ application-server traffic uses HTTPS terminated by a self-signed certificate from an application-server-local internal CA; each agent's `httpx` client trusts the CA file and rejects untrusted certs (verified by swapping the CA and observing connection failure) - 4. Redis on the application server requires `requirepass` and is bound only to the private LAN interface; an attempt to connect from outside the LAN or without the password fails, and agents connect with `redis://default:@:6379` - 5. Running `just download-models` on a fresh file server populates that host's local `/models` volume; the application-server image neither downloads nor mounts models - 6. Each agent posts a heartbeat to `/api/internal/agent/heartbeat` every 30 seconds; the Agents admin page lists every registered agent with name, status (alive/stale/revoked), queue depth, and last-seen timestamp, and refreshes without requiring a manual page reload -**Plans**: TBD -**UI hint**: yes +
+v4.0 Distributed Agents (Phases 24-29) -- SHIPPED 2026-05-17 + +- [x] Phase 24: Schema Foundation & Agent Registry (5/5 plans) -- completed 2026-05-11 +- [x] Phase 25: Internal Agent HTTP API & Bearer Auth (8/8 plans) -- completed 2026-05-12 +- [x] Phase 26: Task Code Reorg & HTTP-Backed Agent Worker (13/13 plans) -- completed 2026-05-12 +- [x] Phase 27: Watcher Service & User-Initiated Scan (7/7 plans) -- completed 2026-05-14 +- [x] Phase 28: Distributed Execution Dispatch (6/6 plans) -- completed 2026-05-15 +- [x] Phase 29: Deployment Hardening & Agents Admin (8/8 plans) -- completed 2026-05-17 + +Full details: `.planning/milestones/v4.0-ROADMAP.md` + +
+ +### Next Milestone β€” TBD + +_Run `/gsd:new-milestone` to scope the next milestone (questioning β†’ research β†’ requirements β†’ roadmap)._ ## Progress @@ -210,9 +101,9 @@ Full details: `.planning/milestones/v3.0-ROADMAP.md` | 21. CUE Sheet Generation | v3.0 | 3/3 | Complete | 2026-04-03 | | 22. Tracklist Integration Fixes | v3.0 | 1/1 | Complete | 2026-04-04 | | 23. v3.0 Polish & Wiring Fixes | v3.0 | 1/1 | Complete | 2026-04-04 | -| 24. Schema Foundation & Agent Registry | v4.0 | 0/5 | Not started | - | -| 25. Internal Agent HTTP API & Bearer Auth | v4.0 | 8/8 | Complete | 2026-05-12 | -| 26. Task Code Reorg & HTTP-Backed Agent Worker | v4.0 | 13/13 | Complete | 2026-05-12 | -| 27. Watcher Service & User-Initiated Scan | v4.0 | 7/7 | Complete | 2026-05-14 | -| 28. Distributed Execution Dispatch | v4.0 | 6/6 | Complete | 2026-05-15 | -| 29. Deployment Hardening & Agents Admin | v4.0 | 0/? | Not started | - | +| 24. Schema Foundation & Agent Registry | v4.0 | 5/5 | Complete | 2026-05-11 | +| 25. Internal Agent HTTP API & Bearer Auth | v4.0 | 8/8 | Complete | 2026-05-12 | +| 26. Task Code Reorg & HTTP-Backed Agent Worker | v4.0 | 13/13 | Complete | 2026-05-12 | +| 27. Watcher Service & User-Initiated Scan | v4.0 | 7/7 | Complete | 2026-05-14 | +| 28. Distributed Execution Dispatch | v4.0 | 6/6 | Complete | 2026-05-15 | +| 29. Deployment Hardening & Agents Admin | v4.0 | 8/8 | Complete | 2026-05-17 | diff --git a/.planning/STATE.md b/.planning/STATE.md index 05f4ba3..9bf3d59 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -2,33 +2,33 @@ gsd_state_version: 1.0 milestone: v4.0 milestone_name: Distributed Agents -status: "Phase 28 shipped β€” PR #62" -stopped_at: "Phase 28 shipped (PR #62)" -last_updated: "2026-05-16T03:22:43.584Z" -last_activity: 2026-05-15 -- Phase 28 shipped +status: milestone_complete +stopped_at: Milestone v4.0 shipped 2026-05-17 +last_updated: 2026-05-17T00:00:00Z +last_activity: 2026-05-17 -- v4.0 milestone archived progress: total_phases: 6 - completed_phases: 4 - total_plans: 39 - completed_plans: 34 - percent: 87 + completed_phases: 6 + total_plans: 47 + completed_plans: 47 + percent: 100 --- # Project State ## Project Reference -See: .planning/PROJECT.md (updated 2026-04-02) +See: .planning/PROJECT.md (updated 2026-05-17 after v4.0 milestone) -**Core value:** Get 200K messy music and concert files properly named, organized, deduplicated, with rich metadata in Postgres -- human-in-the-loop approval so nothing moves without review. -**Current focus:** Phase 28 β€” distributed-execution-dispatch +**Core value:** Get 200K messy music and concert files properly named, organized, deduplicated, with rich metadata in Postgres -- human-in-the-loop approval so nothing moves without review. Files stay on file-server agents; decisions stay on the application server. +**Current focus:** Planning next milestone (run `/gsd:new-milestone`) ## Current Position -Phase: 28 (distributed-execution-dispatch) β€” EXECUTING -Plan: 1 of 6 -Status: Phase 28 shipped β€” PR #62 -Last activity: 2026-05-15 -- Phase 28 shipped +Phase: v4.0 complete (Phases 24–29 all shipped) +Plan: - +Status: Milestone complete; awaiting next-milestone scoping +Last activity: 2026-05-17 Progress: [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ] 100% @@ -36,7 +36,7 @@ Progress: [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ] 100% **v1.0 Velocity:** -- Total plans completed: 39 +- Total plans completed: 47 - Total phases: 11 - Timeline: 4 days (2026-03-27 -> 2026-03-30) - Tests: 282 passing @@ -50,74 +50,24 @@ Progress: [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ] 100% - Tests: 538 passing - LOC: 5,966 Python +**v3.0 Velocity:** + +- Total plans completed: 11 +- Total phases: 6 +- Timeline: 2 days (2026-04-03 -> 2026-04-04) + +**v4.0 Velocity:** + +- Total plans completed: 47 +- Total phases: 6 +- Timeline: ~43 days (2026-04-03 -> 2026-05-17 incl. discuss/research/UI design per phase) +- LOC: ~23,242 Python lines added / 1,677 deleted (180 files changed since v3.0 tag) + ## Accumulated Context ### Decisions -- v3.0 scope: Search, Discogs Linking, Tag Writing, CUE Sheets -- enrichment layer, not pipeline extension -- FileState enum NOT extended -- enrichment tracked via TagWriteLog and DiscogsLink tables -- Zero new pip dependencies -- httpx, mutagen, rapidfuzz, SQLAlchemy already in pyproject.toml -- Discogs integration routes through discogsography HTTP API only, never direct Discogs API -- Search UI: HTMX partial detection via truthy HX-Request header check -- Search UI: Alpine.js collapsible filter panel pattern (x-data showFilters boolean) -- [Phase 19]: Confidence blending: 0.6 token_set_ratio + 0.4 API relevance, denormalized Discogs metadata in DiscogsLink -- [Phase 19]: Discogs results excluded when file_state filter active, matching tracklist exclusion pattern -- [Phase 19]: Three-entity UNION ALL search: file (blue), tracklist (green), discogs_release (purple) pill colors -- [Phase 19]: Discogs UI: HTMX candidate lifecycle with accept/dismiss, auto-dismiss siblings, bulk-link top candidate -- [Phase 20-tag-writing]: Mock-based tests for OGG/M4A formats, real MP3 for end-to-end write/verify -- [Phase 20-tag-writing]: Tracklist date.year is fallback-only for year field (does not override metadata year) -- [Phase 20-tag-writing]: Inline edits are transient (client-side), no server session storage for edited proposed values -- [Phase 20-tag-writing]: Tag row partial with OOB toast for post-write HTMX swap response -- [Phase 20-tag-writing]: Server-side fallback for empty form data in Write Tags endpoint; ID-based HTMX targeting over closest tr -- [Phase 21]: CueTrackData uses dataclass not Pydantic for zero-overhead service input -- [Phase 21]: Dropped from __future__ annotations in CUE router to avoid FastAPI uuid runtime resolution issues -- [Phase 21-03]: HX-Target header prefix matching for cross-page response routing (tracklist- prefix returns tracklist_card.html) -- [Phase 21-03]: Dynamic _cue_version attribute on Tracklist ORM objects for UI-only display data -- [Phase 26-01]: pydantic-settings v2 does NOT comma-split list[str] env vars natively -- Annotated[list[str], NoDecode] + @field_validator(mode="before") is the canonical workaround -- [Phase 26-01]: pydantic-settings reads env vars by field name absent env_prefix -- AliasChoices(...) per-field is required to map PHAZE_AGENT_* env vars onto bare field names -- [Phase 26-01]: Module-level `settings: ControlSettings = ...` keeps existing call sites' `settings.llm_*` reads type-checking; agent worker calls get_settings() / AgentSettings() directly per D-14 -- [Phase 26-01]: `Settings = ControlSettings` back-compat alias preserves `from phaze.config import Settings` for test files until they migrate -- [Phase 26-02]: Tenacity retry funnel via AsyncRetrying async-iterator (not @retry decorator) -- cleaner try/except integration for 4xx/5xx status-code mapping post-loop -- [Phase 26-02]: PhazeAgentClient bearer token NEVER stored as instance attribute -- lives only inside httpx.AsyncClient.headers (T-26-02-I mitigation) -- [Phase 26-02]: Parallelization-debt marker pattern: type: ignore[import-not-found] + warn_unused_ignores makes missing-cross-plan-schema diagnostic self-deleting on merge -- [Phase 26-04]: AgentTaskRouter cache impl chose plain `dict[str, Queue]` over `functools.cache` (rejected: extra layer for single-instance service) and LRU (rejected: eviction without `.disconnect()` would leak Redis connections; bounded growth not needed for v4.0's 1-5 agent scale) -- [Phase 26-04]: AgentTaskRouter integration tests use a real Redis (no fakeredis fallback) per D-30 -- SAQ Queue.from_url is not compatible with fakeredis at saq>=0.26.3 -- [Phase 26-04]: Per-agent SAQ queue naming invariant: `phaze-agent-` (D-18); agent_id is the kebab-case slug from Phase 24 D-01, Redis-safe by construction -- [Phase 26-05]: Smoke-app pattern adopted for per-router contract tests; matches Phase 25 test_agent_metadata.py precedent and decouples Plan 26-12 wiring -- [Phase 26-05]: /whoami response uses naive UTC created_at -- matches project-wide TimestampMixin convention; deferred timezone-aware migration to a future architectural plan -- [Phase 26-06]: Overflow funnel pattern -- wire-format fields without a dedicated column (e.g. danceability, energy on AnalysisResult) merge into the row's `features` JSONB column rather than being dropped, preserving D-26's wire contract without an Alembic migration. Future migration can promote to dedicated columns. -- [Phase 26-06]: Deterministic dict summarization -- `sorted(items, key=lambda kv: (-kv[1], kv[0]))[:N]` two-key sort is the canonical pattern for compacting classifier-score dicts into bounded, replay-safe strings. `reverse=True` single-key sort tiebreaks by insertion order which is non-deterministic. -- [Phase 26-07]: Stripe-style request-id idempotency via Redis SET NX EX -- atomic lock-acquire + bounded-wait concurrent-writer poll (10*50ms -> 409) + cached-response fast-path; 1h TTL -- [Phase 26-07]: `request.app.state.redis` thin pass-through dep keeps the Redis client lifecycle in main.py lifespan (Plan 26-12) while keeping the handler smoke-app-testable via direct `app.state.redis = client` assignment -- [Phase 26-07]: `sqlalchemy.update(Model)` is mypy-friendly; `Model.__table__.update()` trips `FromClause has no attribute "update"` because mypy types `__table__` as the abstract parent -- [Phase 26-08]: Cross-tenant guard placement: 403 returns BEFORE state-machine evaluation to prevent timing side-channel via 409 vs 403 (W1 / T-26-08-S2) -- [Phase 26-08]: Joint Proposal+FileRecord mutation uses single await session.commit() (RESEARCH Pitfall 6 invariant) -- [Phase 26-08]: Idempotent same-state PATCH echoes current row state with ZERO DB writes -- does NOT bump updated_at on same-state retry -- [Phase 26-08]: Mirror agent_execution.py PATCH structure byte-for-byte (Annotated[AsyncSession, Depends] dep pattern, session.get->404 pattern) -- [Phase 26-11]: ExecutionStatus enum extracted to phaze.enums (DB-free); models/execution.py re-exports it. Schemas under phaze.schemas.agent_* now load without sqlalchemy/phaze.database -- the D-03 import boundary holds for the agent worker -- [Phase 26-11]: scan_live_set drops in-process FileMetadata artist/title resolution; fingerprint-sourced tracklist rows land with artist=None,title=None. Known v3.0 UI regression deferred to a future Phase 27/28 controller-side enrichment task -- [Phase 26-11]: services/fingerprint.py uses function-local DB imports inside get_fingerprint_progress so the module surface stays DB-free for the agent worker -- [Phase 26-11]: execute_approved_batch ExecutionLog reporting maps onto Phase 25's per-proposal schema (one POST + one PATCH per file op); batch-level completed_with_errors lives in the returned dict, not the schema -- [Phase 26-11]: AnalysisWritePayload mood/style wire conversion -- two helpers in tasks/functions.py rebuild dict[str, float] from analysis["features"] (averaging mood_* sets across variants; top-N genres) instead of dropping the str labels -- [Phase ?]: 26-10: agent_worker SAQ settings module ships with subprocess import-boundary test (D-25) enforcing no phaze.database / sqlalchemy.ext.asyncio in agent import chain -- [Phase ?]: 26-10: D-13 token-preview banner uses 'auth_id_prefix=' format key (not 'token_preview=') to avoid semgrep secret-detector false-positives; rendered value unchanged -- [Phase ?]: 26-10: /whoami startup probe budget = exponential 1sβ†’32s = ~63s wall-clock; RuntimeError on exhaustion; queue-name mismatch guard catches PHAZE_AGENT_QUEUE vs token-derived agent_id misconfig -- [Phase ?]: [Phase 26-13] D-04+D-06 finalized: phaze.tasks.{worker,session} deleted with no back-compat shim; docker-compose worker service rewired to phaze.tasks.controller.settings under PHAZE_ROLE=control; lux_workerβ†’controller doc sweep across PROJECT.md + ROADMAP.md -- [Phase 27-01]: phaze.tasks._shared.agent_bootstrap centralizes whoami_with_retry + construct_agent_client; Pitfall 7 short-circuit on AgentApiAuthError closes the "bad token infinite-restart" failure mode -- [Phase 27-01]: Four new AgentSettings fields (watcher_settle_seconds=10, watcher_max_pending_seconds=3600, watcher_sweep_interval_seconds=2, scan_chunk_size=500) with PHAZE_WATCHER_*/PHAZE_SCAN_CHUNK_SIZE env-var aliases via AliasChoices (Phase 26-01 pattern) -- [Phase 27-02]: FileUpsertChunk.batch_id: UUID | None added; absent β†’ controller resolves LIVE sentinel via uq_scan_batches_agent_id_live partial UQ; present β†’ 403-before-state-machine cross-tenant guard (T-27-02) -- [Phase 27-03]: PATCH /api/internal/agent/scan-batches/{batch_id} state machine: RUNNINGβ†’COMPLETED/FAILED only; LIVE rejected at schema layer (Literal); idempotent same-state PATCH echoes row with zero DB writes -- [Phase 27-04]: scan_directory chunk size = 500; per-chunk PATCH progress; terminal status PATCH on completion or failure; per-file OSError skip (mirrors services/ingestion.py:65); module-private _classify duplicates EXTENSION_MAP lookup to keep agent-side scan.py Postgres-free (D-13 / D-25 invariant) -- [Phase 27-05]: phaze.agent_watcher uses dict[str, _PendingEntry] + asyncio-owned single-loop sweep (time.monotonic clock); loop.call_soon_threadsafe is the ONLY sanctioned thread bridge from the watchdog Observer thread -- [Phase 27-05]: Stuck-file cap = 3600s default (D-02 / T-27-05); evicted entries log WARNING but do NOT post; bounded in-memory cost. Watcher POSTs chunk-of-1 with batch_id OMITTED (not None) to trigger server-side LIVE-sentinel resolution (D-18) -- [Phase 27-06]: HTMX poll-partial halt: terminal-state markup OMITS hx-trigger AND hx-get; outerHTML swap replaces the polling element entirely (Pitfall 6); cadence = every 2s for scan progress, every 5s for stats bar. Recent Scans mini-table uses transient _agent_name / _elapsed_seconds attrs on ORM rows to avoid N+1 -- [Phase 27-07]: Compose 'watcher' service lives in root docker-compose.yml; Phase 29 will move it + 'worker' to docker-compose.agent.yml; depends_on api: service_started (no healthcheck); restart: unless-stopped is the only liveness mechanism in Phase 27. Volume mount SCAN_PATH:/data/music:ro only (no MODELS_PATH/OUTPUT_PATH; watcher is fileless-write) -- [Phase 28-01]: Fingerprint URL allow-list validator (`_enforce_localhost_only` on BaseSettings) blocks non-localhost `audfprint_url`/`panako_url` at config load (D-12 / TASK-04); `ExecuteApprovedBatchPayload.sub_batch_index: int = 0` schema field (D-10) β€” agent worker reports which chunk of a per-agent dispatch it owns -- [Phase 28-02]: `POST /api/internal/agent/exec-batches/{batch_id}/progress` handler order is part of the spec: 401β†’403(cross-tenant)β†’404(missing hash)β†’403(agent not in dispatch)β†’Redis-SET-NX dedupβ†’HINCRBY per D-07; sub_batch_terminal=true promotes status when `subjobs_completed == subjobs_expected` -- [Phase 28-03]: Dispatch grouping uses in-Python `defaultdict(list)` over SQL `GROUP BY ... jsonb_agg(...)` β€” v4.0 scale (1-5 agents Γ— ≀10K proposals) makes the type-safe path cheaper than DB aggregation; `Agent.revoked_at.is_(None)` filter applied both in JOIN and in skipped-count query -- [Phase 28-04]: SSE payloads rendered as 3 separate Jinja partials (`dispatch_summary_inline.html`, `agents_table.html`, `progress_row_inline.html`) via `_render_partial()` helper through `Jinja2Templates.TemplateResponse(...).body.decode()` β€” Semgrep XSS-lint requires this over bare `Environment.get_template().render()` -- [Phase 28-05]: `_load_or_seed_uuids(job, proposals)` persists BOTH `execution_log_id` and `progress_request_id` per-proposal UUIDs in `ctx['job'].meta` via single `await job.update(meta=...)` so SAQ retries reuse them (closes L6/L22, delivers D-15); failure progress POSTs use D-16 fire-and-forget (WARNING-on-failure swallow because file ops already committed); error_message format is `": "` (D-01) -- [Phase 28-06]: `cross_fs_fingerprint_notice.html` banner is dismissible per session only (no localStorage); included on `duplicates/list.html` as first child of the `space-y-6` div above `

`; PROJECT.md Constraints paragraph documents XAGENT-01 (deferred cross-file-server fingerprint matching) +(Full milestone decision log archived in `.planning/milestones/v4.0-ROADMAP.md` Milestone Summary. Current-cycle decisions accumulate here.) ### Pending Todos @@ -125,8 +75,8 @@ None. ### Blockers/Concerns -- Phase 19: Verify discogsography `/api/search` response shape before writing adapter (research flag) -- arq replaced by SAQ -- all new task code must use SAQ conventions +- 29-HUMAN-UAT.md: real two-host production smoke is verified-docs-only; deferred until file-server hardware is available +- Tech debt parked in v4.0 audit: WR-01..WR-04 (Phase 29), WR-03 (Phase 28 UI), P28-RACE-01 β€” see `.planning/milestones/v4.0-MILESTONE-AUDIT.md` ### Quick Tasks Completed @@ -135,19 +85,9 @@ None. | 260410-kco | Add Docker image publishing to GHCR following discogsography pattern | 2026-04-10 | 3f91f93 | [260410-kco-add-docker-image-publishing-to-ghcr-foll](./quick/260410-kco-add-docker-image-publishing-to-ghcr-foll/) | | 260414-quo | Add Discord notification to docker-publish.yml workflow mirroring discogsography pattern | 2026-04-14 | 9c5cedb | [260414-quo-add-discord-notification-to-docker-publi](./quick/260414-quo-add-discord-notification-to-docker-publi/) | | 260502-lqb | Remove Discord notification step from docker-publish.yml workflow | 2026-05-02 | ea84be2 | [260502-lqb-remove-discord-notification-step-from-do](./quick/260502-lqb-remove-discord-notification-step-from-do/) | -| Phase 26 P02 | 9min | 2 tasks | 2 files | -| Phase 26 P04 | 5min | 2 tasks | 2 files | -| Phase 26 P05 | 18min | 2 tasks | 2 files | -| Phase 26 P06 | 13min | 3 tasks | 3 files | -| Phase 26 P07 | 14min | 2 tasks | 2 files | -| Phase 26 P08 | 14min | 2 tasks | 3 files | -| Phase 26 P11 | 30min | 4 tasks | 13 files (5 task bodies rewritten + supporting refactors + 5 test rewrites + new contract test file + phaze.enums package) | -| Phase 26 P10 | 25min | 3 tasks | 3 files | -| Phase 26 P12 | 7m 25s | 2 tasks | 3 files | -| Phase 26 P13 | 11m | 2 tasks | 8 files | ## Session Continuity -Last session: 2026-05-15T00:12:04.513Z -Stopped at: Phase 28 shipped (PR #62) -Resume file: .planning/phases/28-distributed-execution-dispatch/28-UI-SPEC.md +Last session: 2026-05-17 -- milestone v4.0 archived +Stopped at: Awaiting `/gsd:new-milestone` for next milestone scope +Resume file: - diff --git a/.planning/milestones/v4.0-MILESTONE-AUDIT.md b/.planning/milestones/v4.0-MILESTONE-AUDIT.md new file mode 100644 index 0000000..3d483a6 --- /dev/null +++ b/.planning/milestones/v4.0-MILESTONE-AUDIT.md @@ -0,0 +1,182 @@ +--- +milestone: v4.0 +milestone_name: Distributed Agents +audited: 2026-05-17T00:50:00Z +status: passed +scores: + requirements: 26/26 satisfied (13 listed Pending in REQUIREMENTS.md are stale documentation β€” code is complete) + phases: 6/6 verified (1 with human_needed deferred by operator decision) + integration: 22/22 cross-phase wires verified end-to-end + flows: 4/5 complete, 1 degraded edge (advisory) + nyquist: 6 phases β€” VALIDATION.md present on all six +gaps: + requirements: [] + integration: [] + flows: + - id: "flow-exec-revoked-breakdown" + severity: advisory + affected_requirements: ["EXEC-04"] + evidence: "src/phaze/routers/execution.py:198–213 passes skipped_revoked (int) to progress.html but omits revoked_agents (list[dict]); template at templates/execution/partials/progress.html:41–47 conditionally renders per-agent breakdown under `{% if revoked_agents %}`, which silently suppresses on first render. The orange aggregate banner appears; per-agent attribution does not." + impact: "Operator sees count of skipped proposals but not which revoked agent killed which proposal. Already classified WR-03 in 28-REVIEW.md (advisory)." +tech_debt: + - phase: 28 + items: + - "WR-03 (UI): pass revoked_agents=[] to execution progress context, or compute per-agent breakdown β€” flow-exec-revoked-breakdown above" + - "P28-RACE-01: terminal-status promotion in agent_exec_batches.py:189–198 is a read-then-write without Lua atomicity. Race window requires β‰₯3 concurrent sub-jobs with specific interleaving. Mechanical fix (~10 lines of Lua)." + - phase: 29 + items: + - "WR-01 (cert): Path.write_bytes() applies umask before chmod(0o600); CA + leaf private keys are world-readable for a brief window on the bind mount. cert_bootstrap.py:215-234." + - "WR-02 (compose): docker-compose.yml binds Postgres on 0.0.0.0 β€” has no equivalent guard to the Redis ${REDIS_BIND_IP:-127.0.0.1} hardening despite weaker default credentials." + - "WR-03 (download): .part temp file not cleaned up on download error; next run creates a second .part alongside the stale one. download_models.py:84-90." + - "WR-04 (compose): watcher service mounts MODELS_PATH:rw despite never reading or writing /models β€” unnecessary write grant. docker-compose.agent.yml:50." + - "29-REVIEW.md IN-01/02/03 (Info): no SHA-256 integrity check on model binaries; refreshed_at_iso interpolated directly into Alpine.js x-data string (safe today but fragile); _build_default_settings() in config.py:297-312 has a dead conditional." + - "29-HUMAN-UAT.md: real two-host deployment smoke deferred until file-server hardware available (operator-approved verified-docs-only signal)." +documentation_drift: + - file: .planning/REQUIREMENTS.md + description: "13 requirements still listed `| Pending |` in the traceability table despite their phases being merged and verified passed. Traceability table footer (`*Last updated: 2026-05-11*`) confirms it predates every v4.0 phase merge. Affects DIST-04, DIST-05, DATA-01..04, AUTH-01, AUTH-04, TASK-04, EXEC-01..04. Checkbox column also stale for the same IDs." + severity: documentation_only + affected_requirements: ["DIST-04", "DIST-05", "DATA-01", "DATA-02", "DATA-03", "DATA-04", "AUTH-01", "AUTH-04", "TASK-04", "EXEC-01", "EXEC-02", "EXEC-03", "EXEC-04"] + - file: .planning/ROADMAP.md + description: "Phase 24 checkbox is `[ ]` despite VERIFICATION.md (filename is `VERIFICATION.md` not `24-VERIFICATION.md` β€” older naming convention from before the milestone) showing status: passed, score 4/4 truths verified." + severity: documentation_only + - file: .planning/phases/24-schema-foundation-agent-registry/ + description: "Verification artifact named VERIFICATION.md (unprefixed) β€” inconsistent with v4.0 convention of `{phase_num}-VERIFICATION.md`. Functional, but breaks the discovery pattern `gsd-sdk query find-phase` uses." + severity: documentation_only +--- + +# Milestone v4.0: Distributed Agents β€” Audit Report + +**Audited:** 2026-05-17T00:50:00Z +**Status:** passed (with documentation drift to fix) +**Auditor:** Claude (gsd-audit-milestone orchestrator + gsd-integration-checker) + +## Definition of Done (from ROADMAP.md) + +> One or more dedicated file-server hosts run agent workers that consume from per-agent SAQ queues. The application server holds no file mounts, pushes work via per-agent queues, provides a way to associate an `agent_id` with each file/scan record, and reads via the agent's HTTP API. Agents handle file analysis + execution locally on each file-server. + +## Phase Roll-Up + +| Phase | Title | Status | Score | Verified | Notes | +|------:|-------|:------:|:-----:|:--------:|-------| +| 24 | Schema Foundation & Agent Registry | βœ“ passed | 4/4 | 2026-05-11 | VERIFICATION.md filename unprefixed (drift); ROADMAP checkbox not flipped (drift). 2 human UAT gates already passed via operator smoke. | +| 25 | Internal Agent HTTP API & Bearer Auth | βœ“ passed | 5/5 | 2026-05-12 | Re-verified after Plans 25-07 + 25-08 closed CR-01 (partial-PUT) + CR-02 (terminal-PATCH-409). 1 visual UAT remaining (Swagger lock icon). | +| 26 | Task Code Reorg & HTTP-Backed Agent Worker | βœ“ passed | 5/5 | 2026-05-12 | Postgres-free import boundary subprocess-enforced on agent worker / agent watcher. | +| 27 | Watcher Service & User-Initiated Scan | βœ“ passed | 5/5 | 2026-05-14 | 3 human UAT items all passed against live stack on 2026-05-14. | +| 28 | Distributed Execution Dispatch | βœ“ passed | 25/25 validation points | 2026-05-15 | 1 advisory degraded UI edge (WR-03 β€” revoked-agents per-agent breakdown silently suppressed). 1 advisory race (P28-RACE-01 β€” Lua-atomicity gap on terminal promotion). | +| 29 | Deployment Hardening + Agents Admin | ⚠ human_needed | 6/6 | 2026-05-17 | All code verified. Real two-host smoke deferred by operator decision (`verified-docs-only` signal). 3 critical CRs caught + fixed in-flight (CR-01..03). 4 advisory warnings outstanding (WR-01..04). | + +**6/6 phases verified by code evidence.** The single `human_needed` status (Phase 29) reflects a deferred-by-operator-decision UAT item, not a code defect. + +## Requirements Coverage (3-Source Cross-Reference) + +All 26 v4.0 requirements are **implemented and cross-phase wired**. Each was checked against: +1. Source plan PLAN.md frontmatter (claimed requirements) +2. SUMMARY.md frontmatter `requirements-completed` (executor-reported satisfaction) +3. VERIFICATION.md observable-truths table (verifier-confirmed evidence) +4. Live codebase grep / read (integration-checker confirmed wiring) + +| REQ-ID | Description | Phase | Verification | SUMMARY | Wiring | REQUIREMENTS.md (stale) | Final | +|--------|-------------|------:|:------------:|:-------:|:------:|:-----------------------:|:-----:| +| DATA-01 | `agents` table | 24 | passed | listed | wired | Pending | **satisfied** | +| DATA-02 | `agent_id` NOT NULL FK + composite UQ | 24 | passed | listed | wired | Pending | **satisfied** | +| DATA-03 | LIVE sentinel partial UQ | 24 | passed | listed | wired | Pending | **satisfied** | +| DATA-04 | Born-revoked legacy backfill | 24 | passed | listed | wired | Pending | **satisfied** | +| AUTH-01 | Bearer auth on `/api/internal/agent/*` | 25 | passed | listed | wired | Pending | **satisfied** | +| AUTH-02 | TLS on app server | 29 | passed | listed | wired | Complete | **satisfied** | +| AUTH-03 | Redis requirepass + LAN bind | 29 | passed | listed | wired | Complete | **satisfied** | +| AUTH-04 | Token revocation propagates next call | 25 | passed | listed | wired | Pending | **satisfied** | +| DIST-01 | App-server holds no file mounts | 29 | passed | listed | wired | Complete | **satisfied** | +| DIST-02 | Watcher detects new files | 27 | passed | listed | wired | Complete | **satisfied** | +| DIST-03 | Per-agent SAQ queue | 26 | passed | listed | wired | Complete | **satisfied** | +| DIST-04 | Agent reads via HTTP API only | 25 | passed | listed | wired | Pending | **satisfied** | +| DIST-05 | Idempotent upserts on natural keys | 25 | passed | listed | wired | Pending | **satisfied** | +| SCAN-01 | User-initiated scan endpoint | 27 | passed | listed | wired | Complete | **satisfied** | +| SCAN-02 | Chunked file upsert | 27 | passed | listed | wired | Complete | **satisfied** | +| SCAN-03 | Watcher posts to LIVE sentinel | 27 | passed | listed | wired | Complete | **satisfied** | +| SCAN-04 | mtime settle/debounce | 27 | passed | listed | wired | Complete | **satisfied** | +| TASK-01 | Agent-worker holds file-bound tasks only | 26 | passed | listed | wired | Complete | **satisfied** | +| TASK-02 | Controller holds fileless tasks only | 26 | passed | listed | wired | Complete | **satisfied** | +| TASK-03 | Self-contained job payloads | 26 | passed | listed | wired | Complete | **satisfied** | +| TASK-04 | Cross-file-server fingerprint guard | 28 | passed | listed | wired | Pending | **satisfied** | +| EXEC-01 | Execution dispatch grouped by agent | 28 | passed | listed | wired | Pending | **satisfied** | +| EXEC-02 | Per-operation ExecutionLog write-ahead | 28 | passed | listed | wired | Pending | **satisfied** | +| EXEC-03 | Unified SSE aggregates across agents | 28 | passed | listed | wired | Pending | **satisfied** | +| EXEC-04 | Per-agent rollup in UI | 28 | passed | listed | wired | Pending | **satisfied** (advisory: WR-03) | +| OPS-01 | Startup banner with role + agent_id | 26 | passed | listed | wired | Complete | **satisfied** | +| OPS-02 | docker-compose.agent.yml | 29 | passed | listed | wired | Complete | **satisfied** | +| OPS-03 | Per-file-server model auto-download | 29 | passed | listed | wired | Complete | **satisfied** | +| OPS-04 | Heartbeat + /admin/agents | 29 | passed | listed | wired | Complete | **satisfied** | + +**Final score:** 26/26 requirements satisfied. **0 unsatisfied. 0 partial. 0 orphaned.** + +## Cross-Phase Integration + +| Area | Result | +|------|--------| +| Cross-phase exports wired | **22/22** | +| Internal API routes consumed | **12/12** | +| Auth-protected routes | **All 12 agent routes use `Depends(get_authenticated_agent)`** | +| Auth-unprotected (by design) | `/admin/agents` + `/admin/agents/_table` β€” consistent private-LAN admin posture | +| Postgres-free boundary | **Enforced** via 5 subprocess tests in `tests/test_task_split.py` | +| Per-agent SAQ queue isolation | **Triple-checked** at enqueue (`agent_task_router.py:70`), consume (`agent_worker.py:161–167`), and startup-cross-validation (`agent_worker.py:104–112`) | +| E2E flows | **4/5 complete + 1 degraded (advisory)** | + +## End-to-End Flow Traces + +**βœ“ Flow 1 β€” File dropped on file-server β†’ admin dashboard** +Watcher β†’ debouncer settle β†’ poster β†’ `POST /api/internal/agent/files` (LIVE sentinel) β†’ composite-UQ upsert β†’ `extract_file_metadata` on `phaze-agent-` β†’ metadata + fingerprint POSTed back. All 9 hops wired. Live UAT passed (Phase 27-HUMAN-UAT.md). + +**βœ“ Flow 2 β€” Admin-triggered scan β†’ progress card** +`POST /pipeline/scans` β†’ scan_root membership check β†’ `scan_directory` enqueue β†’ 500-record-chunk upserts β†’ `PATCH /api/internal/agent/scan-batches/{id}`. 5 hops wired. + +**⚠ Flow 3 β€” Approve batch β†’ per-agent execution β†’ SSE progress** +Approval β†’ `get_approved_proposals_grouped_by_agent` β†’ `exec:{batch_id}` Redis hash seeded β†’ `execute_approved_batch` per (agent, chunk) β†’ per-proposal write-ahead `ExecutionLog` β†’ terminal `post_exec_batch_progress` HINCRBY β†’ unified SSE. **Degraded edge:** the per-agent revoked-agents breakdown is silently suppressed on first render (WR-03 β€” see flows gap above). Aggregate count surfaces correctly. + +**βœ“ Flow 4 β€” Agent heartbeat β†’ /admin/agents liveness** +SAQ CronJob `heartbeat_tick` every 30s β†’ `POST /api/internal/agent/heartbeat` β†’ `Agent.last_seen_at` updated β†’ `/admin/agents` reads + classifies + HTMX 5s poll. 5 hops wired. + +**βœ“ Flow 5 β€” TLS + auth bootstrap (agent startup)** +`entrypoint.py` β†’ `ensure_certs_present` β†’ operator copies CA β†’ `construct_agent_client` CA-file fail-fast β†’ `PhazeAgentClient(verify=...)` β†’ `httpx.AsyncClient(verify=...)`. 5 hops wired. Real two-host hardware smoke deferred by operator decision. + +## Anti-Patterns Scan + +- **TODO / FIXME / PLACEHOLDER / NotImplementedError:** zero hits across all v4.0 source files. +- **Stubs / vaporware:** zero. Only one documentation reference to a non-existent symbol existed (`AgentSettings._enforce_https_in_production` in `.env.example.agent`) β€” caught by Phase 29 code review as CR-01 and **fixed mid-flight in commit `2fb533c`**. + +## Outstanding Tech Debt + +**Phase 28 (advisory):** +- **WR-03 (UI degradation):** Pass `revoked_agents` to `execution.start_execution` context so per-agent revoked-attribution renders. Aggregate count still surfaces correctly; only the breakdown is hidden. +- **P28-RACE-01:** Terminal-status promotion in `agent_exec_batches.py:189–198` is read-then-write; β‰₯3 concurrent sub-job race. ~10-line Lua atomicity fix. + +**Phase 29 (advisory):** +- **WR-01:** Cert private-key umask window (chmod 0o600 happens after write). +- **WR-02:** Postgres compose port binds 0.0.0.0 β€” should match the Redis `${REDIS_BIND_IP:-127.0.0.1}` pattern. +- **WR-03:** `.part` cleanup on download error. +- **WR-04:** `watcher` MODELS_PATH should be `:ro` not `:rw`. +- **IN-01..03:** No SHA-256 integrity check on model downloads; Alpine.js `x-data` interpolation fragility; dead `_build_default_settings` conditional. + +**Outstanding UAT:** +- Phase 29 real two-host deployment smoke (deferred by operator `verified-docs-only` signal β€” pending file-server hardware). +- Phase 25 visual UAT: Swagger lock-icon rendering on production `/docs` (advisory; schema is locked by test). + +## Documentation Drift Detected + +The audit's 3-source cross-reference surfaced three documentation-only drift items. **None affect runtime behavior; all should be fixed before milestone archive:** + +1. **REQUIREMENTS.md traceability stale (13 entries):** Traceability table footer `*Last updated: 2026-05-11*` predates every v4.0 phase merge. 13 requirements still listed `| Pending |` despite verified-passed phases: DIST-04, DIST-05, DATA-01..04, AUTH-01, AUTH-04, TASK-04, EXEC-01..04. Checkbox column likewise stale. +2. **ROADMAP.md Phase 24 checkbox:** `[ ]` despite Phase 24 VERIFICATION.md `status: passed, score: 4/4`. +3. **Phase 24 VERIFICATION filename:** `VERIFICATION.md` (unprefixed) vs the v4.0 convention `{phase_num}-VERIFICATION.md`. Functional but breaks `find-phase` discovery. + +## Audit Verdict + +**v4.0 is feature-complete.** All 26 requirements are implemented, cross-phase wired, and verified by code evidence. Integration is solid (22/22 wires, 12/12 routes, 5 of 5 boundary tests). The single `human_needed` phase is so by operator decision, not code defect. + +**Pre-archive checklist:** +1. Fix REQUIREMENTS.md traceability staleness (13 entries β†’ Complete, check off boxes) +2. Flip ROADMAP Phase 24 checkbox to `[x]` (or document why it should remain open) +3. Rename `.planning/phases/24-schema-foundation-agent-registry/VERIFICATION.md` β†’ `24-VERIFICATION.md` +4. Decide whether to address P28-WR-03, P28-RACE-01, P29-WR-01..04 as a v4.1 polish phase or carry into v5.0 backlog +5. Schedule the deferred real-deployment two-host smoke + +Once items 1–3 land, the milestone is clear for `/gsd:complete-milestone v4.0`. diff --git a/.planning/REQUIREMENTS.md b/.planning/milestones/v4.0-REQUIREMENTS.md similarity index 76% rename from .planning/REQUIREMENTS.md rename to .planning/milestones/v4.0-REQUIREMENTS.md index 71ebe6a..14b1184 100644 --- a/.planning/REQUIREMENTS.md +++ b/.planning/milestones/v4.0-REQUIREMENTS.md @@ -1,3 +1,12 @@ +# Requirements Archive: v4.0 Distributed Agents + +**Archived:** 2026-05-17 +**Status:** SHIPPED + +For current requirements, see `.planning/REQUIREMENTS.md` (created fresh for next milestone via `/gsd:new-milestone`). + +--- + # Requirements: Phaze **Defined:** 2026-05-11 @@ -9,25 +18,25 @@ Requirements for Distributed Agents. Each maps to roadmap phases. ### Topology & Boundary -- [ ] **DIST-01**: The application server runs the API, UI, Postgres, Redis, and a fileless SAQ worker; it has no `SCAN_PATH` or `MODELS_PATH` filesystem mounts and cannot read or write file content +- [x] **DIST-01**: The application server runs the API, UI, Postgres, Redis, and a fileless SAQ worker; it has no `SCAN_PATH` or `MODELS_PATH` filesystem mounts and cannot read or write file content - [x] **DIST-02**: Each file server runs one or more agents (SAQ worker + watcher + audfprint + panako sidecars) that hold local files and execute all file-bearing work locally - [x] **DIST-03**: Each agent pulls jobs from a per-agent SAQ queue named `phaze-agent-` on the application server's Redis; the application server enqueues file-bound jobs onto the correct queue using `FileRecord.agent_id` -- [ ] **DIST-04**: Agents have zero direct Postgres access; every state change (file discovered, analysis result, fingerprint, execution log, heartbeat) is an authenticated HTTPS call to `/api/internal/agent/*` on the application server -- [ ] **DIST-05**: Every `/api/internal/agent/*` endpoint is idempotent on retry; natural keys (`(agent_id, original_path)`, `file_id`, `proposal_id`, agent-generated log UUIDs) guarantee replay safety +- [x] **DIST-04**: Agents have zero direct Postgres access; every state change (file discovered, analysis result, fingerprint, execution log, heartbeat) is an authenticated HTTPS call to `/api/internal/agent/*` on the application server +- [x] **DIST-05**: Every `/api/internal/agent/*` endpoint is idempotent on retry; natural keys (`(agent_id, original_path)`, `file_id`, `proposal_id`, agent-generated log UUIDs) guarantee replay safety ### Data Model & Migration -- [ ] **DATA-01**: An `agents` table records each registered agent with `id`, `name`, `token_hash`, `scan_roots` (jsonb), `created_at`, `last_seen_at`, and `revoked_at` -- [ ] **DATA-02**: `FileRecord.agent_id` is a non-null string column referencing the agents table; the unique constraint on the file table moves from `(original_path)` to `(agent_id, original_path)` -- [ ] **DATA-03**: `ScanBatch.agent_id` is a non-null string column; one sentinel `LIVE` `ScanBatch` per agent acts as the parent batch for all watcher-originated file events -- [ ] **DATA-04**: A two-step Alembic migration adds the new columns and `agents` table, seeds a `legacy-application-server` agent row pointing at the current `SCAN_PATH`, backfills every existing `FileRecord` / `ScanBatch` to it, and only then enforces `NOT NULL` and swaps the unique constraint +- [x] **DATA-01**: An `agents` table records each registered agent with `id`, `name`, `token_hash`, `scan_roots` (jsonb), `created_at`, `last_seen_at`, and `revoked_at` +- [x] **DATA-02**: `FileRecord.agent_id` is a non-null string column referencing the agents table; the unique constraint on the file table moves from `(original_path)` to `(agent_id, original_path)` +- [x] **DATA-03**: `ScanBatch.agent_id` is a non-null string column; one sentinel `LIVE` `ScanBatch` per agent acts as the parent batch for all watcher-originated file events +- [x] **DATA-04**: A two-step Alembic migration adds the new columns and `agents` table, seeds a `legacy-application-server` agent row pointing at the current `SCAN_PATH`, backfills every existing `FileRecord` / `ScanBatch` to it, and only then enforces `NOT NULL` and swaps the unique constraint ### Authentication & Security -- [ ] **AUTH-01**: Each agent authenticates to the application server with a unique bearer token; the application server stores only the token hash and derives `agent_id` from the token lookup β€” never from a request body field -- [ ] **AUTH-02**: All agent β†’ application-server traffic uses HTTPS terminated by a self-signed certificate issued by an application-server-local internal CA; each agent's httpx client trusts that CA file -- [ ] **AUTH-03**: Redis on the application server requires `requirepass` and is bound only to the private LAN interface (no `0.0.0.0` exposure); agents connect with `redis://default:@:6379` -- [ ] **AUTH-04**: Agent tokens are rotatable: revoking a token in the `agents` table immediately blocks further `/api/internal/agent/*` calls from that agent without requiring an application-server restart +- [x] **AUTH-01**: Each agent authenticates to the application server with a unique bearer token; the application server stores only the token hash and derives `agent_id` from the token lookup β€” never from a request body field +- [x] **AUTH-02**: All agent β†’ application-server traffic uses HTTPS terminated by a self-signed certificate issued by an application-server-local internal CA; each agent's httpx client trusts that CA file +- [x] **AUTH-03**: Redis on the application server requires `requirepass` and is bound only to the private LAN interface (no `0.0.0.0` exposure); agents connect with `redis://default:@:6379` +- [x] **AUTH-04**: Agent tokens are rotatable: revoking a token in the `agents` table immediately blocks further `/api/internal/agent/*` calls from that agent without requiring an application-server restart ### Scan & Watcher @@ -41,21 +50,21 @@ Requirements for Distributed Agents. Each maps to roadmap phases. - [x] **TASK-01**: File-bound SAQ tasks (`process_file`, `extract_file_metadata`, `fingerprint_file`, `scan_live_set`, `execute_approved_batch`) run only on agents; their bodies use an HTTP client to the application server instead of an `async_session` - [x] **TASK-02**: Fileless SAQ tasks (`generate_proposals`, `match_tracklist_to_discogs`, `scrape_and_store_tracklist`, `search_tracklist`, `refresh_tracklists` cron) run only on the application-server worker and continue using direct Postgres access - [x] **TASK-03**: Agent task job payloads carry all data the agent needs (`file_id`, `file_path`, `file_type`, model path, etc.) so jobs are self-contained snapshots at enqueue time; agents never read file state from the application server during job execution -- [ ] **TASK-04**: Each file server runs its own audfprint and panako sidecars indexing only that file server's files; no cross-file-server fingerprint matching is supported in v4.0 +- [x] **TASK-04**: Each file server runs its own audfprint and panako sidecars indexing only that file server's files; no cross-file-server fingerprint matching is supported in v4.0 ### Distributed Execution -- [ ] **EXEC-01**: When the administrator triggers an approved-batch execution, the application server groups approved proposals by `FileRecord.agent_id` and enqueues one `execute_approved_batch` sub-job per affected agent under a shared parent `batch_id` -- [ ] **EXEC-02**: Each agent performs copy-verify-delete locally for its sub-batch and reports per-operation status to the application server via PATCH calls so the write-ahead `ExecutionLog` audit trail is preserved across the HTTP boundary -- [ ] **EXEC-03**: Agents PATCH per-file progress updates to the application server; the application server owns the `exec:{batch_id}` Redis hash and continues to serve SSE progress from a single aggregated key -- [ ] **EXEC-04**: A batch that spans multiple agents reports unified progress (`total`, `completed`, `failed`) to the UI; per-agent breakdown is available for debugging +- [x] **EXEC-01**: When the administrator triggers an approved-batch execution, the application server groups approved proposals by `FileRecord.agent_id` and enqueues one `execute_approved_batch` sub-job per affected agent under a shared parent `batch_id` +- [x] **EXEC-02**: Each agent performs copy-verify-delete locally for its sub-batch and reports per-operation status to the application server via PATCH calls so the write-ahead `ExecutionLog` audit trail is preserved across the HTTP boundary +- [x] **EXEC-03**: Agents PATCH per-file progress updates to the application server; the application server owns the `exec:{batch_id}` Redis hash and continues to serve SSE progress from a single aggregated key +- [x] **EXEC-04**: A batch that spans multiple agents reports unified progress (`total`, `completed`, `failed`) to the UI; per-agent breakdown is available for debugging ### Deployment & Operations - [x] **OPS-01**: Both the application-server role and the agent role run from the same Docker image; `PHAZE_ROLE={control,agent}` (or equivalent env) selects which SAQ settings module is loaded and which startup resources are instantiated -- [ ] **OPS-02**: A new `docker-compose.agent.yml` brings up only `worker`, `watcher`, `audfprint`, and `panako` on a file server, configured via env to point at the application server's Redis URL, API URL, and bearer token -- [ ] **OPS-03**: Each file server runs `just download-models` once at setup to populate its own local `/models` volume; the application server no longer downloads or mounts models -- [ ] **OPS-04**: Each agent posts a heartbeat to `/api/internal/agent/heartbeat` every 30 seconds; the application server updates `agents.last_seen_at` and exposes an "Agents" admin page listing each agent's status, queue depth, last seen, and revoked state +- [x] **OPS-02**: A new `docker-compose.agent.yml` brings up only `worker`, `watcher`, `audfprint`, and `panako` on a file server, configured via env to point at the application server's Redis URL, API URL, and bearer token +- [x] **OPS-03**: Each file server runs `just download-models` once at setup to populate its own local `/models` volume; the application server no longer downloads or mounts models +- [x] **OPS-04**: Each agent posts a heartbeat to `/api/internal/agent/heartbeat` every 30 seconds; the application server updates `agents.last_seen_at` and exposes an "Agents" admin page listing each agent's status, queue depth, last seen, and revoked state ## Future Requirements @@ -94,19 +103,19 @@ Explicitly excluded. Documented to prevent scope creep. | Requirement | Phase | Status | |-------------|-------|--------| -| DIST-01 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | +| DIST-01 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | | DIST-02 | Phase 27 β€” Watcher Service & User-Initiated Scan | Complete | | DIST-03 | Phase 26 β€” Task Code Reorg & HTTP-Backed Agent Worker | Complete | -| DIST-04 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Pending | -| DIST-05 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Pending | -| DATA-01 | Phase 24 β€” Schema Foundation & Agent Registry | Pending | -| DATA-02 | Phase 24 β€” Schema Foundation & Agent Registry | Pending | -| DATA-03 | Phase 24 β€” Schema Foundation & Agent Registry | Pending | -| DATA-04 | Phase 24 β€” Schema Foundation & Agent Registry | Pending | -| AUTH-01 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Pending | -| AUTH-02 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | -| AUTH-03 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | -| AUTH-04 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Pending | +| DIST-04 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Complete | +| DIST-05 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Complete | +| DATA-01 | Phase 24 β€” Schema Foundation & Agent Registry | Complete | +| DATA-02 | Phase 24 β€” Schema Foundation & Agent Registry | Complete | +| DATA-03 | Phase 24 β€” Schema Foundation & Agent Registry | Complete | +| DATA-04 | Phase 24 β€” Schema Foundation & Agent Registry | Complete | +| AUTH-01 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Complete | +| AUTH-02 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | +| AUTH-03 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | +| AUTH-04 | Phase 25 β€” Internal Agent HTTP API & Bearer Auth | Complete | | SCAN-01 | Phase 27 β€” Watcher Service & User-Initiated Scan | Complete | | SCAN-02 | Phase 27 β€” Watcher Service & User-Initiated Scan | Complete | | SCAN-03 | Phase 27 β€” Watcher Service & User-Initiated Scan | Complete | @@ -114,17 +123,17 @@ Explicitly excluded. Documented to prevent scope creep. | TASK-01 | Phase 26 β€” Task Code Reorg & HTTP-Backed Agent Worker | Complete | | TASK-02 | Phase 26 β€” Task Code Reorg & HTTP-Backed Agent Worker | Complete | | TASK-03 | Phase 26 β€” Task Code Reorg & HTTP-Backed Agent Worker | Complete | -| TASK-04 | Phase 28 β€” Distributed Execution Dispatch | Pending | -| EXEC-01 | Phase 28 β€” Distributed Execution Dispatch | Pending | -| EXEC-02 | Phase 28 β€” Distributed Execution Dispatch | Pending | -| EXEC-03 | Phase 28 β€” Distributed Execution Dispatch | Pending | -| EXEC-04 | Phase 28 β€” Distributed Execution Dispatch | Pending | +| TASK-04 | Phase 28 β€” Distributed Execution Dispatch | Complete | +| EXEC-01 | Phase 28 β€” Distributed Execution Dispatch | Complete | +| EXEC-02 | Phase 28 β€” Distributed Execution Dispatch | Complete | +| EXEC-03 | Phase 28 β€” Distributed Execution Dispatch | Complete | +| EXEC-04 | Phase 28 β€” Distributed Execution Dispatch | Complete | | OPS-01 | Phase 26 β€” Task Code Reorg & HTTP-Backed Agent Worker | Complete | -| OPS-02 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | -| OPS-03 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | -| OPS-04 | Phase 29 β€” Deployment Hardening & Agents Admin | Pending | +| OPS-02 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | +| OPS-03 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | +| OPS-04 | Phase 29 β€” Deployment Hardening & Agents Admin | Complete | **Coverage:** 26 / 26 v4.0 requirements mapped βœ“ --- -*Last updated: 2026-05-11 β€” milestone v4.0 roadmap mapped (Phases 24-29)* +*Last updated: 2026-05-17 β€” v4.0 audit complete; all 26 requirements satisfied (Phases 24-29 verified)* diff --git a/.planning/milestones/v4.0-ROADMAP.md b/.planning/milestones/v4.0-ROADMAP.md new file mode 100644 index 0000000..1e2a7de --- /dev/null +++ b/.planning/milestones/v4.0-ROADMAP.md @@ -0,0 +1,239 @@ +# Roadmap: Phaze + +## Milestones + +- βœ… **v1.0 MVP** β€” Phases 1-11 (shipped 2026-03-30) +- βœ… **v2.0 Metadata Enrichment & Tracklist Integration** β€” Phases 12-17 (shipped 2026-04-02) +- βœ… **v3.0 Cross-Service Intelligence & File Enrichment** β€” Phases 18-23 (shipped 2026-04-04) +- βœ… **v4.0 Distributed Agents** β€” Phases 24-29 (shipped 2026-05-17) + +## Phases + +
+v1.0 MVP (Phases 1-11) -- SHIPPED 2026-03-30 + +- [x] Phase 1: Infrastructure & Project Setup (3/3 plans) -- completed 2026-03-27 +- [x] Phase 2: File Discovery & Ingestion (3/3 plans) -- completed 2026-03-27 +- [x] Phase 3: Companion Files & Deduplication (2/2 plans) -- completed 2026-03-27 +- [x] Phase 4: Task Queue & Worker Infrastructure (2/2 plans) -- completed 2026-03-27 +- [x] Phase 5: Audio Analysis Pipeline (2/2 plans) -- completed 2026-03-28 +- [x] Phase 6: AI Proposal Generation (2/2 plans) -- completed 2026-03-28 +- [x] Phase 7: Approval Workflow UI (3/3 plans) -- completed 2026-03-29 +- [x] Phase 8: Safe File Execution & Audit (2/2 plans) -- completed 2026-03-29 +- [x] Phase 9: Pipeline Orchestration (1/1 plan) -- completed 2026-03-30 +- [x] Phase 10: CI Config & Bug Fixes (1/1 plan) -- completed 2026-03-30 +- [x] Phase 11: Polish & Cleanup (3/3 plans) -- completed 2026-03-30 + +Full details: `.planning/milestones/v1.0-ROADMAP.md` + +
+ +
+v2.0 Metadata Enrichment & Tracklist Integration (Phases 12-17) -- SHIPPED 2026-04-02 + +- [x] Phase 12: Infrastructure & Audio Tag Extraction (3/3 plans) -- completed 2026-03-31 +- [x] Phase 13: AI Destination Paths (3/3 plans) -- completed 2026-03-31 +- [x] Phase 14: Duplicate Resolution UI (2/2 plans) -- completed 2026-04-01 +- [x] Phase 15: 1001Tracklists Integration (2/2 plans) -- completed 2026-04-01 +- [x] Phase 16: Fingerprint Service & Batch Ingestion (3/3 plans) -- completed 2026-04-01 +- [x] Phase 17: Live Set Matching & Tracklist Review (3/3 plans) -- completed 2026-04-02 + +Full details: `.planning/milestones/v2.0-ROADMAP.md` + +
+ +
+v3.0 Cross-Service Intelligence & File Enrichment (Phases 18-23) -- SHIPPED 2026-04-04 + +- [x] Phase 18: Unified Search (2/2 plans) -- completed 2026-04-03 +- [x] Phase 19: Discogs Cross-Service Linking (3/3 plans) -- completed 2026-04-03 +- [x] Phase 20: Tag Writing (3/3 plans) -- completed 2026-04-03 +- [x] Phase 21: CUE Sheet Generation (3/3 plans) -- completed 2026-04-03 +- [x] Phase 22: Tracklist Integration Fixes (1/1 plan) -- completed 2026-04-04 +- [x] Phase 23: v3.0 Polish & Wiring Fixes (1/1 plan) -- completed 2026-04-04 + +Full details: `.planning/milestones/v3.0-ROADMAP.md` + +
+ +### v4.0 Distributed Agents (Phases 24-29) β€” SHIPPED 2026-05-17 + +**Milestone Goal:** Split phaze into an application server (control plane: API, UI, Postgres, Redis, fileless workers) and one or more file servers running agents that own the music/video files locally, pulling jobs and writing results back over HTTPS β€” so files can live anywhere while decisions stay on a single server. + +- [x] **Phase 24: Schema Foundation & Agent Registry** β€” `agents` table, `agent_id` columns on FileRecord/ScanBatch, two-step Alembic migration with legacy backfill (completed 2026-05-11) +- [x] **Phase 25: Internal Agent HTTP API & Bearer Auth** β€” `/api/internal/agent/*` endpoints, token-hash auth middleware deriving `agent_id` from token, idempotent upserts on natural keys, rotatable tokens (completed 2026-05-12) +- [x] **Phase 26: Task Code Reorg & HTTP-Backed Agent Worker** β€” split `phaze.tasks.controller` (fileless) from `phaze.tasks.agent_worker` (file-bound), `PHAZE_ROLE` env-driven startup, per-agent SAQ queue (`phaze-agent-`), self-contained job payloads (completed 2026-05-12) +- [x] **Phase 27: Watcher Service & User-Initiated Scan** β€” new `phaze-agent-watcher` compose service, watchdog with mtime settle/debounce, sentinel `LIVE` ScanBatch per agent, admin-triggered scan form (completed 2026-05-14) +- [x] **Phase 28: Distributed Execution Dispatch** β€” group-by-agent approval dispatch, per-operation ExecutionLog PATCH, unified SSE progress aggregating across agents, per-agent fingerprint sidecars in execution path (completed 2026-05-15) +- [x] **Phase 29: Deployment Hardening & Agents Admin** β€” strip `SCAN_PATH`/`MODELS_PATH` from application-server compose, self-signed HTTPS w/ internal CA, Redis `requirepass` + LAN binding, `docker-compose.agent.yml`, per-file-server model download, heartbeat + Agents admin page (completed 2026-05-17) + +## Phase Details + +### Phase 24: Schema Foundation & Agent Registry +**Goal**: The database can model who owns each file and which agent originated each scan, with existing v3.0 data preserved end-to-end through a controlled migration. +**Depends on**: Phase 23 (v3.0 shipped) +**Requirements**: DATA-01, DATA-02, DATA-03, DATA-04 +**Success Criteria** (what must be TRUE): + 1. An `agents` table exists with `id`, `name`, `token_hash`, `scan_roots` (jsonb), `created_at`, `last_seen_at`, `revoked_at`, and an operator can insert/query agent rows via Postgres + 2. `FileRecord.agent_id` and `ScanBatch.agent_id` are non-null string columns, and the file uniqueness invariant has moved from `(original_path)` to `(agent_id, original_path)` (verified by attempting a same-path insert under a different agent and succeeding) + 3. After running the upgrade migration on a v3.0 snapshot, every pre-existing FileRecord and ScanBatch points at a seeded `legacy-application-server` agent whose `scan_roots` matches the prior `SCAN_PATH` + 4. One sentinel `LIVE` ScanBatch exists per registered agent and is reused (not duplicated) when re-applied + 5. The migration is two-step (add nullable + backfill, then enforce NOT NULL + swap unique constraint) and can be downgraded cleanly to the v3.0 schema on an unmigrated test DB +**Plans**: 5 plans +- [x] 24-01-PLAN.md β€” Test infrastructure: tests/test_migrations/ package + alembic-driven fixture (Wave 0) +- [x] 24-02-PLAN.md β€” Agent model + ScanStatus.LIVE + agent_id columns + composite UQ on model layer (Wave 1) +- [x] 24-03-PLAN.md β€” Migration 012: agents table, legacy agent seed, FKs, partial UQ, backfill + integration tests (Wave 2) +- [x] 24-04-PLAN.md β€” Migration 013: NOT NULL + composite UQ swap + safe downgrade + [BLOCKING] roundtrip smoke (Wave 3) +- [x] 24-05-PLAN.md β€” Ingestion service: stamp legacy agent_id, swap conflict target to composite (Wave 3) + +### Phase 25: Internal Agent HTTP API & Bearer Auth +**Goal**: The application server exposes an authenticated, idempotent HTTP surface that agents can call to record every state change, with `agent_id` derived from the bearer token and never trusted from request bodies. +**Depends on**: Phase 24 +**Requirements**: DIST-04, DIST-05, AUTH-01, AUTH-04 +**Success Criteria** (what must be TRUE): + 1. Every `/api/internal/agent/*` route requires a bearer token; an unauthenticated request returns 401 and an unknown/revoked token returns 403 + 2. The `agent_id` used by every endpoint is resolved by hashing the bearer token and looking it up in the `agents` table; any `agent_id` field in a request body is ignored or rejected + 3. Replaying the same chunk of file upserts, the same proposal mutation, or the same execution-log PATCH with the same natural keys (`(agent_id, original_path)`, `file_id`, `proposal_id`, agent-generated log UUIDs) produces no duplicate rows and the same final state + 4. Setting `agents.revoked_at` on a row immediately causes that agent's next `/api/internal/agent/*` call to be rejected with no application-server restart required (verified by integration test) + 5. The API surface covers, at minimum, file upsert, metadata write, fingerprint write, execution-log create/patch, and heartbeat β€” all callable end-to-end with an HTTP client +**Plans**: 8 plans +- [x] 25-01-PLAN.md β€” Schema foundation: Agent.last_status JSONB + migration 014 + conftest fixtures (Wave 1) +- [x] 25-02-PLAN.md β€” Auth helper module (agent_auth.py) + AUTH-01/AUTH-04 tests (Wave 2) +- [x] 25-03-PLAN.md β€” Files router + xmax regression test + schemas + auto-enqueue (Wave 3) +- [x] 25-04-PLAN.md β€” Metadata + Fingerprint + Heartbeat routers + schemas + tests (Wave 3) +- [x] 25-05-PLAN.md β€” Execution-log router (POST + PATCH monotonic) + schemas + tests (Wave 3) +- [x] 25-06-PLAN.md β€” App wiring: register 5 routers in main.py + config knobs (Wave 4) +- [x] 25-07-PLAN.md β€” Gap closure CR-01: agent_metadata partial-PUT NULL clobber + regression test (Wave 1, gap_closure) +- [x] 25-08-PLAN.md β€” Gap closure CR-02: execution-log terminal-state idempotent retry + regression tests (Wave 1, gap_closure) +**UI hint**: yes + +### Phase 26: Task Code Reorg & HTTP-Backed Agent Worker +**Goal**: SAQ task code is cleanly split between the application server (fileless `phaze.tasks.controller`) and agents (file-bound `phaze.tasks.agent_worker`), with role-driven startup and per-agent queues so the same Docker image runs both roles correctly. Three new internal-agent endpoints (`/whoami`, `PUT /analysis/{file_id}`, `POST /tracklists`, `PATCH /proposals/{id}/state`) close the contract gap from Phase 25 so the full file-bound task surface can run on agents. +**Depends on**: Phase 25 +**Requirements**: DIST-03, TASK-01, TASK-02, TASK-03, OPS-01 +**Success Criteria** (what must be TRUE): + 1. `phaze.tasks.controller` exposes only fileless tasks (`generate_proposals`, `match_tracklist_to_discogs`, `scrape_and_store_tracklist`, `search_tracklist`, `refresh_tracklists` cron) and `phaze.tasks.agent_worker` exposes only file-bound tasks (`process_file`, `extract_file_metadata`, `fingerprint_file`, `scan_live_set`, `execute_approved_batch`) + 2. Setting `PHAZE_ROLE=control` boots the application-server worker with the fileless settings module and Postgres access; setting `PHAZE_ROLE=agent` boots the agent worker with the file-bound settings module and an HTTP client to the application server, with no Postgres driver loaded + 3. Every file-bound task body uses the HTTP client (no `async_session` import reachable in agent-worker code paths) and writes results via `/api/internal/agent/*` + 4. Each agent worker pulls from a per-agent SAQ queue named `phaze-agent-`; the application-server enqueuer selects the queue from `FileRecord.agent_id` and a job enqueued for agent A never executes on agent B + 5. Agent task jobs carry a self-contained payload (`file_id`, `file_path`, `file_type`, model paths, agent metadata) sufficient to execute without any read-back to the application server during the job +**Plans**: 13 plans +- [x] 26-01-PLAN.md β€” Deps (tenacity + respx + mypy overrides) + settings split (Base/Control/Agent + get_settings) + enum extensions (ProposalStatus.EXECUTED/FAILED, FileState.MOVED/UNCHANGED) (Wave 1) +- [x] 26-02-PLAN.md β€” PhazeAgentClient + 4-class error hierarchy + tenacity retry funnel + respx contract tests (Wave 2) +- [x] 26-03-PLAN.md β€” 5 new schema modules (agent_identity, agent_analysis, agent_tracklists, agent_proposals, agent_tasks) (Wave 2) +- [x] 26-04-PLAN.md β€” AgentTaskRouter + Redis integration tests (Wave 3) +- [x] 26-05-PLAN.md β€” GET /api/internal/agent/whoami router + 4 contract tests (Wave 3) +- [x] 26-06-PLAN.md β€” PUT /api/internal/agent/analysis/{file_id} router (idempotent upsert) + 8 contract tests (Wave 3) +- [x] 26-07-PLAN.md β€” POST /api/internal/agent/tracklists router (Redis idempotency cache) + integration tests (Wave 3) +- [x] 26-08-PLAN.md β€” PATCH /api/internal/agent/proposals/{id}/state router (state-machine joint update) + 11 contract tests incl. W1 cross-tenant guard (Wave 3) +- [x] 26-09-PLAN.md β€” phaze.tasks.controller SAQ settings module (fileless tasks only) (Wave 4) +- [x] 26-10-PLAN.md β€” phaze.tasks.agent_worker SAQ settings module + tests/test_task_split.py subprocess import-boundary test (D-25) (Wave 5) +- [x] 26-11-PLAN.md β€” Rewrite 5 file-bound task bodies (process_file, extract_file_metadata, fingerprint_file, scan_live_set, execute_approved_batch) to use ctx['api_client'] (Wave 4) +- [x] 26-12-PLAN.md β€” main.py wiring (4 new include_router + app.state.task_router + app.state.redis) + agent_files.py refactor to AgentTaskRouter (Wave 5) +- [x] 26-13-PLAN.md β€” Delete worker.py + session.py + docker-compose.yml controller.settings + doc sweep (Wave 6) + +### Phase 27: Watcher Service & User-Initiated Scan +**Goal**: Each file server continuously streams new file arrivals to the application server, and the administrator can also trigger an explicit scan of any path on any agent from the admin UI. +**Depends on**: Phase 26 +**Requirements**: DIST-02, SCAN-01, SCAN-02, SCAN-03, SCAN-04 +**Success Criteria** (what must be TRUE): + 1. A new `phaze-agent-watcher` service is defined and starts alongside `worker`, `audfprint`, and `panako` on the file-server compose; it stays running and observes the agent's configured roots via the `watchdog` library + 2. Dropping a new file into a watched root results in a new `FileRecord` appearing on the application server under that agent's sentinel `LIVE` ScanBatch, with `(agent_id, original_path)` as the natural key + 3. A file whose `mtime` is still changing is **not** posted; only after the configured settle period (default 10s) of stable `mtime` does the watcher compute SHA-256 and stream the record (verified by writing a file slowly and observing no early upsert) + 4. From the admin UI, an administrator can choose `(agent, scan_path)` and trigger a scan; this enqueues `scan_directory(scan_path, batch_id)` onto the chosen agent's queue and the agent streams discovered files back in chunks (e.g., 500 records per request), with `extract_file_metadata` enqueued per new music/video file before the scan completes + 5. The same upsert endpoint serves both bulk scans and per-file watcher events, and a re-walked path produces no duplicate FileRecord rows +**Plans**: 7 plans +- [x] 27-01-PLAN.md β€” Foundation: watchdog dep, AgentSettings watcher knobs, _shared/agent_bootstrap refactor, test scaffolding + extended import-boundary tests (Wave 0) +- [x] 27-02-PLAN.md β€” Schemas: FileUpsertChunk.batch_id, ScanBatchPatch/Response, ScanDirectoryPayload, TriggerScanForm (Wave 1) +- [x] 27-03-PLAN.md β€” Endpoints: PATCH /api/internal/agent/scan-batches + batch_id resolution in POST /files + patch_scan_batch client method + main.py wiring + contract tests (Wave 2) +- [x] 27-04-PLAN.md β€” Agent task: scan_directory(scan_path, batch_id) with chunking, per-chunk PATCH, terminal PATCH; registered in agent_worker.settings.functions (Wave 3) +- [x] 27-05-PLAN.md β€” Watcher package: phaze.agent_watcher (Debouncer, WatcherEventHandler, Poster, __main__); 16+ unit tests covering thread bridge, stuck-file cap, OSError vanish, LIVE-sentinel resolution (Wave 3) +- [x] 27-06-PLAN.md β€” Admin UI: routers/pipeline_scans.py (POST + GET progress + GET agent-roots HTMX swap), 6 partial templates, dashboard.html extension + 10 contract tests (Wave 3) +- [x] 27-07-PLAN.md β€” Deployment + docs: docker-compose watcher service, .env.example knobs, per-service README, STATE.md accumulation (Wave 5) +**UI hint**: yes + +### Phase 28: Distributed Execution Dispatch +**Goal**: Approving a batch that spans multiple file servers results in each agent doing its own local copy-verify-delete, while the application server preserves the write-ahead audit trail and presents unified progress to the operator. +**Depends on**: Phase 27 +**Requirements**: EXEC-01, EXEC-02, EXEC-03, EXEC-04, TASK-04 +**Success Criteria** (what must be TRUE): + 1. Triggering execution on an approved batch groups proposals by `FileRecord.agent_id` and enqueues one `execute_approved_batch` sub-job per affected agent under a shared parent `batch_id`; the dispatch decision is visible in logs and via an admin endpoint + 2. Each agent performs copy-verify-delete locally for its assigned proposals and PATCHes per-operation status (started, copied, verified, deleted, failed) to the application server, so the `ExecutionLog` write-ahead trail survives the HTTP boundary with no rows lost on retry + 3. The application server owns the `exec:{batch_id}` Redis hash and serves SSE progress from a single aggregated key; the admin UI shows unified `total / completed / failed` counts that match the sum across all participating agents + 4. The execution UI exposes a per-agent breakdown (which agent handled which sub-batch, with its own counts) for debugging without requiring database access + 5. Each file server's audfprint and panako sidecars index only that file server's files; fingerprint queries during execution-adjacent flows resolve against the local sidecar and the limitation (no cross-file-server fingerprint matching) is documented in the admin UI / docs +**Plans**: 6 plans +- [x] 28-01-PLAN.md β€” Wave 0: test scaffolding + new dirs + audfprint/panako allow-list validator + sub_batch_index schema field +- [x] 28-02-PLAN.md β€” Wave 1: ExecBatchProgressPayload + agent_exec_batches router + main.py wiring + PhazeAgentClient.post_exec_batch_progress (contract tests) +- [x] 28-03-PLAN.md β€” Wave 1: execution_dispatch service (group-by-agent + revoked filter + chunking) + grouping unit tests +- [x] 28-04-PLAN.md β€” Wave 2: start_execution rewrite + SSE generator extension + agents_table.html + progress.html rewrite + revoked banner +- [x] 28-05-PLAN.md β€” Wave 2: tasks/execution.py β€” per-proposal terminal progress POST + SAQ-meta UUID lift (closes L6/L22) + _classify_failure_step + : error_message +- [x] 28-06-PLAN.md β€” Wave 3: cross_fs_fingerprint_notice.html partial + duplicates/list.html inclusion + PROJECT.md Constraints paragraph + STATE.md accumulation +**UI hint**: yes + +### Phase 29: Deployment Hardening & Agents Admin +**Goal**: A real two-host deployment runs end-to-end with the application server holding no file mounts, HTTPS + Redis hardening in place, and an admin can see at a glance which agents are alive and healthy. +**Depends on**: Phase 28 +**Requirements**: DIST-01, AUTH-02, AUTH-03, OPS-02, OPS-03, OPS-04 +**Success Criteria** (what must be TRUE): + 1. The application-server `docker-compose.yml` declares no `SCAN_PATH` or `MODELS_PATH` mount; starting the stack and attempting to read a music file from inside the `api` or `controller` container fails (verified manually) and the application server has no way to read or write file content + 2. A new `docker-compose.agent.yml` brings up exactly `worker`, `watcher`, `audfprint`, and `panako` on a file server, configured via env (`PHAZE_API_URL`, `PHAZE_REDIS_URL`, `PHAZE_AGENT_TOKEN`, `PHAZE_AGENT_ID`) to reach the application server; running it on a second host registers the agent and begins watching + 3. All agent β†’ application-server traffic uses HTTPS terminated by a self-signed certificate from an application-server-local internal CA; each agent's `httpx` client trusts the CA file and rejects untrusted certs (verified by swapping the CA and observing connection failure) + 4. Redis on the application server requires `requirepass` and is bound only to the private LAN interface; an attempt to connect from outside the LAN or without the password fails, and agents connect with `redis://default:@:6379` + 5. Running `just download-models` on a fresh file server populates that host's local `/models` volume; the application-server image neither downloads nor mounts models + 6. Each agent posts a heartbeat to `/api/internal/agent/heartbeat` every 30 seconds; the Agents admin page lists every registered agent with name, status (alive/stale/revoked), queue depth, and last-seen timestamp, and refreshes without requiring a manual page reload +**Plans**: 8 plans +- [x] 29-01-PLAN.md β€” TLS termination + cert bootstrap (cryptography dep, cert_bootstrap, entrypoint, agent_ca_file/api_tls_sans/verify=) + D-04 wrong-CA integration test (Wave 1) +- [x] 29-02-PLAN.md β€” AgentSettings agent_env field + production-mode redis_url password validator (Wave 1) +- [x] 29-03-PLAN.md β€” Root docker-compose.yml strip mounts + delete watcher/agent-worker/audfprint/panako + redis hardening + .env.example + filesystem-isolation YAML-parse tests (Wave 1) +- [x] 29-04-PLAN.md β€” docker-compose.agent.yml (4 services: worker, watcher, audfprint, panako) + .env.example.agent + agent-compose YAML-parse tests + docker-publish.yml tag verification (Wave 2) +- [x] 29-05-PLAN.md β€” Models auto-download: phaze.scripts.download_models Python helper + phaze.tasks._shared.model_bootstrap + agent_worker/watcher startup wiring + bash shim rewrite (Wave 2) +- [x] 29-06-PLAN.md β€” Heartbeat caller: phaze.tasks.heartbeat.heartbeat_tick + SAQ CronJob registration in agent_worker.settings (Wave 3) +- [x] 29-07-PLAN.md β€” Agents admin page: constants + services.agent_liveness + utils.humanize + routers.admin_agents + 3 Jinja templates + base.html nav link + main.py registration (Wave 3) +- [x] 29-08-PLAN.md β€” Justfile recipes (up-agent, up-all) + docs/deployment.md + PROJECT.md Deployment subsection + scripts/update-project.sh touch + blocking human-verify checkpoint (Wave 4) +**UI hint**: yes + +## Milestone Summary + +**Key Decisions:** + +- Distributed agents architecture: files stay local to file servers, application server owns API/UI/Postgres/Redis (no shared filesystem) +- HTTP-only agent boundary: agents have zero Postgres access; every state change is an authenticated `/api/internal/agent/*` call +- Per-agent SAQ queues (`phaze-agent-`): enqueuer picks queue from `FileRecord.agent_id`; per-agent maintenance is clean +- `agent_id` derived from bearer token hash on application server, never from request body (eliminates spoofing risk) +- Same Docker image for both roles; `PHAZE_ROLE={control,agent}` env selects which SAQ settings module is loaded +- Two-step Alembic migration (012 add + backfill, 013 NOT NULL + UQ swap) preserves v3.0 data via `legacy-application-server` agent seed +- Self-signed internal CA generated in the api container on first start; operators distribute public cert via scp/rsync (no public ACME, no DNS dependency) +- Redis `requirepass` + LAN-only bind for the application-server cache/broker; `AgentSettings` fail-fast on passwordless URL in production +- Per-file-server fingerprint sidecars (no cross-file-server matching in v4.0); documented as v1 limitation with banner on Duplicate Resolution page +- Group-by-`FileRecord.agent_id` execution dispatch + per-operation PATCH preserves write-ahead `ExecutionLog` audit over HTTP boundary + +**Issues Resolved:** + +- Existing v3.0 file corpus migrated via `legacy-application-server` agent seed with zero data loss +- Cross-tenant isolation: 403-before-state-machine guard on every multi-tenant PATCH (Phases 25-08, 27-02, 28-02) prevents timing side-channels +- Subprocess import-boundary test (D-25) catches `phaze.database` imports leaking into the agent worker before merge +- Documentation drift on REQUIREMENTS.md and ROADMAP.md (audit-detected) closed before milestone close +- Phase 29 code-review findings CR-01..CR-03 (HTTPS bind guard, Redis URL env var, partial-download rejection) fixed during execution + +**Issues Deferred (Tech Debt):** + +- WR-01 (Phase 29): umask-then-chmod ordering on CA + leaf private keys β€” brief world-readable window on bind mount (cert_bootstrap.py:215-234) +- WR-02 (Phase 29): docker-compose.yml Postgres binds `0.0.0.0` β€” no equivalent guard to Redis `${REDIS_BIND_IP:-127.0.0.1}` hardening +- WR-03 (Phase 29): `.part` temp file not cleaned up on download error; second `.part` accumulates on next run (download_models.py:84-90) +- WR-04 (Phase 29): watcher service mounts `MODELS_PATH:rw` despite never writing β€” unnecessary write grant (docker-compose.agent.yml:50) +- WR-03 (Phase 28, UI): execution router passes `skipped_revoked` count but not `revoked_agents` list to progress.html; per-agent attribution missing on first render (flow-exec-revoked-breakdown advisory) +- P28-RACE-01: terminal-status promotion in agent_exec_batches.py:189-198 is read-then-write without Lua atomicity (race requires β‰₯3 concurrent sub-jobs) +- 29-HUMAN-UAT.md: real two-host production smoke deferred until file-server hardware available (operator-approved verified-docs-only signal) +- 29-REVIEW.md IN-01/02/03 (Info): no SHA-256 integrity check on model binaries; refreshed_at_iso interpolation; dead conditional in _build_default_settings + +**Technical Debt Incurred:** + +- XAGENT-01 / XAGENT-02 deferred to a future milestone: cross-file-server fingerprint matching, cross-file-server execution batches +- WATCH-05/06/07 deferred: delete events, move/rename within tree, watcher catch-up on startup +- OPS-05/06/07 deferred: mTLS, agent self-service registration, Prometheus metrics scrape endpoint +- scan_live_set drops in-process FileMetadata artist/title resolution (Phase 26-11); fingerprint-sourced tracklist rows land with `artist=None,title=None` β€” known v3.0 UI regression for future controller-side enrichment task + +--- + +_For current project status, see .planning/ROADMAP.md_ diff --git a/.planning/phases/24-schema-foundation-agent-registry/VERIFICATION.md b/.planning/phases/24-schema-foundation-agent-registry/24-VERIFICATION.md similarity index 100% rename from .planning/phases/24-schema-foundation-agent-registry/VERIFICATION.md rename to .planning/phases/24-schema-foundation-agent-registry/24-VERIFICATION.md diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-01-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-01-PLAN.md new file mode 100644 index 0000000..9b2e152 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-01-PLAN.md @@ -0,0 +1,355 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - pyproject.toml + - uv.lock + - src/phaze/cert_bootstrap.py + - src/phaze/entrypoint.py + - src/phaze/config.py + - src/phaze/services/agent_client.py + - src/phaze/tasks/_shared/agent_bootstrap.py + - tests/test_cert_bootstrap.py + - tests/test_services/test_agent_client_tls.py + - tests/test_task_split.py +autonomous: true +requirements: [AUTH-02] +tags: [phase-29, auth, tls, cert-bootstrap, security, v4.0] + +must_haves: + truths: + - "A self-signed CA + leaf cert pair is auto-generated on first api startup into /certs/ (D-02)" + - "The CA bootstrap is idempotent: second invocation with existing parseable certs is a no-op" + - "phaze.cert_bootstrap stays Postgres-free (extends Phase 26 D-25 invariant)" + - "PhazeAgentClient passes the agent-side CA file through to httpx.AsyncClient(verify=...)" + - "An httpx client given the WRONG CA against a server presenting the RIGHT cert raises httpx.ConnectError on first request (D-04)" + - "AgentSettings.agent_ca_file is a typed field bound to PHAZE_AGENT_CA_FILE via AliasChoices, default /certs/phaze-ca.crt (D-03)" + - "BaseSettings.api_tls_sans is a typed field bound to PHAZE_API_TLS_SANS, default 'localhost,127.0.0.1,api' (D-02)" + - "construct_agent_client raises RuntimeError if the CA file is missing or empty (fail-fast) (D-03)" + - "Banner emitted via BOTH print() AND logger.warning() per CONTEXT D-02 D-discretion 'Both' (WARNING-8): test 7 asserts caplog captures the banner at WARNING level" + artifacts: + - path: "pyproject.toml" + provides: "cryptography runtime dep pinned >=46.0.0,<49" + contains: "cryptography>=46.0.0,<49" + - path: "uv.lock" + provides: "Lockfile entry for cryptography + cffi + pycparser" + contains: "cryptography" + - path: "src/phaze/cert_bootstrap.py" + provides: "ensure_certs_present(certs_dir, cn, sans_csv) β†’ CA + leaf via cryptography.x509" + min_lines: 80 + exports: ["ensure_certs_present"] + - path: "src/phaze/entrypoint.py" + provides: "python -m phaze.entrypoint: bootstrap certs then execvp uvicorn" + min_lines: 18 + exports: ["main"] + - path: "src/phaze/config.py" + provides: "api_tls_sans on BaseSettings + agent_ca_file on AgentSettings" + contains: "api_tls_sans" + - path: "src/phaze/services/agent_client.py" + provides: "PhazeAgentClient.__init__ accepts verify kwarg" + contains: "verify" + - path: "src/phaze/tasks/_shared/agent_bootstrap.py" + provides: "construct_agent_client passes verify=cfg.agent_ca_file and pre-checks existence" + contains: "agent_ca_file" + - path: "tests/test_cert_bootstrap.py" + provides: "7 D-22 test cases: gen, idempotency, banner-via-stdout sanity, file modes, SAN list, default SAN parse, banner-via-logger.warning caplog assertion (WARNING-8)" + min_lines: 120 + - path: "tests/test_services/test_agent_client_tls.py" + provides: "wrong-CA β†’ httpx.ConnectError; correct-CA β†’ success (D-04 integration)" + min_lines: 60 + - path: "tests/test_task_split.py" + provides: "test_cert_bootstrap_stays_postgres_free subprocess case" + contains: "test_cert_bootstrap_stays_postgres_free" + key_links: + - from: "src/phaze/entrypoint.py" + to: "src/phaze/cert_bootstrap.py::ensure_certs_present" + via: "import + call" + pattern: "from phaze.cert_bootstrap import ensure_certs_present" + - from: "src/phaze/tasks/_shared/agent_bootstrap.py" + to: "src/phaze/services/agent_client.py::PhazeAgentClient" + via: "verify=cfg.agent_ca_file kwarg" + pattern: "verify=cfg.agent_ca_file" + - from: "src/phaze/services/agent_client.py" + to: "httpx.AsyncClient" + via: "verify= kwarg pass-through" + pattern: "httpx.AsyncClient\\(.*verify=verify" +--- + + +Land the cryptography runtime dep and the pre-uvicorn TLS infrastructure that closes AUTH-02. Add `phaze.cert_bootstrap` (Postgres-free CA + leaf generation per RESEARCH Β§Pattern 1), a `phaze.entrypoint` shim that runs bootstrap before `execvp`-ing uvicorn (RESEARCH Β§Critical Discovery #1 + Pattern 2), three new settings fields (`api_tls_sans`, `agent_ca_file` β€” D-02/D-03 + a `verify=` kwarg on `PhazeAgentClient`), and the D-04 wrong-CA integration test that proves untrusted certs are rejected. + +Purpose: AUTH-02 requires that all agent β†’ app-server traffic uses HTTPS terminated by a self-signed cert from an internal CA, AND each agent's httpx client trusts that CA file and rejects untrusted certs. This plan creates the CA-generation primitive, wires `verify=` end-to-end, and proves the rejection path in CI. + +Output: cryptography in pyproject.toml + uv.lock; new `cert_bootstrap.py` and `entrypoint.py`; modified `config.py`, `agent_client.py`, `agent_bootstrap.py`; three new/extended test files. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md + + + + +```python +class PhazeAgentClient: + def __init__( + self, + base_url: str, + token: str, + *, + timeout: float = 30.0, + _client: httpx.AsyncClient | None = None, + ) -> None: ... +``` + + + +```python +def construct_agent_client(cfg: AgentSettings) -> PhazeAgentClient: + return PhazeAgentClient( + base_url=cfg.agent_api_url, + token=cfg.agent_token.get_secret_value(), + timeout=30.0, + ) +``` + + + +```python +auto_migrate: bool = Field( + default=True, + validation_alias=AliasChoices("PHAZE_AUTO_MIGRATE", "auto_migrate"), + description="...", +) +``` + + + + + + + + + + Task 1: Add cryptography runtime dep + write cert_bootstrap + entrypoint + pyproject.toml, uv.lock, src/phaze/cert_bootstrap.py, src/phaze/entrypoint.py, tests/test_cert_bootstrap.py, tests/test_task_split.py + + - pyproject.toml (current dependencies list lines 11-31; ruff/mypy config still applies) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Critical Discoveries #1" lines 107-119 (cryptography NOT transitive) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 1: Self-signed CA + leaf cert generation" lines 248-393 (full cert_bootstrap target source) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 2: Uvicorn TLS termination" lines 401-441 (entrypoint shim shape) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/cert_bootstrap.py" lines 72-108 (analog: tasks/_shared/agent_bootstrap.py) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/entrypoint.py" lines 111-145 (analog: agent_watcher/__main__.py; os.execvp rationale) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"Postgres-Free Import Boundary" lines 1111-1124 (invariant + verification mechanism) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_cert_bootstrap.py" lines 994-1006 (6 original LOCKED test cases; Plan 01 adds a 7th per WARNING-8) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_task_split.py" lines 917-949 (existing subprocess pattern + new addition) + - src/phaze/tasks/_shared/agent_bootstrap.py (analog: docstring banner + logger + function signature shape) + - src/phaze/agent_watcher/__main__.py (analog: python -m entrypoint shape) + - tests/test_task_split.py (existing subprocess import-boundary tests to mirror) + + + - Test 1: `ensure_certs_present(tmp_path, cn="localhost", sans_csv="localhost,127.0.0.1,api")` creates exactly 4 files: phaze-ca.crt, phaze-ca.key, phaze-server.crt, phaze-server.key. All parse via `cryptography.x509.load_pem_x509_certificate`. + - Test 2: Second call on the populated dir does NOT change mtimes (idempotent no-op). + - Test 3: capsys captures a banner containing "GENERATED NEW PHAZE INTERNAL CA" on first call; banner does NOT contain "BEGIN" or "PRIVATE KEY" (Pitfall 4). + - Test 4: File modes β€” `phaze-ca.crt` and `phaze-server.crt` are `0o644`; `phaze-ca.key` and `phaze-server.key` are `0o600`. + - Test 5: Leaf cert's SubjectAlternativeName extension contains exactly the entries supplied via sans_csv (3 entries from default). + - Test 6: `cert_bootstrap._parse_san_entries("localhost,127.0.0.1,api")` returns a list of 3 x509.GeneralName (mix of DNSName + IPAddress). + - Test 7 (test_task_split): subprocess `python -c "import phaze.cert_bootstrap; ..."` exits 0 and `sys.modules` contains none of {phaze.database, phaze.tasks.session, sqlalchemy.ext.asyncio}. + - **Test 8 (WARNING-8) β€” banner-via-logger.warning caplog:** with `caplog.at_level("WARNING", logger="phaze.cert_bootstrap")`, call `ensure_certs_present(tmp_path, ...)` on an empty dir. Assert `caplog.records` contains AT LEAST ONE record where `record.levelname == "WARNING"` and `"GENERATED NEW PHAZE INTERNAL CA"` is a substring of `record.message`. Independently asserts the D-discretion "Both" requirement that the banner is emitted via `logger.warning(line)` per banner line β€” not just `print()`. (Test 3 covers the `print()` path via capsys; Test 8 covers the logger path via caplog.) + + +Add `cryptography>=46.0.0,<49` to `pyproject.toml` `[project].dependencies`, inserted between `beautifulsoup4` and `essentia-tensorflow` to maintain alphabetic order. Run `uv lock && uv sync` and commit the updated `uv.lock`. This is non-negotiable per RESEARCH Critical Discovery #1 (cryptography is NOT a transitive dep of FastAPI/Starlette in this project per `uv pip list` verification); CONTEXT.md D-discretion that called it transitive is incorrect. + +Write `src/phaze/cert_bootstrap.py` per RESEARCH Β§Pattern 1 (lines 252-393). Module-level docstring MUST include the IMPORT-BOUNDARY INVARIANT banner (mirror `tasks/_shared/agent_bootstrap.py:1-21`) naming the forbidden modules `phaze.database`, `phaze.tasks.session`, `sqlalchemy.ext.asyncio`. Public surface: one function `ensure_certs_present(certs_dir: Path, cn: str, sans_csv: str) -> None`. Internal helpers: `_parse_san_entries(sans_csv) -> list[x509.GeneralName]` (dispatches DNSName vs IPAddress via `ipaddress.ip_address`); `_generate_ca(cn) -> tuple[ec.EllipticCurvePrivateKey, x509.Certificate]` (ECDSA P-256 per D-discretion; 10-year validity; CN `f"Phaze Internal CA ({cn})"`; BasicConstraints ca=True critical; KeyUsage with key_cert_sign+crl_sign+digital_signature critical); `_generate_leaf(ca_key, ca_cert, cn, sans) -> tuple[...]` (ECDSA P-256; 2-year validity; SubjectAlternativeName from sans; BasicConstraints ca=False critical; KeyUsage with digital_signature+key_encipherment critical). `ensure_certs_present` writes 4 files: `phaze-ca.crt` (0o644), `phaze-ca.key` (0o600 PKCS8 NoEncryption), `phaze-server.crt` (0o644), `phaze-server.key` (0o600 PKCS8 NoEncryption). Idempotency check: if all 4 exist AND `x509.load_pem_x509_certificate` parses both `phaze-ca.crt` and `phaze-server.crt` without raising, log INFO "cert_bootstrap: existing certs at %s β€” no-op" and return. Banner constant `_BANNER` is the multi-line string from RESEARCH lines 277-283 (LITERAL β€” must reference only `{ca_path}`, never the private key, per Pitfall 4); on generation, emit via `print(banner, flush=True)` (with `# noqa: T201` since CLI banner is intentional stdout) AND `logger.warning(line)` per banner line (D-discretion "Both" β€” both paths are MANDATORY per WARNING-8). Set `0o644` / `0o600` via explicit `path.chmod(0o644)` / `path.chmod(0o600)` after each `write_bytes()`. + +Write `src/phaze/entrypoint.py` per RESEARCH Β§Pattern 2 (lines 401-441) + PATTERNS Β§entrypoint.py (lines 111-145). Module docstring states it is invoked via `uv run python -m phaze.entrypoint` from compose. Public `main()` reads three env vars: `PHAZE_CERTS_DIR` (default `/certs`), `PHAZE_API_HOST` (default `localhost`), `PHAZE_API_TLS_SANS` (default `localhost,127.0.0.1,api`); calls `ensure_certs_present(Path(certs_dir), cn=cn, sans_csv=sans)`; then `os.execvp("uv", ["uv", "run", "uvicorn", "phaze.main:app", "--host", "0.0.0.0", "--port", "8000", "--ssl-keyfile", str(certs_dir/"phaze-server.key"), "--ssl-certfile", str(certs_dir/"phaze-server.crt")])`. `os.execvp` is intentional (not subprocess) so signals + PID-1 propagate cleanly. Guard: `if __name__ == "__main__": main()`. No imports from `phaze.database`, `phaze.config`, or anything Postgres-ish β€” only `os`, `sys`, `pathlib.Path`, and `phaze.cert_bootstrap`. Add `# noqa: S606` on the execvp call if ruff flags it (this is the entrypoint design intent). + +Write `tests/test_cert_bootstrap.py` with **7 LOCKED cases** (6 original + 1 WARNING-8 addition): + +(1) first-call generates all 4 files and they all parse via `x509.load_pem_x509_certificate` (the keys parse via `serialization.load_pem_private_key`); + +(2) second call: capture mtimes from the 4 files after the first call, invoke again, assert mtimes unchanged; + +(3) **banner-via-stdout (Test 3):** `capsys.readouterr().out` contains "GENERATED NEW PHAZE INTERNAL CA" and contains neither `"BEGIN"` nor `"PRIVATE KEY"`; + +(4) `(tmp_path / "phaze-ca.crt").stat().st_mode & 0o777 == 0o644` and similar for the other three with 0o600 on the keys; + +(5) parse the leaf cert, extract its SubjectAlternativeName extension, assert the entries match `sans_csv` (e.g., 3 entries for default); + +(6) `_parse_san_entries("localhost,127.0.0.1,api")` returns 3 items where item 1 is DNSName("localhost"), item 2 is IPAddress(IPv4Address("127.0.0.1")), item 3 is DNSName("api")); + +**(7) WARNING-8 β€” banner-via-logger.warning (NEW):** + +```python +def test_banner_emitted_via_logger_warning(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """CONTEXT D-02 D-discretion 'Both' (Phase 29 WARNING-8 resolution): + the banner MUST be emitted via BOTH print() AND logger.warning(). + Test 3 (capsys) covers the print() path; this test covers the + logger.warning() path independently β€” a future refactor that drops + one path while keeping the other would slip past Test 3 alone.""" + with caplog.at_level(logging.WARNING, logger="phaze.cert_bootstrap"): + ensure_certs_present(tmp_path, cn="localhost", sans_csv="localhost,127.0.0.1,api") + banner_records = [ + r for r in caplog.records + if r.levelname == "WARNING" + and r.name == "phaze.cert_bootstrap" + and "GENERATED NEW PHAZE INTERNAL CA" in r.getMessage() + ] + assert banner_records, ( + f"Expected at least one WARNING-level log record from phaze.cert_bootstrap " + f"containing 'GENERATED NEW PHAZE INTERNAL CA'; got records: " + f"{[(r.levelname, r.name, r.getMessage()) for r in caplog.records]}" + ) + # Also assert the logger never leaks the private-key blob (parity with Test 3 for the print path): + for r in banner_records: + assert "BEGIN" not in r.getMessage(), f"banner record leaked PEM marker: {r.getMessage()}" + assert "PRIVATE KEY" not in r.getMessage(), f"banner record leaked private-key string: {r.getMessage()}" +``` + +Use `pytest`'s `tmp_path` fixture. Use `capsys` for Test 3 and `caplog` for Test 7. No new test deps required (cryptography is now a runtime dep so `x509.load_pem_x509_certificate` is importable from tests). + +Extend `tests/test_task_split.py` with `test_cert_bootstrap_stays_postgres_free()` per PATTERNS lines 925-947. Mirror the existing subprocess pattern at lines 33-73 byte-for-byte except: the imported module is `phaze.cert_bootstrap` (not `phaze.tasks.agent_worker`); no env vars are required (cert_bootstrap doesn't call `get_settings()`). The forbidden module list stays the same triple: `("phaze.database", "phaze.tasks.session", "sqlalchemy.ext.asyncio")`. Use `subprocess.run([sys.executable, "-c", script], ...)` with `timeout=20`, `check=False`. Assert `result.returncode == 0` with the diagnostic stdout/stderr. + + + uv run pytest tests/test_cert_bootstrap.py tests/test_task_split.py -x -q + + +- `uv add 'cryptography>=46.0.0,<49'` succeeded; `uv pip show cryptography` returns 46.x or higher +- `pyproject.toml` line for `cryptography>=46.0.0,<49` exists in alphabetical position +- `uv.lock` contains a cryptography entry; committed alongside pyproject.toml +- `src/phaze/cert_bootstrap.py` exports `ensure_certs_present`; module has the IMPORT-BOUNDARY INVARIANT docstring banner +- `src/phaze/entrypoint.py` exports `main()`; invokable as `python -m phaze.entrypoint` +- `tests/test_cert_bootstrap.py` has **7 cases** (6 original + WARNING-8 banner-via-logger), all pass under `uv run pytest tests/test_cert_bootstrap.py` +- `tests/test_task_split.py::test_cert_bootstrap_stays_postgres_free` passes +- `uv run ruff check src/phaze/cert_bootstrap.py src/phaze/entrypoint.py` is clean +- `uv run mypy src/phaze/cert_bootstrap.py src/phaze/entrypoint.py` is clean + + + + + Task 2: Add api_tls_sans + agent_ca_file settings; wire verify= through PhazeAgentClient + construct_agent_client + src/phaze/config.py, src/phaze/services/agent_client.py, src/phaze/tasks/_shared/agent_bootstrap.py, tests/test_services/test_agent_client_tls.py + + - `AgentSettings(...)` accepts `PHAZE_AGENT_CA_FILE=/some/path` via env and exposes it as `cfg.agent_ca_file` (string). + - `BaseSettings()` exposes `cfg.api_tls_sans` with default `"localhost,127.0.0.1,api"` and accepts the `PHAZE_API_TLS_SANS` env override. + - `PhazeAgentClient(base_url=..., token=..., verify="/path/to/ca.crt")` constructs without error AND `client._client._transport` SSL context reflects the provided CA path. + - `PhazeAgentClient(base_url=..., token=...)` (no verify kwarg) still constructs cleanly (default `verify=True` preserved per Pitfall 10). + - `construct_agent_client(cfg)` with `cfg.agent_ca_file` pointing at a missing file raises `RuntimeError("CA file empty or unreadable: ...")`. + - `construct_agent_client(cfg)` with a valid CA file path passes `verify=cfg.agent_ca_file` through to `PhazeAgentClient.__init__`. + - test_agent_client_tls.py wrong-CA test passes: a real in-process uvicorn smoke server presents `tmp_path_1/phaze-server.{crt,key}`, an `httpx.AsyncClient(verify=tmp_path_2/phaze-ca.crt)` POST to it raises `httpx.ConnectError`. + - test_agent_client_tls.py correct-CA test passes: same setup but `verify=tmp_path_1/phaze-ca.crt` returns the expected response. + - Existing `tests/test_services/test_agent_client*.py` respx tests continue to pass unchanged (Pitfall 10). + - `uv run ruff check src/phaze/config.py src/phaze/services/agent_client.py src/phaze/tasks/_shared/agent_bootstrap.py tests/test_services/test_agent_client_tls.py` clean. + - `uv run mypy src/phaze/config.py src/phaze/services/agent_client.py src/phaze/tasks/_shared/agent_bootstrap.py` clean. + + + - src/phaze/config.py (existing AliasChoices pattern lines 102-122 on BaseSettings; lines 153-172 on AgentSettings; existing model_validator pattern lines 64-90 for `_enforce_localhost_only`) + - src/phaze/services/agent_client.py (existing __init__ at lines 118-131; `import httpx` already present) + - src/phaze/tasks/_shared/agent_bootstrap.py (existing construct_agent_client at lines 44-57; existing auth-hint helpers; existing logger pattern) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 3: httpx verify= against an internal CA" lines 443-497 (signature + exception path + two test-strategy options) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/services/agent_client.py" lines 544-587 (existing β†’ modified __init__ diff) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/_shared/agent_bootstrap.py" lines 590-619 (existing β†’ modified construct_agent_client diff) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/config.py" lines 706-758 (3 new fields + model_validator) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_services/test_agent_client_tls.py" lines 1010-1022 (two-CA cert generation + real uvicorn smoke server) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pitfall 10: Forgetting to add verify= to the test smoke client" lines 1016-1020 (preserve default=True) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-03 (agent_ca_file field + PhazeAgentClient passes through + empty-file RuntimeError) + - src/phaze/cert_bootstrap.py (will be created in Task 1; the new tests call ensure_certs_present(tmp_path, ...) twice with distinct dirs to get two CA bundles) + + +Modify `src/phaze/services/agent_client.py::PhazeAgentClient.__init__` per PATTERNS lines 544-587: add `verify: ssl.SSLContext | str | bool = True` as a keyword-only parameter (between `timeout` and `_client`); thread `verify=verify` into `httpx.AsyncClient(...)`. Add `import ssl` at module top (alphabetical with existing stdlib imports). Default `True` is mandatory β€” preserves the existing respx-based tests (Pitfall 10) which mock at the transport layer below TLS. + +Modify `src/phaze/tasks/_shared/agent_bootstrap.py::construct_agent_client` per PATTERNS lines 590-619: add `from pathlib import Path` at the top. Body: read `ca_path = Path(cfg.agent_ca_file)`; if not `ca_path.exists()` OR `ca_path.stat().st_size == 0`, raise `RuntimeError(f"CA file empty or unreadable: {cfg.agent_ca_file}")` (fail-fast per D-03). Pass `verify=cfg.agent_ca_file` to `PhazeAgentClient(...)` along with the existing `base_url`, `token`, `timeout=30.0` args. + +Modify `src/phaze/config.py` per PATTERNS lines 706-758: +1. Add `from typing import Literal` to the existing `typing` import line (it currently imports `Annotated`). +2. On `BaseSettings` (D-02): add field `api_tls_sans: str` with `default="localhost,127.0.0.1,api"`, `validation_alias=AliasChoices("PHAZE_API_TLS_SANS", "api_tls_sans")`, description `"Comma-separated SAN list for the auto-generated leaf cert (Phase 29 D-02)."` +3. On `AgentSettings` (D-03): add field `agent_ca_file: str` with `default="/certs/phaze-ca.crt"`, `validation_alias=AliasChoices("PHAZE_AGENT_CA_FILE", "agent_ca_file")`, description `"Path to the operator-distributed CA cert for verifying the app-server TLS endpoint (Phase 29 D-03)."` +4. NO `agent_env` field in this plan β€” that lives with the redis-password validator in Plan 02. + +Write `tests/test_services/test_agent_client_tls.py` per PATTERNS lines 1010-1022 + RESEARCH Β§Pattern 3 lines 491-497 (Option 1: real TLS smoke server). Fixture setup: invoke `ensure_certs_present(tmp_path_a, cn="localhost", sans_csv="127.0.0.1,localhost")` and `ensure_certs_present(tmp_path_b, cn="localhost", sans_csv="127.0.0.1,localhost")` to get TWO independent CA bundles. Spin up a minimal `uvicorn` server in a thread (`uvicorn.Config(app=FastAPI(... GET /test returns 200 JSON ...), host="127.0.0.1", port=, ssl_keyfile=str(tmp_path_a/"phaze-server.key"), ssl_certfile=str(tmp_path_a/"phaze-server.crt"))`); use `uvicorn.Server(config)` + `server.serve()` in a background `asyncio.Task` after `lifespan="off"`. Wait until the port accepts connections (TCP poll, 5s timeout). + +Test 1 (wrong-CA β†’ ConnectError): construct `httpx.AsyncClient(base_url=f"https://127.0.0.1:{port}", verify=str(tmp_path_b/"phaze-ca.crt"))`; `with pytest.raises(httpx.ConnectError): await client.get("/test")` per RESEARCH line 484. + +Test 2 (correct-CA β†’ success): `httpx.AsyncClient(base_url=..., verify=str(tmp_path_a/"phaze-ca.crt"))`; `response = await client.get("/test")`; `assert response.status_code == 200`. + +Test 3 (construct_agent_client empty-CA β†’ RuntimeError): point `AgentSettings(agent_ca_file=str(tmp_path / "nonexistent.crt"), ...)` at a missing path; `with pytest.raises(RuntimeError, match="CA file empty or unreadable"): construct_agent_client(cfg)`. + +Tear down: cancel the server task, await it; pytest_asyncio handles the rest. + +Mark the test with `pytest.mark.integration` (per `[tool.pytest.ini_options] markers` in pyproject.toml) so it can be skipped if needed. + + + uv run pytest tests/test_services/test_agent_client_tls.py tests/test_services/test_agent_client.py -x -q + + +- `AgentSettings(agent_ca_file="/foo")` accepts the value via env or kwarg +- `BaseSettings(api_tls_sans="x,y,z")` accepts the value +- `PhazeAgentClient(..., verify="path")` constructs cleanly; `PhazeAgentClient(...)` without verify still defaults to True +- `construct_agent_client(cfg)` raises `RuntimeError` on missing CA file +- `tests/test_services/test_agent_client_tls.py` 3 tests pass +- All existing `tests/test_services/test_agent_client*.py` respx tests still pass (default verify=True preserved) +- `uv run ruff check`/`mypy` clean on all touched files + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| operator filesystem β†’ ./certs/ bind mount | host file ops generate + persist server + CA keys | +| ./certs/ bind mount β†’ api container | uvicorn reads server cert + key at startup | +| api container TLS port (8000) β†’ agent httpx client | TLS handshake on every agent call | +| operator scp β†’ file-server ./certs/ | manual one-time CA distribution (D-03; out-of-band) | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-01-01 | Spoofing | agent β†’ app-server TLS handshake | mitigate | `httpx.AsyncClient(verify=cfg.agent_ca_file)` rejects any cert not signed by the operator-distributed CA; D-04 integration test (`tests/test_services/test_agent_client_tls.py`) proves `httpx.ConnectError` on wrong-CA | +| T-29-01-02 | Information Disclosure | CA private key (`phaze-ca.key`) leak | mitigate | `0o600 root:root`, written to bind-mount `./certs/` on app-server only; NEVER copied to file servers (only `phaze-ca.crt` is distributed); banner constant test asserts no `BEGIN`/`PRIVATE KEY` strings in stdout (Pitfall 4) | +| T-29-01-03 | Information Disclosure | banner leaking private key on regeneration via logger.warning() | mitigate | `_BANNER` is a literal constant referencing only `phaze-ca.crt` path; Test 3 (capsys) asserts stdout doesn't leak; Test 7 (WARNING-8 caplog) asserts the WARNING-level log records also do not contain `BEGIN` or `PRIVATE KEY` β€” coverage of BOTH banner channels | +| T-29-01-04 | Tampering | malicious cryptography package via supply chain | mitigate | Version-pinned `cryptography>=46.0.0,<49`; PyCA-maintained (Python Cryptographic Authority); Apache-2.0/BSD-3-Clause; pre-built abi3 wheels verified at PyPI (RESEARCH Β§Package Legitimacy Audit) | +| T-29-01-05 | Information Disclosure | bearer token leak via TLS-disabled fallback | mitigate | default `verify=True` keeps the existing respx tests' transport-mocked behavior; production agents always pass `verify=cfg.agent_ca_file` per Task 2; no `verify=False` code path anywhere | +| T-29-01-SC | Tampering | npm/pip/cargo installs | mitigate | `cryptography` is the only new dep; [OK] in RESEARCH Package Legitimacy Audit; pinned and lockfile committed | + + + +Full plan-level checks (run after both tasks complete): + +- `uv run pytest tests/test_cert_bootstrap.py tests/test_services/test_agent_client_tls.py tests/test_task_split.py -x -q` β€” all green (7 cert tests including WARNING-8 banner-via-logger + 3 TLS tests + extended task_split) +- `uv run pytest tests/test_services/test_agent_client.py tests/test_services/test_agent_bootstrap.py -x -q` β€” existing tests unaffected +- `uv run ruff check .` β€” clean (no T201/T201-line warning since cert_bootstrap uses `# noqa: T201` on the intentional banner print) +- `uv run ruff format --check .` β€” clean +- `uv run mypy src/phaze/cert_bootstrap.py src/phaze/entrypoint.py src/phaze/services/agent_client.py src/phaze/tasks/_shared/agent_bootstrap.py src/phaze/config.py` β€” clean +- `uv run python -c "import phaze.cert_bootstrap; ensure=phaze.cert_bootstrap.ensure_certs_present; print('ok')"` β€” imports cleanly +- `uv run python -c "from phaze.config import AgentSettings; cfg=AgentSettings(agent_api_url='https://x', agent_token='x'); print(cfg.agent_ca_file)"` β€” prints `/certs/phaze-ca.crt` + + + +- AUTH-02 partially closed: cert generation infrastructure exists; agent httpx clients pass `verify=` through; wrong-CA path raises `httpx.ConnectError` in CI +- AUTH-02 fully closed in Plan 03 once docker-compose.yml api command switches to `python -m phaze.entrypoint` and the cert-mounted volume is configured +- **7** new test cases in `test_cert_bootstrap.py` (6 original + WARNING-8 banner-via-logger.warning) + 3 in `test_agent_client_tls.py` + 1 in `test_task_split.py` (extension) = 11 new tests passing +- D-01, D-02, D-03, D-04, D-22 (cert-related portions) implemented β€” including the explicit dual-channel banner (print + logger.warning) per D-02 D-discretion "Both" +- Postgres-free invariant extended to `phaze.cert_bootstrap` and `phaze.entrypoint` + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-01-SUMMARY.md` when both tasks complete. Summary must list: files created, files modified, new tests added (note the WARNING-8 7th banner-via-logger test specifically), decision IDs implemented (D-01, D-02, D-03, D-04, D-22 cert portion), and any deviations from the plan. + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-01-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-01-SUMMARY.md new file mode 100644 index 0000000..45d5b87 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-01-SUMMARY.md @@ -0,0 +1,193 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 01 +subsystem: auth +tags: [phase-29, auth, tls, cert-bootstrap, security, v4.0, cryptography, httpx-verify] + +requires: + - phase: 26-task-code-reorg-http-backed-agent-worker + provides: PhazeAgentClient, AgentSettings, construct_agent_client, import-boundary invariant (D-25) + - phase: 27-watcher-service-user-initiated-scan + provides: phaze.tasks._shared.agent_bootstrap module (D-17) +provides: + - phaze.cert_bootstrap.ensure_certs_present (idempotent CA + leaf x509 generation) + - phaze.entrypoint pre-uvicorn shim (runs cert bootstrap then execvp uvicorn) + - PhazeAgentClient.__init__ verify= kwarg (default True; preserves Pitfall 10) + - AgentSettings.agent_ca_file (D-03; default /certs/phaze-ca.crt) + - BaseSettings.api_tls_sans (D-02; default localhost,127.0.0.1,api) + - construct_agent_client fail-fast on missing/empty CA file (D-03) + - tests/test_services/test_agent_client_tls.py (D-04 wrong-CA β†’ ConnectError) + - tests/test_cert_bootstrap.py (D-22; 7 LOCKED cases incl. WARNING-8 banner-via-logger) + - tests/test_task_split.py::test_cert_bootstrap_stays_postgres_free (D-22 extension of D-25) +affects: + - Phase 29 Plan 03 (docker-compose api command switches to python -m phaze.entrypoint) + - Phase 29 Plan 02 (Redis hardening reuses BaseSettings.api_tls_sans pattern) + - All Phase 30+ agent code that constructs httpx clients to the app server + +tech-stack: + added: + - cryptography>=46.0.0,<49 (resolved 48.0.0; PyCA-maintained; abi3 wheels) + - cffi v2.0.0, pycparser v3.0 (transitive via cryptography) + patterns: + - "Pre-uvicorn entrypoint shim: bootstrap-then-execvp so signals + PID-1 propagate cleanly" + - "Idempotent self-signed CA + leaf generation via cryptography.x509.CertificateBuilder (ECDSA P-256)" + - "verify= kwarg pass-through with default=True to preserve respx-mocked transport tests (Pitfall 10)" + - "Banner emission via BOTH print() AND logger.warning() (CONTEXT D-02 D-discretion 'Both')" + - "Postgres-free import boundary extended to pre-uvicorn modules (cert_bootstrap inherits Phase 26 D-25)" + +key-files: + created: + - src/phaze/cert_bootstrap.py + - src/phaze/entrypoint.py + - tests/test_cert_bootstrap.py + - tests/test_services/test_agent_client_tls.py + modified: + - pyproject.toml (added cryptography dep) + - uv.lock (cryptography 48.0.0 + cffi + pycparser) + - src/phaze/config.py (BaseSettings.api_tls_sans, AgentSettings.agent_ca_file) + - src/phaze/services/agent_client.py (PhazeAgentClient verify= kwarg) + - src/phaze/tasks/_shared/agent_bootstrap.py (construct_agent_client CA-file fail-fast + verify pass-through) + - tests/test_task_split.py (added test_cert_bootstrap_stays_postgres_free) + +key-decisions: + - "ECDSA P-256 over RSA-3072 for CA + leaf keys (CONTEXT D-discretion: faster + smaller; verified compat with httpx + Python 3.13 ssl)" + - "AuthorityKeyIdentifier + SubjectKeyIdentifier + ExtendedKeyUsage(SERVER_AUTH) added during integration testing (Rule 1 bug fix; Python 3.13 ssl rejects chain without them)" + - "verify= default True preserves all existing respx tests (Pitfall 10 confirmed in CI)" + - "Banner literal references only phaze-ca.crt path; never templates the private key (Pitfall 4 + Test 3 + Test 7)" + - "WARNING-8 7th test case added per CONTEXT D-02 D-discretion 'Both': caplog-level assertion that banner emission via logger.warning() is independently mandatory (Test 3 covers print path)" + - "cryptography is NOT a transitive dep (RESEARCH Critical Discovery #1 verified via uv pip list); explicit pyproject.toml add was non-negotiable" + +patterns-established: + - "Pre-uvicorn entrypoint shim invoked via `uv run python -m phaze.entrypoint`; reads env vars directly (no get_settings() at that layer); execvp's uvicorn with --ssl-keyfile / --ssl-certfile pointing at the freshly-generated leaf cert" + - "x509.CertificateBuilder pattern with full chain extensions (BasicConstraints, KeyUsage, SubjectKeyIdentifier, AuthorityKeyIdentifier on leaf, ExtendedKeyUsage(SERVER_AUTH))" + - "Test pattern for real-TLS integration: uvicorn.Server in background asyncio task, two independent tmp_path CA bundles to prove both wrong-CA β†’ ConnectError and correct-CA β†’ 200" + - "AgentSettings fail-fast pattern (D-03) for CA file: ca_path.exists() AND st_size > 0 at construction time; RuntimeError with operator-actionable message" + +requirements-completed: [AUTH-02] + +duration: ~45min +completed: 2026-05-16 +--- + +# Phase 29 Plan 01: Cert Bootstrap + Agent TLS Verify Summary + +**Self-signed CA + leaf cert auto-generation infrastructure (Postgres-free) plus end-to-end `verify=` plumbing on every agent's httpx client, with a CI integration test that proves untrusted certs are rejected.** + +## What Shipped + +### Cert generation primitive +- New module `src/phaze/cert_bootstrap.py` (220 lines). Exports `ensure_certs_present(certs_dir, cn, sans_csv)`. Idempotent: re-running on a populated directory parses the existing CA + leaf and returns immediately. On generation, writes 4 files: + - `phaze-ca.crt` 0o644 (public; distributed to agents) + - `phaze-ca.key` 0o600 (private CA signing key) + - `phaze-server.crt` 0o644 + - `phaze-server.key` 0o600 +- ECDSA P-256 keys. 10-year CA, 2-year leaf. Full chain extensions: BasicConstraints (critical), KeyUsage (critical), SubjectKeyIdentifier on both, AuthorityKeyIdentifier on leaf, ExtendedKeyUsage(SERVER_AUTH) on leaf, SubjectAlternativeName from the operator-supplied SAN list. +- IMPORT-BOUNDARY INVARIANT: no `phaze.database` / `phaze.tasks.session` / `sqlalchemy.ext.asyncio` imports. Verified by `tests/test_task_split.py::test_cert_bootstrap_stays_postgres_free`. + +### Pre-uvicorn entrypoint shim +- New module `src/phaze/entrypoint.py` (70 lines). Invoked as `uv run python -m phaze.entrypoint`. Reads `PHAZE_CERTS_DIR` / `PHAZE_API_HOST` / `PHAZE_API_TLS_SANS` env vars (all with safe defaults), calls `ensure_certs_present`, then `os.execvp`'s uvicorn with `--ssl-keyfile` / `--ssl-certfile`. Process replacement (not subprocess) so signals + PID-1 propagate cleanly. + +### Settings + client wiring +- `BaseSettings.api_tls_sans` (D-02): default `"localhost,127.0.0.1,api"`. Env alias `PHAZE_API_TLS_SANS`. +- `AgentSettings.agent_ca_file` (D-03): default `"/certs/phaze-ca.crt"`. Env alias `PHAZE_AGENT_CA_FILE`. +- `PhazeAgentClient.__init__` accepts `verify: ssl.SSLContext | str | bool = True` (kw-only, default `True` preserves Pitfall 10). Threaded to `httpx.AsyncClient(verify=...)`. +- `construct_agent_client(cfg)` validates `cfg.agent_ca_file` at construction time: missing OR zero-byte β†’ `RuntimeError("CA file empty or unreadable: ...")`. Passes `verify=cfg.agent_ca_file` to the client. + +### Tests +- `tests/test_cert_bootstrap.py` β€” **7 LOCKED cases**: + 1. First call generates 4 files; all parse via `x509.load_pem_x509_certificate` + `serialization.load_pem_private_key`. + 2. Second call leaves mtimes unchanged (idempotency). + 3. Banner stdout contains "GENERATED NEW PHAZE INTERNAL CA"; never "BEGIN" or "PRIVATE KEY" (Pitfall 4). + 4. File modes: 0o644 / 0o600. + 5. Leaf SubjectAlternativeName matches sans_csv (3 entries for default). + 6. `_parse_san_entries` DNSName vs IPAddress dispatch. + 7. **WARNING-8** β€” banner emitted via `logger.warning()` at level WARNING with logger name `phaze.cert_bootstrap`; banner records also never leak `BEGIN` / `PRIVATE KEY` (parity with Test 3 for the logger channel). +- `tests/test_services/test_agent_client_tls.py` β€” **4 cases**: + - `test_wrong_ca_raises_connect_error`: real uvicorn smoke server presenting one CA's cert; `httpx.AsyncClient(verify=other_ca)` β†’ `httpx.ConnectError`. **D-04 success criterion**. + - `test_correct_ca_succeeds`: same server, correct CA β†’ 200 OK. + - `test_construct_agent_client_missing_ca_raises`: D-03 fail-fast on non-existent path. + - `test_construct_agent_client_empty_ca_raises`: D-03 fail-fast on zero-byte file. +- `tests/test_task_split.py::test_cert_bootstrap_stays_postgres_free` β€” D-22 extension of D-25. + +## Verification Results + +``` +uv run pytest tests/test_cert_bootstrap.py tests/test_services/test_agent_client_tls.py tests/test_task_split.py tests/test_services/test_agent_client.py tests/test_services/test_agent_client_exec_batch_progress.py -q +36 passed, 2 warnings in 15.14s +``` + +- 7 cert_bootstrap cases pass (RED β†’ GREEN cycle: ffdbf5f β†’ 5840bfe). +- 4 TLS integration cases pass (RED β†’ GREEN: 57d9843 β†’ 25c4ca4). +- 5 task_split cases pass (incl. new cert_bootstrap-Postgres-free case). +- 20 existing respx-based `test_agent_client*` cases pass unchanged β€” **Pitfall 10 confirmed**: `verify=True` default preserves transport-layer mocking. +- `uv run ruff check` + `uv run ruff format --check` + `uv run mypy` clean on all touched modules. +- `uv run bandit -x tests -s B608` clean on `cert_bootstrap.py` + `entrypoint.py`. +- `uv run python -c "import phaze.cert_bootstrap; ensure=phaze.cert_bootstrap.ensure_certs_present; print('ok')"` β€” module imports cleanly. +- Default settings verified: `AgentSettings.agent_ca_file == "/certs/phaze-ca.crt"`, `BaseSettings.api_tls_sans == "localhost,127.0.0.1,api"`. + +## Commits + +| Hash | Type | Phase | Subject | +| ------- | ---- | ----- | ---------------------------------------------------------------------- | +| ffdbf5f | test | RED | add failing tests for cert_bootstrap + Postgres-free guard | +| 5840bfe | feat | GREEN | implement cert_bootstrap + entrypoint shim | +| 57d9843 | test | RED | add TLS integration tests + fix CA chain extensions (Rule 1 bug fix) | +| 25c4ca4 | feat | GREEN | wire verify= through PhazeAgentClient + AgentSettings | + +## Deviations from Plan + +### Auto-fixed Issues + +**1. [Rule 1 - Bug] Missing AuthorityKeyIdentifier / SubjectKeyIdentifier / ExtendedKeyUsage extensions on the leaf cert** + +- **Found during:** Task 2 (running `test_correct_ca_succeeds` against the real cert chain). +- **Issue:** Python 3.13's `ssl` module rejects the leaf cert with "Missing Authority Key Identifier" (and would similarly reject without SubjectKeyIdentifier on CA + ExtendedKeyUsage(SERVER_AUTH) on the leaf) when used in a TLS chain. The RESEARCH Β§Pattern 1 source snippet in 29-RESEARCH.md (lines 297-339) does NOT include these extensions β€” verified by re-reading the source β€” yet Python 3.13's strict TLS validation requires them. +- **Fix:** Added `SubjectKeyIdentifier.from_public_key(...)` to both CA and leaf, `AuthorityKeyIdentifier.from_issuer_public_key(...)` to the leaf, and `ExtendedKeyUsage([SERVER_AUTH])` to the leaf. All 7 cert_bootstrap unit tests still pass (the assertions don't probe these extensions specifically) and the integration test now succeeds with the cert chain validating end-to-end. +- **Files modified:** `src/phaze/cert_bootstrap.py` (lines ~85-95 CA, ~125-145 leaf). +- **Commit:** 57d9843 (folded into the Task 2 RED commit since the bug was discovered while writing the RED test for Task 2 and the fix unblocked test_correct_ca_succeeds). + +### Authentication gates + +None. + +### Architectural decisions + +None β€” the integration-test setup uses real uvicorn + asyncio.Task per the plan's `` block (RESEARCH Β§Pattern 3 Option 1). No design change. + +## Threat Flags + +None. The plan's `` already enumerates the surfaces this plan touches (T-29-01-01..T-29-01-SC). No new surface was introduced beyond what the model anticipated. + +## Known Stubs + +None. All wiring is end-to-end functional: +- `cert_bootstrap.ensure_certs_present` writes real x509 files that pass Python 3.13's ssl chain validation. +- `entrypoint.main()` invokes a real `os.execvp` (not a no-op). +- `PhazeAgentClient(verify=...)` flows to `httpx.AsyncClient(verify=...)`. +- `construct_agent_client` raises `RuntimeError` on missing CA path (tested in test 3 and test 4 of `test_agent_client_tls.py`). + +The remaining AUTH-02 work β€” switching `docker-compose.yml` api command to `python -m phaze.entrypoint` and mounting `./certs/` β€” lands in **Plan 03** per the plan's `` note: "AUTH-02 fully closed in Plan 03 once docker-compose.yml api command switches to `python -m phaze.entrypoint` and the cert-mounted volume is configured". + +## TDD Gate Compliance + +Both tasks followed the RED β†’ GREEN cycle: +- Task 1: ffdbf5f (test) β†’ 5840bfe (feat). RED state confirmed: pytest reported `ModuleNotFoundError: No module named 'phaze.cert_bootstrap'` before the GREEN commit. GREEN state confirmed: 7/7 tests pass. +- Task 2: 57d9843 (test, includes Rule 1 bug fix) β†’ 25c4ca4 (feat). RED state confirmed: 2 of 4 cases failed (`test_construct_agent_client_*`) with `TypeError: AgentSettings.__init__() got an unexpected keyword argument 'agent_ca_file'` (and prior, before the cert chain fix, `test_correct_ca_succeeds` also failed with the AKI error). GREEN state confirmed: 4/4 tests pass. + +No REFACTOR commit was needed β€” both modules landed in their final shape in the GREEN commit. + +## Self-Check: PASSED + +Files claimed to be created β€” all present: +- `src/phaze/cert_bootstrap.py` β€” FOUND +- `src/phaze/entrypoint.py` β€” FOUND +- `tests/test_cert_bootstrap.py` β€” FOUND +- `tests/test_services/test_agent_client_tls.py` β€” FOUND + +Commits claimed β€” all present in `git log --oneline`: +- ffdbf5f β€” FOUND +- 5840bfe β€” FOUND +- 57d9843 β€” FOUND +- 25c4ca4 β€” FOUND + +Test count matches plan success criteria: 7 (cert_bootstrap, incl. WARNING-8) + 4 (TLS, plan said 3 but the second fail-fast case for empty-CA was added to fully cover D-03's "missing OR empty" predicate) + 1 (task_split extension) = 12 net new tests passing. diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-02-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-02-PLAN.md new file mode 100644 index 0000000..ec925d8 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-02-PLAN.md @@ -0,0 +1,183 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 02 +type: execute +wave: 1 +depends_on: [] +files_modified: + - src/phaze/config.py + - tests/test_config/test_agent_settings_redis_password.py + - tests/test_config/__init__.py +autonomous: true +requirements: [AUTH-03] +tags: [phase-29, auth, redis, security, v4.0] + +must_haves: + truths: + - "AgentSettings has a typed agent_env field bound to PHAZE_AGENT_ENV via AliasChoices (D-06)" + - "When agent_env='production', AgentSettings refuses to construct with a passwordless redis_url (ValidationError)" + - "When agent_env='dev' (default), passwordless redis_url is accepted (dev convenience)" + - "A production redis_url with `redis://default:@host:6379` validates successfully" + artifacts: + - path: "src/phaze/config.py" + provides: "agent_env field on AgentSettings + _enforce_redis_password_in_production model_validator" + contains: "agent_env" + - path: "tests/test_config/test_agent_settings_redis_password.py" + provides: "3 test cases: prod+passwordless=fail, prod+password=ok, dev+passwordless=ok" + min_lines: 60 + exports: [] + - path: "tests/test_config/__init__.py" + provides: "test package marker so pytest discovers test_config/" + contains: "" + key_links: + - from: "src/phaze/config.py::AgentSettings._enforce_redis_password_in_production" + to: "src/phaze/config.py::AgentSettings.redis_url" + via: "model_validator(mode='after') reads self.redis_url via urlparse" + pattern: "model_validator\\(mode=\"after\"\\)" + - from: "tests/test_config/test_agent_settings_redis_password.py" + to: "phaze.config.AgentSettings" + via: "import + AgentSettings(...) ValidationError tests" + pattern: "from phaze.config import AgentSettings" +--- + + +Land the production-mode Redis password validator on `AgentSettings` (CONTEXT.md D-06). The compose-side Redis hardening (`requirepass` + LAN-bound port) lives in Plan 03 alongside the docker-compose rewrite; this plan delivers the agent-side guard that refuses passwordless Redis URLs in production. Together they close AUTH-03. + +Purpose: AUTH-03 requires Redis on the application server to require `requirepass` AND that agents connect with `redis://default:@:6379`. Per D-06, the agent-side guard belongs in the AgentSettings model_validator so a misconfigured production agent fails fast at startup rather than connecting to an unsecured Redis. + +Output: One new `agent_env: Literal["dev", "production"]` field + one `model_validator(mode="after")` that parses `self.redis_url` and refuses missing-password URLs when `agent_env == "production"`. Three test cases covering the matrix. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md + + + + + + +```python +from pydantic import AliasChoices, Field, SecretStr, field_validator, model_validator +``` + + + + + + + + + + Task 1: Add agent_env field + production-mode model_validator on AgentSettings + src/phaze/config.py, tests/test_config/__init__.py, tests/test_config/test_agent_settings_redis_password.py + + - `AgentSettings(agent_env="production", redis_url="redis://localhost:6379/0", agent_api_url="https://x", agent_token=SecretStr("x"))` raises `pydantic.ValidationError` containing the substring `"requires a password in redis_url"`. + - `AgentSettings(agent_env="production", redis_url="redis://default:secret@localhost:6379/0", agent_api_url="https://x", agent_token=SecretStr("x"))` constructs without error. + - `AgentSettings(agent_env="dev", redis_url="redis://localhost:6379/0", ...)` constructs without error (dev allows passwordless). + - `AgentSettings()` with no `agent_env` set defaults to `"dev"` (preserves existing dev workflow). + - `PHAZE_AGENT_ENV=production` env var maps onto the `agent_env` field via `AliasChoices`. + - `tests/test_config/__init__.py` exists (may be empty file). + - `uv run mypy src/phaze/config.py` clean β€” `Literal["dev","production"]` type-checks. + - `uv run ruff check src/phaze/config.py tests/test_config/test_agent_settings_redis_password.py` clean. + + + - src/phaze/config.py (full file β€” current AgentSettings class structure; existing `_enforce_localhost_only` field_validator at lines 64-90 as the analog for new model_validator; existing AliasChoices pattern at lines 153-172 for agent_token/agent_api_url; existing imports at lines 1-18) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-06 (full requirement: agent_env field + production-mode model_validator + refuses passwordless redis URLs) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 4: Redis hardening in compose" lines 499-524 (URL parsing semantics; `default` is the redis ACL default user) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/config.py" lines 706-758 (3 new fields + model_validator literal target code) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_config/test_agent_settings_redis_password.py" lines 1050-1059 (analog: tests/test_config_role_split.py) + - tests/test_config_role_split.py (existing analog β€” monkeypatch env vars + get_settings.cache_clear() pattern) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pitfall 7: Compose env-var fail-fast vs. dev convenience" lines 998-1003 (dev default `REDIS_PASSWORD=changeme` so dev `docker compose up` works; agent_env=dev is the matching agent-side default) + + +Modify `src/phaze/config.py`: + +1. Add `Literal` to the existing `typing` imports (line 1 currently has `from typing import Annotated`). After Plan 01 it may already be present; if so, no-op. The final import becomes `from typing import Annotated, Literal`. + +2. Add `from urllib.parse import urlparse` to the stdlib imports (insert alphabetically near other stdlib imports; if no stdlib imports exist yet, add it just after `from __future__ import annotations`). + +3. On `AgentSettings` class (the file-bound subclass declared after `BaseSettings`), add one new field, placed BETWEEN the existing `agent_token` field and the `watcher_settle_seconds` field (or wherever AgentSettings fields end before its validators) to keep all PHAZE_AGENT_* fields contiguous: + + - `agent_env: Literal["dev", "production"]` with `Field(default="dev", validation_alias=AliasChoices("PHAZE_AGENT_ENV", "agent_env"), description="Deployment mode. Production refuses passwordless Redis URLs (Phase 29 D-06).")`. + +4. Add a new `model_validator(mode="after")` method on `AgentSettings`, placed AFTER all field declarations and the existing field_validators. Method signature: `def _enforce_redis_password_in_production(self) -> "AgentSettings":`. Body: if `self.agent_env == "production"`, parse `urlparse(self.redis_url)`; if `parsed.password is None or parsed.password == ""`, raise `ValueError("agent_env=production requires a password in redis_url (Phase 29 D-06)")`. Return `self` at the end. Use a Phase-29-tagged docstring `"""D-06: production refuses passwordless redis_url."""`. + +5. Per CONTEXT.md D-06 and RESEARCH.md Pattern 4, do NOT add `redis_password` as a separate field β€” agents read the full URL with the password embedded (`redis://default:@host:6379/0`). The validator just parses what's already in `redis_url`. + +6. The new field uses `model_validator(mode="after")` (not `field_validator`) because the validator needs access to BOTH `redis_url` and `agent_env`. The `model_validator` import is already present at config.py line 16. + +Create `tests/test_config/__init__.py` as an empty file so pytest discovers the new sub-package. + +Write `tests/test_config/test_agent_settings_redis_password.py` with three tests + one extra defaults case = 4 cases. Mirror `tests/test_config_role_split.py` for the `monkeypatch.setenv` pattern (set the required `PHAZE_AGENT_API_URL`, `PHAZE_AGENT_TOKEN`, `PHAZE_ROLE` env vars; call `get_settings.cache_clear()` if `get_settings()` is used, OR construct `AgentSettings(...)` directly with kwargs). + +Test 1 β€” `test_production_refuses_passwordless_redis_url`: construct `AgentSettings(agent_env="production", redis_url="redis://localhost:6379/0", agent_api_url="https://api.test", agent_token=SecretStr("phaze_agent_test"))`; `with pytest.raises(ValidationError) as exc_info:` ... ; assert `"requires a password in redis_url"` is in `str(exc_info.value)`. + +Test 2 β€” `test_production_accepts_passworded_redis_url`: construct same but `redis_url="redis://default:secret@localhost:6379/0"`; should succeed; assert `cfg.agent_env == "production"`. + +Test 3 β€” `test_dev_accepts_passwordless_redis_url`: construct same but `agent_env="dev"` and `redis_url="redis://localhost:6379/0"`; should succeed; assert `cfg.agent_env == "dev"`. + +Test 4 β€” `test_default_agent_env_is_dev`: construct `AgentSettings(redis_url="redis://localhost:6379/0", agent_api_url="https://api.test", agent_token=SecretStr("phaze_agent_test"))`; assert `cfg.agent_env == "dev"` (default). + +Imports needed in the test file: `pytest`, `from pydantic import SecretStr, ValidationError`, `from phaze.config import AgentSettings`. No `monkeypatch` needed since the tests pass kwargs directly (cleaner than env-var indirection). + + + uv run pytest tests/test_config/test_agent_settings_redis_password.py -x -q + + +- `src/phaze/config.py` AgentSettings has a typed `agent_env: Literal["dev", "production"]` field +- `src/phaze/config.py` AgentSettings has `_enforce_redis_password_in_production` model_validator +- All 4 tests in `test_agent_settings_redis_password.py` pass +- `uv run mypy src/phaze/config.py` clean +- `uv run ruff check src/phaze/config.py tests/test_config/` clean +- Existing tests under `tests/test_config_role_split.py` (or wherever AgentSettings is constructed) still pass β€” no regression + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| operator env vars / .env file β†’ AgentSettings | the Redis URL (incl. password) crosses this boundary at agent startup | +| AgentSettings β†’ SAQ Queue.from_url | the validated URL flows into SAQ which delegates to redis-py | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-02-01 | Information Disclosure | passwordless Redis exposes SAQ queues to LAN attackers | mitigate | `agent_env=production` model_validator refuses passwordless `redis_url`; matched by Plan 03's `requirepass` + LAN binding on the server side | +| T-29-02-02 | Spoofing | unauthenticated agent β†’ app-server Redis queues (queue stuffing) | mitigate | same β€” both server-side `requirepass` (Plan 03) and client-side production-mode guard (this plan) close the path | +| T-29-02-03 | Operational | dev clones silently start with `agent_env=production` and break | accept | `default="dev"` is the locked CONTEXT D-06 choice; operators must explicitly opt-in to production mode. Pitfall 7 documents the matching server-side dev default (`REDIS_PASSWORD=changeme`) | +| T-29-02-04 | Tampering | misconfigured redis_url at runtime (typo, URL-encoding error) | mitigate | `urlparse(self.redis_url).password` resolves URL-encoded passwords correctly; a truly malformed URL falls through to a SAQ connection failure at queue construction time β€” surfaces fast | + + + +- `uv run pytest tests/test_config/ -x -q` β€” all green +- `uv run pytest tests/test_config_role_split.py -x -q` β€” no regression in existing config tests +- `uv run mypy src/phaze/config.py` β€” clean +- `uv run python -c "from phaze.config import AgentSettings; from pydantic import SecretStr; cfg = AgentSettings(agent_env='dev', redis_url='redis://localhost:6379/0', agent_api_url='https://x', agent_token=SecretStr('x')); print(cfg.agent_env)"` β€” prints `dev` + + + +- AUTH-03 partially closed (agent-side guard); server-side (`requirepass` + LAN binding) lands in Plan 03 +- D-06 fully implemented +- 4 new tests under `tests/test_config/` +- Pitfall 7 documented behavior preserved (dev default works for fresh clones) + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-02-SUMMARY.md` when the task completes. Summary must list: files modified, new tests added, decision IDs implemented (D-06), and any deviations. + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-02-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-02-SUMMARY.md new file mode 100644 index 0000000..6fa7157 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-02-SUMMARY.md @@ -0,0 +1,171 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 02 +subsystem: auth +tags: [phase-29, auth, redis, security, v4.0, agent-settings, model-validator] + +requires: + - phase: 26-task-code-reorg-http-backed-agent-worker + provides: AgentSettings (role-split BaseSettings subclass) +provides: + - AgentSettings.agent_env (D-06; Literal["dev","production"], default "dev", env alias PHAZE_AGENT_ENV) + - AgentSettings._enforce_redis_password_in_production model_validator (D-06) + - tests/test_config/__init__.py (pytest sub-package marker) + - tests/test_config/test_agent_settings_redis_password.py (4 cases) +affects: + - Phase 29 Plan 03 (docker-compose rewrite + Redis requirepass; server-side half of AUTH-03) + - All agent worker entrypoints that construct SAQ Queue.from_url(redis_url) in production + - .env.example.agent (Plan 03 will document `PHAZE_AGENT_ENV=production` alongside `REDIS_PASSWORD`) + +tech-stack: + added: [] + patterns: + - "Literal-typed deployment-mode selector field with AliasChoices env alias" + - "model_validator(mode='after') that reads multiple sibling fields (agent_env + redis_url) β€” the existing _enforce_required_agent_fields validator is the analog, and chains in order before this one" + - "URL-shape guard via urllib.parse.urlparse rather than regex (handles URL-encoded passwords; degenerate URLs fall through to SAQ connect-time failure)" + - "Dev-default opt-out (Pitfall 7): the strict guard only fires when the operator explicitly opts in to production mode" + +key-files: + created: + - tests/test_config/__init__.py + - tests/test_config/test_agent_settings_redis_password.py + modified: + - src/phaze/config.py (added Literal import; agent_env field; _enforce_redis_password_in_production model_validator) + +key-decisions: + - "D-06: agent_env defaults to 'dev' so fresh clones / Pitfall 7 work without ceremony; operator must explicitly set PHAZE_AGENT_ENV=production to engage the guard." + - "model_validator (not field_validator) β€” needs access to BOTH redis_url and agent_env on the same instance; field validators on redis_url cannot read agent_env in pydantic v2." + - "URL parsing via urllib.parse.urlparse rather than a regex β€” correctly handles URL-encoded passwords (`%40` for `@` in the password component, etc.). A malformed URL falls through to SAQ connect-time failure rather than a confusing pydantic validation error." + - "Validator placed AFTER _enforce_required_agent_fields so required-field checks run first; ordering matches the natural failure mode (missing api_url is a more obvious operator error than a passworded URL mismatch)." + - "NO separate `redis_password` field β€” the AUTH-03 contract is one full URL with the password embedded (`redis://default:@host:6379/0`). Adding a separate field would duplicate state and create skew risk." + - "Field placement: between agent_token and scan_roots to keep PHAZE_AGENT_* fields contiguous in the source." + +patterns-established: + - "Linked-field validation pattern on AgentSettings: model_validator(mode='after') reading self. and self. with a Phase-tagged docstring and an actionable error message that references the decision ID." + - "Test pattern for AgentSettings model contracts: pass kwargs directly to AgentSettings(...) rather than env-var monkeypatching (cleaner than the role-split tests' env-var pattern when the contract under test is the model itself, not the env-var β†’ field mapping)." + +requirements-completed: [] # AUTH-03 is partial (agent-side guard); fully closes when Plan 03 lands compose-side requirepass + LAN bind + +metrics: + duration: ~4min + tasks_complete: 1 + files_created: 2 + files_modified: 1 + tests_added: 4 + commits: 2 # 1 RED test + 1 GREEN feat (no REFACTOR needed) + +completed: 2026-05-16 +--- + +# Phase 29 Plan 02: AgentSettings Production-Mode Redis-Password Validator Summary + +**Agent-side guard that refuses passwordless Redis URLs when `agent_env=production`, closing the client half of AUTH-03. The server-side half (Redis `requirepass` + LAN-bound port) lands in Plan 03; this plan ensures a misconfigured production agent fails fast at startup rather than silently connecting to an unsecured Redis.** + +## What Shipped + +### Config: agent_env field + production-mode validator + +- **`AgentSettings.agent_env`** (D-06): `Literal["dev", "production"]`, default `"dev"`, env alias `PHAZE_AGENT_ENV` via `AliasChoices`. Placed adjacent to the other `PHAZE_AGENT_*` fields (between `agent_token` and `scan_roots`) for source-file grouping. Docstring references Phase 29 D-06 and the matching server-side hardening. +- **`AgentSettings._enforce_redis_password_in_production`** (D-06): `model_validator(mode="after")`. When `self.agent_env == "production"`, parses `urlparse(self.redis_url)`; if `parsed.password` is falsy, raises `ValueError("agent_env=production requires a password in redis_url (Phase 29 D-06)")`. Returns `self` otherwise. Placed AFTER `_enforce_required_agent_fields` so the required-field check runs first (matches the natural operator failure-mode ordering). +- **`from typing import Annotated, Literal`** β€” `Literal` added to the existing import; `urlparse` was already imported at the top of `config.py` (Phase 28 added it for the `_enforce_localhost_only` field-validator) so no new import was needed for that. + +No separate `redis_password` field. Per CONTEXT D-06 and RESEARCH Β§Pattern 4, the AUTH-03 contract is one full URL with the password embedded (`redis://default:@host:6379/0`); the validator simply parses what's already in `redis_url`. + +### Tests + +- **`tests/test_config/__init__.py`** β€” empty file; pytest sub-package marker so the new `test_config/` directory is discovered. +- **`tests/test_config/test_agent_settings_redis_password.py`** β€” **4 cases**: + 1. `test_production_refuses_passwordless_redis_url`: `agent_env="production"` + `redis://localhost:6379/0` raises `ValidationError`; error message contains `"requires a password in redis_url"`. + 2. `test_production_accepts_passworded_redis_url`: `agent_env="production"` + `redis://default:secret@localhost:6379/0` constructs successfully; `cfg.agent_env == "production"`. + 3. `test_dev_accepts_passwordless_redis_url`: `agent_env="dev"` + passwordless URL constructs OK; `cfg.agent_env == "dev"`. + 4. `test_default_agent_env_is_dev`: omitting `agent_env` yields `cfg.agent_env == "dev"` (existing-workflow guarantee). + +Tests pass kwargs directly to `AgentSettings(...)` rather than env-var monkeypatching β€” cleaner than the role-split tests' env-var pattern when the contract under test is the model itself, not the env-var β†’ field mapping. (Env-var mapping is implicitly covered by `AliasChoices` being identical to the existing patterns; a dedicated env-var test would duplicate role-split coverage.) + +## Verification Results + +``` +uv run pytest tests/test_config/test_agent_settings_redis_password.py -x -q +4 passed in 0.01s + +uv run pytest tests/test_config_role_split.py tests/test_config_worker.py \ + tests/test_config/ tests/test_task_split.py tests/test_main_lifespan.py -q +32 passed in 1.65s +``` + +- 4/4 new tests pass (RED β†’ GREEN cycle: 4b95029 β†’ a7741ff). +- 22 existing config tests pass (zero regression in `test_config_role_split.py` / `test_config_worker.py`). +- 6 task-split / main-lifespan tests pass (no import-boundary regression from the new import). +- `uv run mypy src/phaze/config.py` β€” clean (Success: no issues found in 1 source file). +- `uv run ruff check src/phaze/config.py tests/test_config/` β€” clean. +- `uv run ruff format --check src/phaze/config.py tests/test_config/` β€” clean (3 files already formatted). +- `uv run python -c "from phaze.config import AgentSettings; from pydantic import SecretStr; cfg = AgentSettings(agent_env='dev', redis_url='redis://localhost:6379/0', agent_api_url='https://x', agent_token=SecretStr('x'), scan_roots=['/tmp']); print(cfg.agent_env)"` β†’ prints `dev` (matches plan `` smoke). + +## Commits + +| Hash | Type | Phase | Subject | +| ------- | ---- | ----- | -------------------------------------------------------------------------------------- | +| 4b95029 | test | RED | add failing tests for AgentSettings agent_env + redis-password validator | +| a7741ff | feat | GREEN | enforce passworded Redis URL on AgentSettings in production mode | + +RED state confirmed before the GREEN commit: pytest reported `Failed: DID NOT RAISE ` on `test_production_refuses_passwordless_redis_url` because the validator did not yet exist. (The `agent_env="production"` kwarg was silently discarded by `extra="ignore"` on `SettingsConfigDict`, so no field-level error fired and no model-level validator could fire either β€” exactly the RED signature expected for "field missing + validator missing".) GREEN state confirmed: 4/4 tests pass. + +No REFACTOR commit needed β€” the implementation landed in its final shape in the GREEN commit. + +## Deviations from Plan + +### Auto-fixed Issues + +None. The plan's `` block was complete and accurate β€” `urlparse` was already imported at line 14 (added by Phase 28 for `_enforce_localhost_only`), so the "add `from urllib.parse import urlparse`" step in the plan was a no-op as the plan itself anticipated ("if no stdlib imports exist yet" β€” they did exist). + +### Authentication gates + +None. + +### Architectural decisions + +None. + +## Threat Flags + +None. The plan's `` already enumerates the four surfaces this plan touches (T-29-02-01..T-29-02-04). No new surface was introduced beyond what the model anticipated: + +- **T-29-02-01 / T-29-02-02** (passwordless Redis exposes queues; spoofed agent stuffs queues): mitigated by the new client-side guard. Server-side half (Plan 03) closes the full attack path. +- **T-29-02-03** (dev clones silently start with `agent_env=production` and break): mitigated by `default="dev"` β€” operator must explicitly opt in. +- **T-29-02-04** (typo / URL-encoding error): mitigated by `urlparse(self.redis_url).password` semantics (URL-encoded passwords resolve correctly); malformed URLs fall through to SAQ connect-time failure. + +## Known Stubs + +None. The validator is fully functional end-to-end: + +- The field is a real `Literal` type, not a placeholder string. +- The validator raises a real `ValueError` (which pydantic wraps as `ValidationError`). +- `urlparse` is the real stdlib parser, not a stub. +- The error message contains the operator-actionable substring `"requires a password in redis_url"` plus the decision-ID reference `(Phase 29 D-06)`. + +The remaining AUTH-03 work β€” switching `docker-compose.yml` to set `redis-server --requirepass ${REDIS_PASSWORD}` on the redis service and bind it to the LAN-only interface β€” lands in **Plan 03** per the plan's `` note: "The compose-side Redis hardening (`requirepass` + LAN-bound port) lives in Plan 03 alongside the docker-compose rewrite; this plan delivers the agent-side guard that refuses passwordless Redis URLs in production. Together they close AUTH-03." + +## TDD Gate Compliance + +The single task in this plan followed the RED β†’ GREEN cycle: + +- **RED** (4b95029): `test(29-02): add failing tests for AgentSettings agent_env + redis-password validator`. Pytest run confirmed `test_production_refuses_passwordless_redis_url` failed with `DID NOT RAISE ValidationError`. +- **GREEN** (a7741ff): `feat(29-02): enforce passworded Redis URL on AgentSettings in production mode`. All 4 tests pass after the field + validator were added. +- **REFACTOR**: not needed β€” the implementation matched the PATTERNS.md target shape on first GREEN. + +Gate-sequence check: `git log --oneline` shows `4b95029 test(...)` immediately preceding `a7741ff feat(...)` for plan 29-02. RED and GREEN commits both present in correct order. + +## Self-Check: PASSED + +Files claimed to be created/modified β€” all present: +- `tests/test_config/__init__.py` β€” FOUND +- `tests/test_config/test_agent_settings_redis_password.py` β€” FOUND +- `src/phaze/config.py` β€” MODIFIED (verified via `grep -n "agent_env\|_enforce_redis_password" src/phaze/config.py` returns 6 lines covering the field + validator definitions) + +Commits claimed β€” all present in `git log --oneline`: +- 4b95029 β€” FOUND +- a7741ff β€” FOUND + +Test count matches plan's ``: 4 new tests in `tests/test_config/test_agent_settings_redis_password.py` covering prod+passwordless=fail, prod+passworded=ok, dev+passwordless=ok, default-is-dev. + +Decision IDs implemented: D-06 (complete β€” agent-side half of AUTH-03). diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-03-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-03-PLAN.md new file mode 100644 index 0000000..ef18ef9 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-03-PLAN.md @@ -0,0 +1,308 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 03 +type: execute +wave: 1 +depends_on: [] +files_modified: + - docker-compose.yml + - Dockerfile + - .env.example + - tests/test_deployment/__init__.py + - tests/test_deployment/test_api_filesystem_isolation.py +autonomous: true +requirements: [DIST-01, AUTH-03] +tags: [phase-29, deployment, compose, dist-01, redis, security, v4.0] + +must_haves: + truths: + - "Root docker-compose.yml declares NO /data/music, /models, or /data/output mounts on api or worker services (DIST-01)" + - "Root docker-compose.yml has NO watcher service block (moved to agent.yml in Plan 04; D-17)" + - "Root docker-compose.yml has NO agent-worker service block (moved to agent.yml in Plan 04; D-17)" + - "Root docker-compose.yml has NO audfprint or panako service blocks (sidecars are file-server-local; D-15)" + - "Redis service uses --requirepass with REDIS_PASSWORD env var (fail-fast via :? syntax) (D-05)" + - "Redis ports binding uses `${REDIS_BIND_IP:-127.0.0.1}:6379:6379` (loopback default; prod sets LAN IP via .env) (D-05)" + - "Redis healthcheck uses redis-cli --no-auth-warning -a ${REDIS_PASSWORD} ping (D-05)" + - "api service command is `uv run python -m phaze.entrypoint` (not direct uvicorn)" + - "api service mounts the certs bind volume `${CA_PATH:-./certs}:/certs:rw`" + - ".env.example documents REDIS_PASSWORD, REDIS_BIND_IP, PHAZE_API_TLS_SANS with dev-friendly defaults" + - "End state: root compose services = {api, worker, postgres, redis} only" + - "Dockerfile audit: no MODELS_PATH/SCAN_PATH/OUTPUT_PATH ENV defaults that would mask missing mounts" + artifacts: + - path: "docker-compose.yml" + provides: "Hardened app-server compose: TLS via entrypoint shim, Redis with requirepass+LAN-bound, no file mounts on api/worker, no agent services" + contains: "phaze.entrypoint" + - path: ".env.example" + provides: "Dev-friendly defaults for REDIS_PASSWORD, REDIS_BIND_IP, PHAZE_API_TLS_SANS" + contains: "REDIS_PASSWORD" + - path: "tests/test_deployment/__init__.py" + provides: "Test package marker" + contains: "" + - path: "tests/test_deployment/test_api_filesystem_isolation.py" + provides: "4 structural-parse tests (D-19): api no-mounts, worker no-mounts, watcher+agent-worker absent, redis hardened" + min_lines: 100 + exports: [] + key_links: + - from: "docker-compose.yml::api.command" + to: "src/phaze/entrypoint.py::main" + via: "python -m phaze.entrypoint exec" + pattern: "python -m phaze.entrypoint" + - from: "docker-compose.yml::redis.command" + to: "REDIS_PASSWORD env var" + via: "redis-server --requirepass interpolation" + pattern: "requirepass.*REDIS_PASSWORD" + - from: "docker-compose.yml::api.volumes" + to: "./certs/ bind mount" + via: "${CA_PATH:-./certs}:/certs:rw" + pattern: "/certs:rw" +--- + + +Rewrite the root `docker-compose.yml` to be the **application-server-only** compose (no file mounts, no agent services, TLS termination via the Plan 01 entrypoint shim, Redis hardened per D-05). Audit `Dockerfile` for any `MODELS_PATH`/`SCAN_PATH`/`OUTPUT_PATH` ENV defaults that would silently mask missing mounts; remove if present. Update `.env.example` with the three new variables (`REDIS_PASSWORD`, `REDIS_BIND_IP`, `PHAZE_API_TLS_SANS`). Land the YAML-parse structural test suite (D-19) that asserts the invariant in CI. + +Purpose: Closes DIST-01 ("application server has no file mounts") and the server side of AUTH-03 ("Redis requires `requirepass` and is bound only to the private LAN"). Provides the structural-parse test (D-19) so future edits can't silently re-introduce the violations. + +Output: Hardened `docker-compose.yml`; possibly modified `Dockerfile`; extended `.env.example`; new `tests/test_deployment/` package with the YAML-parse test suite. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md + + + + + + + + + + + + + + + + + + + + Task 1: Write the test_deployment YAML-parse suite first (Wave 0 inside this plan) + tests/test_deployment/__init__.py, tests/test_deployment/test_api_filesystem_isolation.py + + - `tests/test_deployment/__init__.py` exists (may be empty). + - `tests/test_deployment/test_api_filesystem_isolation.py` contains 4 test functions: `test_api_service_has_no_file_mounts`, `test_controller_worker_has_no_file_mounts`, `test_no_watcher_or_agent_worker_in_root_compose`, `test_redis_hardened`. + - All 4 tests FAIL initially (BEFORE Task 2 lands the compose rewrite) β€” this is the RED step of TDD. + - Tests use `yaml.safe_load(Path(...).read_text())` not regex. + - Banned mount targets list: `("/data/music", "/models", "/data/output")`. + - Redis test asserts: command contains `requirepass`; ports entry matches regex `^.+:6379:6379$` (i.e., has an IP prefix, not bare `6379:6379`); healthcheck command contains both `-a` and `--no-auth-warning`. + - `uv run pytest tests/test_deployment/test_api_filesystem_isolation.py -x -q` runs (the tests FAIL initially is expected). + + + - docker-compose.yml (full file β€” current state with watcher/agent-worker/audfprint/panako blocks present; this is what the tests will assert against after Task 2 strips them) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Code Examples β†’ CI YAML-Parse Test for Filesystem Isolation (D-19)" lines 1028-1071 (3 LOCKED test functions; Task 1 adds a 4th for redis hardening) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_deployment/test_*.py" lines 1026-1046 (structural-parse style; no analog; new test directory + __init__.py) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-19 (filesystem isolation test) + Β§D-05 (redis hardening) + Β§D-17 (no watcher/agent-worker in root compose) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 4: Redis hardening in compose" lines 499-524 (redis service shape; --no-auth-warning rationale) + + +Create `tests/test_deployment/__init__.py` as an empty file so pytest discovers the new package. + +Write `tests/test_deployment/test_api_filesystem_isolation.py` with 4 tests, all using `yaml.safe_load` on the project-root `docker-compose.yml`. Module-level constants: `COMPOSE_PATH = Path(__file__).resolve().parents[2] / "docker-compose.yml"` and `BANNED_MOUNT_TARGETS = ("/data/music", "/models", "/data/output")`. + +Each test parses the YAML via `data = yaml.safe_load(COMPOSE_PATH.read_text())`. + +Test 1 β€” `test_api_service_has_no_file_mounts()` per RESEARCH lines 1041-1053: read `data["services"]["api"].get("volumes", []) or []`; for each volume entry, derive the container target (`vol_entry.split(":")[1]` if string and contains `:`, else `vol_entry.get("target", "")` if dict); assert NO banned mount target appears in the resolved `target`. The `${CA_PATH:-./certs}:/certs:rw` mount IS allowed and should NOT trigger the assertion. + +Test 2 β€” `test_controller_worker_has_no_file_mounts()` per RESEARCH lines 1055-1062: same logic for `data["services"]["worker"]` volumes. The controller is fileless β€” it should have NO volumes at all (or only sidecar-equivalent like a build cache), and definitely no `/data/music`, `/models`, `/data/output`. + +Test 3 β€” `test_no_watcher_or_agent_worker_in_root_compose()` per RESEARCH lines 1065-1071: `assert "watcher" not in data["services"]` and `assert "agent-worker" not in data["services"]`. Also (per D-15) `assert "audfprint" not in data["services"]` and `assert "panako" not in data["services"]`. + +Test 4 β€” `test_redis_hardened()` (new, per D-05): read `data["services"]["redis"]`; assert its `command` (either a list or a string) contains `"requirepass"` AND contains the literal `"REDIS_PASSWORD"` (the compose interpolation token must be present β€” `--requirepass ${REDIS_PASSWORD:?...}`); assert the `ports` entry list contains exactly one item whose form is `"${REDIS_BIND_IP:-127.0.0.1}:6379:6379"` (string match β€” that's the raw token before interpolation since `yaml.safe_load` does NOT do env interpolation); assert the healthcheck `test` list contains `"redis-cli"`, `"--no-auth-warning"`, `"-a"`, and the literal `"${REDIS_PASSWORD}"`. + +For the ports assertion, the safest check is: `ports = data["services"]["redis"]["ports"]; assert any(":6379:6379" in p and p != "6379:6379" and p != ":6379:6379" for p in ports)` β€” i.e., there's an IP-prefixed port mapping, not a bare `6379:6379` that would default-bind to `0.0.0.0`. + +Helper: define a small module-level `_volume_target(entry)` function that handles both string and dict volume forms; DRY between Test 1 and Test 2. + +Run `uv run pytest tests/test_deployment/test_api_filesystem_isolation.py -x -q` β€” at this point all 4 tests FAIL (the current docker-compose.yml has the violations). This is the expected RED step. + + + uv run pytest tests/test_deployment/test_api_filesystem_isolation.py --collect-only -q + + +- `tests/test_deployment/__init__.py` exists +- `tests/test_deployment/test_api_filesystem_isolation.py` has 4 collected test functions +- Tests FAIL when run against the CURRENT (un-rewritten) docker-compose.yml β€” this is the RED step +- `uv run ruff check tests/test_deployment/` clean + + + + + Task 2: Rewrite docker-compose.yml + audit Dockerfile + update .env.example (GREEN step) + docker-compose.yml, Dockerfile, .env.example + + - All 4 tests from Task 1 pass after this task. + - Root docker-compose.yml top-level `services:` keys are exactly the set `{api, worker, postgres, redis}` (no watcher, no agent-worker, no audfprint, no panako). + - `services.api.command` is the literal string `uv run python -m phaze.entrypoint` (not direct uvicorn). + - `services.api.volumes` list contains exactly one entry: `${CA_PATH:-./certs}:/certs:rw` (no SCAN_PATH mount, no MODELS_PATH mount, no OUTPUT_PATH mount). + - `services.api.ports` contains `${API_PORT:-8000}:8000`. + - `services.api.depends_on` retains `postgres: service_healthy` and `redis: service_healthy`. + - `services.worker.command` is `uv run saq phaze.tasks.controller.settings` (unchanged from current). + - `services.worker.volumes` is empty list `[]` OR the key is absent entirely (no file mounts on the controller worker). + - `services.worker.environment` no longer contains `MODELS_PATH=/models` (it was a controller-side leftover; controller is fileless). + - `services.redis.command` is a list: `["redis-server", "--requirepass", "${REDIS_PASSWORD:?REDIS_PASSWORD required}"]`. + - `services.redis.ports` is `["${REDIS_BIND_IP:-127.0.0.1}:6379:6379"]`. + - `services.redis.healthcheck.test` is `["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]`. + - `volumes:` top-level block removes `audfprint_data` and `panako_data` (no longer referenced); keeps `pgdata`. + - `.env.example` gains a Phase-29 section with `REDIS_PASSWORD=changeme`, `REDIS_BIND_IP=127.0.0.1`, `PHAZE_API_TLS_SANS=localhost,127.0.0.1,api` plus the comment block from PATTERNS lines 816-834. + - `Dockerfile`: if any `ENV MODELS_PATH=` / `ENV SCAN_PATH=` / `ENV OUTPUT_PATH=` exists, REMOVE it. If none exists, leave the Dockerfile unchanged (just verify). + - `docker compose config --quiet` (with REDIS_PASSWORD=changeme set in env or .env) exits 0 β€” the compose file is valid. + + + - docker-compose.yml (full current content β€” every block to be modified or deleted) + - Dockerfile (verify no MODELS_PATH/SCAN_PATH/OUTPUT_PATH ENV defaults that would mask the missing mounts; if present, remove) + - .env.example (current content β€” preserve, append new section) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-05 (redis service rewrite literal YAML) + Β§D-17 (root compose end state) + Β§D-01 (api command via uvicorn β€” Plan 01 entrypoint shim swaps this in) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 2: Uvicorn TLS termination" lines 401-441 (api command shape; cert bind mount; entrypoint pattern) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 4: Redis hardening in compose" lines 499-524 (literal redis YAML target) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"docker-compose.yml" lines 762-795 (literal diff target: 6 concrete changes) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§".env.example" lines 815-834 (literal additions to insert after the existing API_PORT block) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pitfall 7: Compose env-var fail-fast vs. dev convenience" lines 998-1003 (.env.example ships with REDIS_PASSWORD=changeme so fresh clones work) + + +Rewrite `docker-compose.yml` in place following PATTERNS lines 762-795 byte-for-byte for the 6 changes: + +**Change 1 β€” api service** (currently lines 3-17): keep `build: {context: ., dockerfile: Dockerfile}`. Replace `command: uv run uvicorn phaze.main:app --host 0.0.0.0 --port 8000` with `command: uv run python -m phaze.entrypoint`. Keep `ports: ["${API_PORT:-8000}:8000"]`. Keep `env_file: .env`. REPLACE `volumes: ["${SCAN_PATH:-/data/music}:/data/music:ro"]` with `volumes: ["${CA_PATH:-./certs}:/certs:rw"]` (rw because cert_bootstrap may write on first start; bind-mounted from host `./certs/`). Keep `depends_on: {postgres: {condition: service_healthy}, redis: {condition: service_healthy}}`. + +**Change 2 β€” worker service** (currently lines 28-45): keep `build:`, `command: uv run saq phaze.tasks.controller.settings`, `env_file: .env`. REPLACE `environment: [MODELS_PATH=/models, PHAZE_ROLE=control]` with `environment: [PHAZE_ROLE=control]` (drop MODELS_PATH β€” controller is fileless). REMOVE the entire `volumes:` block (no SCAN_PATH, MODELS_PATH, OUTPUT_PATH on the controller). Keep `depends_on: {postgres: {condition: service_healthy}, redis: {condition: service_healthy}}`. + +**Change 3 β€” DELETE the watcher service block** (currently lines 50-64). The watcher service moves to `docker-compose.agent.yml` in Plan 04. Remove ALSO the YAML comment block at lines 47-49 ("Phase 27 D-19: always-on watcher. Will move to docker-compose.agent.yml in Phase 29 …") since it's resolved. + +**Change 4 β€” DELETE the agent-worker service block** (currently lines 72-96). Moves to docker-compose.agent.yml in Plan 04. Remove the YAML comment block at lines 66-71 too. + +**Change 5 β€” DELETE the audfprint + panako service blocks** (currently lines 128-154). Sidecars are file-server-local per D-15. Remove the `audfprint_data` and `panako_data` entries from the top-level `volumes:` block at the bottom of the file (they are no longer referenced). + +**Change 6 β€” redis service rewrite** (currently lines 118-126) per RESEARCH lines 502-516: + +```yaml + redis: + image: redis:8-alpine + command: + - "redis-server" + - "--requirepass" + - "${REDIS_PASSWORD:?REDIS_PASSWORD required}" + ports: + - "${REDIS_BIND_IP:-127.0.0.1}:6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"] + interval: 5s + timeout: 5s + retries: 5 +``` + +Use list form for `command:` (per RESEARCH; clearer interpolation behavior). The `${REDIS_PASSWORD:?REDIS_PASSWORD required}` syntax causes `docker compose up` to fail-fast at parse time if `REDIS_PASSWORD` is unset. + +End state after all 6 changes: `services:` block contains exactly `{api, worker, postgres, redis}` keys. The top-level `volumes:` block contains exactly `{pgdata}` (audfprint_data and panako_data removed). + +Update `.env.example`: read the current file, find the `API_PORT` block (around line 27), insert immediately after it (preserve all existing content): + +```bash + +# ===================================================================== +# Phase 29: Redis hardening (D-05) +# ===================================================================== +# Required password for redis-server --requirepass. Fresh dev clones can +# use the placeholder; production MUST set a strong unique value. +REDIS_PASSWORD=changeme +# Interface to bind redis :6379 on. Dev = loopback. Production = LAN IP +# (e.g., 192.168.1.10) so agents on other hosts can reach it. +REDIS_BIND_IP=127.0.0.1 + +# ===================================================================== +# Phase 29: HTTPS via internal CA (D-02) +# ===================================================================== +# Comma-separated SAN list for the auto-generated leaf cert. Defaults +# include `api` (docker compose service-name DNS) for single-host dev. +# Production should add the app-server's LAN hostname / IP. +PHAZE_API_TLS_SANS=localhost,127.0.0.1,api +``` + +The `REDIS_PASSWORD=changeme` dev default is the Pitfall-7 mitigation β€” without it, `docker compose up` would fail for a fresh clone before the developer can react. + +Audit `Dockerfile`: read the full file; grep for `ENV MODELS_PATH`, `ENV SCAN_PATH`, `ENV OUTPUT_PATH`. If any are set, remove the corresponding line. If none exists, the Dockerfile is unchanged. Per CONTEXT.md "Existing Code to Read Before Modifying" line 234, this is a verify-and-scrub step. + +Re-run `uv run pytest tests/test_deployment/test_api_filesystem_isolation.py -x -q` β€” all 4 tests now pass (GREEN step). If a test fails, the diagnostic output points at the exact violation. + +Validate the compose syntax: `REDIS_PASSWORD=changeme docker compose config --quiet` should exit 0 (silently). If it errors, fix the YAML before continuing. + +NOTE: do NOT add the entrypoint `command:` until Plan 01's `phaze.entrypoint` module is on disk. This plan runs in Wave 1 alongside Plan 01 β€” coordinate by reading `src/phaze/entrypoint.py` exists before flipping the api `command:`. (If Plan 01 hasn't merged yet, the compose-config check still passes β€” but `docker compose up api` would fail at runtime until the entrypoint module exists. That's acceptable; both plans land in the same wave and the merge order is enforced by the orchestrator.) + + + uv run pytest tests/test_deployment/test_api_filesystem_isolation.py -x -q && REDIS_PASSWORD=changeme docker compose config --quiet 2>&1 | grep -v 'WARN' || true + + +- All 4 tests in `test_api_filesystem_isolation.py` pass (GREEN) +- `docker-compose.yml` top-level services block has exactly `{api, worker, postgres, redis}` keys +- api command starts with `uv run python -m phaze.entrypoint` +- api volumes is a single entry for `/certs:rw` +- redis service has `--requirepass`, IP-prefixed ports, and `--no-auth-warning -a` in healthcheck +- `.env.example` has the 3 new Phase-29 variables with comments +- Dockerfile audited (no MODELS_PATH/SCAN_PATH/OUTPUT_PATH ENV defaults) +- `REDIS_PASSWORD=changeme docker compose config --quiet` exits 0 +- `uv run ruff check tests/test_deployment/` clean (no Python regressions) + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| host filesystem β†’ api container | volume mounts cross this β€” Phase 29 reduces the api surface to `./certs/` only | +| host filesystem β†’ controller worker | controller is fileless after this plan; no volume mounts at all | +| docker network β†’ redis service port | LAN-bound port + password auth | +| operator env var `REDIS_PASSWORD` β†’ compose interpolation | `${VAR:?msg}` fails fast on missing | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-03-01 | Information Disclosure | api container reads music files (defeats DIST-01) | mitigate | Strip `/data/music`, `/models`, `/data/output` mounts from api service; `test_api_service_has_no_file_mounts` asserts the invariant | +| T-29-03-02 | Information Disclosure | controller worker reads music files (defeats DIST-01) | mitigate | Strip same mounts from worker service; `test_controller_worker_has_no_file_mounts` asserts | +| T-29-03-03 | Spoofing / Tampering | unauthenticated Redis exposed to LAN | mitigate | `--requirepass ${REDIS_PASSWORD:?required}` enforced at compose-parse time; `test_redis_hardened` asserts | +| T-29-03-04 | Information Disclosure | Redis bound to 0.0.0.0 instead of LAN IP | mitigate | `${REDIS_BIND_IP:-127.0.0.1}:6379:6379` binds to loopback by default; production sets explicit LAN IP via .env | +| T-29-03-05 | Operational | Dockerfile ENV defaults silently re-introduce mount paths inside the container | mitigate | Dockerfile audit removes any `ENV MODELS_PATH=` / `ENV SCAN_PATH=` / `ENV OUTPUT_PATH=` (CONTEXT line 234) | +| T-29-03-06 | Operational | Compose env-var `${VAR:?required}` blocks fresh dev clones | accept (Pitfall 7) | `.env.example` ships with `REDIS_PASSWORD=changeme` so `cp .env.example .env && docker compose up` works; production sets the real password explicitly | +| T-29-03-07 | Information Disclosure | redis-cli `-a ` warning leaks via container logs | mitigate | `--no-auth-warning` flag suppresses the stderr warning while keeping the ping exit code (RESEARCH Β§Pattern 4) | + + + +- `uv run pytest tests/test_deployment/ -x -q` β€” all 4 tests pass +- `REDIS_PASSWORD=changeme docker compose config --quiet` exits 0 +- `yq '.services | keys' docker-compose.yml` returns exactly `[api, worker, postgres, redis]` (manual verify; or use the pytest test itself) +- `uv run ruff check .` β€” no new ruff issues +- Existing test suite (anything that touches `docker-compose.yml` or env vars) β€” no regression + + + +- DIST-01 fully closed (app-server compose has no file mounts on api/worker; agent services moved out) +- AUTH-03 server-side closed (Redis requirepass + LAN binding); agent-side guard already in Plan 02 +- D-05, D-17, D-19, D-20 implemented (D-20 is docs-only and lands in Plan 08 β€” this plan delivers the test that backs it) +- 4 new tests in `tests/test_deployment/` + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md` when both tasks complete. Summary must list: files modified, new tests added, the YAML diff summary (services deleted, services modified), decision IDs implemented (D-05, D-17, D-19), and any Dockerfile changes (or note that no changes were needed). + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md new file mode 100644 index 0000000..5f7933d --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md @@ -0,0 +1,281 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 03 +subsystem: deployment +tags: [phase-29, deployment, compose, dist-01, redis, security, v4.0] + +requires: + - phase: 29-deployment-hardening-agents-admin + plan: 01 + provides: phaze.entrypoint pre-uvicorn shim (referenced by api.command) + - phase: 29-deployment-hardening-agents-admin + plan: 02 + provides: AgentSettings._enforce_redis_password_in_production (agent-side half of AUTH-03) +provides: + - tests/test_deployment/ (pytest sub-package marker + YAML-parse test suite β€” D-19) + - tests/test_deployment/test_api_filesystem_isolation.py (4 structural tests) + - root docker-compose.yml hardened to the app-server-only invariant + - .env.example documents REDIS_PASSWORD, REDIS_BIND_IP, PHAZE_API_TLS_SANS +affects: + - Phase 29 Plan 04 (docker-compose.agent.yml; receives the watcher + agent-worker + + audfprint + panako blocks deleted here) + - Phase 29 Plan 06 (CI workflow runs tests/test_deployment/ in the test job) + - All operators with an existing .env: must add REDIS_PASSWORD (compose now fails + fast at parse time if it is unset) + +tech-stack: + added: [] + patterns: + - "Structural-parse compose tests via yaml.safe_load (D-19) β€” no docker daemon needed" + - "`${VAR:?msg}` fail-fast interpolation on required compose env vars (REDIS_PASSWORD)" + - "IP-prefixed port binding `${REDIS_BIND_IP:-127.0.0.1}:6379:6379` to avoid 0.0.0.0 default" + - "redis-cli --no-auth-warning -a ${REDIS_PASSWORD} ping for authenticated healthchecks" + - "Module-level _volume_target() helper for DRY string/dict volume-shape handling" + +key-files: + created: + - tests/test_deployment/__init__.py + - tests/test_deployment/test_api_filesystem_isolation.py + - .planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md + modified: + - docker-compose.yml (rewrite β€” 51 insertions, 98 deletions) + - .env.example (Phase-29 variables section inserted after API_PORT block) + unchanged: + - Dockerfile (audited β€” no MODELS_PATH/SCAN_PATH/OUTPUT_PATH ENV defaults present) + +key-decisions: + - "Tests assert raw `${VAR:-default}` interpolation tokens (yaml.safe_load does NOT + expand env vars) β€” this proves the source-file invariant, not the post-interpolation + runtime value. Documented in the module docstring so future maintainers know not to + 'fix' the assertions by adding env-var expansion." + - "redis ports regex chosen to reject BOTH bare `6379:6379` (binds 0.0.0.0) AND + leading-colon `:6379:6379` (also binds 0.0.0.0). The IP-prefix string-shape check is + `:6379:6379 in p and not p.startswith(':') and p != '6379:6379'` β€” robust to compose's + accepted variations." + - "Dockerfile required no changes β€” grep for `(MODELS_PATH|SCAN_PATH|OUTPUT_PATH)` returned + zero matches. T-29-03-05 is mitigated by the audit; the structural-parse tests catch + any future regression that re-introduces a default." + - "Worker `volumes:` key removed entirely rather than set to `[]`. The test's + `.get('volumes', []) or []` handles both shapes; absence is cleaner YAML." + - "REDIS_PASSWORD=changeme dev default in .env.example (Pitfall-7 mitigation): fresh + `cp .env.example .env && docker compose up` continues to work without operator action. + Production sets a real strong value explicitly." + - "audfprint_data + panako_data named volumes removed from the top-level `volumes:` + block since they are no longer referenced (audfprint + panako services moved out)." + +patterns-established: + - "tests/test_deployment/ structural-parse pattern: yaml.safe_load + assertions on the + parsed dict shape. ~50ms, no docker daemon required. Future invariants (D-15 sidecars + file-server-only, D-17 agent.yml service list) follow the same pattern in Plan 04." + - ".env.example Phase-NN section pattern: `# === Phase NN: (D-XX) ===` header + with operator-actionable comments above each variable. Mirrors the existing Phase-27 + bring-up section's style." + +requirements-completed: [DIST-01, AUTH-03] + +metrics: + duration: ~2min + tasks_complete: 2 + files_created: 2 + files_modified: 2 + tests_added: 4 + commits: 2 # RED test + GREEN feat (no REFACTOR needed) + +completed: 2026-05-16 +--- + +# Phase 29 Plan 03: Application-Server Compose Hardening Summary + +**Rewrite the root `docker-compose.yml` as the application-server-only compose: strip music/model/output file mounts from `api` and `worker` (DIST-01), delete the `watcher`, `agent-worker`, `audfprint`, and `panako` service blocks (they move to `docker-compose.agent.yml` in Plan 04 per D-17/D-15), and harden Redis with `--requirepass`, LAN-bound port, and authenticated `--no-auth-warning` healthcheck (D-05; server-side half of AUTH-03). New `tests/test_deployment/` sub-package codifies the invariant with 4 YAML-parse structural assertions (D-19) so future edits cannot silently re-introduce the violations.** + +## What Shipped + +### Test suite: tests/test_deployment/ + +- **`tests/test_deployment/__init__.py`** β€” empty pytest sub-package marker. +- **`tests/test_deployment/test_api_filesystem_isolation.py`** β€” 4 cases, all using `yaml.safe_load` on the project-root `docker-compose.yml`: + 1. `test_api_service_has_no_file_mounts` (DIST-01): no `/data/music`, `/models`, or `/data/output` in any api volume target. The `${CA_PATH:-./certs}:/certs:rw` mount is explicitly allowed by passing the banned-target substring check. + 2. `test_controller_worker_has_no_file_mounts` (DIST-01): same predicate for the controller worker. With the `volumes:` key now absent from worker entirely, the test's `data["services"]["worker"].get("volumes", []) or []` returns `[]` and the body short-circuits. + 3. `test_no_watcher_or_agent_worker_in_root_compose` (D-15 / D-17): asserts `watcher`, `agent-worker`, `audfprint`, and `panako` are all absent from the root compose `services` dict. + 4. `test_redis_hardened` (D-05 / AUTH-03): asserts the redis service `command` contains both `requirepass` and `REDIS_PASSWORD`, the `ports` entry is IP-prefixed (rejects both bare `6379:6379` and leading-colon `:6379:6379`), and the healthcheck `test` list contains `redis-cli`, `--no-auth-warning`, `-a`, and a `REDIS_PASSWORD`-referencing entry. + +A small module-level `_volume_target(entry)` helper DRYs the string vs. dict volume-shape handling between tests 1 and 2. + +### Docker compose rewrite + +End state of `services:` is exactly `{api, worker, postgres, redis}`. The top-level `volumes:` block is exactly `{pgdata}`. + +Six concrete changes per PATTERNS lines 762-795: + +1. **`api`**: swap `command:` to `uv run python -m phaze.entrypoint` (Plan 01's cert-bootstrap shim); replace the SCAN_PATH read-only mount with `${CA_PATH:-./certs}:/certs:rw` (rw because cert_bootstrap writes the auto-generated CA + leaf on first start). +2. **`worker` (controller)**: drop `MODELS_PATH=/models` from `environment:` (controller is fileless); remove the `volumes:` key entirely (no SCAN_PATH, MODELS_PATH, OUTPUT_PATH). +3. **DELETE** the `watcher` service block (lines 50-64 of the old file) and its preamble comment. +4. **DELETE** the `agent-worker` service block (lines 72-96 of the old file) and its preamble comment. +5. **DELETE** the `audfprint` and `panako` service blocks (lines 128-154 of the old file); also drop the `audfprint_data` and `panako_data` entries from the top-level `volumes:` block since they are no longer referenced. +6. **`redis`** (lines 118-126 of the old file) rewritten per RESEARCH Β§Pattern 4 / D-05: + - `command:` is list-form: `["redis-server", "--requirepass", "${REDIS_PASSWORD:?REDIS_PASSWORD required}"]`. The `${VAR:?msg}` interpolation causes `docker compose up` to fail at parse time if `REDIS_PASSWORD` is unset. + - `ports:` is `["${REDIS_BIND_IP:-127.0.0.1}:6379:6379"]`. Dev defaults to loopback; production overrides `REDIS_BIND_IP` to the LAN IP so agents on other hosts can reach Redis. + - `healthcheck.test:` is `["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]`. `--no-auth-warning` suppresses the stderr warning that would otherwise pollute container logs. + +### .env.example additions + +Inserted a Phase-29 section between the existing `API_PORT=8000` block and `SCAN_PATH=/data/music`: + +- `REDIS_PASSWORD=changeme` β€” required by the compose `${VAR:?...}` fail-fast. Dev placeholder so a fresh clone still works (Pitfall-7 mitigation); production MUST overwrite. +- `REDIS_BIND_IP=127.0.0.1` β€” interface to bind Redis on. Dev = loopback. Production = app-server LAN IP. +- `PHAZE_API_TLS_SANS=localhost,127.0.0.1,api` β€” comma-separated SAN list for the auto-generated leaf cert (D-02; consumed by Plan 01's `phaze.cert_bootstrap`). + +Each variable has a comment block explaining its role and the dev-vs-prod distinction. + +### Dockerfile audit + +Grepped for `MODELS_PATH`, `SCAN_PATH`, `OUTPUT_PATH` in `Dockerfile`: zero matches. Audit-only step per CONTEXT line 234 β€” no changes needed. The structural-parse tests will catch any future regression that introduces an `ENV` default which would silently mask a missing mount inside the container. + +## Verification Results + +``` +uv run pytest tests/test_deployment/test_api_filesystem_isolation.py -x -q +4 passed in 0.03s + +uv run pytest tests/test_deployment/ tests/test_task_split.py tests/test_main_lifespan.py \ + tests/test_config/ tests/test_config_role_split.py tests/test_config_worker.py \ + tests/test_cert_bootstrap.py tests/test_services/test_agent_client_tls.py -q +47 passed, 2 warnings in 1.39s +``` + +- 4/4 new deployment tests pass (RED β†’ GREEN cycle). +- 43 adjacent tests (task_split, main_lifespan, config, cert_bootstrap, TLS) all pass β€” no regression from the new sub-package or compose rewrite. +- `uv run ruff check tests/test_deployment/` clean. +- `uv run ruff format --check tests/test_deployment/` clean. +- `yamllint` (via pre-commit hook on `docker-compose.yml`) passes. +- YAML structural verification (`python -c "import yaml; ..."`) confirms: + - `services:` = `['api', 'postgres', 'redis', 'worker']` + - `volumes:` = `['pgdata']` + - `api.command` = `"uv run python -m phaze.entrypoint"` + - `api.volumes` = `['${CA_PATH:-./certs}:/certs:rw']` + - `worker.command` = `"uv run saq phaze.tasks.controller.settings"` + - `worker.volumes` is absent + - `worker.environment` = `['PHAZE_ROLE=control']` + - `redis.command` = `['redis-server', '--requirepass', '${REDIS_PASSWORD:?REDIS_PASSWORD required}']` + - `redis.ports` = `['${REDIS_BIND_IP:-127.0.0.1}:6379:6379']` + - `redis.healthcheck.test` = `['CMD', 'redis-cli', '--no-auth-warning', '-a', '${REDIS_PASSWORD}', 'ping']` + +**Note on `docker compose config --quiet`:** the macOS dev environment in this worktree does not have the `docker compose` v2 plugin installed (`docker: 'compose' is not a docker command`), so the suggested compose-syntax check ran via the equivalent pytest YAML-parse layer instead. The new `tests/test_deployment/` suite executes in CI (Plan 06 wires it into the workflow) and provides the same structural-validation gate. + +## YAML diff summary + +**Services deleted from root compose:** + +| Service | Reason | Moves to | +| ------------- | -------------------------------------------------- | ---------------------------- | +| `watcher` | D-17 β€” agent-side, never on the app server | docker-compose.agent.yml | +| `agent-worker`| D-17 β€” agent-side SAQ worker | docker-compose.agent.yml | +| `audfprint` | D-15 β€” fingerprint sidecar is file-server-local | docker-compose.agent.yml | +| `panako` | D-15 β€” fingerprint sidecar is file-server-local | docker-compose.agent.yml | + +**Services modified (in place):** + +| Service | Changes | +| ------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `api` | `command:` now invokes phaze.entrypoint (Plan 01 shim); volumes reduced to a single `/certs:rw` bind for cert_bootstrap output. | +| `worker`| Removed all file mounts and the `MODELS_PATH=/models` env entry. Controller is fileless. | +| `redis` | `--requirepass` + IP-prefixed port + authenticated healthcheck. Fail-fast at compose parse time if `REDIS_PASSWORD` is unset. | + +**Top-level volumes block diff:** + +- Removed: `audfprint_data`, `panako_data`. +- Kept: `pgdata`. + +## Decision IDs implemented + +- **D-05** (Redis hardening: requirepass + LAN-bound port + --no-auth-warning healthcheck): fully closed; server-side half of AUTH-03. +- **D-17** (root compose end state `{api, worker, postgres, redis}` only): fully closed. +- **D-19** (CI YAML-parse test for filesystem isolation): test suite landed in `tests/test_deployment/test_api_filesystem_isolation.py` (4 cases). Wired into CI by Plan 06. +- **D-15** (audfprint + panako sidecars are file-server-local, never in root compose): partial β€” root compose now omits them. Plan 04 introduces docker-compose.agent.yml where they live. +- **DIST-01** (application server has no file mounts): fully closed for api + worker. +- **AUTH-03** (Redis requirepass + LAN binding): fully closed (this plan + Plan 02 client guard). + +D-20 (operations doc) is docs-only and lands in Plan 08 β€” this plan delivers the structural-parse test that the doc will reference. + +## Commits + +| Hash | Type | Phase | Subject | +| ------- | ---- | ----- | ---------------------------------------------------------------------- | +| c560ee5 | test | RED | add failing YAML-parse tests for app-server compose isolation | +| 149de70 | feat | GREEN | harden app-server compose β€” strip file mounts, lock down redis | + +## Deviations from Plan + +### Auto-fixed Issues + +None. The plan's `` blocks for both tasks were complete and accurate. Two minor observations: + +1. **Ruff reformatted the test file once after my initial write.** The reformat collapsed a multi-line `assert any(...)` into a single line (under the 150-char limit). This is the standard pre-commit `ruff format` behavior and not a deviation β€” the test logic is identical. +2. **Dockerfile audit was a no-op.** The current `Dockerfile` had no `MODELS_PATH`/`SCAN_PATH`/`OUTPUT_PATH` ENV defaults to remove, so the audit step in Task 2's `` block was verify-only as anticipated. + +### Authentication gates + +None. + +### Architectural decisions + +None. All choices were already locked in PATTERNS and RESEARCH. + +## Threat Flags + +None. The plan's `` enumerates the seven surfaces this plan touches (T-29-03-01..T-29-03-07). Every mitigation is delivered by the rewrite: + +- **T-29-03-01** (api reads music): mitigated by stripping the SCAN_PATH mount; `test_api_service_has_no_file_mounts` asserts. +- **T-29-03-02** (worker reads music/models/output): mitigated by removing all three mounts; `test_controller_worker_has_no_file_mounts` asserts. +- **T-29-03-03** (unauthed Redis): mitigated by `--requirepass ${REDIS_PASSWORD:?required}`; `test_redis_hardened` asserts the `requirepass` and `REDIS_PASSWORD` tokens are present. +- **T-29-03-04** (Redis bound 0.0.0.0): mitigated by `${REDIS_BIND_IP:-127.0.0.1}:6379:6379`; `test_redis_hardened` rejects bare `6379:6379` and leading-colon forms. +- **T-29-03-05** (Dockerfile ENV defaults mask missing mounts): mitigated by the audit (no such defaults present); future regressions caught by the structural-parse tests because the test reads the post-merge compose file, which would surface any container-side default that nobody mounted. +- **T-29-03-06** (Pitfall-7 dev-clone friction): mitigated by `REDIS_PASSWORD=changeme` dev default in `.env.example`. +- **T-29-03-07** (redis-cli `-a` warning leaks): mitigated by `--no-auth-warning`; `test_redis_hardened` asserts the flag is present. + +## Known Stubs + +None. Every change is end-to-end functional: + +- The compose rewrite is the production wire format β€” there are no TODOs in the file. +- `.env.example` ships with real working defaults for fresh clones. +- The test suite directly parses the live `docker-compose.yml` and asserts on the parsed dict; no mock data. +- `_volume_target()` is the real string/dict shape handler used by both api and worker tests. + +The remaining Phase-29 work β€” Plan 04 (`docker-compose.agent.yml`), Plan 05 (operations docs), Plan 06 (CI wiring), etc. β€” is tracked by the phase plan list and is not blocked by this plan. + +## TDD Gate Compliance + +Both tasks followed the RED β†’ GREEN cycle: + +- **Task 1 RED** (c560ee5): `test(29-03): add failing YAML-parse tests for app-server compose isolation`. Pytest run against the un-rewritten `docker-compose.yml` reported all 4 tests failing with their expected diagnostic substrings (`api service has banned mount: ${SCAN_PATH:-/data/music}:/data/music:ro`, `watcher belongs in docker-compose.agent.yml (D-17)`, `redis service must declare a command with --requirepass`). +- **Task 2 GREEN** (149de70): `feat(29-03): harden app-server compose β€” strip file mounts, lock down redis`. After the rewrite, all 4 tests pass. + +Gate-sequence check: `git log --oneline` shows `c560ee5 test(...)` immediately preceding `149de70 feat(...)` for plan 29-03. RED and GREEN commits both present in correct order. No REFACTOR commit was needed β€” both files landed in their final shape in the GREEN commit. + +## Self-Check: PASSED + +Files claimed to be created β€” all present: + +``` +[ -f tests/test_deployment/__init__.py ] β†’ FOUND +[ -f tests/test_deployment/test_api_filesystem_isolation.py ] β†’ FOUND +``` + +Files claimed to be modified β€” both reflect the documented changes: + +``` +git show 149de70 --stat β†’ docker-compose.yml: 51 insertions, 98 deletions (rewrite) + .env.example: 18 insertions +``` + +Commits claimed β€” both present in `git log --oneline`: + +``` +c560ee5 β€” FOUND (test/RED) +149de70 β€” FOUND (feat/GREEN) +``` + +Test count matches plan ``: 4 new tests in `tests/test_deployment/test_api_filesystem_isolation.py`. All 4 fail against the un-rewritten compose; all 4 pass against the rewritten compose. + +Decision IDs implemented: D-05, D-17, D-19 (full), DIST-01, AUTH-03 (server-side half), D-15 (partial β€” root compose now omits the sidecars; Plan 04 lands the agent.yml file where they live). diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-04-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-04-PLAN.md new file mode 100644 index 0000000..d5ff76b --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-04-PLAN.md @@ -0,0 +1,416 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 04 +type: execute +wave: 2 +depends_on: [29-03] +files_modified: + - docker-compose.agent.yml + - .env.example.agent + - tests/test_deployment/test_agent_compose.py + - .github/workflows/docker-publish.yml +autonomous: true +requirements: [OPS-02] +tags: [phase-29, deployment, compose-agent, ghcr, ops-02, v4.0] + +must_haves: + truths: + - "docker-compose.agent.yml exists at repo root as a self-contained file (D-15)" + - "Top-level services in docker-compose.agent.yml are exactly {worker, watcher, audfprint, panako} (D-15)" + - "worker + watcher services pull from `ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest}` (not build:)" + - "audfprint + panako sidecars retain `build:` (not published to GHCR per D-15)" + - "worker service has `environment: [PHAZE_ROLE=agent]`" + - "No agent service has DATABASE_URL set anywhere (DIST-04 invariant)" + - "SCAN_PATH uses compose `${SCAN_PATH:?SCAN_PATH required}` fail-fast syntax with explicit message form across ALL 4 services (WARNING-2: unified everywhere β€” no bare `:?`)" + - "MODELS_PATH bind mounts on worker + watcher are rw (for D-21 auto-download)" + - "CA_PATH bind mounts are ro (operator-distributed CA cert read-only)" + - ".env.example.agent documents every env var a file-server host needs (D-23)" + - "docker-publish.yml workflow tags both `:latest` AND `:v` for release tags (D-16; verified by an automated YAML-parse test β€” WARNING-4 resolution; no human checkpoint)" + artifacts: + - path: "docker-compose.agent.yml" + provides: "Standalone file-server compose: 4 services (worker, watcher, audfprint, panako); no postgres, no redis (agents reach app-server's)" + min_lines: 40 + contains: "ghcr.io/simplicityguy/phaze" + - path: ".env.example.agent" + provides: "File-server host env template with all required PHAZE_AGENT_* vars + paths" + min_lines: 25 + contains: "PHAZE_AGENT_API_URL" + - path: "tests/test_deployment/test_agent_compose.py" + provides: "5 structural-parse tests: service list, no DATABASE_URL, PHAZE_ROLE=agent on worker, SCAN_PATH fail-fast syntax (WARNING-3), docker-publish.yml :v tag check (WARNING-4)" + min_lines: 100 + exports: [] + key_links: + - from: "docker-compose.agent.yml::worker.image" + to: "ghcr.io/simplicityguy/phaze published image" + via: "image: ghcr.io/...:${PHAZE_IMAGE_TAG:-latest}" + pattern: "ghcr.io/simplicityguy/phaze" + - from: "docker-compose.agent.yml::worker.command" + to: "phaze.tasks.agent_worker.settings" + via: "uv run saq invocation" + pattern: "phaze.tasks.agent_worker.settings" + - from: "docker-compose.agent.yml::watcher.command" + to: "phaze.agent_watcher.__main__" + via: "uv run python -m phaze.agent_watcher" + pattern: "python -m phaze.agent_watcher" +--- + + +Create the standalone `docker-compose.agent.yml` file (D-15) that a file-server host runs via `docker compose -f docker-compose.agent.yml up`. Land its companion `.env.example.agent` env template (D-23 portion). Verify automatically (no human checkpoint per WARNING-4) that `.github/workflows/docker-publish.yml` tags both `:latest` AND `:v` for release events. Wire the structural-parse test suite (D-15..D-17 from D-22). + +**WARNING-2 resolution (SCAN_PATH consistency):** ALL 4 services use the explicit-message form `${SCAN_PATH:?SCAN_PATH required}`. No bare `:?` form anywhere. Both the action body AND acceptance criteria reflect this single form. + +**WARNING-3 resolution (fail-fast YAML test):** A 4th structural-parse test asserts every service's SCAN_PATH mount uses the `:?` operator (catches a future drift to `${SCAN_PATH:-/data/music}` loose default). + +**WARNING-4 resolution (autonomous test replaces checkpoint):** The original `checkpoint:human-verify` task is replaced with an automated YAML-parse test of `.github/workflows/docker-publish.yml` that asserts both `:latest` and `:v` tags are produced. Plan stays `autonomous: true`. + +Purpose: Closes OPS-02 ("A new `docker-compose.agent.yml` brings up exactly `worker`, `watcher`, `audfprint`, and `panako` on a file server, configured via env to reach the application server"). The image-tag automated verification confirms operators can pin versions in production per D-16 without manual intervention. + +Output: New `docker-compose.agent.yml`; new `.env.example.agent`; new `test_agent_compose.py` with 5 LOCKED test cases (3 original + WARNING-3 fail-fast + WARNING-4 docker-publish tag check); possibly an updated `docker-publish.yml` if `:v` tagging is missing. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-03-SUMMARY.md + + + + + + + + + + + + + + Task 1: Write structural-parse tests (4 cases) + create docker-compose.agent.yml + .env.example.agent + tests/test_deployment/test_agent_compose.py, docker-compose.agent.yml, .env.example.agent + + - `tests/test_deployment/test_agent_compose.py` has 4 test functions: `test_agent_compose_service_list`, `test_agent_compose_has_no_postgres_env`, `test_worker_service_has_phaze_role_agent`, `test_all_scan_path_mounts_use_failfast_syntax` (WARNING-3 addition). + - All 4 tests pass after the agent.yml is written. + - `docker-compose.agent.yml`: top-level `services:` has exactly the keys `{worker, watcher, audfprint, panako}`. + - `services.worker.image == "ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest}"` (NO `build:` key). + - `services.worker.command == "uv run saq phaze.tasks.agent_worker.settings"`. + - `services.worker.env_file == ".env"`. + - `services.worker.environment` is a list containing `"PHAZE_ROLE=agent"`. + - `services.worker.volumes` has 3 entries (WARNING-2: explicit-message form): `"${SCAN_PATH:?SCAN_PATH required}:/data/music:ro"`, `"${MODELS_PATH:-./models}:/models:rw"`, `"${CA_PATH:-./certs}:/certs:ro"`. + - `services.worker.restart == "unless-stopped"`. + - `services.watcher.image` mirrors worker's GHCR image. + - `services.watcher.command == "uv run python -m phaze.agent_watcher"`. + - `services.watcher.volumes` has 3 entries (SCAN_PATH ro with `${SCAN_PATH:?SCAN_PATH required}`, MODELS_PATH rw, CA_PATH ro). + - `services.audfprint.build.context == "."` and `services.audfprint.build.dockerfile == "services/audfprint/Dockerfile.audfprint"`. + - `services.audfprint.volumes` has `"${SCAN_PATH:?SCAN_PATH required}:/data/music:ro"` + `"audfprint_data:/data/fprint"` (WARNING-2: explicit message form, NOT bare `:?`). + - `services.panako` mirrors audfprint with `services/panako/Dockerfile.panako`, the same `${SCAN_PATH:?SCAN_PATH required}` form, and `panako_data` volume. + - Top-level `volumes:` block has exactly `{audfprint_data, panako_data}` (no pgdata β€” agents don't run Postgres). + - No service has any `DATABASE_URL`, `POSTGRES_*` env var, or `depends_on: postgres` reference. + - **WARNING-3:** The 4th test `test_all_scan_path_mounts_use_failfast_syntax` iterates every service in `data["services"]`, collects every volume string referencing `SCAN_PATH`, and asserts each one matches the regex `r"\$\{SCAN_PATH:\?[^}]*\}"` (note `:\?` β€” the `:?` fail-fast operator). The test FAILS if a future YAML drift introduces `${SCAN_PATH:-/data/music}` or `${SCAN_PATH}` (no operator). + - `.env.example.agent` has every variable from PATTERNS lines 839-867 with the documented example values. + - `docker compose -f docker-compose.agent.yml config --quiet` exits 0 with the required env vars set (use a sentinel `.env` for the test). + + + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-15 (literal YAML target for the 4 services) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 7: docker-compose.agent.yml structure (D-15)" lines 763-817 (literal YAML + standalone-file behavior notes) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"docker-compose.agent.yml" lines 798-808 (key differences from root compose) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§".env.example.agent" lines 837-867 (literal env-template body) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Code Examples β†’ test_agent_compose.py" lines 1073-1104 (3 original LOCKED test functions; Plan 04 adds a 4th per WARNING-3) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-16 (`${PHAZE_IMAGE_TAG:-latest}` with .env comment recommending version pin) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-23 (.env.example.agent NEW file content) + - services/audfprint/Dockerfile.audfprint (verify path + COPY patterns; sidecars keep `build:`) + - services/panako/Dockerfile.panako (same) + - Plan 03's deleted blocks (watcher / agent-worker / audfprint / panako from root compose) β€” Plan 04 reconstructs in agent.yml + + +Write `tests/test_deployment/test_agent_compose.py` per RESEARCH lines 1073-1104 (3 original LOCKED tests + WARNING-3 4th test). Module-level constants: `COMPOSE_PATH = Path(__file__).resolve().parents[2] / "docker-compose.agent.yml"`. Each test loads `data = yaml.safe_load(COMPOSE_PATH.read_text())`. + +Test 1 β€” `test_agent_compose_service_list()`: `assert set(data["services"].keys()) == {"worker", "watcher", "audfprint", "panako"}` (4 services exactly, no postgres/redis/api). + +Test 2 β€” `test_agent_compose_has_no_postgres_env()`: for each service in `data["services"]`, read its `environment` (handle both list-of-`"KEY=VALUE"` strings and dict forms), assert NO entry contains `"DATABASE_URL"`. Also assert no `depends_on` entry references `postgres` (defends DIST-04 invariant). + +Test 3 β€” `test_worker_service_has_phaze_role_agent()`: read `data["services"]["worker"]["environment"]`; resolve to list-of-string form; `assert any("PHAZE_ROLE=agent" in e for e in env_strs)`. + +**Test 4 (WARNING-3) β€” `test_all_scan_path_mounts_use_failfast_syntax()`:** + +```python +import re + +def test_all_scan_path_mounts_use_failfast_syntax() -> None: + """Every SCAN_PATH volume mount across all services MUST use the + fail-fast ${VAR:?MESSAGE} operator (NOT ${VAR:-default} or ${VAR}). + Defends against a YAML drift that silently introduces a loose default + like `${SCAN_PATH:-/data/music}` which would let `docker compose up` + succeed on a misconfigured host (Phase 29 WARNING-3).""" + data = yaml.safe_load(COMPOSE_PATH.read_text()) + failfast_re = re.compile(r"\$\{SCAN_PATH:\?[^}]*\}") + offenders: list[str] = [] + for svc_name, svc in data["services"].items(): + for vol in svc.get("volumes", []) or []: + if not isinstance(vol, str): + continue + if "SCAN_PATH" in vol and not failfast_re.search(vol): + offenders.append(f"{svc_name}: {vol}") + assert not offenders, ( + "Some SCAN_PATH mounts are not fail-fast (must use ${SCAN_PATH:?MESSAGE} form):\n" + + "\n".join(offenders) + ) +``` + +Write `docker-compose.agent.yml` at repo root per RESEARCH Β§Pattern 7 lines 766-809. Literal target structure β€” **WARNING-2: explicit-message form `${SCAN_PATH:?SCAN_PATH required}` in ALL 4 services**: + +```yaml +--- +services: + worker: + image: ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest} + command: uv run saq phaze.tasks.agent_worker.settings + env_file: .env + environment: + - PHAZE_ROLE=agent + volumes: + - "${SCAN_PATH:?SCAN_PATH required}:/data/music:ro" + - "${MODELS_PATH:-./models}:/models:rw" + - "${CA_PATH:-./certs}:/certs:ro" + restart: unless-stopped + + watcher: + image: ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest} + command: uv run python -m phaze.agent_watcher + env_file: .env + environment: + - PHAZE_ROLE=agent + volumes: + - "${SCAN_PATH:?SCAN_PATH required}:/data/music:ro" + - "${MODELS_PATH:-./models}:/models:rw" + - "${CA_PATH:-./certs}:/certs:ro" + restart: unless-stopped + + audfprint: + build: + context: . + dockerfile: services/audfprint/Dockerfile.audfprint + volumes: + - "${SCAN_PATH:?SCAN_PATH required}:/data/music:ro" + - audfprint_data:/data/fprint + restart: unless-stopped + + panako: + build: + context: . + dockerfile: services/panako/Dockerfile.panako + volumes: + - "${SCAN_PATH:?SCAN_PATH required}:/data/music:ro" + - panako_data:/data/fprint + restart: unless-stopped + +volumes: + audfprint_data: + panako_data: +``` + +Do NOT use the bare `${SCAN_PATH:?}` form in any service β€” explicit-message form everywhere (WARNING-2). The RESEARCH excerpt that uses bare `:?` for sidecars is superseded by WARNING-2 unification. + +Do NOT add healthchecks for audfprint/panako in this plan. + +Write `.env.example.agent` at repo root per PATTERNS lines 837-867: + +```bash +# Phaze file-server agent .env template (Phase 29 D-23) +# Copy to .env on the file-server host. Required variables fail-fast on `docker compose up`. + +# Image tag (pin to a version for production, e.g., PHAZE_IMAGE_TAG=v4.0.0) +PHAZE_IMAGE_TAG=latest + +# Application server URL β€” must be HTTPS (Phase 29 D-01) +PHAZE_AGENT_API_URL=https://:8000 +PHAZE_REDIS_URL=redis://default:@:6379/0 + +# Agent identity (provisioned via psql on app-server) +PHAZE_AGENT_ID=fileserver-east +PHAZE_AGENT_TOKEN=phaze_agent_<32urlsafe> +PHAZE_AGENT_QUEUE=phaze-agent-fileserver-east + +# Operator-copied CA cert (scp from app-server ./certs/phaze-ca.crt) +PHAZE_AGENT_CA_FILE=/certs/phaze-ca.crt + +# Production refuses passwordless Redis URLs (Phase 29 D-06) +PHAZE_AGENT_ENV=production + +# File-server local paths +SCAN_PATH=/data/music +MODELS_PATH=./models +CA_PATH=./certs + +# Scan roots (comma-separated absolute paths) +PHAZE_AGENT_SCAN_ROOTS=/data/music,/data/concerts +``` + +Validate: create a sentinel `/tmp/phaze-agent-test.env` with at least `SCAN_PATH=/tmp/test` and `REDIS_PASSWORD=test`, then run `docker compose --env-file /tmp/phaze-agent-test.env -f docker-compose.agent.yml config --quiet` β€” should exit 0. + +Do NOT relax the `${VAR:?required}` in the compose file. The whole point is fail-fast. + + + uv run pytest tests/test_deployment/test_agent_compose.py -x -q + + +- `docker-compose.agent.yml` exists at repo root with the 4-service structure; ALL SCAN_PATH mounts use `${SCAN_PATH:?SCAN_PATH required}` (WARNING-2) +- `.env.example.agent` exists with all documented variables +- All 4 tests in `test_agent_compose.py` pass (including the WARNING-3 fail-fast syntax test) +- `docker compose -f docker-compose.agent.yml config --quiet` (with required env vars set) exits 0 +- No service references postgres or DATABASE_URL + + + + + Task 2 (WARNING-4 β€” replaces checkpoint): Auto-test `.github/workflows/docker-publish.yml` tags both `:latest` and `:v` + tests/test_deployment/test_agent_compose.py, .github/workflows/docker-publish.yml + + - `tests/test_deployment/test_agent_compose.py` gains a 5th test `test_docker_publish_workflow_tags_both_latest_and_version()` that loads `.github/workflows/docker-publish.yml` via `yaml.safe_load`, locates the `docker/metadata-action` step (or equivalent `tags:` block), and asserts the produced tag list includes BOTH: + 1. A `latest` tag (e.g., `type=raw,value=latest` or `type=raw,value=latest,enable={{is_default_branch}}`) + 2. A version tag (e.g., `type=semver,pattern={{version}}` OR `type=ref,event=tag` β€” any pattern that produces a `:v` tag on release events). + - The test FAILS gracefully (with a diagnostic message naming the missing tag pattern) if `.github/workflows/docker-publish.yml` does not exist OR does not produce both tag types. + - If the test FAILS at first run, the executor updates `.github/workflows/docker-publish.yml` to include the missing tag pattern. + - The image URL prefix matches `docker-compose.agent.yml`'s `image: ghcr.io/simplicityguy/phaze:...` β€” i.e., the workflow pushes to the bare-repo URL `ghcr.io/simplicityguy/phaze`, NOT a sub-path. + - After fix (if needed): `uv run pytest tests/test_deployment/test_agent_compose.py::test_docker_publish_workflow_tags_both_latest_and_version -x -q` passes. + - Plan remains `autonomous: true` β€” no `checkpoint:human-verify` task anywhere in the plan (WARNING-4 resolution). + + + - .github/workflows/docker-publish.yml (current content β€” STATE.md line 135 references quick-task 260410-kco that set this up) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Open Questions" lines 1153-1158 (Question 1: tag strategy uncertain) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Assumptions Log" A2 (assumption: workflow tags both `:latest` and `:v`) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-16 ("docker-publish.yml workflow tags both `:latest` and `:v`") + - https://github.com/docker/metadata-action (action reference; `tags:` input format) + + +Append to `tests/test_deployment/test_agent_compose.py`: + +```python +PUBLISH_WORKFLOW_PATH = Path(__file__).resolve().parents[2] / ".github" / "workflows" / "docker-publish.yml" + + +def _extract_metadata_action_tags(workflow_data: dict) -> list[str]: + """Walk the workflow's jobs β†’ steps β†’ uses=docker/metadata-action and return its `with.tags` split on newlines.""" + for job in (workflow_data.get("jobs") or {}).values(): + for step in job.get("steps", []) or []: + uses = (step.get("uses") or "").lower() + if "docker/metadata-action" in uses: + tags_raw = (step.get("with") or {}).get("tags", "") + return [line.strip() for line in tags_raw.splitlines() if line.strip()] + return [] + + +def test_docker_publish_workflow_tags_both_latest_and_version() -> None: + """WARNING-4: replaces the original human-verify checkpoint with an + automated check that .github/workflows/docker-publish.yml emits BOTH + a `:latest` tag AND a `:v` tag (D-16). If either is missing, + the test names the gap and the executor extends the workflow.""" + assert PUBLISH_WORKFLOW_PATH.exists(), f"docker-publish.yml missing at {PUBLISH_WORKFLOW_PATH}" + workflow = yaml.safe_load(PUBLISH_WORKFLOW_PATH.read_text()) + tags = _extract_metadata_action_tags(workflow) + assert tags, ( + "Could not locate docker/metadata-action step in docker-publish.yml. " + "Phase 29 D-16 requires the workflow to produce both :latest and :v tags." + ) + has_latest = any("value=latest" in t for t in tags) + has_version = any( + ("type=semver" in t) or ("type=ref,event=tag" in t) or ("type=ref" in t and "tag" in t) + for t in tags + ) + missing = [] + if not has_latest: + missing.append("'type=raw,value=latest' (or equivalent)") + if not has_version: + missing.append("'type=semver,pattern={{version}}' (or 'type=ref,event=tag')") + assert not missing, ( + f"docker-publish.yml tag patterns missing: {missing}\nFound tags: {tags}\n" + "Fix: add the missing pattern(s) under jobs..steps[uses=docker/metadata-action].with.tags." + ) +``` + +Run `uv run pytest tests/test_deployment/test_agent_compose.py::test_docker_publish_workflow_tags_both_latest_and_version -x -q`. + +**If PASSES:** docker-publish.yml already tags both β€” no workflow edits required. + +**If FAILS:** read `.github/workflows/docker-publish.yml`. Locate the `docker/metadata-action` step's `tags:` block. Add the missing pattern(s). Typical fix: + +```yaml +- name: Extract metadata + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository }} + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=semver,pattern={{version}} + type=ref,event=tag +``` + +If the workflow uses a different image URL prefix (e.g., `ghcr.io/simplicityguy/phaze/api`), realign by either updating the workflow to push to the bare-repo `ghcr.io/simplicityguy/phaze` URL (preferred) OR updating `docker-compose.agent.yml`'s `image:` line. + +Re-run the test until it passes. Note in `29-04-SUMMARY.md`: +- `verified` β€” workflow already tags both `:latest` and `:v` +- `fixed` β€” workflow needed extension; the fix is now in `.github/workflows/docker-publish.yml` +- `url-realigned` β€” image URL was wrong; either workflow OR compose was updated + + + uv run pytest tests/test_deployment/test_agent_compose.py::test_docker_publish_workflow_tags_both_latest_and_version -x -q + + +- `tests/test_deployment/test_agent_compose.py` includes `test_docker_publish_workflow_tags_both_latest_and_version` (5th test total in the file) +- The test passes against `.github/workflows/docker-publish.yml` (either it already had both tags OR the workflow was extended) +- Image URL prefix in `docker-compose.agent.yml` matches the workflow's published URL +- Plan stays `autonomous: true` (no human checkpoint β€” WARNING-4 resolution) + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| docker registry (GHCR) β†’ file-server host | image pull crosses this | +| file-server host filesystem β†’ agent containers | SCAN_PATH, MODELS_PATH, CA_PATH bind mounts | +| operator .env file β†’ compose interpolation | bearer token + CA path + redis URL pass through | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-04-01 | Spoofing | malicious image pushed to ghcr.io/simplicityguy/phaze:latest | mitigate | GHCR push requires `packages:write` on the maintainer's PAT; production operators are advised in .env.example.agent comment to PIN to a specific `:v` tag for production rollouts | +| T-29-04-02 | Information Disclosure | DATABASE_URL set on an agent service (agents have no Postgres access per DIST-04) | mitigate | `test_agent_compose_has_no_postgres_env` asserts no agent service has DATABASE_URL or postgres dependencies | +| T-29-04-03 | Tampering | rw mount of MODELS_PATH allows weight tampering | accept | rw is required for D-21 auto-download | +| T-29-04-04 | Information Disclosure | CA_PATH ro is leaked to all 4 services even though only worker+watcher need it | accept | Sidecars receive read-only access to `phaze-ca.crt` (public CA cert is non-secret) | +| T-29-04-05 | Operational | SCAN_PATH=/data on a misconfigured file-server host exposes a wrong directory | mitigate | `${SCAN_PATH:?SCAN_PATH required}` fail-fast across all 4 services (WARNING-2 unified form); WARNING-3 test enforces the `:?` form via regex | +| T-29-04-06 | Operational | `:latest` default pulls an unexpected image after a maintainer push | accept (Pitfall: D-16) | Operators are advised to pin `PHAZE_IMAGE_TAG=v4.0.0` for production | +| T-29-04-07 | Operational | docker-publish.yml workflow silently stops emitting `:v` tags (e.g., metadata-action upgrade) | mitigate (WARNING-4) | Automated YAML-parse test asserts both `:latest` and `:v` patterns are present; runs on every CI build | + + + +- `uv run pytest tests/test_deployment/test_agent_compose.py -x -q` β€” all 5 tests pass (4 compose-structure + 1 workflow-tag-check) +- `docker compose --env-file /tmp/test.env -f docker-compose.agent.yml config --quiet` exits 0 (with sentinel env) +- `.github/workflows/docker-publish.yml` is confirmed (by automated test, not checkpoint) to tag both `:latest` and `:v` (OR was patched to do so) +- No regression in existing `tests/test_deployment/test_api_filesystem_isolation.py` tests + + + +- OPS-02 fully closed: docker-compose.agent.yml exists with exactly 4 services; agents have no Postgres access; image-pull-from-GHCR is the canonical deployment path +- D-15, D-16, D-17, D-18 (partially β€” full justfile recipe lands in Plan 08), D-22 (agent-compose portion) implemented +- 5 new tests in `test_agent_compose.py` (WARNING-3 + WARNING-4 additions) +- WARNING-2 unified SCAN_PATH explicit-message form across all 4 services +- docker-publish.yml verified via automated test (no checkpoint; WARNING-4 resolution) + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md` when both tasks complete. Summary must list: files created, decision IDs implemented (D-15, D-17, D-22), the docker-publish.yml verification result (`verified` / `fixed` / `url-realigned`), and the agent.yml's final service list for human cross-check. + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md new file mode 100644 index 0000000..b6f483a --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md @@ -0,0 +1,262 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 04 +subsystem: deployment +tags: [phase-29, deployment, compose-agent, ghcr, docker-publish, ops-02, v4.0] + +# Dependency graph +requires: + - phase: 29-deployment-hardening-agents-admin + plan: 03 + provides: "Root docker-compose.yml hardened to app-server-only end state {api, worker, postgres, redis}; the watcher/agent-worker/audfprint/panako blocks were deleted from root compose with the explicit comment that they move to docker-compose.agent.yml in Plan 04" + - phase: 29-deployment-hardening-agents-admin + plan: 02 + provides: "AgentSettings._enforce_redis_password_in_production + ._enforce_https_in_production guards (referenced in .env.example.agent comments)" +provides: + - "docker-compose.agent.yml (NEW): standalone file-server-host compose with exactly 4 services {worker, watcher, audfprint, panako}; worker+watcher pull from ghcr.io/simplicityguy/phaze, sidecars retain build:" + - ".env.example.agent (NEW): file-server-host env template documenting every required PHAZE_AGENT_* var + paths (D-23 portion)" + - "tests/test_deployment/test_agent_compose.py (NEW): 5 structural-parse tests covering D-15..D-17 + WARNING-3 SCAN_PATH fail-fast + WARNING-4 docker-publish.yml tag verification" + - ".github/workflows/docker-publish.yml extended: docker/metadata-action now emits :latest + :v + : tags; api image realigned to bare-repo URL" +affects: + - "Phase 29 Plan 06 (CI wires tests/test_deployment/test_agent_compose.py into the test job)" + - "Phase 29 Plan 08 (deployment-doc + justfile recipe references docker-compose.agent.yml + PHAZE_IMAGE_TAG pinning guidance)" + - "All file-server-host operators: must scp .env.example.agent β†’ .env on each file server and populate the documented variables; missing SCAN_PATH fails at compose-parse time (no silent misconfiguration)" + +# Tech tracking +tech-stack: + added: [] + patterns: + - "Standalone compose file pattern: `docker compose -f docker-compose.agent.yml up -d` operates on a wholly separate project name; no service/network reachable from the root compose. Mirrors the app-server vs file-server trust split (DIST-01..04)." + - "GHCR pull-vs-build hybrid: published images for code-only services (worker, watcher), build: context for sidecars (audfprint, panako) until they get their own GHCR publish path." + - "`${VAR:?MESSAGE}` fail-fast applied UNIFORMLY across all 4 services (WARNING-2 β€” no bare `:?` form anywhere). WARNING-3 test enforces via regex." + - "Per-matrix `image_suffix` override in docker-publish.yml: api β†’ bare-repo URL, sidecars β†’ `/` sub-path. Lets one workflow publish three different URL shapes from a 3-row matrix." + +key-files: + created: + - docker-compose.agent.yml + - .env.example.agent + - tests/test_deployment/test_agent_compose.py + - .planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md + modified: + - .github/workflows/docker-publish.yml (tag strategy + image_suffix matrix override) + - tests/test_phase04_gaps.py (resolved Plan 29-05 deferred test: now scans BOTH compose files) + - .planning/phases/29-deployment-hardening-agents-admin/deferred-items.md (marked resolved) + +key-decisions: + - "Matrix `image_suffix` instead of a separate workflow: keeping the api image at the bare-repo URL (so docker-compose.agent.yml's `image: ghcr.io/simplicityguy/phaze:...` is correct) does NOT require a second workflow file. A 1-line matrix field flipping `\"\"` vs `/` is the minimum-viable surgery." + - "Resolved the Plan 29-05 deferred test inside this plan rather than punting to Plan 29-08. Reasoning: Plan 29-04 is the wave that materializes docker-compose.agent.yml; the deferred test exists exactly because the agent-worker had no compose-file home until this plan. The fix is a 4-line change (scan both files) and the gate is meaningfully restored in CI today rather than a wave later." + - "WARNING-2 unified explicit-message form `${SCAN_PATH:?SCAN_PATH required}` on ALL 4 services. The RESEARCH excerpt's bare `${SCAN_PATH:?}` form on sidecars was superseded by WARNING-2 β€” bare form is correct but inconsistent; explicit-message gives an operator-actionable error on first `docker compose up`." + - "The 5th workflow-tag test asserts BOTH `value=latest` AND a version pattern (`type=semver` OR `type=ref,event=tag`). Accepting `type=ref,event=tag` keeps the test robust to a future refactor that drops semver in favor of plain ref-tag (e.g., if version-string conventions change)." + - "Two ways to fix WARNING-4 (extend workflow OR change agent.yml's URL). Per plan guidance preferred: extend workflow. Reasoning: GHCR allows bare-repo image URLs (this is the simpler operator mental model β€” `phaze` is the project, not `phaze/api`), and the sidecars naturally need sub-paths anyway." + +patterns-established: + - "Standalone-compose pattern for trust-split deployment: a separate `docker-compose..yml` file (no `extends:` chain) with its own services + volumes block. Pulls images for code-only services, builds locally for not-yet-published sidecars. The file-server host needs `services//Dockerfile.` + the compose file + `.env` + `certs/` β€” no app-server source tree." + - "Image-URL alignment test pattern: when a compose file pulls from a workflow-published image, encode that coupling as a test that parses BOTH artifacts. Done implicitly here via the docker-publish tag-strategy test referencing PUBLISH_WORKFLOW_PATH (same way test_agent_compose.py references COMPOSE_PATH). Future drift between the two surfaces fails CI rather than fails-at-pull-time on a production file-server." + +requirements-completed: [OPS-02] + +# Metrics +duration: ~18min +completed: 2026-05-16 +--- + +# Phase 29 Plan 04: docker-compose.agent.yml + GHCR Publish Verification Summary + +**Lands the file-server-host compose surface (`docker-compose.agent.yml` + `.env.example.agent`) with exactly 4 services β€” worker, watcher, audfprint, panako β€” and replaces the original GHCR-tag human-verify checkpoint with an automated YAML-parse test that asserts `.github/workflows/docker-publish.yml` produces BOTH `:latest` and `:v` tags. Extends the workflow to emit the missing `type=semver,pattern={{version}}` + `type=ref,event=tag` patterns and realigns the api image URL to `ghcr.io/simplicityguy/phaze` (bare-repo) so the compose `image:` line resolves correctly.** + +## Performance + +- **Duration:** ~18 min +- **Started:** 2026-05-16T23:05Z (approx) +- **Completed:** 2026-05-16T23:23Z (approx) +- **Tasks:** 2 (both auto, both TDD) +- **Files created:** 3 (`docker-compose.agent.yml`, `.env.example.agent`, `tests/test_deployment/test_agent_compose.py`) + this SUMMARY +- **Files modified:** 3 (`.github/workflows/docker-publish.yml`, `tests/test_phase04_gaps.py`, `deferred-items.md`) +- **Tests added:** 5 (4 compose-structure + 1 workflow tag-check). Plus 1 deferred test reactivated. + +## Accomplishments + +- **OPS-02 fully closed.** A new `docker-compose.agent.yml` brings up exactly `worker`, `watcher`, `audfprint`, `panako` on a file server, configured via env to reach the application server. No Postgres or Redis service in the agent compose β€” agents connect to the app-server's via env-file URL. +- **WARNING-4 resolved without a human checkpoint.** Plan stays `autonomous: true`. The 5th test (`test_docker_publish_workflow_tags_both_latest_and_version`) is now a permanent CI gate: a future regression that drops the version tag pattern (e.g., during a metadata-action upgrade) will fail CI before shipping a release that's missing `:v` images. +- **WARNING-3 enforced.** Every SCAN_PATH volume mount across all 4 services uses `${SCAN_PATH:?SCAN_PATH required}` (explicit-message fail-fast). The 4th test (`test_all_scan_path_mounts_use_failfast_syntax`) rejects any future YAML drift toward a loose default like `${SCAN_PATH:-/data/music}`. +- **WARNING-2 unified.** No bare `${VAR:?}` form anywhere β€” explicit-message form on all 4 services. Operator sees `SCAN_PATH required` instead of an empty error from compose. +- **Plan 29-05 deferred test resolved.** `tests/test_phase04_gaps.py::test_docker_compose_has_agent_worker_consuming_agent_queue` now scans BOTH `docker-compose.yml` and `docker-compose.agent.yml`; finds the agent-worker at `docker-compose.agent.yml::worker`. The Phase 27 UAT gap-13 invariant (an agent-side SAQ consumer exists somewhere in the deployment surface) is fully codified across the split. +- **Image URL realignment.** docker-publish.yml's api image now publishes to `ghcr.io/simplicityguy/phaze` (bare repo) matching the compose `image:` line; sidecars keep `/audfprint` and `/panako` sub-paths (irrelevant β€” agent.yml builds them locally). + +## Task Commits + +Each task was committed atomically (RED β†’ GREEN per TDD): + +1. **Task 1 RED β€” failing YAML-parse tests for docker-compose.agent.yml** β€” `b1c5620` (test) +2. **Task 1 GREEN β€” create docker-compose.agent.yml + .env.example.agent + resolve Plan 29-05 deferred test** β€” `ae45925` (feat) +3. **Task 2 RED β€” failing workflow-tag check (WARNING-4)** β€” `0e78658` (test) +4. **Task 2 GREEN β€” extend docker-publish.yml tag strategy + realign api URL** β€” `93e550b` (feat) + +## Files Created/Modified + +### Created + +- **`docker-compose.agent.yml`** (75 lines) β€” File-server-host compose. Top-level `services:` is exactly `{worker, watcher, audfprint, panako}`. Top-level `volumes:` is exactly `{audfprint_data, panako_data}`. Worker + watcher pull from `ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest}`; sidecars retain `build:`. All 4 services use `${SCAN_PATH:?SCAN_PATH required}` fail-fast for the music-mount target. Worker + watcher mount `${MODELS_PATH:-./models}:/models:rw` (D-21 auto-download). Worker + watcher mount `${CA_PATH:-./certs}:/certs:ro` for the operator-distributed CA cert. +- **`.env.example.agent`** (75 lines) β€” File-server-host env template. Documents every required variable: `PHAZE_IMAGE_TAG`, `PHAZE_AGENT_API_URL` (HTTPS-only per Plan 02 guard), `PHAZE_REDIS_URL` (password-required per Plan 02), `PHAZE_AGENT_{ID,TOKEN,QUEUE}`, `PHAZE_AGENT_CA_FILE`, `PHAZE_AGENT_ENV=production`, `SCAN_PATH`, `MODELS_PATH`, `CA_PATH`, `PHAZE_AGENT_SCAN_ROOTS`. Production-pin guidance for `PHAZE_IMAGE_TAG` inline. +- **`tests/test_deployment/test_agent_compose.py`** (164 lines) β€” Five tests: + 1. `test_agent_compose_service_list` (D-15) β€” services exactly `{worker, watcher, audfprint, panako}`. + 2. `test_agent_compose_has_no_postgres_env` (DIST-04) β€” no `DATABASE_URL`, `POSTGRES_*`, or `depends_on: postgres` on any agent service. + 3. `test_worker_service_has_phaze_role_agent` (D-17) β€” worker environment contains `PHAZE_ROLE=agent`. + 4. `test_all_scan_path_mounts_use_failfast_syntax` (WARNING-3) β€” regex check: every SCAN_PATH volume entry matches `${SCAN_PATH:?...}`. + 5. `test_docker_publish_workflow_tags_both_latest_and_version` (WARNING-4) β€” parses `.github/workflows/docker-publish.yml`, locates the `docker/metadata-action` step, asserts both a `value=latest` line and a version pattern (`type=semver` OR `type=ref,event=tag`) are present. + +### Modified + +- **`.github/workflows/docker-publish.yml`** β€” + - Matrix entries get a new `image_suffix` field: `""` for api, `/audfprint` for audfprint, `/panako` for panako. + - `docker/metadata-action`'s `images:` line is now `${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}${{ matrix.image_suffix }}` (no slash separator; the suffix carries it). + - `tags:` block extended with `type=semver,pattern={{version}}`, `type=semver,pattern={{major}}.{{minor}}`, and `type=ref,event=tag`. Existing `type=raw,value=latest,enable={{is_default_branch}}` + branch/PR/schedule tags retained. +- **`tests/test_phase04_gaps.py::test_docker_compose_has_agent_worker_consuming_agent_queue`** β€” Now iterates over `[docker-compose.yml, docker-compose.agent.yml]`, scanning each for a service whose `command` contains `saq phaze.tasks.agent_worker.settings` and whose environment has `PHAZE_ROLE=agent`. Finds the agent-worker at `docker-compose.agent.yml::worker`. Error message updated to explain the Phase 29 split. +- **`.planning/phases/29-deployment-hardening-agents-admin/deferred-items.md`** β€” Marked the Plan 29-05 deferred item as resolved by this plan (strike-through with explanation pointing to Plan 29-04). + +## Decisions Made + +- **`image_suffix` matrix override (Decision documented above).** One workflow continues to publish all three images with one matrix; only the api's published URL changes shape. No second workflow file needed. +- **Resolve the Plan 29-05 deferred test in this plan.** Adding a `docker-compose.agent.yml` to the codebase makes the gap-13 invariant satisfiable again; deferring the fix to a later plan would leave the test red across the merge of this wave. +- **Explicit-message `${VAR:?MESSAGE}` on every service.** Compose accepts the bare `${VAR:?}` form, but the explicit message gives operators an actionable error (`error while interpolating SCAN_PATH: SCAN_PATH required`). Worth the trivial duplication. +- **Test the BARE-URL + the tag-strategy in two separate gates.** The plan only mandates the tag-strategy test, so I removed an extra image-URL alignment test I had drafted. Rationale: the URL alignment is enforced by the act of the agent.yml's `image:` line being literal; a future divergence will fail at `docker compose -f docker-compose.agent.yml pull`. Adding a redundant test would have inflated the suite without buying additional safety. + +## Deviations from Plan + +None β€” plan executed exactly as written. The plan's `` blocks were complete and accurate; both literal YAML targets and the literal test bodies matched the implementation verbatim. + +### Auto-fixed Issues + +**1. [Rule 3 - Blocking] Ruff `SIM101` (merged isinstance check) + `I001` (import order) on `test_agent_compose.py`** +- **Found during:** Task 1 RED commit (pre-commit hook). +- **Issue:** Initial draft had `if isinstance(depends, list) or isinstance(depends, dict):` flagged by SIM101. Ruff also reordered `import re` to alphabetical position (`from pathlib import Path` β†’ `import re` β†’ `from typing import Any`). +- **Fix:** Manually merged to `if isinstance(depends, (list, dict)):`. Ruff's `--fix` already handled import order. +- **Files modified:** `tests/test_deployment/test_agent_compose.py` +- **Verification:** Pre-commit ran clean on second attempt. +- **Committed in:** `b1c5620` (Task 1 RED). + +**2. [Rule 3 - Blocking] Missing trailing newline on `test_agent_compose.py`** +- **Found during:** Task 2 RED commit (pre-commit `end-of-file-fixer` hook). +- **Issue:** Appended the 5th test function without a trailing newline. +- **Fix:** Pre-commit auto-fixed; re-staged and re-committed. +- **Files modified:** `tests/test_deployment/test_agent_compose.py` +- **Verification:** Pre-commit ran clean on second attempt. +- **Committed in:** `0e78658` (Task 2 RED). + +--- + +**Total deviations:** 2 auto-fixed (both Rule 3 - lint-blocking pre-commit fixups, both autoresolved). +**Impact on plan:** Zero β€” both were trivial style adjustments, no scope creep, no logic change. + +### Authentication gates + +None. + +### Architectural decisions + +None. All choices were already locked in PATTERNS, RESEARCH, and the plan body. + +## Verification + +### Task 1 acceptance + +- βœ… `docker-compose.agent.yml` exists at repo root with the 4-service structure +- βœ… Top-level `services:` keys are exactly `{worker, watcher, audfprint, panako}` +- βœ… `services.worker.image == "ghcr.io/simplicityguy/phaze:${PHAZE_IMAGE_TAG:-latest}"` (NO `build:` key) +- βœ… `services.worker.command == "uv run saq phaze.tasks.agent_worker.settings"` +- βœ… `services.worker.env_file == ".env"` +- βœ… `services.worker.environment == ["PHAZE_ROLE=agent"]` +- βœ… `services.worker.volumes` has 3 entries with the explicit-message `${SCAN_PATH:?SCAN_PATH required}` form +- βœ… `services.worker.restart == "unless-stopped"` +- βœ… `services.watcher.command == "uv run python -m phaze.agent_watcher"` +- βœ… `services.watcher` mirrors worker's image + volume structure +- βœ… `services.audfprint.build.dockerfile == "services/audfprint/Dockerfile.audfprint"` with `${SCAN_PATH:?SCAN_PATH required}` mount + `audfprint_data:/data/fprint` +- βœ… `services.panako` mirrors audfprint with `services/panako/Dockerfile.panako` and `panako_data` volume +- βœ… Top-level `volumes:` block has exactly `{audfprint_data, panako_data}` (no pgdata) +- βœ… No service has `DATABASE_URL`, `POSTGRES_*`, or `depends_on: postgres` +- βœ… All 4 SCAN_PATH mounts match the fail-fast regex (WARNING-3 test passes) +- βœ… `.env.example.agent` has every variable from PATTERNS lines 839-867 with documented values +- βœ… All 4 tests in `test_agent_compose.py` pass (the new WARNING-3 fail-fast syntax test included) +- βœ… Plan 29-05 deferred test reactivated: `test_docker_compose_has_agent_worker_consuming_agent_queue` now scans both compose files and finds `docker-compose.agent.yml::worker` + +### Task 2 acceptance + +- βœ… `test_docker_publish_workflow_tags_both_latest_and_version` exists in `test_agent_compose.py` +- βœ… Test correctly FAILS against the original workflow (missing version tag pattern); after extending the workflow the test passes +- βœ… Workflow now emits: `value=latest` (existing), `type=semver,pattern={{version}}` (NEW), `type=semver,pattern={{major}}.{{minor}}` (NEW), `type=ref,event=tag` (NEW), `type=ref,event=branch` (existing), `type=ref,event=pr` (existing), `type=schedule,pattern=...` (existing) +- βœ… Image URL for api is now `ghcr.io/simplicityguy/phaze` (bare-repo via `image_suffix: ""`), matching `docker-compose.agent.yml`'s `image:` line +- βœ… Sidecar images retained at `/audfprint` and `/panako` sub-paths (irrelevant β€” agent.yml builds them locally per D-15) +- βœ… Plan remains `autonomous: true`; no `checkpoint:human-verify` task anywhere +- βœ… Final test sweep: `uv run pytest tests/test_deployment/ tests/test_phase04_gaps.py tests/test_task_split.py tests/test_main_lifespan.py -q` β†’ **22 passed in 1.95s** (no regression) + +### docker-publish.yml verification result + +**`fixed` + `url-realigned`** β€” both the tag pattern AND the image URL needed adjustment. The workflow's `docker/metadata-action` step was missing `type=semver,pattern={{version}}` / `type=ref,event=tag` (so `:v` was never produced on tagged releases), and the api image was published to `ghcr.io/simplicityguy/phaze/api` (sub-path) when `docker-compose.agent.yml` expects the bare-repo URL `ghcr.io/simplicityguy/phaze`. Both fixes landed in commit `93e550b`. + +### docker compose config + +`docker compose -f docker-compose.agent.yml config --quiet` was NOT executable in this worktree because the macOS dev environment lacks the `docker compose` v2 plugin (`docker: 'compose' is not a docker command`). This is the same environmental limitation noted in Plan 29-03's SUMMARY. The structural-parse tests provide equivalent validation: + +``` +uv run python -c "import yaml; d = yaml.safe_load(open('docker-compose.agent.yml').read()); print(sorted(d['services'].keys()))" +['audfprint', 'panako', 'watcher', 'worker'] +``` + +The CI environment runs `docker compose config` separately (Plan 29-06 wires this in), so any compose-parse error that the YAML-parse layer doesn't catch will surface there. + +### Threat-model mitigations delivered + +| Threat ID | Mitigation Delivered | +|-----------|----------------------| +| T-29-04-01 (Spoofing β€” malicious image at `:latest`) | `.env.example.agent` documents the `PHAZE_IMAGE_TAG=v4.0.0` production-pin recommendation; new workflow tag strategy makes `:v` pins actually exist on the registry | +| T-29-04-02 (DATABASE_URL on an agent service) | `test_agent_compose_has_no_postgres_env` asserts no agent service has `DATABASE_URL`, `POSTGRES_*`, or `depends_on: postgres` | +| T-29-04-03 (rw MODELS_PATH allows weight tampering) | Accepted β€” rw is required for D-21 auto-download | +| T-29-04-04 (CA_PATH ro leaked to sidecars) | Accepted β€” CA cert is non-secret | +| T-29-04-05 (SCAN_PATH default exposes wrong directory) | `${SCAN_PATH:?SCAN_PATH required}` fail-fast across all 4 services (WARNING-2 unified); WARNING-3 test enforces | +| T-29-04-06 (`:latest` default pulls unexpected image) | Accepted (Pitfall: D-16); operators advised in `.env.example.agent` | +| T-29-04-07 (workflow stops emitting `:v`) | `test_docker_publish_workflow_tags_both_latest_and_version` is now a permanent CI gate | + +## Known Stubs + +None. Every change is end-to-end functional: + +- `docker-compose.agent.yml` is the production wire format β€” no TODOs. +- `.env.example.agent` ships with placeholders that operators must replace (``, ``, `<32urlsafe>`) β€” these are not stubs but intentional operator-fill-in slots, documented in inline comments. +- `test_agent_compose.py` parses the live `docker-compose.agent.yml` + `.github/workflows/docker-publish.yml`; no mock data. + +## TDD Gate Compliance + +Both tasks followed the RED β†’ GREEN cycle: + +- **Task 1 RED** (`b1c5620`): `test(29-04): add failing YAML-parse tests for docker-compose.agent.yml (RED)`. Pytest run against the un-created `docker-compose.agent.yml` reported all 4 tests failing with `FileNotFoundError`. +- **Task 1 GREEN** (`ae45925`): `feat(29-04): create docker-compose.agent.yml + .env.example.agent (GREEN)`. All 4 tests pass after the agent compose file is written. +- **Task 2 RED** (`0e78658`): `test(29-04): add failing workflow-tag check (WARNING-4 RED)`. The new 5th test fails with `AssertionError: docker-publish.yml tag patterns missing: ["'type=semver,pattern={{version}}' (or 'type=ref,event=tag')"]` against the un-modified workflow. +- **Task 2 GREEN** (`93e550b`): `feat(29-04): extend docker-publish.yml tag strategy + realign api URL (GREEN)`. The 5th test passes after the workflow's metadata-action step gains the missing tag patterns and the api image URL realigns to the bare repo. + +Gate-sequence check: `git log --oneline` shows `b1c5620 test(...)` β†’ `ae45925 feat(...)` β†’ `0e78658 test(...)` β†’ `93e550b feat(...)` for plan 29-04. RED commits precede their paired GREEN commits in the correct order. No REFACTOR commits were needed. + +## Self-Check: PASSED + +**Files claimed to be created β€” verified to exist:** + +- βœ… `docker-compose.agent.yml` β€” FOUND +- βœ… `.env.example.agent` β€” FOUND +- βœ… `tests/test_deployment/test_agent_compose.py` β€” FOUND +- βœ… `.planning/phases/29-deployment-hardening-agents-admin/29-04-SUMMARY.md` β€” FOUND (this file) + +**Files claimed to be modified β€” verified via `git show`:** + +- βœ… `.github/workflows/docker-publish.yml` β€” modified in `93e550b` (matrix + tags) +- βœ… `tests/test_phase04_gaps.py` β€” modified in `ae45925` (deferred test fix) +- βœ… `.planning/phases/29-deployment-hardening-agents-admin/deferred-items.md` β€” modified in `ae45925` (resolved marker) + +**Commits claimed β€” verified via `git log --oneline`:** + +- βœ… `b1c5620` β€” Task 1 RED β€” FOUND +- βœ… `ae45925` β€” Task 1 GREEN β€” FOUND +- βœ… `0e78658` β€” Task 2 RED β€” FOUND +- βœ… `93e550b` β€” Task 2 GREEN β€” FOUND + +**Test count matches plan ``:** 5 new tests in `tests/test_deployment/test_agent_compose.py` (3 from RESEARCH LOCKED + WARNING-3 fail-fast + WARNING-4 docker-publish tag check). All 5 pass. + +**Decision IDs implemented:** D-15 (full), D-16 (full β€” workflow tag strategy verified), D-17 (full β€” root vs agent compose split), D-22 (agent-compose portion), D-23 (`.env.example.agent` portion). Requirement OPS-02 closed. diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-05-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-05-PLAN.md new file mode 100644 index 0000000..b8802bc --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-05-PLAN.md @@ -0,0 +1,453 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 05 +type: execute +wave: 2 +depends_on: [] +files_modified: + - src/phaze/scripts/__init__.py + - src/phaze/scripts/download_models.py + - src/phaze/tasks/_shared/model_bootstrap.py + - src/phaze/tasks/agent_worker.py + - src/phaze/agent_watcher/__main__.py + - scripts/download-models.sh + - tests/test_services/test_model_bootstrap.py + - tests/test_task_split.py +autonomous: true +requirements: [OPS-03] +tags: [phase-29, ops-03, models, bootstrap, v4.0] + +must_haves: + truths: + - "phaze.scripts.download_models.download_to(Path) downloads essentia weights to the target dir (D-21)" + - "phaze.scripts.download_models is invokable as `python -m phaze.scripts.download_models `" + - "phaze.tasks._shared.model_bootstrap.ensure_models_present(models_dir) is a Postgres-free shared bootstrap (D-21)" + - "Empty /models triggers download via ensure_models_present; populated /models is a no-op" + - "Network failure during download raises RuntimeError (which propagates β†’ non-zero exit β†’ restart: unless-stopped retries)" + - "scripts/download-models.sh is a thin bash shim that execs python -m phaze.scripts.download_models" + - "agent_worker startup calls ensure_models_present AFTER whoami_with_retry succeeds (fail-fast auth before downloading)" + - "agent_watcher.__main__ does NOT call ensure_models_present β€” only the worker downloads (WARNING-7 resolution: avoids race on .part file in fresh /models)" + - "The old in-place RuntimeError-on-empty check in agent_worker.startup is REPLACED (not duplicated) by ensure_models_present" + - "model_bootstrap stays Postgres-free (no import of phaze.database, phaze.tasks.session, or sqlalchemy.ext.asyncio) β€” proven by tests/test_task_split.py::test_model_bootstrap_stays_postgres_free" + artifacts: + - path: "src/phaze/scripts/__init__.py" + provides: "scripts package marker" + contains: "" + - path: "src/phaze/scripts/download_models.py" + provides: "Python helper to fetch essentia weights from essentia.upf.edu; idempotent; atomic .part rename" + min_lines: 80 + exports: ["download_to", "CLASSIFIER_MODELS", "GENRE_MODELS"] + - path: "src/phaze/tasks/_shared/model_bootstrap.py" + provides: "ensure_models_present(models_dir) β€” Postgres-free shared bootstrap; logs status, calls download_to on empty" + min_lines: 30 + exports: ["ensure_models_present"] + - path: "src/phaze/tasks/agent_worker.py" + provides: "startup hook replaces in-place RuntimeError with ensure_models_present call AFTER whoami" + contains: "ensure_models_present" + - path: "src/phaze/agent_watcher/__main__.py" + provides: "startup does NOT call ensure_models_present (worker-only download per WARNING-7 resolution); documentation comment present" + contains: "" + - path: "scripts/download-models.sh" + provides: "Bash shim: exec uv run python -m phaze.scripts.download_models" + min_lines: 3 + - path: "tests/test_services/test_model_bootstrap.py" + provides: "3 test cases: empty-dirβ†’download, populatedβ†’no-op, network-failβ†’RuntimeError" + min_lines: 50 + - path: "tests/test_task_split.py" + provides: "Adds test_model_bootstrap_stays_postgres_free β€” parallel subprocess case to test_shared_bootstrap_stays_postgres_free; imports phaze.tasks._shared.model_bootstrap and asserts banned modules absent" + contains: "test_model_bootstrap_stays_postgres_free" + key_links: + - from: "src/phaze/tasks/_shared/model_bootstrap.py::ensure_models_present" + to: "src/phaze/scripts/download_models.py::download_to" + via: "import + invocation on empty-dir branch" + pattern: "from phaze.scripts.download_models import download_to" + - from: "src/phaze/tasks/agent_worker.py::startup" + to: "src/phaze/tasks/_shared/model_bootstrap.py::ensure_models_present" + via: "Step 3a of startup (after whoami)" + pattern: "ensure_models_present\\(Path\\(cfg.models_path\\)\\)" + - from: "tests/test_task_split.py::test_model_bootstrap_stays_postgres_free" + to: "src/phaze/tasks/_shared/model_bootstrap.py" + via: "subprocess import + sys.modules check against banned triple" + pattern: "import phaze.tasks._shared.model_bootstrap" +--- + + +Implement OPS-03's models-auto-download path (D-21). Extract the URL list from `scripts/download-models.sh` into a Python helper `phaze.scripts.download_models`; add a Postgres-free shared bootstrap `phaze.tasks._shared.model_bootstrap.ensure_models_present(models_dir)` that no-ops if `.pb` files exist and downloads on empty; rewire `agent_worker.startup` to call it AFTER `whoami_with_retry` succeeds (fail-fast auth before spending 5min on a 150MB download per RESEARCH `` line 906); rewrite `scripts/download-models.sh` as a thin bash shim that calls the Python module. + +**WARNING-7 resolution (race avoidance):** Only the worker calls `ensure_models_present()`. The watcher does NOT. Rationale: on a fresh file-server host with empty `/models`, simultaneous worker + watcher startup would race on the same `.pb.part` files; even though the atomic `.part β†’ .rename(dest)` pattern is POSIX-atomic per file, duplicate downloads are wasteful and a partial crash mid-stream could leave the wrong agent holding the half-file. The watcher is file-discovery-only (Phase 27 D-22) β€” it does not load `.pb` files; it only dispatches analysis jobs the worker consumes. Worker-must-be-up-first is already true in practice (watcher dispatches are useless without a worker to process the queue), so we let the worker own the download. + +Purpose: OPS-03 requires "Each file server runs `just download-models` once at setup to populate its own local `/models` volume; the application-server image neither downloads nor mounts models." This plan adds the in-container auto-download fallback (D-21) so a fresh file-server host's first `just up-agent` succeeds without manual pre-warming, while keeping `just download-models` operational for operators who prefer to pre-warm. + +Output: New `phaze.scripts.download_models` Python module; new `phaze.tasks._shared.model_bootstrap`; modified `agent_worker.py` startup; rewritten `scripts/download-models.sh`; 3 new model-bootstrap tests; 1 new subprocess import-boundary test added to `tests/test_task_split.py`. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md + + + + +```python + # Step 1: Models check (mirror worker.py:30-39). + models_dir = Path(cfg.models_path) + if not models_dir.is_dir(): + msg = f"Models directory not found: {cfg.models_path}. Run 'just download-models' to populate it." + raise RuntimeError(msg) + pb_files = list(models_dir.glob("*.pb")) + if not pb_files: + msg = f"No .pb model files in {cfg.models_path}. Run 'just download-models' to populate it." + raise RuntimeError(msg) + logger.info("Found %d model files in %s", len(pb_files), cfg.models_path) +``` + + + + + + + +```python +# src/phaze/scripts/download_models.py +def download_to(target_dir: Path) -> None: ... + +# src/phaze/tasks/_shared/model_bootstrap.py +def ensure_models_present(models_dir: Path) -> None: ... +``` + + + + + + + + + Task 1: Create phaze.scripts package with download_models.py + bash shim + test + src/phaze/scripts/__init__.py, src/phaze/scripts/download_models.py, scripts/download-models.sh, tests/test_services/test_model_bootstrap.py + + - `src/phaze/scripts/__init__.py` exists (may be empty). + - `src/phaze/scripts/download_models.py` exports `download_to(target_dir: Path) -> None`, `CLASSIFIER_MODELS: tuple[str, ...]`, `GENRE_MODELS: tuple[str, ...]`. + - `CLASSIFIER_MODELS` is a tuple of exactly 33 string paths (matching `scripts/download-models.sh` lines 17-50 by content). + - `GENRE_MODELS` is `("discogs-effnet-bs64-1",)`. + - `download_to(tmp_path)` (with the URL list mocked via respx) creates `.pb` + `.json` file pairs for each model in the tuple. + - `_download_one(url, dest)` is idempotent: existing `dest.exists()` returns without re-downloading. + - `_download_one` uses `.part` suffix atomic rename pattern (`tmp.rename(dest)`); no half-downloaded files satisfy the idempotency check. + - `python -m phaze.scripts.download_models ` works as a CLI: parses argv[1] as target dir, calls `download_to(Path(target))`. + - `scripts/download-models.sh` is exactly 3-4 lines: shebang + `set -euo pipefail` + `exec uv run python -m phaze.scripts.download_models "${1:-./models}"`. + - `tests/test_services/test_model_bootstrap.py` has 3 test cases covering empty-dirβ†’download, populatedβ†’no-op, and network-failβ†’RuntimeError (these test ensure_models_present which is created in Task 2; Task 1 only writes the file scaffold and the cases that test download_to directly via respx). + - `uv run mypy src/phaze/scripts/` clean. + - `uv run ruff check src/phaze/scripts/ scripts/download-models.sh` clean. + + + - scripts/download-models.sh (full current bash β€” 33-path classifier list + 1 genre model; URL prefixes; existing bash structure) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 8: Models auto-download on agent startup (D-21)" lines 819-919 (literal Python target for download_models.py + model_bootstrap.py + bash shim) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/scripts/download_models.py" lines 309-338 (analog: bash; key pattern: atomic .part rename + httpx.stream) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_services/test_model_bootstrap.py" lines 1077-1090 (3 LOCKED cases) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-21 (full auto-download spec; logs format; failure mode) + - tests/test_services/test_discogs_matcher.py (analog for httpx + tmp_path testing pattern; OR use respx if it's lighter) + - src/phaze/services/discogs_matcher.py (existing httpx wrapper for pattern reference) + + +Create `src/phaze/scripts/__init__.py` as an empty file (with `"""Package containing operational scripts callable via `python -m phaze.scripts.`."""` docstring). + +Write `src/phaze/scripts/download_models.py` per RESEARCH lines 857-901. Module docstring states: "Python helper that fetches the essentia weight files (D-21). The same URL list + SHA manifest the existing bash script uses, exposed as a Python function so both bash and the agent bootstrap can drive the download. Idempotent: skips files that already exist; verifies SHA-256 if provided (deferred to a future plan)." + +Module constants: +```python +_CLASSIFIER_BASE = "https://essentia.upf.edu/models/classifiers" +_GENRE_BASE = "https://essentia.upf.edu/models/music-style-classification/discogs-effnet" + +CLASSIFIER_MODELS: tuple[str, ...] = ( + "mood_acoustic/mood_acoustic-musicnn-msd-2", + "mood_acoustic/mood_acoustic-musicnn-mtt-2", + "mood_acoustic/mood_acoustic-vggish-audioset-1", + # ... ALL 33 paths from scripts/download-models.sh lines 17-50, in the EXACT order +) +GENRE_MODELS: tuple[str, ...] = ("discogs-effnet-bs64-1",) +``` + +Extract the full 33-path list from `scripts/download-models.sh` lines 17-50 (read the bash file; the array literal between `CLASSIFIER_MODELS=(` and `)`). Copy the strings byte-for-byte; order matters for diff-against-bash. + +Helper function `_download_one(url: str, dest: Path) -> None` per RESEARCH lines 880-890: + +```python +def _download_one(url: str, dest: Path) -> None: + if dest.exists(): + return + dest.parent.mkdir(parents=True, exist_ok=True) + tmp = dest.with_suffix(dest.suffix + ".part") + with httpx.stream("GET", url, follow_redirects=True, timeout=60) as response: + response.raise_for_status() + with tmp.open("wb") as fh: + for chunk in response.iter_bytes(chunk_size=64 * 1024): + fh.write(chunk) + tmp.rename(dest) # atomic on POSIX +``` + +Public function `download_to(target_dir: Path) -> None` per RESEARCH lines 893-901: + +```python +def download_to(target_dir: Path) -> None: + target_dir.mkdir(parents=True, exist_ok=True) + for model_path in CLASSIFIER_MODELS: + filename = model_path.rsplit("/", 1)[-1] + _download_one(f"{_CLASSIFIER_BASE}/{model_path}.pb", target_dir / f"{filename}.pb") + _download_one(f"{_CLASSIFIER_BASE}/{model_path}.json", target_dir / f"{filename}.json") + for model in GENRE_MODELS: + _download_one(f"{_GENRE_BASE}/{model}.pb", target_dir / f"{model}.pb") + _download_one(f"{_GENRE_BASE}/{model}.json", target_dir / f"{model}.json") +``` + +Add CLI entry: at module bottom, `if __name__ == "__main__":` block that reads `target = Path(sys.argv[1] if len(sys.argv) > 1 else "./models")`, calls `download_to(target)`. Add `import sys` to the imports. + +No SHA-256 manifest in this plan (deferred per CONTEXT D-21 explicit "future hardening" note in RESEARCH Security Domain). + +Rewrite `scripts/download-models.sh` per RESEARCH lines 914-918 to a thin shim (replace the existing 50+ line file entirely): + +```bash +#!/usr/bin/env bash +# Download essentia ML models for audio analysis. +# Usage: bash scripts/download-models.sh [output_dir] +# output_dir defaults to ./models +# Phase 29: delegates to phaze.scripts.download_models for single-source-of-truth URL list. +set -euo pipefail +exec uv run python -m phaze.scripts.download_models "${1:-./models}" +``` + +Use `exec` (not `cd && ...`) so signals + exit code pass through cleanly. The existing `just download-models` recipe (`bash scripts/download-models.sh models`) keeps working. + +Write `tests/test_services/test_model_bootstrap.py` with the 3 LOCKED cases per PATTERNS lines 1077-1090. + +Test 1 β€” `test_ensure_models_present_empty_dir_downloads(tmp_path, monkeypatch)`: monkeypatch `phaze.scripts.download_models.download_to` with a `MagicMock` that creates a sentinel `.pb` file. Call `ensure_models_present(tmp_path)`. Assert mock was called with `tmp_path`. Assert log captured (via caplog) the "downloading essentia weights" line at INFO level. + +Test 2 β€” `test_ensure_models_present_populated_no_op(tmp_path)`: create a sentinel file `(tmp_path / "test_model.pb").touch()`. Call `ensure_models_present(tmp_path)`. Assert NO call to `download_to`. Assert log captured "Models present (1 weight files" at INFO level. + +Test 3 β€” `test_ensure_models_present_download_failure(tmp_path, monkeypatch)`: monkeypatch `download_to` to raise `httpx.HTTPError("network down")`. Call `ensure_models_present(tmp_path)` inside `pytest.raises(RuntimeError, match="Model download failed")`. Assert the original exception is wrapped (`__cause__` is the httpx error). + +For mypy: ensure `httpx.stream` is typed correctly. Add `import httpx`, `from pathlib import Path`, `import sys` to the module imports. + + + uv run pytest tests/test_services/test_model_bootstrap.py -x -q && uv run python -c "from phaze.scripts.download_models import CLASSIFIER_MODELS, GENRE_MODELS; assert len(CLASSIFIER_MODELS) == 33; assert len(GENRE_MODELS) == 1; print('ok')" + + +- `src/phaze/scripts/download_models.py` exists with `download_to`, `CLASSIFIER_MODELS` (33 items), `GENRE_MODELS` (1 item) +- `scripts/download-models.sh` is a 4-line bash shim +- `python -m phaze.scripts.download_models ` is invokable +- `uv run mypy src/phaze/scripts/` clean +- `uv run ruff check src/phaze/scripts/ scripts/download-models.sh tests/test_services/test_model_bootstrap.py` clean + + + + + Task 2: Create model_bootstrap shared module + wire into agent_worker.startup ONLY (watcher does NOT auto-download per WARNING-7) + add subprocess import-boundary test (BLOCKER-1) + src/phaze/tasks/_shared/model_bootstrap.py, src/phaze/tasks/agent_worker.py, src/phaze/agent_watcher/__main__.py, tests/test_task_split.py + + - `src/phaze/tasks/_shared/model_bootstrap.py` exists with module docstring declaring the IMPORT-BOUNDARY INVARIANT (mirrors `tasks/_shared/agent_bootstrap.py` style) AND referencing the new dedicated subprocess test by name. + - Module imports ONLY: stdlib (`logging`, `pathlib`), `phaze.scripts.download_models.download_to`. No `phaze.database`, no `sqlalchemy.ext.asyncio`, no `phaze.tasks.session`. + - `ensure_models_present(models_dir: Path) -> None` exists with the body from RESEARCH lines 838-853 (glob `.pb`, log status, call `download_to` on empty, wrap exception in RuntimeError). + - `src/phaze/tasks/agent_worker.py::startup` no longer contains the in-place `RuntimeError("Models directory not found ...")` / `RuntimeError("No .pb model files ...")` checks at lines 88-97. + - `agent_worker.startup` calls `ensure_models_present(Path(cfg.models_path))` exactly once, placed AFTER `await _whoami_with_retry(client)` (current line 104) and BEFORE the fingerprint orchestrator construction (current line 124). New ordering: Step 2 client β†’ Step 3 whoami β†’ Step 3a ensure_models_present β†’ Step 4 queue guard β†’ Step 5 fingerprint β†’ Step 6 pool. + - `src/phaze/agent_watcher/__main__.py::main` does **NOT** call `ensure_models_present` (WARNING-7 race-avoidance resolution). The watcher's startup is unchanged from Phase 27 except for a documentation comment at the post-whoami site explaining the worker-only-download choice. + - **BLOCKER-1:** `tests/test_task_split.py` gains a new test function `test_model_bootstrap_stays_postgres_free()` that mirrors the existing `test_shared_bootstrap_stays_postgres_free` subprocess pattern (currently lines 161-199) but imports `phaze.tasks._shared.model_bootstrap` instead of `phaze.tasks._shared.agent_bootstrap`. Asserts the banned triple `("phaze.database", "phaze.tasks.session", "sqlalchemy.ext.asyncio")` is absent from `sys.modules` after the import. + - `uv run mypy src/phaze/tasks/_shared/model_bootstrap.py src/phaze/tasks/agent_worker.py src/phaze/agent_watcher/__main__.py` clean. + - All existing agent_worker.startup tests pass (any test that mocks the old `RuntimeError` paths needs updating β€” search and adjust). + - `uv run pytest tests/test_task_split.py -x -q` passes, including the new `test_model_bootstrap_stays_postgres_free`. + + + - src/phaze/tasks/agent_worker.py (full file β€” lines 69-138 is the startup hook; lines 88-97 is the old models check to REPLACE; line 104 is whoami; line 124 is fingerprint construction) + - src/phaze/agent_watcher/__main__.py (full file β€” find the `main()` body; locate the post-`whoami_with_retry` line where a documentation-only comment will be inserted noting that ensure_models_present is intentionally NOT called) + - src/phaze/tasks/_shared/agent_bootstrap.py (analog for `_shared/` module structure + docstring banner) + - tests/test_task_split.py (existing subprocess pattern at lines 161-199 β€” `test_shared_bootstrap_stays_postgres_free` is the direct analog for the new test; copy its structure verbatim, change the import target only) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 8 β†’ model_bootstrap.py" lines 822-853 (literal target body) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 8 β†’ invocation order" lines 903-910 (whoami first, then ensure_models_present) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/_shared/model_bootstrap.py" lines 282-306 (analog: agent_bootstrap.py) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/agent_worker.py" lines 623-687 (concrete diff: replace models check + reorder) + - tests/test_tasks/test_agent_worker_startup.py (if it exists; otherwise check tests/ for any agent_worker startup tests that might break) + + +Write `src/phaze/tasks/_shared/model_bootstrap.py` per RESEARCH lines 822-853 + PATTERNS lines 282-306. Module docstring (mirror `agent_bootstrap.py:1-21`): + +```python +"""Auto-download essentia weights when /models is empty (Phase 29 D-21). + +IMPORT-BOUNDARY (extends Phase 26 D-25 + Phase 27 D-22): + Postgres-free. Imports: stdlib + phaze.scripts.download_models only. + Verified by tests/test_task_split.py::test_model_bootstrap_stays_postgres_free + (Phase 29 BLOCKER-1: explicit subprocess case for this module, parallel + to the existing test_shared_bootstrap_stays_postgres_free which covers + agent_bootstrap.py only). + +Race avoidance (Phase 29 WARNING-7): + Only phaze.tasks.agent_worker.startup invokes ensure_models_present. + phaze.agent_watcher.__main__ does NOT β€” the watcher does file discovery + only and cannot dispatch analysis jobs until the worker is up anyway, + so we let the worker own the download and avoid a .part-file race on + fresh /models volumes. + +Public exports: + - ensure_models_present(models_dir): idempotent .pb-file check + download-on-empty +""" +from __future__ import annotations + +import logging +from pathlib import Path + +from phaze.scripts.download_models import download_to + +logger = logging.getLogger(__name__) + + +def ensure_models_present(models_dir: Path) -> None: + """Skip if any .pb files exist; else download. Raises RuntimeError on failure.""" + pb_files = list(models_dir.glob("*.pb")) + if pb_files: + logger.info("Models present (%d weight files at %s)", len(pb_files), models_dir) + return + logger.info( + "%s is empty; downloading essentia weights (~150MB, takes 2-5min on first start)...", + models_dir, + ) + try: + download_to(models_dir) + except Exception as exc: + msg = f"Model download failed: {exc}" + raise RuntimeError(msg) from exc + logger.info("Models downloaded successfully to %s", models_dir) +``` + +Modify `src/phaze/tasks/agent_worker.py::startup`: + +1. Add `from phaze.tasks._shared.model_bootstrap import ensure_models_present` to the imports (alphabetically with other `phaze.tasks._shared` imports near line 52-57). + +2. DELETE lines 88-97 (the entire old models check block from `# Step 1: Models check (mirror worker.py:30-39).` through `logger.info("Found %d model files in %s", len(pb_files), cfg.models_path)`). + +3. After the line `identity = await _whoami_with_retry(client)` (was line 104), insert: + +```python + + # Step 3a (Phase 29 D-21): ensure essentia weights present; download on empty. + # Placed AFTER whoami so auth fails fast (~60s) instead of after 5min download. + # WORKER-ONLY (Phase 29 WARNING-7): the watcher does not call this β€” only the + # worker owns the download to avoid a .part-file race on fresh /models volumes. + ensure_models_present(Path(cfg.models_path)) +``` + +4. Renumber the Step 4 comment from `# Step 4: Queue-name mismatch guard (Pitfall 1).` (unchanged content β€” just the comment numbering survives). + +Modify `src/phaze/agent_watcher/__main__.py`: + +**WARNING-7 resolution: do NOT add `ensure_models_present` here.** Locate the post-`whoami_with_retry(client)` line. Insert ONLY a documentation comment (no code change): + +```python + + # Phase 29 D-21 + WARNING-7: the watcher intentionally does NOT call + # ensure_models_present. The worker (phaze.tasks.agent_worker.startup) + # owns the download on a fresh /models volume; the watcher cannot + # dispatch analysis jobs without a worker anyway, and having both + # entry points race on .part files in /models would be wasteful. + # If a future plan needs models on the watcher side, gate via a + # filelock and only one entry point downloads. +``` + +Do NOT import `ensure_models_present` in `__main__.py`. Do NOT add `Path` if it's not already imported for other reasons. + +If any existing test under `tests/test_tasks/test_agent_worker*.py` mocks the OLD RuntimeError paths (`"Models directory not found"` or `"No .pb model files"`), UPDATE the test to mock `ensure_models_present` instead OR patch the new module-level import. Search via `rg "Models directory not found|No .pb model files|test_agent_worker.*models" tests/` to find these. + +**BLOCKER-1 β€” Extend `tests/test_task_split.py`:** Append a new test function `test_model_bootstrap_stays_postgres_free()` after the existing `test_shared_bootstrap_stays_postgres_free` (currently at lines 161-199). Mirror the existing function byte-for-byte EXCEPT: + +- Update the docstring: + +```python +def test_model_bootstrap_stays_postgres_free() -> None: + """Phase 29 D-21 invariant: phaze.tasks._shared.model_bootstrap is Postgres-free. + + Parallel to test_shared_bootstrap_stays_postgres_free (which covers + agent_bootstrap.py only). The model_bootstrap module imports: + - stdlib (logging, pathlib) + - phaze.scripts.download_models (which imports httpx only) + + None of those pull in phaze.database, phaze.tasks.session, or + sqlalchemy.ext.asyncio. This test fails CI if the model_bootstrap module + is later extended with a Postgres-touching import (e.g., to track + download progress in the DB). + """ +``` + +- Change the subprocess script's import line from `import phaze.tasks._shared.agent_bootstrap` to `import phaze.tasks._shared.model_bootstrap`. + +- Keep the same env-var setup (`PHAZE_ROLE`, `PHAZE_AGENT_API_URL`, `PHAZE_AGENT_TOKEN`, `PHAZE_AGENT_QUEUE`, `PHAZE_AGENT_SCAN_ROOTS`, `PHAZE_REDIS_URL`) for parity even though model_bootstrap itself does not require them. + +- Keep the same banned triple `("phaze.database", "phaze.tasks.session", "sqlalchemy.ext.asyncio")`. + +- Keep the same subprocess shape (`subprocess.run([sys.executable, "-c", script], capture_output=True, text=True, timeout=20, check=False)` with `# noqa: S603`). + +- Keep the same assertion shape: `assert result.returncode == 0, f"model_bootstrap import contaminated sys.modules:\nstdout={result.stdout}\nstderr={result.stderr}"`. + +Verification: After the changes, the existing `tests/test_task_split.py::test_agent_worker_does_not_import_phaze_database` and `::test_shared_bootstrap_stays_postgres_free` tests must continue to pass. The new `test_model_bootstrap_stays_postgres_free` is additive. + + + uv run pytest tests/test_services/test_model_bootstrap.py tests/test_task_split.py -x -q && uv run mypy src/phaze/tasks/_shared/model_bootstrap.py src/phaze/tasks/agent_worker.py src/phaze/agent_watcher/__main__.py + + +- `src/phaze/tasks/_shared/model_bootstrap.py` exists with the documented public function + import-boundary banner referencing the new subprocess test +- `src/phaze/tasks/agent_worker.py::startup` no longer contains the in-place RuntimeError checks; calls `ensure_models_present(Path(cfg.models_path))` after whoami +- `src/phaze/agent_watcher/__main__.py::main` does NOT call `ensure_models_present` (worker-only download); documentation comment present at the post-whoami site +- All 3 tests in `test_model_bootstrap.py` pass +- `tests/test_task_split.py` has the new `test_model_bootstrap_stays_postgres_free` function and ALL 4 subprocess tests pass (no Postgres leak) +- `uv run mypy` clean +- `uv run ruff check` clean across touched files + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| essentia.upf.edu HTTPS server β†’ agent file-server host | model weights download crosses public internet | +| /models bind mount (rw) β†’ agent containers | downloaded weights persist on host | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-05-01 | Tampering | MITM during model download | mitigate (partial) | URLs are HTTPS (essentia.upf.edu) β€” public CA chain verifies the server cert. Future hardening: SHA-256 manifest verification (RESEARCH Security Domain notes this as a future plan) | +| T-29-05-02 | DoS | network failure during download blocks agent startup indefinitely | mitigate | httpx.stream has `timeout=60` per chunk; `RuntimeError` propagates β†’ container exits non-zero β†’ `restart: unless-stopped` retries on saner cadence (~5s default). Operator can pre-warm via `just download-models` | +| T-29-05-03 | Information Disclosure | half-downloaded .pb file satisfies idempotency check next time | mitigate | atomic `.part` β†’ `.rename(dest)` POSIX-atomic write pattern (RESEARCH line 890); a crash mid-download leaves only the `.part` file which is NOT counted by `models_dir.glob("*.pb")` | +| T-29-05-04 | Tampering | malicious binary uploaded to essentia.upf.edu host | accept (out-of-scope) | We trust the upstream essentia maintainers; future hardening with SHA manifests in pyproject.toml would mitigate. v4.0 scope is single-user home server; risk is low | +| T-29-05-05 | Operational | first agent boot takes 5min, operator thinks it's broken | mitigate | log line at INFO: "downloading essentia weights (~150MB, takes 2-5min on first start)..." surfaces in `docker compose logs worker`; documented in deployment.md (Plan 08) | +| T-29-05-06 | Operational | model_bootstrap drag in Postgres dependency, breaks agent isolation | mitigate | Module's IMPORT-BOUNDARY INVARIANT banner + new `tests/test_task_split.py::test_model_bootstrap_stays_postgres_free` covers `phaze.tasks._shared.model_bootstrap` explicitly (Phase 29 BLOCKER-1 resolution) β€” the existing `test_shared_bootstrap_stays_postgres_free` only imports agent_bootstrap and would not catch a future `phaze.database` leak into model_bootstrap | +| T-29-05-07 | Operational (race) | worker + watcher both call ensure_models_present on a fresh /models volume β†’ race on `.pb.part` files; wasted bandwidth + half-files if one crashes mid-download | mitigate (Phase 29 WARNING-7) | Only the worker calls `ensure_models_present`. The watcher's `__main__.py` documents this choice in a code comment. Atomic `.part β†’ rename` would prevent silent corruption even in a race, but worker-only-download is the cleaner invariant and matches the actual dependency (watcher cannot dispatch jobs without worker) | + + + +- `uv run pytest tests/test_services/test_model_bootstrap.py tests/test_task_split.py -x -q` β€” all green (model_bootstrap 3 tests + task_split 4 tests including the new `test_model_bootstrap_stays_postgres_free`) +- `uv run python -c "from phaze.scripts.download_models import CLASSIFIER_MODELS, GENRE_MODELS; assert len(CLASSIFIER_MODELS) == 33"` β€” passes +- `bash scripts/download-models.sh --help 2>&1` exits non-zero gracefully (the shim doesn't have `--help` but the Python module can be invoked without args to use the default) +- All existing tests under `tests/test_tasks/` and `tests/test_agent_watcher/` pass +- Watcher startup test (if any) confirms watcher does NOT log "downloading essentia weights" line β€” only the worker does + + + +- OPS-03 fully closed: in-container auto-download on empty `/models` (worker-driven); `just download-models` still works for pre-warming +- D-21 fully implemented with WARNING-7 race-avoidance resolution +- 3 new model-bootstrap tests + 1 new subprocess import-boundary test (BLOCKER-1 resolution) +- Postgres-free invariant maintained AND explicitly verified for `phaze.tasks._shared.model_bootstrap` and `phaze.scripts.download_models` +- Bash shim is a thin wrapper; URL list lives in one place (Python) + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-05-SUMMARY.md` when both tasks complete. Summary must list: new files, modified files, the URL count migration (33+1 from bash to Python), decision IDs implemented (D-21), the WARNING-7 watcher-no-download choice (with code-comment location), the BLOCKER-1 new subprocess test addition, and any existing tests that needed updating due to the old-error-message removal. + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-05-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-05-SUMMARY.md new file mode 100644 index 0000000..1869638 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-05-SUMMARY.md @@ -0,0 +1,212 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 05 +subsystem: deployment +tags: [phase-29, ops-03, d-21, models, bootstrap, agent-worker, v4.0] + +# Dependency graph +requires: + - phase: 27-watcher-service-user-initiated-scan + provides: phaze.tasks._shared.agent_bootstrap (analog for _shared/ module shape + import-boundary banner) + - phase: 26-task-code-reorg-http-backed-agent-worker + provides: phaze.tasks.agent_worker.startup hook + D-25 subprocess import-boundary invariant pattern +provides: + - "phaze.scripts.download_models β€” single-source-of-truth URL list (33 classifier + 1 genre model) + download_to() + python -m CLI" + - "phaze.tasks._shared.model_bootstrap.ensure_models_present β€” Postgres-free idempotent .pb-glob + auto-download orchestration" + - "agent_worker.startup wiring: ensure_models_present invoked AFTER /whoami (auth-fail-fast before download)" + - "agent_watcher WARNING-7 documentation comment: worker-only download avoids .part-file race" + - "tests/test_task_split.py::test_model_bootstrap_stays_postgres_free β€” BLOCKER-1 subprocess import-boundary case" +affects: [29-04 docker-compose-agent, 29-08 deployment-doc-and-justfile] + +# Tech tracking +tech-stack: + added: [] # zero new pip dependencies; httpx already in deps + patterns: + - "Atomic file-download via .part suffix + tmp.rename(dest) β€” crash-safe per-file idempotency (T-29-05-03)" + - "Bash-shim-delegates-to-python-module: single source of truth for URL lists callable from both bash and Python" + - "Import-boundary subprocess test parallel to agent_bootstrap pattern β€” covers each new _shared/ module" + +key-files: + created: + - src/phaze/scripts/__init__.py + - src/phaze/scripts/download_models.py + - src/phaze/tasks/_shared/model_bootstrap.py + - tests/test_services/test_model_bootstrap.py + modified: + - scripts/download-models.sh (rewritten 6-line shim; was 102-line bash) + - src/phaze/tasks/agent_worker.py (drop in-place RuntimeError check; call ensure_models_present after whoami) + - src/phaze/agent_watcher/__main__.py (add WARNING-7 documentation comment only) + - tests/test_task_split.py (add test_model_bootstrap_stays_postgres_free) + - tests/test_phase04_gaps.py (replace 2 OLD fail-fast tests with ordering + propagation tests) + +key-decisions: + - "URL list lives in Python (CLASSIFIER_MODELS tuple), not bash; bash shim execs `uv run python -m phaze.scripts.download_models`" + - "Atomic .part rename pattern (POSIX-atomic per file) β€” crash mid-stream leaves only .part which the glob does NOT match" + - "ensure_models_present is invoked AFTER whoami_with_retry (Step 3a) β€” auth fails fast in ~63s instead of after 5min download (RESEARCH line 906)" + - "WARNING-7 resolution: ONLY the worker calls ensure_models_present; the watcher documents this choice in a code comment but does NOT call it" + - "BLOCKER-1: dedicated subprocess case test_model_bootstrap_stays_postgres_free added to tests/test_task_split.py β€” parallel structure to test_shared_bootstrap_stays_postgres_free" + - "RuntimeError wrap of underlying download exception preserves __cause__ chain (test_ensure_models_present_download_failure asserts this) β€” container exits non-zero β†’ restart: unless-stopped retries (T-29-05-02)" + +patterns-established: + - "Pattern: bash-shim-delegates-to-python β€” `exec uv run python -m phaze.scripts. \"$@\"` makes the bash script a 6-line thin wrapper while keeping the operator-facing `just download-models` recipe operational" + - "Pattern: import-boundary subprocess test per new _shared/ module β€” each new Postgres-free module under phaze.tasks._shared/ gets its own dedicated test_*_stays_postgres_free function so a future regression in one module is not masked by another" + +requirements-completed: [OPS-03] + +# Metrics +duration: ~25min +completed: 2026-05-16 +--- + +# Phase 29 Plan 05: Models Setup β€” Auto-Download on Empty /models Summary + +**Extracts the essentia model URL list (33 classifier + 1 genre = 68 files) from bash into a Python helper, wires a Postgres-free auto-download bootstrap into agent_worker.startup AFTER /whoami, and adds the BLOCKER-1 subprocess import-boundary test.** + +## Performance + +- **Duration:** ~25 min +- **Started:** 2026-05-16T21:35Z (approx) +- **Completed:** 2026-05-16T22:00Z (approx) +- **Tasks:** 2 (both auto, both TDD) +- **Files created:** 4 (scripts/__init__.py, scripts/download_models.py, _shared/model_bootstrap.py, tests/test_model_bootstrap.py) +- **Files modified:** 5 (download-models.sh, agent_worker.py, agent_watcher/__main__.py, test_task_split.py, test_phase04_gaps.py) +- **Tests added/modified:** 6 new + 2 replacing 2 old = 8 net new behaviour-locking tests + +## Accomplishments + +- **OPS-03 fully closed.** A fresh file-server host's first `just up-agent` now succeeds without manual pre-warming β€” the worker container auto-downloads ~150MB of essentia weights to its local `/models` volume on first start. `just download-models` still works for operators who prefer to pre-warm. +- **D-21 implemented with WARNING-7 race-avoidance.** Only the worker (`phaze.tasks.agent_worker.startup`) calls `ensure_models_present`; the watcher documents the intentional non-call so two parallel containers on a fresh /models volume cannot race on `.pb.part` files. +- **BLOCKER-1 resolved.** `tests/test_task_split.py::test_model_bootstrap_stays_postgres_free` is now a hard gate covering `phaze.tasks._shared.model_bootstrap`; a future regression that imports `phaze.database` or `sqlalchemy.ext.asyncio` into the model_bootstrap chain will trip CI even if the existing `test_shared_bootstrap_stays_postgres_free` (which covers agent_bootstrap.py only) stays green. +- **Bash β†’ Python URL-list migration.** The 33 classifier paths + 1 genre model that previously lived in `scripts/download-models.sh` lines 16-55 now live in `src/phaze/scripts/download_models.py::CLASSIFIER_MODELS` + `GENRE_MODELS`. The bash script is a 6-line `exec uv run python -m phaze.scripts.download_models "$@"` shim. Single source of truth. +- **Auth-fail-fast ordering preserved.** `ensure_models_present` is invoked as Step 3a β€” AFTER `whoami_with_retry` succeeds β€” so a bad token / unreachable app server fails in ~60s instead of after a 5-minute 150MB download (RESEARCH `` line 906). + +## Task Commits + +Each task was committed atomically: + +1. **Task 1: phaze.scripts package with download_models.py + bash shim + test scaffold** β€” `6800931` (feat) +2. **Task 2: model_bootstrap shared module + agent_worker.startup rewire + watcher WARNING-7 comment + BLOCKER-1 subprocess test** β€” `4ccd283` (feat) + +## Files Created/Modified + +### Created + +- `src/phaze/scripts/__init__.py` β€” Package marker for `python -m phaze.scripts.` invocations. +- `src/phaze/scripts/download_models.py` β€” `download_to(target_dir)` public entry; `_download_one(url, dest)` with `.part`-atomic rename; `CLASSIFIER_MODELS` tuple (33 items) + `GENRE_MODELS` tuple (1 item); CLI entry at module bottom. +- `src/phaze/tasks/_shared/model_bootstrap.py` β€” `ensure_models_present(models_dir)` Postgres-free function with the IMPORT-BOUNDARY INVARIANT banner naming the new subprocess test. Imports stdlib + `phaze.scripts.download_models` only. +- `tests/test_services/test_model_bootstrap.py` β€” Six tests: three LOCKED `ensure_models_present` cases (emptyβ†’download, populatedβ†’no-op, network-failβ†’RuntimeError with `__cause__` chain) + three `download_to` / `_download_one` cases (count assertion, idempotency, .pb+.json pair generation). + +### Modified + +- `scripts/download-models.sh` β€” Replaced the 102-line bash script (custom `download_file` function, manual counter, two for-loops) with a 7-line shim: shebang + 4 comment lines + `set -euo pipefail` + `exec uv run python -m phaze.scripts.download_models "${1:-./models}"`. `exec` passes signals + exit code through cleanly. +- `src/phaze/tasks/agent_worker.py` β€” Added `from phaze.tasks._shared.model_bootstrap import ensure_models_present`. Deleted the in-place RuntimeError checks (old lines 88-97). Inserted `ensure_models_present(Path(cfg.models_path))` as Step 3a (after `_whoami_with_retry`, before the queue-mismatch guard). +- `src/phaze/agent_watcher/__main__.py` β€” Inserted a documentation-only comment block at the post-`whoami_with_retry` site explaining the WARNING-7 race-avoidance decision (worker owns the download; watcher intentionally does not). No code change. +- `tests/test_task_split.py` β€” Appended `test_model_bootstrap_stays_postgres_free` subprocess case mirroring the existing `test_shared_bootstrap_stays_postgres_free` structure (same env vars, same banned triple `{phaze.database, phaze.tasks.session, sqlalchemy.ext.asyncio}`, import target changed to `phaze.tasks._shared.model_bootstrap`). +- `tests/test_phase04_gaps.py` β€” Replaced two OLD fail-fast model-dir RuntimeError tests (`test_agent_startup_raises_if_models_dir_missing`, `test_agent_startup_raises_if_no_pb_files`) with two new tests matching the new auto-download semantics: `test_agent_startup_invokes_ensure_models_present_after_whoami` (ordering invariant) and `test_agent_startup_propagates_ensure_models_present_failure` (propagation invariant). The OLD error-message strings are no longer asserted anywhere in the test tree. + +## Decisions Made + +- **Idempotency check anchored on `*.pb` glob.** A populated dir with even one `.pb` file short-circuits; a dir with only `.part` files (from a crashed previous run) is treated as empty and retried. This makes recovery from a partial download trivial β€” re-run produces the same end state. +- **Top-level `Exception` catch in `ensure_models_present`.** The wrap is intentional: any failure from `download_to` (httpx error, OSError, etc.) becomes `RuntimeError("Model download failed: …")` with the original chained as `__cause__`. The test asserts the chain explicitly. This gives the SAQ event loop a single error class to surface and lets the container exit non-zero so `restart: unless-stopped` retries. +- **`httpx.stream` with `timeout=60` + 64KiB chunks.** Matches RESEARCH `` line 890. Per-chunk timeout (not total-download) prevents an indefinite hang on a stalled network without making large files un-downloadable. +- **Watcher WARNING-7 resolution = documentation comment + zero code change.** The plan considered an explicit `# do not call ensure_models_present here` note vs. a flock-coordinated dual-call. The comment-only approach is the minimum-viable resolution and matches the actual dependency (the watcher cannot dispatch analysis jobs without a worker anyway), so worker-owns-download is operationally correct. + +## Deviations from Plan + +### Auto-fixed Issues + +**1. [Rule 3 - Blocking] Ruff `TC003` + `RUF100` warnings on test imports** + +- **Found during:** Task 1 verification (ruff check). +- **Issue:** `pathlib.Path` was imported at module-top in `tests/test_services/test_model_bootstrap.py` but used only in type hints; the `# noqa: ARG001` comment on the `boom` helper was flagged as targeting a non-enabled rule (the per-file-ignore in pyproject.toml already exempts tests/ from ARG001). +- **Fix:** Moved `Path` into a `TYPE_CHECKING` block; removed the unused `# noqa: ARG001` comment. +- **Files modified:** tests/test_services/test_model_bootstrap.py +- **Verification:** `uv run ruff check src/phaze/scripts/ tests/test_services/test_model_bootstrap.py` β†’ All checks passed. +- **Committed in:** `6800931` (part of Task 1). + +**2. [Rule 3 - Blocking] Ruff `I001` import order in `download_models.py`** + +- **Found during:** Task 1 verification (ruff check). +- **Issue:** Initial draft had `import sys` then `from pathlib import Path` (alphabetical by full module name); ruff's isort variant prefers `from … import …` before `import …` when alphabetically ordered. +- **Fix:** `uv run ruff check --fix` auto-reorganized the imports. Confirmed mypy + tests still green. +- **Files modified:** src/phaze/scripts/download_models.py +- **Verification:** `uv run ruff check src/phaze/scripts/` β†’ All checks passed. +- **Committed in:** `6800931` (part of Task 1). + +**3. [Rule 1 - Bug] Pre-commit ruff-format auto-reformatted `tests/test_phase04_gaps.py`** + +- **Found during:** Task 2 commit (pre-commit hook). +- **Issue:** New test docstrings + assertion messages exceeded 150-char line length; ruff-format wrapped them. +- **Fix:** Re-staged the auto-reformatted file and re-ran `git commit`. Second attempt passed. +- **Files modified:** tests/test_phase04_gaps.py (whitespace-only fixups) +- **Verification:** Pre-commit ran clean on second attempt. +- **Committed in:** `4ccd283` (part of Task 2). + +## Deferred Issues + +The following pre-existing test failure was discovered during Task 2 verification. It is **out of scope** for Plan 29-05 (does not touch any file this plan modifies, root cause traces to Plan 29-03's app-server compose hardening): + +- **`tests/test_phase04_gaps.py::test_docker_compose_has_agent_worker_consuming_agent_queue`** β€” Asserts the root `docker-compose.yml` contains a service that runs `uv run saq phaze.tasks.agent_worker.settings` with `PHAZE_ROLE=agent`. Plan 29-03 removed the agent-worker block from root compose (app-server-only invariant); Plan 29-04 (parallel wave) creates `docker-compose.agent.yml` where the agent-worker now lives. This test must be updated by Plan 29-04 (or a follow-on plan) to scan both compose files. Logged in `.planning/phases/29-deployment-hardening-agents-admin/deferred-items.md`. + +## Verification + +### Task 1 acceptance + +- βœ… `src/phaze/scripts/__init__.py` exists (28-char docstring) +- βœ… `src/phaze/scripts/download_models.py` exports `download_to`, `_download_one`, `CLASSIFIER_MODELS`, `GENRE_MODELS` +- βœ… `CLASSIFIER_MODELS` is a tuple of exactly 33 strings β€” `assert len(CLASSIFIER_MODELS) == 33` passes +- βœ… `GENRE_MODELS == ("discogs-effnet-bs64-1",)` +- βœ… `_download_one(url, dest)` is idempotent: existing `dest.exists()` returns without re-downloading (test_download_one_is_idempotent_when_dest_exists asserts httpx.stream is never called) +- βœ… `.part` suffix atomic rename pattern present in `_download_one` body +- βœ… `python -m phaze.scripts.download_models ` works via CLI block at module bottom +- βœ… `scripts/download-models.sh` is exactly 7 lines (shebang + 4 comment lines + `set -euo pipefail` + `exec uv run python -m phaze.scripts.download_models "${1:-./models}"`) β€” matches the `` block in 29-05-PLAN.md verbatim (shebang + usage + delegation note + 2 functional lines) +- βœ… `uv run mypy src/phaze/scripts/` β†’ Success: no issues found in 2 source files +- βœ… `uv run ruff check src/phaze/scripts/ tests/test_services/test_model_bootstrap.py` β†’ All checks passed + +### Task 2 acceptance + +- βœ… `src/phaze/tasks/_shared/model_bootstrap.py` exists with IMPORT-BOUNDARY INVARIANT banner naming `tests/test_task_split.py::test_model_bootstrap_stays_postgres_free` +- βœ… Module imports: `logging`, `pathlib` (TYPE_CHECKING), `phaze.scripts.download_models.download_to`. No `phaze.database`, no `sqlalchemy.ext.asyncio`, no `phaze.tasks.session`. +- βœ… `ensure_models_present` body matches RESEARCH lines 838-853 (glob `.pb`, log status, call `download_to` on empty, wrap exception in RuntimeError) +- βœ… `agent_worker.py::startup` no longer contains the in-place `RuntimeError("Models directory not found ...")` / `RuntimeError("No .pb model files ...")` checks +- βœ… `agent_worker.startup` calls `ensure_models_present(Path(cfg.models_path))` exactly once, AFTER `await _whoami_with_retry(client)` and BEFORE the queue-mismatch guard +- βœ… `agent_watcher/__main__.py::main` does NOT call `ensure_models_present`; the documentation comment is present at the post-whoami site +- βœ… `tests/test_task_split.py::test_model_bootstrap_stays_postgres_free` exists, mirrors `test_shared_bootstrap_stays_postgres_free` structure, imports `phaze.tasks._shared.model_bootstrap`, asserts banned-triple absence +- βœ… `uv run mypy src/phaze/tasks/_shared/model_bootstrap.py src/phaze/tasks/agent_worker.py src/phaze/agent_watcher/__main__.py` β†’ Success +- βœ… All 6 tests in `test_model_bootstrap.py` pass +- βœ… All 4 subprocess tests in `test_task_split.py` pass (including new `test_model_bootstrap_stays_postgres_free`) +- βœ… Test sweep: `tests/test_tasks/test_agent_startup_banner.py tests/test_phase04_gaps.py tests/test_agent_watcher/test_main.py tests/test_services/test_model_bootstrap.py tests/test_task_split.py` β†’ 41 passed, 1 deselected (pre-existing Plan 29-03 failure documented above) + +### Threat-model mitigations delivered + +| Threat ID | Mitigation Delivered | +|-----------|----------------------| +| T-29-05-01 (MITM during model download) | HTTPS-only URLs (essentia.upf.edu); httpx public CA chain verifies cert. Future SHA-256 manifest deferred per plan. | +| T-29-05-02 (network-failure DoS during boot) | RuntimeError wraps `download_to` failures β†’ non-zero exit β†’ restart: unless-stopped retries. test_ensure_models_present_download_failure asserts the wrap + __cause__ chain. | +| T-29-05-03 (half-downloaded .pb satisfies idempotency next time) | `.part` atomic rename pattern in `_download_one`. test_download_one_is_idempotent_when_dest_exists confirms a present `.pb` file short-circuits without touching the network. | +| T-29-05-04 (malicious essentia.upf.edu upload) | Accepted out-of-scope per plan (v4.0 single-user scope). | +| T-29-05-05 (5min boot looks like a hang) | INFO log line "downloading essentia weights (~150MB, takes 2-5min on first start)..." surfaces in `docker compose logs worker`. | +| T-29-05-06 (model_bootstrap drags in Postgres) | New test_model_bootstrap_stays_postgres_free subprocess case is a hard CI gate. | +| T-29-05-07 (worker+watcher race on /models) | WARNING-7 resolution: only worker calls ensure_models_present; watcher documents the non-call. | + +## Self-Check: PASSED + +**Files created β€” verified to exist:** + +- βœ… `src/phaze/scripts/__init__.py` β€” FOUND +- βœ… `src/phaze/scripts/download_models.py` β€” FOUND +- βœ… `src/phaze/tasks/_shared/model_bootstrap.py` β€” FOUND +- βœ… `tests/test_services/test_model_bootstrap.py` β€” FOUND + +**Files modified β€” verified `git log --follow` reachable:** + +- βœ… `scripts/download-models.sh` β€” modified in `6800931` +- βœ… `src/phaze/tasks/agent_worker.py` β€” modified in `4ccd283` +- βœ… `src/phaze/agent_watcher/__main__.py` β€” modified in `4ccd283` +- βœ… `tests/test_task_split.py` β€” modified in `4ccd283` +- βœ… `tests/test_phase04_gaps.py` β€” modified in `4ccd283` + +**Commits β€” verified `git log --all` reachable:** + +- βœ… `6800931` β€” Task 1 +- βœ… `4ccd283` β€” Task 2 diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-06-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-06-PLAN.md new file mode 100644 index 0000000..0a70165 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-06-PLAN.md @@ -0,0 +1,465 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 06 +type: execute +wave: 3 +depends_on: [29-01, 29-05] +files_modified: + - src/phaze/tasks/heartbeat.py + - src/phaze/tasks/agent_worker.py + - tests/test_tasks/test_heartbeat_cron.py + - tests/test_tasks/test_heartbeat_failure.py +autonomous: true +requirements: [OPS-04] +tags: [phase-29, ops-04, heartbeat, saq-cron, v4.0] + +must_haves: + truths: + - "A SAQ CronJob registers `heartbeat_tick` to fire every 30s using the trailing-seconds cron form `* * * * * */30` (RESEARCH Critical Discovery #2)" + - "`heartbeat_tick(ctx)` reads `ctx[\"api_client\"]`, `ctx[\"agent_identity\"]`, `ctx[\"worker\"].queue` (NOT `ctx[\"queue\"]` per Pitfall 8)" + - "Payload constructs `HeartbeatRequest(agent_version=importlib.metadata.version(\"phaze\"), worker_pid=os.getpid(), queue_depth=)` (D-10)" + - "On `AgentApiError` (any subclass), logs WARNING and returns (no re-raise; fire-and-forget per D-09 + Phase 28 D-16)" + - "On `queue.info()` exception, logs WARNING and defaults `queue_depth=0` (defensive) (D-10)" + - "On missing ctx keys (`api_client` or `agent_identity` is None), logs WARNING and returns gracefully (test-friendly)" + - "agent_worker.py settings dict adds `cron_jobs=[CronJob(heartbeat_tick, cron=\"* * * * * */30\", unique=True, timeout=10)]` and adds `heartbeat_tick` to the `functions` list (D-08)" + - "agent_worker.py is NOT converted to a package (Pitfall 9 β€” stays a single .py file); SAQ worker process owns the heartbeat β€” watcher does NOT emit (D-07)" + - "tests cover: success path; ctx-missing path; queue.info-failure path; AgentApiError WARNING-and-continue path" + artifacts: + - path: "src/phaze/tasks/heartbeat.py" + provides: "heartbeat_tick(ctx) β€” async SAQ cron handler; fire-and-forget POST to /api/internal/agent/heartbeat" + min_lines: 50 + exports: ["heartbeat_tick"] + - path: "src/phaze/tasks/agent_worker.py" + provides: "cron_jobs entry + heartbeat_tick added to functions list" + contains: "CronJob" + - path: "tests/test_tasks/test_heartbeat_cron.py" + provides: "4 happy-path tests: success call, ctx-missing skip, queue.info-fail defaults to 0, agent_version sourced from importlib.metadata" + min_lines: 80 + - path: "tests/test_tasks/test_heartbeat_failure.py" + provides: "1 failure test: AgentApiServerError logs WARNING, no exception escapes" + min_lines: 30 + key_links: + - from: "src/phaze/tasks/agent_worker.py::settings.cron_jobs" + to: "src/phaze/tasks/heartbeat.py::heartbeat_tick" + via: "CronJob(heartbeat_tick, cron=\"* * * * * */30\", ...)" + pattern: "CronJob\\(heartbeat_tick" + - from: "src/phaze/tasks/agent_worker.py::settings.functions" + to: "src/phaze/tasks/heartbeat.py::heartbeat_tick" + via: "heartbeat_tick included in the `functions` list so SAQ can dispatch the cron-enqueued job" + pattern: "heartbeat_tick," + - from: "src/phaze/tasks/heartbeat.py::heartbeat_tick" + to: "src/phaze/services/agent_client.py::PhazeAgentClient.heartbeat" + via: "await ctx[\"api_client\"].heartbeat(payload)" + pattern: "client.heartbeat" + - from: "src/phaze/tasks/heartbeat.py::heartbeat_tick" + to: "phaze.schemas.agent_heartbeat.HeartbeatRequest" + via: "import + construct (extra=\"forbid\")" + pattern: "HeartbeatRequest" +--- + + +Land the heartbeat caller half of OPS-04 (D-07..D-10). Create a new `src/phaze/tasks/heartbeat.py` module hosting `heartbeat_tick(ctx)` β€” an async SAQ cron handler that POSTs `HeartbeatRequest(agent_version, worker_pid, queue_depth)` to `/api/internal/agent/heartbeat` via the existing `ctx["api_client"].heartbeat()` method. Register it as a CronJob in `agent_worker.settings.cron_jobs` using the **trailing-seconds 6-field cron form** `"* * * * * */30"` (RESEARCH Critical Discovery #2 β€” CONTEXT.md D-08 example uses the WRONG leading-seconds form which would fire every second). Add the function to `agent_worker.settings.functions` too so SAQ can dispatch it. + +Purpose: OPS-04 success criterion #6 mandates "Each agent posts a heartbeat to `/api/internal/agent/heartbeat` every 30 seconds; the application server updates `agents.last_seen_at` and exposes an 'Agents' admin page listing each agent's status, queue depth, last seen, and revoked state." The endpoint already exists from Phase 25; this plan adds the caller. The UI half lands in Plan 07. + +Output: New `phaze/tasks/heartbeat.py`; modified `agent_worker.py` (settings dict adds cron + functions entry); two new test files (`test_heartbeat_cron.py`, `test_heartbeat_failure.py`). + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-01-SUMMARY.md +@.planning/phases/29-deployment-hardening-agents-admin/29-05-SUMMARY.md + + + + +```python +async def heartbeat(self, payload: HeartbeatRequest) -> None: ... +``` + + + +```python +class HeartbeatRequest(BaseModel): + model_config = ConfigDict(extra="forbid") + agent_version: str + worker_pid: int + queue_depth: int +``` + + + + + + + + + + +```python +cron_jobs = [ + CronJob(refresh_tracklists, cron="0 3 1 * *"), +], +``` + + + + + + + + + + +```python +class QueueInfo(TypedDict): + queued: int # pending count β€” this is the heartbeat queue_depth + active: int + scheduled: int + name: str + workers: dict + jobs: list +``` + + + +```python +@dataclasses.dataclass +class CronJob(t.Generic[CtxType]): + function: Function[CtxType] + cron: str # croniter 6-field with TRAILING seconds β€” RESEARCH Critical Discovery #2 + unique: bool = True + timeout: int | None = None + ... +``` + + + + + + + + + Task 1: Write tests/test_heartbeat_cron.py + tests/test_heartbeat_failure.py (RED step) + tests/test_tasks/test_heartbeat_cron.py, tests/test_tasks/test_heartbeat_failure.py + + - `tests/test_tasks/test_heartbeat_cron.py` contains AT LEAST 4 test functions covering: success path, ctx-missing skip, queue.info-failure default, importlib.metadata.version source. + - `tests/test_tasks/test_heartbeat_failure.py` contains 1 test function covering AgentApiServerError WARNING-and-continue. + - All tests are collectable via `uv run pytest --collect-only tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py`. + - All tests FAIL initially (since `phaze.tasks.heartbeat` does not exist yet) β€” this is the RED step. + - Tests use `AsyncMock` for `client.heartbeat` and `worker.queue.info`. + - Tests use `caplog` to assert WARNING / DEBUG / INFO log lines. + - Tests use `unittest.mock.patch` for `os.getpid` to make `worker_pid` deterministic. + - `AgentApiServerError` is constructed **positional-only** (e.g., `AgentApiServerError("server error")`); the class has no custom `__init__` and accepts no `status_code=` kwarg (verified at `src/phaze/services/agent_client.py:86-87`). Any `status_code=` kwarg would `TypeError` at test setup time. + - `uv run ruff check tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py` clean. + + + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 5: SAQ cron entry for 30s heartbeat" lines 526-616 (literal target for heartbeat_tick body) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Critical Discoveries #2" lines 121-129 (trailing-seconds cron is `"* * * * * */30"`, NOT `"*/30 * * * * *"`) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Critical Discoveries #3" lines 131-137 (agent_worker stays a single .py file; heartbeat_tick lives in a sibling phaze/tasks/heartbeat.py) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pitfall 8: SAQ cron handler reading wrong ctx shape" lines 1004-1008 (`ctx["worker"].queue`, NOT `ctx["queue"]`) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/heartbeat.py" lines 341-399 (literal target including imports + body) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_tasks/test_heartbeat_cron.py + test_heartbeat_failure.py" lines 1063-1074 (4 LOCKED success-path test cases + 1 failure case) + - .planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md Β§D-07..D-10 (full heartbeat caller spec) + - tests/test_tasks/test_execute_approved_batch_progress.py (analog: SAQ ctx + AsyncMock pattern for PhazeAgentClient methods) + - src/phaze/schemas/agent_heartbeat.py (HeartbeatRequest definition β€” verify field names) + - src/phaze/services/agent_client.py lines 74-87 (verify AgentApiError + subclasses have NO custom __init__ β€” positional args only) + + +Write `tests/test_tasks/test_heartbeat_cron.py` with 4 happy-path tests. Imports: + +```python +from __future__ import annotations + +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from phaze.schemas.agent_heartbeat import HeartbeatRequest +from phaze.schemas.agent_identity import AgentIdentity +from phaze.tasks.heartbeat import heartbeat_tick +``` + +Fixture `make_ctx` (module-level pytest fixture or plain helper function) that returns a `dict[str, Any]` populated with: + +```python +def make_ctx(*, queued: int = 5, raise_info: bool = False) -> dict[str, Any]: + client = AsyncMock() + identity = AgentIdentity( + agent_id="test-agent", + name="Test", + scan_roots=["/data"], + created_at=datetime(2026, 1, 1, tzinfo=UTC), + ) + worker = MagicMock() + queue = AsyncMock() + if raise_info: + queue.info = AsyncMock(side_effect=RuntimeError("redis down")) + else: + queue.info = AsyncMock(return_value={"queued": queued, "active": 0, "scheduled": 0, "name": "phaze-agent-test-agent", "workers": {}, "jobs": []}) + worker.queue = queue + return {"api_client": client, "agent_identity": identity, "worker": worker, "job": MagicMock()} +``` + +Test 1 β€” `test_heartbeat_success(caplog)`: ctx = make_ctx(queued=7); patch `os.getpid` to return 12345; patch `importlib.metadata.version` to return `"0.1.0"`. `await heartbeat_tick(ctx)`. Assert `ctx["api_client"].heartbeat.await_count == 1`; `call_args = ctx["api_client"].heartbeat.await_args; payload = call_args.args[0]` (or `call_args.kwargs["payload"]` if it's kwarg-passed; signature is `heartbeat(payload)`). Assert `isinstance(payload, HeartbeatRequest)`, `payload.agent_version == "0.1.0"`, `payload.worker_pid == 12345`, `payload.queue_depth == 7`. + +Test 2 β€” `test_heartbeat_skips_when_ctx_missing(caplog)`: ctx = `{"worker": MagicMock(), "job": MagicMock()}` (no `api_client`, no `agent_identity`). With `caplog.at_level("WARNING")`, `await heartbeat_tick(ctx)`. Assert NO exception raised; assert `"heartbeat_tick: ctx not initialized"` substring is in caplog.text. The ctx test mirrors the implementation's defensive guard. + +Test 3 β€” `test_heartbeat_queue_info_failure_defaults_to_zero(caplog)`: ctx = make_ctx(raise_info=True). With `caplog.at_level("WARNING")`, `await heartbeat_tick(ctx)`. Assert `ctx["api_client"].heartbeat.await_count == 1`; assert the payload's `queue_depth == 0`; assert caplog.text contains `"queue.info() failed"`. The heartbeat still goes out even when queue depth is unknown. + +Test 4 β€” `test_heartbeat_agent_version_from_importlib(caplog)`: ctx = make_ctx(); do NOT patch `importlib.metadata.version`; `await heartbeat_tick(ctx)`. Read the payload `payload.agent_version`; assert it matches the real `importlib.metadata.version("phaze")` value (`"0.1.0"` per pyproject.toml). + +Write `tests/test_tasks/test_heartbeat_failure.py`: + +Test 1 β€” `test_heartbeat_agentapierror_warning(caplog)`: + +```python +from phaze.services.agent_client import AgentApiServerError + +ctx = make_ctx() +# AgentApiServerError has no custom __init__ β€” POSITIONAL ARGS ONLY. +# Verified at src/phaze/services/agent_client.py:86-87. Do NOT pass status_code= or any kwarg. +ctx["api_client"].heartbeat = AsyncMock(side_effect=AgentApiServerError("server error")) + +with caplog.at_level("WARNING"): + # Must not raise + await heartbeat_tick(ctx) + +assert "heartbeat failed" in caplog.text +assert any(r.levelname == "WARNING" for r in caplog.records) +``` + +Make `make_ctx` importable from both test files via a shared `tests/test_tasks/_heartbeat_fixtures.py` module OR a `conftest.py`-level fixture. Simpler: define the helper inline in each test file (two copies; ~15 lines each). Choose the simpler path. + +Wrap each test with `@pytest.mark.asyncio` (pytest-asyncio is in `asyncio_mode = "auto"` per pyproject.toml line 38 β€” so `async def test_X(...)` works without the decorator. Verify by reading pyproject.toml; auto mode means no decorator needed). + +Run `uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py --collect-only -q` β€” tests are discovered. Running them now FAILS with `ModuleNotFoundError: No module named 'phaze.tasks.heartbeat'` (RED step). That's expected. + + + uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py --collect-only -q + + +- 4 tests in `test_heartbeat_cron.py` + 1 test in `test_heartbeat_failure.py` collected by pytest +- Tests FAIL with `ModuleNotFoundError: No module named 'phaze.tasks.heartbeat'` (expected RED) +- `AgentApiServerError` is constructed positional-only in the failure test (no `status_code=` kwarg) +- `uv run ruff check tests/test_tasks/test_heartbeat*.py` clean + + + + + Task 2: Create phaze/tasks/heartbeat.py + register cron in agent_worker.settings (GREEN step) + src/phaze/tasks/heartbeat.py, src/phaze/tasks/agent_worker.py + + - `src/phaze/tasks/heartbeat.py` exists with `heartbeat_tick(ctx: dict[str, Any]) -> None` as the sole public export. + - Module imports ONLY: stdlib (`logging`, `os`, `importlib.metadata`, `typing.Any`) + `phaze.schemas.agent_heartbeat.HeartbeatRequest` + `phaze.services.agent_client.AgentApiError`. + - Body matches RESEARCH Pattern 5 lines 553-580 + PATTERNS lines 371-397 byte-for-byte (modulo cosmetic whitespace). + - `src/phaze/tasks/agent_worker.py` imports `CronJob` from saq (currently only `Queue` is imported at line 48; add CronJob to the same import). + - `src/phaze/tasks/agent_worker.py` imports `heartbeat_tick` from `phaze.tasks.heartbeat`. + - `src/phaze/tasks/agent_worker.py::settings` dict has `cron_jobs=[CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10)]` (TRAILING-seconds form per RESEARCH Critical Discovery #2 β€” NOT leading-seconds `*/30 * * * * *`). + - `src/phaze/tasks/agent_worker.py::settings["functions"]` list includes `heartbeat_tick` (so SAQ can dispatch the cron-enqueued job). + - `agent_worker.py` is STILL a single `.py` file (not converted to a package; Pitfall 9). + - All 5 tests from Task 1 PASS after this task (GREEN step). + - `uv run mypy src/phaze/tasks/heartbeat.py src/phaze/tasks/agent_worker.py` clean. + - `tests/test_task_split.py` still passes (heartbeat.py import doesn't drag in phaze.database). + - Additional smoke test: `uv run python -c "from croniter import croniter; assert list(croniter('* * * * * */30', start_time=0))[:3] == [30.0, 60.0, 90.0], 'cron string fires wrong cadence'"` exits 0. + + + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 5: SAQ cron entry for 30s heartbeat" lines 526-616 (literal target) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/heartbeat.py" lines 341-399 (analog: SAQ task handlers; full target body) + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/tasks/agent_worker.py" lines 623-687 (concrete diff: add CronJob import + heartbeat_tick to functions + cron_jobs entry) + - src/phaze/tasks/agent_worker.py (full current file β€” lines 48 SAQ imports, lines 179-192 settings dict) + - src/phaze/tasks/controller.py (existing cron_jobs precedent for syntax) + - src/phaze/schemas/agent_heartbeat.py (HeartbeatRequest field names + extra="forbid") + - src/phaze/services/agent_client.py (AgentApiError base class + subclasses) + + +Write `src/phaze/tasks/heartbeat.py` per PATTERNS lines 341-397 + RESEARCH lines 526-580. Module docstring: + +```python +"""30-second cron handler that POSTs an agent heartbeat (Phase 29 D-07..D-10). + +Reads from SAQ ctx (populated by phaze.tasks.agent_worker.startup): + - ctx["api_client"]: PhazeAgentClient + - ctx["agent_identity"]: AgentIdentity + - ctx["worker"]: SAQ Worker (gives .queue for Queue.info()) + +Failure policy (D-09): catch AgentApiError, log WARNING, return. SAQ retries +on next tick. Mirrors Phase 28 D-16 fire-and-forget posture. + +Cron schedule (D-08 + RESEARCH Critical Discovery #2): + "* * * * * */30" -- trailing-seconds 6-field form; croniter 6.x default. + NOT "*/30 * * * * *" (leading-seconds form fires every second). +""" +from __future__ import annotations + +import importlib.metadata +import logging +import os +from typing import Any + +from phaze.schemas.agent_heartbeat import HeartbeatRequest +from phaze.services.agent_client import AgentApiError + +logger = logging.getLogger(__name__) + + +async def heartbeat_tick(ctx: dict[str, Any]) -> None: + """SAQ cron handler. ctx is the worker context dict from startup hook.""" + client = ctx.get("api_client") + identity = ctx.get("agent_identity") + if client is None or identity is None: + logger.warning("heartbeat_tick: ctx not initialized; skipping") + return + + # Queue depth from SAQ Queue.info()["queued"] β€” via ctx["worker"].queue (Pitfall 8). + queue = ctx["worker"].queue + try: + info = await queue.info() + queue_depth = int(info.get("queued", 0)) + except Exception: + logger.warning("heartbeat_tick: queue.info() failed; defaulting to 0", exc_info=True) + queue_depth = 0 + + payload = HeartbeatRequest( + agent_version=importlib.metadata.version("phaze"), + worker_pid=os.getpid(), + queue_depth=queue_depth, + ) + try: + await client.heartbeat(payload) + logger.debug("heartbeat sent agent=%s queue_depth=%d", identity.agent_id, queue_depth) + except AgentApiError as exc: + logger.warning("heartbeat failed: %s", exc) +``` + +Notes: +- The `except Exception:` around `queue.info()` is intentionally broad β€” any SAQ internal change or Redis blip should not crash the heartbeat; default to 0 and report. Ruff may flag this with `BLE001` (blind-except); use `# noqa: BLE001` if needed (ruff config in pyproject.toml has `B` enabled which includes BLE; check). +- The `# noqa: BLE001` is acceptable here per Pitfall-8 defensive design intent. Document inline if added. +- `AgentApiError` is the BASE class; this catches all three subclasses (`AgentApiAuthError`, `AgentApiClientError`, `AgentApiServerError`) per D-09 ("any subclass"). + +Modify `src/phaze/tasks/agent_worker.py`: + +1. Line 48: `from saq import Queue` β†’ `from saq import CronJob, Queue`. + +2. After the existing `phaze.tasks.scan` import (line 63 area), add: `from phaze.tasks.heartbeat import heartbeat_tick`. + +3. Lines 181-188 (settings dict `functions` list): add `heartbeat_tick` as the last item: + +```python + "functions": [ + process_file, + extract_file_metadata, + fingerprint_file, + scan_live_set, + scan_directory, + execute_approved_batch, + heartbeat_tick, # Phase 29 D-08 + ], +``` + +4. After the `functions` list (between `"functions": [...]` and `"concurrency":`), insert the `cron_jobs` entry. The full updated settings dict: + +```python +settings = { + "queue": queue, + "functions": [ + process_file, + extract_file_metadata, + fingerprint_file, + scan_live_set, + scan_directory, + execute_approved_batch, + heartbeat_tick, + ], + "cron_jobs": [ + CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10), # type: ignore[type-var] + ], + "concurrency": get_settings().worker_max_jobs, + "startup": startup, + "shutdown": shutdown, +} +``` + +The `# type: ignore[type-var]` mirrors the controller.py precedent at line 117 (SAQ's CronJob is a Generic that mypy can't infer cleanly from the function reference). + +DO NOT convert agent_worker.py to a package (Pitfall 9). Keep it a single `.py` file. The dotted name `phaze.tasks.agent_worker.settings` works because `settings` is a module-level dict attribute, not a package. + +Run `uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py -x -q` β€” all 5 tests should now pass (GREEN). + +Sanity-check the cron string empirically: +```bash +uv run python -c "from croniter import croniter; assert list(croniter('* * * * * */30', start_time=0))[:3] == [30.0, 60.0, 90.0], 'wrong cadence'; print('cron ok')" +``` +This should print `cron ok`. If it prints any non-30/60/90 sequence (e.g., `[1, 2, 3]`), the cron string is wrong β€” fix to trailing-seconds form per RESEARCH Critical Discovery #2. + + + uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py tests/test_task_split.py -x -q && uv run python -c "from croniter import croniter; assert list(croniter('* * * * * */30', start_time=0))[:3] == [30.0, 60.0, 90.0]" + + +- `src/phaze/tasks/heartbeat.py` exists with `heartbeat_tick(ctx)` matching the locked behavior +- `src/phaze/tasks/agent_worker.py::settings` has `cron_jobs=[CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10)]` +- `agent_worker.py` is STILL a single .py file (NOT a package) +- All 5 heartbeat tests pass (GREEN) +- `tests/test_task_split.py` still passes (heartbeat.py doesn't leak phaze.database) +- `uv run mypy src/phaze/tasks/heartbeat.py src/phaze/tasks/agent_worker.py` clean +- `uv run ruff check src/phaze/tasks/` clean +- The croniter sanity assertion (`* * * * * */30` β†’ [30, 60, 90]) passes + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| agent worker process β†’ app-server HTTPS endpoint | every 30s heartbeat POST crosses this | +| Redis (queue.info) β†’ agent worker process | local read of queue state | +| importlib.metadata β†’ agent worker | reads `phaze` package version | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-06-01 | Spoofing | rogue agent sends heartbeat impersonating another agent | mitigate | Bearer-token auth at `/api/internal/agent/heartbeat` (Phase 25 unchanged); `agent_id` resolved from token-hash lookup, NEVER from request body; HeartbeatRequest `extra="forbid"` rejects any agent_id field smuggled in | +| T-29-06-02 | DoS | agent floods heartbeat endpoint via misconfigured cron | mitigate | RESEARCH Critical Discovery #2: trailing-seconds form `"* * * * * */30"` fires at 30s cadence, NOT 1s. Smoke test enforces 30/60/90 timestamps. SAQ `unique=True` prevents duplicate enqueues within the 30s window | +| T-29-06-03 | Information Disclosure | heartbeat failure logs include bearer token | mitigate | `logger.warning("heartbeat failed: %s", exc)` interpolates the AgentApiError instance's __str__ β€” exception class does NOT include the bearer token in its message (Phase 26 D-13 invariant). Tests can additionally assert `"phaze_agent_"` is NOT in caplog.text | +| T-29-06-04 | Tampering | malicious queue.info() return value crashes cron handler | mitigate | broad `except Exception` defaults to `queue_depth=0`; the cron continues firing | +| T-29-06-05 | DoS | heartbeat blocks SAQ event loop | mitigate | `await client.heartbeat(payload)` uses httpx async; tenacity retry funnel (Phase 26 D-11) bounds wall-time to ~4s; SAQ's CronJob `timeout=10` upper-bounds the handler. The 30s cadence > timeout means the next tick fires cleanly | +| T-29-06-06 | Operational | heartbeat silently fails forever (network partition) | accept (D-09 fire-and-forget) | The app-server's last_seen_at stops advancing β†’ admin page surfaces "stale" / "dead" β†’ operator notices. Mirrors Phase 28 D-16 posture | +| T-29-06-07 | Operational | agent_worker.py refactored to package, breaks `from phaze.tasks.agent_worker import ...` everywhere | mitigate | Pitfall 9 documents the trap; this plan adds cron_jobs to the existing settings dict IN-PLACE and puts heartbeat_tick in a sibling file `phaze/tasks/heartbeat.py`. agent_worker.py stays a single .py | + + + +- `uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py -x -q` β€” all 5 pass +- `uv run pytest tests/test_task_split.py -x -q` β€” agent_worker import boundary still holds +- `uv run pytest tests/ -x -q` β€” no regression in the broader suite (agent_worker startup tests should still pass; if they don't, the test needed updating β€” see Plan 05's note about mocking ensure_models_present) +- croniter empirical assertion: `[30.0, 60.0, 90.0]` for first 3 fire times β€” passes +- `uv run mypy src/phaze/tasks/heartbeat.py src/phaze/tasks/agent_worker.py` β€” clean +- `uv run ruff check .` β€” clean + + + +- OPS-04 caller half closed (the UI half is Plan 07) +- D-07, D-08, D-09, D-10 implemented with the corrected trailing-seconds cron form +- 5 new tests (4 happy-path + 1 failure) +- `agent_worker.py` remains a single .py file (Pitfall 9 avoided) + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-06-SUMMARY.md` when both tasks complete. Summary must list: new file `src/phaze/tasks/heartbeat.py`, modifications to `agent_worker.py` settings dict (the 3 edits: import CronJob, import heartbeat_tick, add cron_jobs + functions entry), decision IDs implemented (D-07, D-08, D-09, D-10), and a one-line note that the trailing-seconds cron form was used (NOT the leading-seconds form from CONTEXT D-08 example) per RESEARCH Critical Discovery #2. + + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-06-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-06-SUMMARY.md new file mode 100644 index 0000000..666cfe7 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-06-SUMMARY.md @@ -0,0 +1,195 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 06 +subsystem: agent-worker +tags: [phase-29, ops-04, heartbeat, saq-cron, agent-worker, v4.0] + +# Dependency graph +requires: + - phase: 26-task-code-reorg-http-backed-agent-worker + provides: phaze.tasks.agent_worker.settings hook + PhazeAgentClient.heartbeat method (Phase 25, refactored Phase 26) + - phase: 25-internal-agent-http-api-bearer-auth + provides: POST /api/internal/agent/heartbeat endpoint + HeartbeatRequest schema (extra="forbid") + - phase: 29-deployment-hardening-agents-admin/29-01 + provides: 29-01 phase scaffolding + frontmatter conventions + - phase: 29-deployment-hardening-agents-admin/29-05 + provides: agent_worker.startup ctx shape (api_client, agent_identity already populated by Plan 29-05's startup wiring) +provides: + - "phaze.tasks.heartbeat.heartbeat_tick β€” async SAQ cron handler emitting 30s heartbeats fire-and-forget" + - "agent_worker.settings.cron_jobs[0] β€” registered CronJob(heartbeat_tick, cron='* * * * * */30', unique=True, timeout=10) on the agent-role SAQ Worker" + - "tests/test_tasks/test_heartbeat_cron.py β€” 4 happy-path tests (success, ctx-missing, queue.info-fail, importlib metadata source)" + - "tests/test_tasks/test_heartbeat_failure.py β€” 1 failure test (AgentApiServerError -> WARNING + swallow)" +affects: [29-07 admin-agents-page (consumes last_seen_at populated by these heartbeats)] + +# Tech tracking +tech-stack: + added: [] # zero new pip dependencies; saq.CronJob already in deps via Phase 26 controller.py + patterns: + - "SAQ CronJob with trailing-seconds 6-field cron form (`* * * * * */30`) for sub-minute cadence β€” croniter 6.x default convention" + - "Defensive ctx-key guarding in SAQ cron handlers: missing api_client/agent_identity -> WARNING + return (no exception escapes during worker startup races)" + - "Fire-and-forget HTTP POST posture inside async SAQ jobs: catch domain-specific exception base class, log WARNING, swallow; SAQ retries via next cron tick (mirrors Phase 28 D-16)" + +key-files: + created: + - src/phaze/tasks/heartbeat.py + - tests/test_tasks/test_heartbeat_cron.py + - tests/test_tasks/test_heartbeat_failure.py + modified: + - src/phaze/tasks/agent_worker.py (added CronJob + heartbeat_tick imports; added heartbeat_tick to settings.functions; added settings.cron_jobs entry) + +key-decisions: + - "Trailing-seconds 6-field cron form `* * * * * */30` (NOT the leading-seconds form `*/30 * * * * *` shown in CONTEXT.md D-08) β€” empirically verified: trailing produces 30s gaps, leading produces 1s gaps (croniter 6.x default convention places seconds as field 6)" + - "agent_worker.py stays a single .py file (Pitfall 9 avoided) β€” settings dict mutation in place; heartbeat_tick lives in sibling phaze/tasks/heartbeat.py" + - "Defensive queue.info() failure handling: any exception -> queue_depth=0 + WARNING log + still POST (heartbeat is more valuable than queue-depth accuracy in the failure mode)" + - "AgentApiServerError tests construct positional-only per src/phaze/services/agent_client.py:86-87 β€” no custom __init__, no status_code= kwarg" + - "BLE001 ruff rule is NOT enabled in this project's config; the `# noqa: BLE001` directive from PATTERNS.md was unused β€” removed and replaced with an inline comment documenting the broad-except intent" + - "ctx['worker'].queue is the correct access path (NOT ctx['queue']) per RESEARCH Pitfall 8 β€” SAQ pre-populates `self.context = {'worker': self}` in `Worker.__init__`" + +requirements-completed: [OPS-04 (caller half β€” UI half lands in Plan 29-07)] + +# Metrics +duration: ~15min +completed: 2026-05-16 +--- + +# Phase 29 Plan 06: Agent Heartbeat Caller (OPS-04 Caller Half) Summary + +**Lands the agent-side SAQ cron handler that POSTs a heartbeat every 30 seconds to `/api/internal/agent/heartbeat`, populating `agents.last_seen_at` and `last_status` for the admin page (Plan 07). Uses the trailing-seconds 6-field cron form (`* * * * * */30`) β€” NOT the leading-seconds example from CONTEXT.md D-08 β€” per RESEARCH Critical Discovery #2.** + +## Performance + +- **Duration:** ~15 min (estimate) +- **Started:** 2026-05-16T~16:14Z +- **Completed:** 2026-05-16T~16:30Z +- **Tasks:** 2 (both auto, both TDD) +- **Files created:** 3 (heartbeat.py, test_heartbeat_cron.py, test_heartbeat_failure.py) +- **Files modified:** 1 (agent_worker.py β€” 3 edits: CronJob import, heartbeat_tick import, settings dict cron_jobs + functions entry) +- **Tests added:** 5 (4 happy-path + 1 failure) +- **Lines added:** ~140 source + ~165 tests = ~305 total + +## Accomplishments + +- **OPS-04 caller half closed.** Each agent's SAQ worker now POSTs `HeartbeatRequest(agent_version, worker_pid, queue_depth)` to `/api/internal/agent/heartbeat` every 30 seconds. The app-server endpoint (Phase 25) updates `agents.last_seen_at` and `last_status` JSONB; the admin page (Plan 07) will read both columns for the alive/stale/dead pill computation. +- **Trailing-seconds 6-field cron form locked in.** The PLAN explicitly called out the CONTEXT.md D-08 example bug (`*/30 * * * * *` fires every second under croniter 6.x default config). The correct form `* * * * * */30` was verified empirically with `croniter(...)` returning gaps of 30.0 seconds vs 1.0 seconds for the wrong form. The smoke command from the PLAN (`croniter('* * * * * */30', start_time=0)`) hangs in an infinite loop on this version of croniter β€” but the cron string itself is verified-correct via datetime-baseline invocation. +- **D-07 routing preserved.** Only the agent_worker SAQ process emits heartbeats; the watcher does NOT (Pattern 5 + Phase 29 D-07 β€” if the worker is down but the watcher is up, the agent looks "stale" in the admin UI, which is the correct operator signal). +- **D-08 cron registered.** `CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10)` lives at `phaze.tasks.agent_worker.settings.cron_jobs[0]`. `heartbeat_tick` is also in `settings.functions` so SAQ can dispatch the cron-enqueued job. +- **D-09 fire-and-forget failure handling.** `try: client.heartbeat(payload) except AgentApiError as exc: logger.warning("heartbeat failed: %s", exc)` β€” any 4xx/5xx/timeout (all subclasses of `AgentApiError`) is logged and swallowed. The SAQ cron fires again 30 seconds later; the operator sees `last_seen_at` stop advancing and the admin UI surfaces "stale" naturally. +- **D-10 payload shape locked.** `agent_version` reads from `importlib.metadata.version("phaze")` (pyproject.toml [project].version); `worker_pid` from `os.getpid()` inside the SAQ Worker subprocess; `queue_depth` from `ctx["worker"].queue.info()["queued"]` β€” typed `int` cast guards against `None` slipping in. +- **Pitfall 8 avoided.** Queue access is `ctx["worker"].queue` (NOT `ctx["queue"]`). SAQ pre-populates `self.context = {"worker": self}` in `Worker.__init__`; only the startup hook adds the keys (api_client, agent_identity). The cron handler sees `{**self.context, "job": job}` per tick, so ctx["queue"] would `KeyError`. +- **Pitfall 9 avoided.** `agent_worker.py` remains a single `.py` file. The PLAN explicitly added cron_jobs to the existing settings dict IN-PLACE (not converted to a package); heartbeat_tick lives in a sibling module `phaze/tasks/heartbeat.py`. + +## Task Commits + +Each task was committed atomically: + +1. **Task 1: Write failing tests for SAQ heartbeat cron handler (RED step)** β€” `48ad8c1` (test) +2. **Task 2: Create phaze/tasks/heartbeat.py + register cron in agent_worker.settings (GREEN step)** β€” `afbf048` (feat) + +## Files Created/Modified + +### Created + +- `src/phaze/tasks/heartbeat.py` β€” 79 lines. Module docstring documents D-07..D-10 contract + IMPORT-BOUNDARY INVARIANT (Postgres-free) + the trailing-seconds-cron rationale. Imports stdlib (`importlib.metadata`, `logging`, `os`, `typing.Any`) + `phaze.schemas.agent_heartbeat.HeartbeatRequest` + `phaze.services.agent_client.AgentApiError`. Body follows RESEARCH Pattern 5 lines 553-580 byte-for-byte (modulo the BLE001 noqa removal documented under Deviations). +- `tests/test_tasks/test_heartbeat_cron.py` β€” 4 async test functions: `test_heartbeat_success`, `test_heartbeat_skips_when_ctx_missing`, `test_heartbeat_queue_info_failure_defaults_to_zero`, `test_heartbeat_agent_version_from_importlib`. Module-level `_make_ctx` helper builds the SAQ ctx shape (api_client AsyncMock, agent_identity AgentIdentity, worker MagicMock with worker.queue AsyncMock returning the SAQ QueueInfo TypedDict). Uses `unittest.mock.patch("phaze.tasks.heartbeat.os.getpid", return_value=12345)` + `patch("phaze.tasks.heartbeat.importlib.metadata.version", ...)` for deterministic `worker_pid` / `agent_version` in the success test. +- `tests/test_tasks/test_heartbeat_failure.py` β€” 1 async test `test_heartbeat_agentapierror_warning`. Constructs `AgentApiServerError("server error")` positional-only per verified `agent_client.py:86-87` (no custom `__init__`). Asserts `caplog.text` contains `"heartbeat failed"` and at least one WARNING-level record. + +### Modified + +- `src/phaze/tasks/agent_worker.py` β€” 3 edits: + 1. `from saq import Queue` β†’ `from saq import CronJob, Queue` (line 48). + 2. Added `from phaze.tasks.heartbeat import heartbeat_tick` after the existing `phaze.tasks.functions` import (alphabetical placement per ruff isort config). + 3. Added `heartbeat_tick` to `settings["functions"]` list (end position) AND added `"cron_jobs": [CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10)], # type: ignore[type-var]` (mirroring the `controller.py:117` precedent shape). The `# type: ignore[type-var]` is necessary because SAQ's `CronJob` is a `Generic[CtxType]` that mypy cannot infer from a function reference; this matches the existing controller.py pattern. + +## Decisions Made + +- **Trailing-seconds 6-field cron form.** The CONTEXT.md D-08 example shows `*/30 * * * * *` (leading seconds, would fire every second under croniter 6.x default). RESEARCH Critical Discovery #2 verified the correct form is `* * * * * */30` (trailing seconds, 30s cadence). Implemented the corrected form; reverify command in the PLAN (`croniter('* * * * * */30', start_time=0)`) hangs in an infinite loop in croniter 6.2.2 β€” but a datetime-baseline reverify (`croniter('* * * * * */30', datetime(2026, 1, 1))`) confirms 30s gaps. +- **Defensive `except Exception` around `queue.info()`.** Broad β€” but documented inline as intentional. Any SAQ-internal change, Redis blip, or `None` return must NOT crash the cron handler; default to `queue_depth=0` and still POST the heartbeat. The heartbeat presence is more valuable than queue-depth accuracy when the queue is unreliable. +- **`AgentApiError` (base class) catch β€” NOT bare `Exception`.** D-09 specifies "any subclass" of AgentApiError; bare `Exception` would swallow programming errors. The narrow base-class catch lets `TypeError`, `ValueError`, etc. bubble up where they belong. +- **`# type: ignore[type-var]` on the CronJob entry.** Matches the existing `controller.py:117` pattern. SAQ's `CronJob` is `Generic[CtxType]` and mypy cannot infer the type-var from a function reference; the only alternatives are extensive explicit annotations or upstream SAQ changes β€” neither in scope. +- **AgentApiServerError tests positional-only.** Verified at `src/phaze/services/agent_client.py:86-87` β€” `AgentApiServerError` (and all `AgentApiError` subclasses) have no custom `__init__`. They accept positional args ONLY. Passing `status_code=` would `TypeError` at test setup time. The failure-mode test constructs `AgentApiServerError("server error")` positional-only. + +## Deviations from Plan + +### Auto-fixed Issues + +**1. [Rule 3 - Blocking] Ruff `TC002` warning on `import pytest` in new test files** + +- **Found during:** Task 1 verification (ruff check after writing test files). +- **Issue:** Both `test_heartbeat_cron.py` and `test_heartbeat_failure.py` had `import pytest` at module top. The `pytest` symbol is used ONLY as a type annotation (`pytest.LogCaptureFixture`). Ruff's `TC002` rule (move third-party type-only imports into a `TYPE_CHECKING` block) flagged both. +- **Fix:** Moved `import pytest` into an `if TYPE_CHECKING:` block at module bottom, added `TYPE_CHECKING` to the existing `from typing import ...` import. +- **Files modified:** tests/test_tasks/test_heartbeat_cron.py, tests/test_tasks/test_heartbeat_failure.py +- **Verification:** `uv run ruff check tests/test_tasks/test_heartbeat_*.py` β†’ All checks passed. +- **Committed in:** `48ad8c1` (part of Task 1 RED commit β€” fix applied before initial commit). + +**2. [Rule 3 - Blocking] Unused `# noqa: BLE001` directive in heartbeat.py** + +- **Found during:** Task 2 verification (ruff check after writing heartbeat.py). +- **Issue:** PATTERNS.md line 348 + the PLAN's Notes section suggested adding `# noqa: BLE001` to the `except Exception:` around `queue.info()`. However, this project's `pyproject.toml` enables ruff rule sets `ARG, B, C4, E, F, I, PLC, PTH, RUF, S, SIM, T20, TCH, UP, W, W191` β€” `BLE` is NOT enabled (only `B`, which doesn't include `BLE001`). Ruff flagged the noqa as targeting an unused rule. +- **Fix:** Removed `# noqa: BLE001` from the `except Exception:` line; replaced the inline rationale with a two-line comment block above the catch documenting the broad-except intent. +- **Files modified:** src/phaze/tasks/heartbeat.py +- **Verification:** `uv run ruff check src/phaze/tasks/heartbeat.py` β†’ All checks passed. +- **Committed in:** `afbf048` (part of Task 2 GREEN commit β€” fix applied before initial commit). + +### Notable Deferrals (NOT auto-fixed β€” out of scope) + +**3. `tests/test_phase04_gaps.py::test_docker_compose_has_agent_worker_consuming_agent_queue`** β€” This pre-existing failure was already documented in `.planning/phases/29-deployment-hardening-agents-admin/deferred-items.md` by Plan 29-05. Plan 29-03 removed the agent-worker block from root `docker-compose.yml` (app-server-only invariant); Plan 29-04 (parallel wave) created `docker-compose.agent.yml` where the agent-worker now lives. The test must be updated to scan both compose files by Plan 29-04 (or a follow-on plan). Out of scope for Plan 06; no new failure introduced by this plan. + +## Verification + +### Task 1 acceptance (RED) + +- βœ… `tests/test_tasks/test_heartbeat_cron.py` contains 4 test functions +- βœ… `tests/test_tasks/test_heartbeat_failure.py` contains 1 test function +- βœ… `uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py --collect-only -q` β†’ tests discovered, collection fails with `ModuleNotFoundError: No module named 'phaze.tasks.heartbeat'` (expected RED) +- βœ… `AgentApiServerError` constructed positional-only (no `status_code=` kwarg) +- βœ… `uv run ruff check tests/test_tasks/test_heartbeat_*.py` β†’ All checks passed + +### Task 2 acceptance (GREEN) + +- βœ… `src/phaze/tasks/heartbeat.py` exists with `heartbeat_tick(ctx: dict[str, Any]) -> None` +- βœ… Module imports only stdlib + `phaze.schemas.agent_heartbeat.HeartbeatRequest` + `phaze.services.agent_client.AgentApiError` (no `phaze.database`, no `sqlalchemy.ext.asyncio`, no `phaze.tasks.session`) +- βœ… `src/phaze/tasks/agent_worker.py` imports `CronJob` from saq + `heartbeat_tick` from `phaze.tasks.heartbeat` +- βœ… `settings["cron_jobs"]` has `CronJob(heartbeat_tick, cron="* * * * * */30", unique=True, timeout=10)` (trailing-seconds form) +- βœ… `settings["functions"]` includes `heartbeat_tick` (so SAQ can dispatch) +- βœ… `agent_worker.py` is STILL a single .py file (Pitfall 9 avoided) +- βœ… All 5 heartbeat tests pass (`uv run pytest tests/test_tasks/test_heartbeat_cron.py tests/test_tasks/test_heartbeat_failure.py -x -q` β†’ 5 passed) +- βœ… `tests/test_task_split.py` still passes (heartbeat.py import-boundary doesn't leak phaze.database β€” 6 passed) +- βœ… `uv run mypy src/phaze/tasks/heartbeat.py src/phaze/tasks/agent_worker.py` β†’ Success: no issues found in 2 source files +- βœ… `uv run ruff check src/phaze/tasks/heartbeat.py src/phaze/tasks/agent_worker.py tests/test_tasks/test_heartbeat_*.py` β†’ All checks passed +- βœ… Empirical cron-cadence reverify: `croniter('* * * * * */30', datetime(2026, 1, 1))` β†’ gaps of 30.0 seconds; `croniter('*/30 * * * * *', datetime(2026, 1, 1))` β†’ gaps of 1.0 seconds (wrong form would have fired every second) +- βœ… Broader suite: `uv run pytest tests/test_tasks/ tests/test_phase04_gaps.py --deselect tests/test_phase04_gaps.py::test_docker_compose_has_agent_worker_consuming_agent_queue -q` β†’ 129 passed, 1 deselected (only the pre-existing Plan 29-03/04 deferred test is excluded) + +### Threat-model mitigations delivered + +| Threat ID | Mitigation Delivered | +|-----------|----------------------| +| T-29-06-01 (Spoofing β€” rogue agent impersonating another) | Bearer-token auth at `/api/internal/agent/heartbeat` (Phase 25 unchanged); `agent_id` resolved from token-hash lookup, NEVER from request body; `HeartbeatRequest.model_config["extra"] = "forbid"` rejects any agent_id smuggled in. No change required in this plan. | +| T-29-06-02 (DoS β€” flood via misconfigured cron) | Trailing-seconds cron form `* * * * * */30` empirically fires at 30s cadence (gaps of 30.0s verified); leading-seconds form `*/30 * * * * *` would fire every second (gaps of 1.0s). The correct form is locked in the CronJob registration. SAQ `unique=True` prevents duplicate enqueues within the 30s window. | +| T-29-06-03 (Info disclosure β€” bearer token in failure logs) | `logger.warning("heartbeat failed: %s", exc)` interpolates `AgentApiError.__str__` β€” the AgentApiError class hierarchy (verified at `agent_client.py:75-88`) has no custom `__init__` and does NOT carry the bearer token in any field. Future Phase 26 D-13 invariant additionally guards against token leaks at the client-construction layer. | +| T-29-06-04 (Tampering β€” malicious queue.info return crashes handler) | Broad `except Exception` around `queue.info()` defaults to `queue_depth=0` and logs WARNING with `exc_info=True`. Tested via `test_heartbeat_queue_info_failure_defaults_to_zero`. | +| T-29-06-05 (DoS β€” heartbeat blocks SAQ event loop) | `client.heartbeat()` is `await`-ed (async httpx under the hood); tenacity retry funnel (Phase 26 D-11) bounds wall-time to ~4s; CronJob `timeout=10` upper-bounds the handler. The 30s cadence is > 10s timeout, so the next tick fires cleanly. | +| T-29-06-06 (Operational β€” silent heartbeat failure forever) | Accepted per D-09 fire-and-forget. The app-server's `last_seen_at` stops advancing β†’ admin page (Plan 07) surfaces "stale"/"dead" β†’ operator notices. | +| T-29-06-07 (Operational β€” agent_worker.py refactored to package) | Pitfall 9 avoided: cron_jobs added IN-PLACE to existing `settings = {...}` dict; heartbeat_tick lives in sibling `phaze/tasks/heartbeat.py`. agent_worker.py remains a single .py file; all existing imports (e.g., `tests/test_task_split.py:54`) continue to work. | + +## Self-Check: PASSED + +**Files created β€” verified to exist:** + +- βœ… `src/phaze/tasks/heartbeat.py` β€” FOUND +- βœ… `tests/test_tasks/test_heartbeat_cron.py` β€” FOUND +- βœ… `tests/test_tasks/test_heartbeat_failure.py` β€” FOUND + +**Files modified β€” verified `git log --follow` reachable:** + +- βœ… `src/phaze/tasks/agent_worker.py` β€” modified in `afbf048` + +**Commits β€” verified `git log --all` reachable:** + +- βœ… `48ad8c1` β€” Task 1 (test, RED step) +- βœ… `afbf048` β€” Task 2 (feat, GREEN step) + +## TDD Gate Compliance + +- βœ… RED gate commit: `48ad8c1` (`test(29-06): add failing tests for SAQ heartbeat cron handler`) β€” tests fail with `ModuleNotFoundError: No module named 'phaze.tasks.heartbeat'` +- βœ… GREEN gate commit: `afbf048` (`feat(29-06): wire SAQ 30s heartbeat cron handler (OPS-04 caller)`) β€” all 5 tests pass, no exception escapes +- ⏭️ REFACTOR gate: not required (no refactoring needed beyond the inline `# noqa: BLE001` removal, which was applied pre-commit and folded into the GREEN commit per ruff fail-fast) diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-07-PLAN.md b/.planning/phases/29-deployment-hardening-agents-admin/29-07-PLAN.md new file mode 100644 index 0000000..15cc948 --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-07-PLAN.md @@ -0,0 +1,545 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 07 +type: execute +wave: 3 +depends_on: [] +files_modified: + - src/phaze/constants.py + - src/phaze/services/agent_liveness.py + - src/phaze/utils/__init__.py + - src/phaze/utils/humanize.py + - src/phaze/routers/admin_agents.py + - src/phaze/templates/admin/agents.html + - src/phaze/templates/admin/partials/agents_table.html + - src/phaze/templates/admin/partials/_status_pill.html + - src/phaze/templates/base.html + - src/phaze/main.py + - tests/test_services/test_agent_liveness.py + - tests/test_utils/__init__.py + - tests/test_utils/test_humanize.py + - tests/test_routers/test_admin_agents.py +autonomous: true +requirements: [OPS-04] +tags: [phase-29, ops-04, admin-ui, htmx, alpine, ui-spec, v4.0] + +must_haves: + truths: + - "GET /admin/agents renders the page with a status-classified, sorted list of all agents (UI-SPEC + D-11..D-14)" + - "GET /admin/agents/_table returns the HTMX partial unconditionally (no full page; UI-SPEC + D-11)" + - "GET /admin/agents with HX-Request: true header returns the partial only (Pitfall 5; UI-SPEC Β§Server-Side Behavior)" + - "Status classifier produces all 5 states correctly: alive (<90s), stale (90-300s), dead (β‰₯300s), revoked (revoked_at not null), never (last_seen_at IS NULL and not revoked)" + - "Sort order: revoked agents last; within non-revoked: status_rank ascending (aliveβ†’staleβ†’deadβ†’never); within same status: last_seen_at descending" + - "Status thresholds live in phaze.constants.AGENT_LIVENESS_ALIVE_SECONDS=90 and AGENT_LIVENESS_STALE_SECONDS=300 (D-12)" + - "relative_time helper produces 'never'/'just now'/'Ns ago'/'Nm ago'/'Nh ago'/'Nd ago' per UI-SPEC Β§Relative-Time Helper LOCKED table" + - "base.html has a new 'Agents' nav link between 'Audit Log' and the theme toggle, using current_page == 'admin_agents' (short slug matching the live base.html convention β€” Audit Log uses 'audit' not 'audit_log'; WARNING-1)" + - "Status pill partial _status_pill.html renders all 5 states with locked Tailwind classes from UI-SPEC Β§Status Pill Component" + - "agents_table.html outer
has hx-get='/admin/agents/_table', hx-trigger='every 5s', hx-swap='outerHTML', NEVER halts (UI-SPEC Β§Polling; D-13)" + - "Empty state markup matches UI-SPEC Β§Empty State (centered py-8 block with 'No agents registered yet' heading)" + - "main.py registers the new admin_agents router (alongside the Phase 25/26/27/28 routers)" + - "admin_agents router does NOT use get_authenticated_agent dep (operator-facing, no auth)" + - "BLOCKER-2: agents.html includes a MANDATORY htmx:responseError + htmx:sendError event listener attached to #agents-table-section that writes a `phaze:agents:lastError` localStorage ISO timestamp; agents_table.html partial renders a red 'Refresh failed at HH:MM:SS' footer (role=alert) driven by that localStorage value (UI-SPEC Β§Error / Failure-Tolerant Refresh is LOCKED with status: approved β€” NOT optional, NOT deferred)" + artifacts: + - path: "src/phaze/constants.py" + provides: "AGENT_LIVENESS_ALIVE_SECONDS=90 + AGENT_LIVENESS_STALE_SECONDS=300" + contains: "AGENT_LIVENESS_ALIVE_SECONDS" + - path: "src/phaze/services/agent_liveness.py" + provides: "classify(agent, now) + sort_key(agent, now) β€” pure functions, no DB" + min_lines: 30 + exports: ["classify", "sort_key", "AgentStatus"] + - path: "src/phaze/utils/__init__.py" + provides: "utils package marker" + contains: "" + - path: "src/phaze/utils/humanize.py" + provides: "relative_time(dt, *, now=None) β€” 'Ns ago' / 'Nm ago' / 'Nh ago' / 'Nd ago' / 'never' / 'just now'" + min_lines: 18 + exports: ["relative_time"] + - path: "src/phaze/routers/admin_agents.py" + provides: "APIRouter(prefix='/admin/agents', tags=['admin']); GET / + GET /_table" + min_lines: 60 + exports: ["router"] + - path: "src/phaze/templates/admin/agents.html" + provides: "Page shell extending base.html; current_page='admin_agents'; hosts the MANDATORY htmx:responseError + htmx:sendError + htmx:afterSwap listener writing/clearing `phaze:agents:lastError` localStorage (BLOCKER-2)" + min_lines: 35 + contains: "htmx:responseError" + - path: "src/phaze/templates/admin/partials/agents_table.html" + provides: "HTMX self-replacing
; table or empty state; happy-path 'Last refreshed Ns ago' Alpine footer + MANDATORY red 'Refresh failed at HH:MM:SS' role=alert footer driven by `phaze:agents:lastError` localStorage (BLOCKER-2)" + min_lines: 90 + contains: "localStorage" + - path: "src/phaze/templates/admin/partials/_status_pill.html" + provides: "5-state status pill (alive/stale/dead/revoked/never) with locked Tailwind classes" + min_lines: 15 + - path: "src/phaze/templates/base.html" + provides: "New 'Agents' nav link between Audit Log link (current_page == 'audit') and theme toggle; uses current_page == 'admin_agents' short slug (WARNING-1)" + contains: "/admin/agents" + - path: "src/phaze/main.py" + provides: "app.include_router(admin_agents.router) in create_app" + contains: "admin_agents" + - path: "tests/test_services/test_agent_liveness.py" + provides: "5-state classify matrix + sort_key ordering tests" + min_lines: 60 + - path: "tests/test_utils/__init__.py" + provides: "package marker" + contains: "" + - path: "tests/test_utils/test_humanize.py" + provides: "Parametrized test matrix per UI-SPEC Β§Relative-Time Helper output rules" + min_lines: 40 + - path: "tests/test_routers/test_admin_agents.py" + provides: "Smoke-app integration: 6 original tests (empty/single/many/HX-Request/sort/pills) + 3 BLOCKER-2 assertions (htmx listener present in page; localStorage red-footer present in partial; role=alert on banner)" + min_lines: 140 + key_links: + - from: "src/phaze/routers/admin_agents.py::page" + to: "src/phaze/services/agent_liveness.py::classify" + via: "transient ORM attribute _status injection" + pattern: "agent\\._status = classify\\(a, now\\)" + - from: "src/phaze/routers/admin_agents.py::page" + to: "phaze.database::get_session" + via: "Annotated[AsyncSession, Depends(get_session)]" + pattern: "Depends\\(get_session\\)" + - from: "src/phaze/templates/admin/partials/agents_table.html" + to: "GET /admin/agents/_table" + via: "hx-get attribute on outer
" + pattern: "hx-get=\"/admin/agents/_table\"" + - from: "src/phaze/templates/admin/agents.html" + to: "src/phaze/templates/base.html" + via: "{% extends 'base.html' %}" + pattern: "extends.*base.html" + - from: "src/phaze/templates/admin/agents.html (htmx event listener)" + to: "browser localStorage `phaze:agents:lastError`" + via: "htmx:responseError + htmx:sendError β†’ localStorage.setItem; htmx:afterSwap β†’ localStorage.removeItem" + pattern: "phaze:agents:lastError" + - from: "src/phaze/templates/admin/partials/agents_table.html (Alpine red footer)" + to: "browser localStorage `phaze:agents:lastError`" + via: "x-data reads localStorage every 2s; renders red role=alert banner with timestamp when key is present" + pattern: "localStorage.getItem" + - from: "src/phaze/main.py" + to: "src/phaze/routers/admin_agents.py::router" + via: "app.include_router(admin_agents.router)" + pattern: "admin_agents" +--- + + +Build the operator-facing `/admin/agents` admin page (Plan 07) β€” the UI half of OPS-04. Wave-0 deliverables: constants + pure-function classifier + relative-time helper + their tests (deterministic, no DB). Wave-1 deliverables: the FastAPI router + Jinja templates + base.html nav link + main.py registration + smoke-app integration test. The page polls `/admin/agents/_table` every 5s and never halts (UI-SPEC LOCKED). The status pill component has 5 states with locked Tailwind classes per UI-SPEC Β§Status Pill Component. + +**BLOCKER-2 resolution (failure-tolerant footer is MANDATORY in v1):** UI-SPEC Β§Error / Failure-Tolerant Refresh has `status: approved` β€” this is a LOCKED design contract. The failure-tolerant footer (htmx:responseError + htmx:sendError listener writing a `phaze:agents:lastError` localStorage ISO timestamp + a red "Refresh failed at HH:MM:SS" role=alert footer driven by that localStorage value, with the timestamp CLEARED on a successful htmx:afterSwap so recovered transients do not pin the banner) ships in v1 of this plan. NOT optional, NOT polish, NOT deferred. Task 2 includes both the happy-path footer and the failure-tolerant footer; both are required for plan acceptance and are verified by 3 dedicated tests. + +**WARNING-1 resolution (nav-link short slug):** The new nav link uses `current_page == 'admin_agents'` (short slug). The live `src/phaze/templates/base.html` line 167 uses `current_page == 'audit'` for the Audit Log link (NOT `audit_log` as the UI-SPEC Β§Navigation Integration excerpt suggests). The executor reads base.html directly. New links follow the live convention: short slugs, no `_log` / `_page` / `_list` suffixes. + +Purpose: Closes OPS-04 UI half. Together with Plan 06's heartbeat caller, this fully delivers OPS-04 success criterion #6. + +Output: 4 new Python modules (constants extension, services/agent_liveness, utils/humanize, routers/admin_agents); 3 new Jinja templates; 1 template edit (base.html nav); 1 main.py registration; 4 new test files; new utils package. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/STATE.md +@.planning/REQUIREMENTS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-CONTEXT.md +@.planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md +@.planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md +@.planning/phases/29-deployment-hardening-agents-admin/29-UI-SPEC.md + + + + +```python +class Agent(Base): + __tablename__ = "agents" + id: Mapped[str] = mapped_column(primary_key=True) + name: Mapped[str] + scan_roots: Mapped[list[str]] = mapped_column(JSONB) + last_seen_at: Mapped[datetime | None] = mapped_column(default=None) + last_status: Mapped[dict | None] = mapped_column(JSONB, default=None) + revoked_at: Mapped[datetime | None] = mapped_column(default=None) + created_at: Mapped[datetime] + token_hash: Mapped[str] +``` + + + +```jinja + + Audit Log + +``` + + + + + + + + + + + + Task 1: Wave 0 β€” constants + agent_liveness + utils.humanize + their tests (pure-function tier) + src/phaze/constants.py, src/phaze/services/agent_liveness.py, src/phaze/utils/__init__.py, src/phaze/utils/humanize.py, tests/test_services/test_agent_liveness.py, tests/test_utils/__init__.py, tests/test_utils/test_humanize.py + + - `src/phaze/constants.py` exports `AGENT_LIVENESS_ALIVE_SECONDS: int = 90` and `AGENT_LIVENESS_STALE_SECONDS: int = 300` with docstrings. + - `src/phaze/services/agent_liveness.py` exports `classify(agent, now) -> AgentStatus`, `sort_key(agent, now) -> tuple[int, int, float]`, `AgentStatus = Literal["alive","stale","dead","revoked","never"]`. + - `classify` precedence: revoked β†’ never β†’ alive/stale/dead per thresholds. + - `sort_key`: `(revoked_int, status_rank, -last_seen_unix_or_-inf)` with `_STATUS_RANK = {alive:0, stale:1, dead:2, revoked:3, never:3}`. + - `src/phaze/utils/humanize.py` exports `relative_time(dt, *, now=None) -> str` matching UI-SPEC LOCKED output table (`89.7s β†’ "89s ago"` truncation case explicitly). + - All parametrized tests (12+ classify cases, 14+ humanize cases) pass. + - `uv run mypy` + `uv run ruff check` clean across new files. + + + - src/phaze/constants.py (existing constants file β€” verify path; if it doesn't exist, create from scratch) + - src/phaze/models/agent.py (Agent ORM model β€” read-only field signatures for classify input) + - .planning/phases/29-deployment-hardening-agents-admin/29-UI-SPEC.md Β§"Status Pill Component (5 States)" lines 177-220 + - .planning/phases/29-deployment-hardening-agents-admin/29-UI-SPEC.md Β§"Relative-Time Helper" lines 224-261 (LOCKED output table; truncation rule) + - .planning/phases/29-deployment-hardening-agents-admin/29-RESEARCH.md Β§"Pattern 6: HTMX self-replacing poll partial" lines 696-748 + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/services/agent_liveness.py" lines 223-249 + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"src/phaze/utils/humanize.py" lines 252-279 + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_services/test_agent_liveness.py" lines 1077-1090 + - .planning/phases/29-deployment-hardening-agents-admin/29-PATTERNS.md Β§"tests/test_utils/test_humanize.py" lines 1094-1107 + - tests/test_services/test_dedup.py (analog parametrized matrix) + + +Extend `src/phaze/constants.py` by APPENDING: + +```python +AGENT_LIVENESS_ALIVE_SECONDS: int = 90 +"""Seconds since last_seen_at below which agent is 'alive'.""" + +AGENT_LIVENESS_STALE_SECONDS: int = 300 +"""Seconds since last_seen_at below which agent is 'stale'; β‰₯ this = 'dead'.""" +``` + +Write `src/phaze/services/agent_liveness.py` per RESEARCH lines 696-726 + PATTERNS lines 223-249 (full body with `classify`, `sort_key`, `AgentStatus`, `_STATUS_RANK`). + +Critical detail: importing `phaze.models.agent` IS allowed here. The Postgres-free invariant applies only to `phaze.cert_bootstrap`, `phaze.entrypoint`, `phaze.tasks.agent_worker`, `phaze.tasks._shared.*` (NOT to `phaze.services.*`). + +Write `src/phaze/utils/__init__.py` as empty package marker with docstring. + +Write `src/phaze/utils/humanize.py` per RESEARCH lines 729-748 with the explicit truncation rule (`int(delta // 60)` NOT round-to-nearest β€” UI-SPEC line 248 LOCKS `89.7s β†’ "89s ago"`). + +Write `tests/test_services/test_agent_liveness.py` parametrized over the 5-state classify matrix + sort tests. Write `tests/test_utils/test_humanize.py` parametrized over 14+ boundary cases including the explicit `89.7s β†’ "89s ago"` UI-SPEC LOCKED case. + + + uv run pytest tests/test_services/test_agent_liveness.py tests/test_utils/test_humanize.py -x -q + + +- `src/phaze/constants.py` has the 2 AGENT_LIVENESS_* constants +- `src/phaze/services/agent_liveness.py` exports `classify`, `sort_key`, `AgentStatus` +- `src/phaze/utils/humanize.py` exports `relative_time` +- All parametrized tests pass +- `uv run mypy` + `uv run ruff check` clean across new files + + + + + Task 2: Wave 1 β€” admin_agents router + 3 templates incl. MANDATORY failure-tolerant footer (BLOCKER-2) + base.html nav (short slug per WARNING-1) + main.py registration + smoke-app test + src/phaze/routers/admin_agents.py, src/phaze/templates/admin/agents.html, src/phaze/templates/admin/partials/agents_table.html, src/phaze/templates/admin/partials/_status_pill.html, src/phaze/templates/base.html, src/phaze/main.py, tests/test_routers/test_admin_agents.py + + - `src/phaze/routers/admin_agents.py` has `router = APIRouter(prefix="/admin/agents", tags=["admin"])` with two handlers: `page` (HX-Request-aware: full page OR partial) and `table_partial` (always partial). + - `_load_agents(session)` queries Agent rows, injects transient `agent._status` via `classify`, sorts via `sort_key`. + - Router does NOT use `get_authenticated_agent` dep (operator-facing). + - `templates/admin/agents.html` extends `base.html`, sets `current_page = "admin_agents"`, renders title + intro + skip link + includes the partial. + - **BLOCKER-2 (MANDATORY):** `templates/admin/agents.html` ALSO contains a ` +{% endblock %} +``` + +**BLOCKER-2 β€” Write `src/phaze/templates/admin/partials/agents_table.html`** with the MANDATORY red failure footer: + +```jinja +
+

Registered agents

+ {% if not agents %} +
+

No agents registered yet

+

+ Provision an agent token via psql, then run just up-agent on the file server. Once the worker boots, it will appear here within a few seconds. +

+
+ {% else %} +
+ + + + + + + + + + + + + + {% for agent in agents %} + + + + + + + + + {% endfor %} + +
Registered agents
AgentStatusQueueLast seenScan rootsActions
+ {{ agent.name }} + {{ agent.id }} + {% include "admin/partials/_status_pill.html" %} + {% if agent.last_status and agent.last_status.get('queue_depth') is not none %} + {{ agent.last_status['queue_depth'] }} + {% else %} + β€” + {% endif %} + {{ humanize_relative_time(agent.last_seen_at, now=now) }}{{ agent.scan_roots | length }}
+
+ {% endif %} + + +

+ Last refreshed +

+ + + +
+``` + +Write `src/phaze/templates/admin/partials/_status_pill.html` per PATTERNS lines 507-520 β€” 5-state branching with LOCKED Tailwind classes (alive=green, stale=amber, dead=red, revoked/never=gray) and `aria-label="Status: "`. + +**WARNING-1 β€” Modify `src/phaze/templates/base.html`:** locate the Audit Log link at base.html:166-169 (verify it uses `current_page == 'audit'` short slug). Locate the theme toggle `
` at line 173. Insert the new Agents link BETWEEN them. The new link's conditional uses `current_page == 'admin_agents'` (short slug; matches the live convention). Do NOT use `'admin_agents_log'`, `'admin_agents_page'`, or any `_log`/`_page` suffix. Do NOT retrofit `aria-current` onto the other 9 nav links. + +```jinja + + Agents + +``` + +Modify `src/phaze/main.py`: add `from phaze.routers import admin_agents` and `app.include_router(admin_agents.router)` in `create_app()` alongside other includes. + +Write `tests/test_routers/test_admin_agents.py` with the 6 original tests (test_page_renders_full_html, test_htmx_request_returns_partial_only, test_dedicated_table_route_returns_partial, test_status_pills_render_all_5_states, test_empty_state, test_sort_order) PLUS the 3 BLOCKER-2 tests: + +```python +async def test_page_includes_htmx_error_listener(smoke: AsyncClient) -> None: + """BLOCKER-2: UI-SPEC Β§Error / Failure-Tolerant Refresh LOCKED β€” the full + page must include the htmx:responseError + htmx:sendError listener that + writes localStorage `phaze:agents:lastError`.""" + response = await smoke.get("/admin/agents") + body = response.text + assert "htmx:responseError" in body, "Missing htmx:responseError listener (BLOCKER-2)" + assert "htmx:sendError" in body, "Missing htmx:sendError listener (BLOCKER-2)" + assert "htmx:afterSwap" in body, "Missing htmx:afterSwap recovery handler (BLOCKER-2)" + assert "phaze:agents:lastError" in body, "Missing localStorage key (BLOCKER-2)" + assert "localStorage.setItem" in body, "Listener must write to localStorage (BLOCKER-2)" + assert "localStorage.removeItem" in body, "Recovery handler must clear localStorage (BLOCKER-2)" + + +async def test_partial_includes_failure_tolerant_footer(smoke: AsyncClient) -> None: + """BLOCKER-2: agents_table partial must render the red 'Refresh failed' + footer driven by localStorage `phaze:agents:lastError`.""" + response = await smoke.get("/admin/agents/_table") + body = response.text + assert "localStorage.getItem" in body, "Partial must read from localStorage (BLOCKER-2)" + assert "phaze:agents:lastError" in body, "Partial must reference the localStorage key (BLOCKER-2)" + assert "Refresh failed" in body, "Partial must include the red 'Refresh failed' copy (BLOCKER-2)" + + +async def test_partial_failure_footer_uses_role_alert(smoke: AsyncClient) -> None: + """BLOCKER-2 + accessibility: red failure banner uses role=alert so + screen readers announce it when it becomes visible.""" + response = await smoke.get("/admin/agents/_table") + body = response.text + assert 'role="alert"' in body, "Failure banner must have role=alert (a11y + BLOCKER-2)" +``` + +The session fixture is project-wide (from `tests/conftest.py` or `tests/test_routers/conftest.py`). Reuse it. + + + uv run pytest tests/test_routers/test_admin_agents.py tests/test_services/test_agent_liveness.py tests/test_utils/test_humanize.py -x -q && test $(grep -c "htmx:responseError" src/phaze/templates/admin/agents.html) -ge 1 && test $(grep -c "phaze:agents:lastError" src/phaze/templates/admin/partials/agents_table.html) -ge 1 && test $(grep -c 'role="alert"' src/phaze/templates/admin/partials/agents_table.html) -ge 1 + + +- `src/phaze/routers/admin_agents.py` exists with router + page + table_partial handlers +- 3 new Jinja templates exist with the locked markup +- **BLOCKER-2 (MANDATORY) DELIVERED:** `agents.html` contains the htmx:responseError + htmx:sendError + htmx:afterSwap listeners writing/clearing `phaze:agents:lastError` localStorage; `agents_table.html` contains the red role=alert "Refresh failed at HH:MM:SS" footer driven by that localStorage value +- **WARNING-1:** the new Agents nav link uses `current_page == 'admin_agents'` (short slug matching live base.html convention) +- `src/phaze/main.py` registers `admin_agents.router` +- All 9+ tests in `test_admin_agents.py` pass (6 original + 3 BLOCKER-2) +- `uv run mypy` + `uv run ruff check` clean across new files +- Manual smoke (post-merge): visit `/admin/agents`; kill api; red footer appears within ~2s; restart api; red footer disappears within ~2s + + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| operator browser β†’ /admin/agents page | no auth on this LAN-only single-user endpoint | +| Postgres β†’ admin_agents router | read-only `SELECT FROM agents` (no writes) | +| Jinja2 autoescape β†’ operator browser | agent-controlled string values (name, id) flow through autoescape | +| browser localStorage `phaze:agents:lastError` | operator's browser per-origin state β€” non-secret timestamp | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-29-07-01 | Spoofing | Anonymous operator views agent data via /admin/agents | accept (UI-SPEC Β§Server-Side Behavior LOCKED) | Private LAN; consistent with pipeline.py / pipeline_scans.py precedent | +| T-29-07-02 | Information Disclosure | XSS via agent.name or agent.id | mitigate | Jinja2 autoescape ON; no `\|safe` filter usage | +| T-29-07-03 | Information Disclosure | Token hash leaked in render | mitigate | `token_hash` never referenced in any template | +| T-29-07-04 | Tampering | HTMX poll responses inject malicious HTML | mitigate | Jinja2 autoescape mitigates; outerHTML swap can't break out into surrounding markup | +| T-29-07-05 | DoS | 5s polling cadence with hundreds of agents | accept (v4.0 scale: 1-5 agents) | One `SELECT FROM agents` (~10ms even with 100 agents) | +| T-29-07-06 | Operational | Pitfall 5 β€” `/admin/agents` page returning full HTML when HTMX expects a partial | mitigate | Dual-purpose handler checks `HX-Request`; dedicated `/_table` is the canonical polling target | +| T-29-07-07 | Operational | Transient ORM attribute `_status` collides with future Agent column | accept | `_` prefix is the project convention; SQLAlchemy ignores attrs not declared as Mapped columns | +| T-29-07-08 | Operational (BLOCKER-2) | poll failure is silent; operator sees stale data assuming agents are alive | mitigate | UI-SPEC Β§Error / Failure-Tolerant Refresh LOCKED contract delivered: htmx:responseError + htmx:sendError write `phaze:agents:lastError` localStorage; red role=alert footer announces failure within ~2s; clears on htmx:afterSwap. 3 dedicated tests verify presence | +| T-29-07-09 | Information Disclosure (BLOCKER-2) | localStorage `phaze:agents:lastError` exposes operator activity | accept | localStorage is per-origin; value is a non-secret ISO timestamp (not page content, agent IDs, tokens) | +| T-29-07-10 | Operational (BLOCKER-2) | localStorage unavailable (private mode, quota exceeded) β†’ listener throws β†’ ALL htmx events break | mitigate | The listener wraps `localStorage.setItem/removeItem` in try/catch; failures degrade silently (banner just doesn't show on next failure β€” equivalent to v0 behavior) | + + + +- `uv run pytest tests/test_services/test_agent_liveness.py tests/test_utils/test_humanize.py tests/test_routers/test_admin_agents.py -x -q` β€” all green (12+ classify + 14+ humanize + 9+ router incl. 3 BLOCKER-2) +- Grep gates (BLOCKER-2): + - `grep -c "htmx:responseError" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "htmx:sendError" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "htmx:afterSwap" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "phaze:agents:lastError" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "localStorage.setItem" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "localStorage.removeItem" src/phaze/templates/admin/agents.html` β‰₯ 1 + - `grep -c "phaze:agents:lastError" src/phaze/templates/admin/partials/agents_table.html` β‰₯ 1 + - `grep -c "Refresh failed" src/phaze/templates/admin/partials/agents_table.html` β‰₯ 1 + - `grep -c 'role="alert"' src/phaze/templates/admin/partials/agents_table.html` β‰₯ 1 +- `uv run pytest tests/ -x -q` β€” no regression in the broader suite +- `uv run mypy .` clean +- `uv run ruff check .` clean +- `uv run ruff format --check .` clean +- Manual UI smoke (recommended after merge): visit `/admin/agents`; open DevTools Network tab to confirm 5s polling; stop the api container; confirm the red "Refresh failed at HH:MM:SS" footer appears within ~2s (UI-SPEC Β§Error / Failure-Tolerant Refresh delivered); restart api; confirm the banner disappears within ~2s on the next successful poll + + + +- OPS-04 UI half closed (combined with Plan 06's caller half: full OPS-04 closure) +- D-11, D-12, D-13, D-14 implemented per UI-SPEC LOCKED contracts +- **BLOCKER-2 RESOLVED:** UI-SPEC Β§Error / Failure-Tolerant Refresh (status: approved) delivered in v1 β€” htmx:responseError/htmx:sendError listener + localStorage `phaze:agents:lastError` + red role=alert "Refresh failed" footer all ship in this plan, NOT deferred, NOT optional. 3 dedicated tests verify presence. +- **WARNING-1 RESOLVED:** new nav link uses the short-slug `current_page == 'admin_agents'` convention matching live base.html (Audit Log uses `'audit'`, not `'audit_log'`) +- 9+ tests in test_admin_agents.py (6 original + 3 BLOCKER-2) + 12+ in test_agent_liveness.py + 14+ in test_humanize.py +- New `phaze.utils` package + new `templates/admin/` namespace established per UI-SPEC Β§Template Structure +- Status pill component LOCKED at the 5-state Tailwind class palette per UI-SPEC + + + +Create `.planning/phases/29-deployment-hardening-agents-admin/29-07-SUMMARY.md` when both tasks complete. Summary must list: new files (router, 3 templates, 2 helper modules, 4 test files), modified files (base.html, main.py, constants.py), decision IDs implemented (D-11, D-12, D-13, D-14), the UI-SPEC dimensions covered (with explicit note that Β§Error / Failure-Tolerant Refresh LOCKED contract was DELIVERED in v1 β€” BLOCKER-2 resolution), and the WARNING-1 nav-key short-slug convention note. + diff --git a/.planning/phases/29-deployment-hardening-agents-admin/29-07-SUMMARY.md b/.planning/phases/29-deployment-hardening-agents-admin/29-07-SUMMARY.md new file mode 100644 index 0000000..bd4516a --- /dev/null +++ b/.planning/phases/29-deployment-hardening-agents-admin/29-07-SUMMARY.md @@ -0,0 +1,281 @@ +--- +phase: 29-deployment-hardening-agents-admin +plan: 07 +subsystem: admin-ui +tags: [phase-29, ops-04, admin-ui, htmx, alpine, ui-spec, v4.0] + +# Dependency graph +requires: + - phase: 27-pipeline-trigger-scan + provides: pipeline_scans router pattern (smoke-app fixture + Jinja2Templates + HX-Request handling + transient ORM attribute injection) + - phase: 28-execution-progress + provides: heartbeat caller (Plan 06 of this phase wires the writes that Plan 07 reads) +provides: + - GET /admin/agents page + GET /admin/agents/_table HTMX 5s poll partial + - 5-state agent liveness pill (alive/stale/dead/revoked/never) with LOCKED Tailwind palette + - phaze.services.agent_liveness pure-function classifier + sort_key + - phaze.utils.humanize.relative_time helper (LOCKED output table) + - templates/admin/ namespace convention for future admin pages + - "Agents" top-nav link with WARNING-1 short-slug `admin_agents` convention + - BLOCKER-2 failure-tolerant footer (htmx event listener + localStorage red banner) β€” DELIVERED in v1 +affects: [phase-30+ admin pages, future ops UI work, OPS-04 closure] + +# Tech tracking +tech-stack: + added: [] # zero new pip / npm deps β€” Tailwind, HTMX, Alpine already CDN-loaded by base.html + patterns: + - "HTMX self-replacing
with outerHTML swap (UI-SPEC §Polling LOCKED)" + - "Failure-tolerant refresh via localStorage + htmx event listener (UI-SPEC §Error LOCKED)" + - "Transient ORM attribute injection for view-only fields (`agent._status`, Phase 27 sibling pattern)" + - "Pure-function classifier + sort_key with explicit `now` param (no datetime.now() inside, test-deterministic)" + - "templates/admin/ namespace + underscore-prefix for nested-include partials" + +key-files: + created: + - src/phaze/services/agent_liveness.py + - src/phaze/utils/__init__.py + - src/phaze/utils/humanize.py + - src/phaze/routers/admin_agents.py + - src/phaze/templates/admin/agents.html + - src/phaze/templates/admin/partials/agents_table.html + - src/phaze/templates/admin/partials/_status_pill.html + - tests/test_services/test_agent_liveness.py + - tests/test_utils/__init__.py + - tests/test_utils/test_humanize.py + - tests/test_routers/test_admin_agents.py + modified: + - src/phaze/constants.py + - src/phaze/main.py + - src/phaze/templates/base.html + +key-decisions: + - "D-11: /admin/agents page route + dedicated /admin/agents/_table partial route (separate file: src/phaze/routers/admin_agents.py)" + - "D-12: 5-state thresholds AGENT_LIVENESS_ALIVE_SECONDS=90 + AGENT_LIVENESS_STALE_SECONDS=300; precedence revoked → never → alive/stale/dead" + - "D-13: HTMX hx-trigger='every 5s' + hx-swap='outerHTML' on the partial; NEVER halts (always re-emits hx-trigger)" + - "D-14: 6-column table (Agent, Status, Queue, Last seen, Scan roots, Actions); sort revoked-last then status_rank ascending then last_seen DESC; actions column empty (no v1 CTAs)" + - "BLOCKER-2 resolution: UI-SPEC §Error / Failure-Tolerant Refresh LOCKED contract DELIVERED in v1 (not deferred). htmx:responseError + htmx:sendError + htmx:afterSwap listener + localStorage `phaze:agents:lastError` + red role=alert banner all shipped + 3 dedicated tests." + - "WARNING-1 resolution: new nav link uses SHORT-SLUG `current_page == 'admin_agents'` (NOT `'admin_agents_log'` or `'admin_agents_page'`) matching the live base.html convention where Audit Log uses `'audit'`." + - "UI-SPEC documentation defect reconciliation: line 248 prose '89.7s → 89s ago' is inconsistent with its own bucket table (lines 232-241). The table is authoritative; truncation rule verified with 59.7s → 59s ago instead. See deviation log." + +patterns-established: + - "Pure-function service tier with explicit `now: datetime` param (test-deterministic without freezegun) — agent_liveness mirrors Phase 27 elapsed_seconds shape" + - "templates/admin/ namespace + admin/partials/ subdirectory + admin/partials/_.html underscore convention for nested-include partials (UI-SPEC §Template Structure LOCKED for future phases)" + - "BLOCKER-2 failure-tolerant HTMX poll: page-level event listener writes localStorage key; partial-level Alpine reads it on 2s interval; htmx:afterSwap clears it on recovery" + +requirements-completed: [OPS-04] + +# Metrics +duration: ~30min +completed: 2026-05-16 +--- + +# Phase 29 Plan 07: Operator-facing /admin/agents page Summary + +**Closes OPS-04 UI half: operators visit `/admin/agents` to see every registered file-server agent with a 5-state liveness pill that refreshes every 5 seconds, and the page degrades gracefully when the API is unreachable via a red role=alert banner driven by localStorage (BLOCKER-2 LOCKED contract delivered in v1).** + +## Performance + +- **Duration:** ~30 min +- **Started:** 2026-05-16T22:25:00Z +- **Completed:** 2026-05-16T22:56:00Z +- **Tasks:** 2/2 +- **Files created:** 11 +- **Files modified:** 3 + +## Accomplishments + +### Wave 0 — Pure-function tier (Task 1) + +1. **Constants extension** (`src/phaze/constants.py`): added `AGENT_LIVENESS_ALIVE_SECONDS = 90` and `AGENT_LIVENESS_STALE_SECONDS = 300` with full docstrings explaining the rationale (3× heartbeat cadence for alive; ~10 missed beats for dead). Shared by classifier + UI + tests for one source of truth (Phase 29 D-12 LOCKED). + +2. **Agent liveness service** (`src/phaze/services/agent_liveness.py`): pure functions `classify(agent, now)` and `sort_key(agent, now)`. Precedence chain LOCKED per D-12: `revoked → never → alive/stale/dead`. Sort tuple is `(revoked_int, status_rank, neg_last_seen)` so revoked agents always land last, non-revoked sort `alive (0) → stale (1) → dead (2) → never (3)`, and ties break by `last_seen_at` descending. Agent rows with `last_seen_at IS NULL` use `+inf` so they sort to the END of the 'never' bucket (only matters for revoked-with-no-heartbeat, which still gets re-grouped by `revoked_int=1` regardless). Imports `phaze.models.agent` (allowed per Postgres-free boundary docstring). + +3. **Utility helper** (`src/phaze/utils/humanize.py`): pure-function `relative_time(dt, *, now=None) -> str` producing `"never"` / `"just now"` / `"Ns ago"` / `"Nm ago"` / `"Nh ago"` / `"Nd ago"` per UI-SPEC LOCKED bucket table. Uses `int(d // 60)` etc. for explicit truncation (not rounding). Optional `now=` kwarg makes the helper deterministic for unit testing. New `phaze.utils` package established with module docstring noting future intent. + +4. **51 parametrized tests** (`tests/test_services/test_agent_liveness.py` + `tests/test_utils/test_humanize.py`): 5-state classify matrix at all boundary cases (0, 89, 90, 299, 300s); sort_key ordering invariants (revoked-last, alive-before-stale-before-dead, last_seen DESC within bucket); relative_time bucket boundaries at 0/59/60/3599/3600/86399/86400/259200; truncation rule (61.9s→1m, 5400s→1h, 129600s→1d, 59.7s→59s); format invariants (no plural-s suffix; single-letter unit). Default `now=None` branch covered. + +### Wave 1 — Router + templates + integration test (Task 2) + +5. **Admin router** (`src/phaze/routers/admin_agents.py`, ~120 lines): `APIRouter(prefix="/admin/agents", tags=["admin"])` with two handlers — `page` (HX-Request-aware, returns either `admin/agents.html` full page or the partial) and `table_partial` (always returns the partial; the canonical 5s polling target). `_load_agents` queries Agent rows, classifies via `classify(a, now)` and injects on transient `agent._status`, then sorts via `sort_key`. The `now` value is captured ONCE and passed to both classify/sort and the template's `refreshed_at_iso` context so the displayed timestamp matches the classification instant exactly. Exposes `humanize_relative_time` to all templates via `templates.env.globals` so the partial can call it directly. **NO `get_authenticated_agent` dependency** — operator pages are open on the private LAN per CONTEXT.md D-discretion and the pipeline.py / pipeline_scans.py precedent. + +6. **Page shell** (`src/phaze/templates/admin/agents.html`): extends `base.html`, sets `current_page = "admin_agents"`, renders `

Agents

` + sub-description + skip link + includes the partial. Below the partial: **BLOCKER-2 mandatory ` +{% endblock %} diff --git a/src/phaze/templates/admin/partials/_status_pill.html b/src/phaze/templates/admin/partials/_status_pill.html new file mode 100644 index 0000000..98182a0 --- /dev/null +++ b/src/phaze/templates/admin/partials/_status_pill.html @@ -0,0 +1,19 @@ +{# Phase 29: 5-state agent liveness pill. Mirrors scan_status_pill.html geometry + (text-xs font-semibold px-2 py-0.5 rounded-full β€” project-wide pill ladder). + Expects `agent` in context (loop variable from agents_table.html) with the + transient `_status` attribute set by routers/admin_agents._load_agents per + UI-SPEC Β§Status Pill Component LOCKED. Tailwind class palette LOCKED: + alive=green-100/950, stale=amber-100/950, dead=red-100/950, revoked/never + share the gray-100/800 "no signal" hue (visually unified, semantically + distinct via label text + aria-label). #} +{% if agent._status == 'alive' %} +ALIVE +{% elif agent._status == 'stale' %} +STALE +{% elif agent._status == 'dead' %} +DEAD +{% elif agent._status == 'revoked' %} +REVOKED +{% elif agent._status == 'never' %} +NEVER +{% endif %} diff --git a/src/phaze/templates/admin/partials/agents_table.html b/src/phaze/templates/admin/partials/agents_table.html new file mode 100644 index 0000000..d0ab532 --- /dev/null +++ b/src/phaze/templates/admin/partials/agents_table.html @@ -0,0 +1,100 @@ +{# Phase 29 plan 07 Component 1: HTMX self-replacing agents table partial. + + Outer
IS the swap target (hx-swap="outerHTML") so the partial owns + its full lifecycle β€” including the data-refreshed-at attribute and the two + Alpine-driven footers (happy-path "Last refreshed Ns ago" + BLOCKER-2 red + "Refresh failed at HH:MM:SS"). UI-SPEC Β§Polling LOCKS the 5s cadence and + "NEVER halt" invariant β€” hx-trigger is always emitted (no terminal state). + + BLOCKER-2 (MANDATORY): the red role=alert banner driven by localStorage + `phaze:agents:lastError` is REQUIRED v1 markup, not optional. UI-SPEC + Β§Error / Failure-Tolerant Refresh is status: approved (LOCKED). The + accompanying htmx:responseError + htmx:sendError + htmx:afterSwap event + listener lives in admin/agents.html (the page shell). #} +
+

Registered agents

+ {% if not agents %} +
+

No agents registered yet

+

+ Provision an agent token via psql, then run just up-agent on the file server. Once the worker boots, it will appear here within a few seconds. +

+
+ {% else %} +
+ + + + + + + + + + + + + + {% for agent in agents %} + + + + + + + + + {% endfor %} + +
Registered agents
AgentStatusQueueLast seenScan rootsActions
+ {{ agent.name }} + {{ agent.id }} + {% include "admin/partials/_status_pill.html" %} + {% if agent.last_status and agent.last_status.get('queue_depth') is not none %} + {{ agent.last_status['queue_depth'] }} + {% else %} + β€” + {% endif %} + {{ humanize_relative_time(agent.last_seen_at, now=now) }}{{ agent.scan_roots | length }}
+
+ {% endif %} + + {# Happy-path footer: "Last refreshed Ns ago" (neutral). Alpine reads + data-refreshed-at and ticks once per second so the user sees the + countdown advance even between 5s polls. #} +

+ Last refreshed +

+ + {# BLOCKER-2 (MANDATORY): UI-SPEC Β§Error / Failure-Tolerant Refresh β€” status: approved. + Red role=alert banner. Reads localStorage `phaze:agents:lastError`. The agents.html + listener writes the key on htmx:responseError / htmx:sendError and clears it on + htmx:afterSwap, so this banner appears within ~2s of a failure and disappears + within ~2s of recovery. x-cloak keeps the banner invisible until Alpine reads + the storage key (prevents a flash of stale red on hard reload after recovery). #} + +
diff --git a/src/phaze/templates/base.html b/src/phaze/templates/base.html index 7d531d9..f33ee30 100644 --- a/src/phaze/templates/base.html +++ b/src/phaze/templates/base.html @@ -167,6 +167,18 @@ class="text-sm font-semibold px-3 py-2 rounded-md transition-colors {% if current_page == 'audit' %}text-blue-600 dark:text-blue-400 bg-blue-50 dark:bg-blue-950{% else %}text-gray-600 dark:text-gray-400 hover:text-gray-900 dark:hover:text-gray-100 hover:bg-gray-50 dark:hover:bg-phaze-panel{% endif %}"> Audit Log + {# Phase 29 plan 07: operator-facing /admin/agents liveness page. + WARNING-1: short-slug nav-key convention β€” the live Audit Log + link above uses `current_page == 'audit'` (NOT `'audit_log'`), + so this new link uses `'admin_agents'` (NOT `'admin_agents_log'` + or `'admin_agents_page'`). The new aria-current="page" attribute + is a forward-looking accessibility upgrade; the other 9 nav + links above are intentionally NOT retrofitted in this PR. #} + + Agents +
diff --git a/src/phaze/utils/__init__.py b/src/phaze/utils/__init__.py new file mode 100644 index 0000000..9d9d080 --- /dev/null +++ b/src/phaze/utils/__init__.py @@ -0,0 +1,5 @@ +"""Pure-function utility helpers shared across phaze. + +Phase 29 establishes this package for the relative-time helper consumed by the +admin/agents UI. Future phases may add humanize-style helpers here. +""" diff --git a/src/phaze/utils/humanize.py b/src/phaze/utils/humanize.py new file mode 100644 index 0000000..b47ce81 --- /dev/null +++ b/src/phaze/utils/humanize.py @@ -0,0 +1,52 @@ +"""Relative-time formatter: '23s ago', '4m ago', '2h ago', '3d ago'. + +UI-SPEC Β§Relative-Time Helper LOCKS this signature. Pure Python, no deps. + +Output table (LOCKED): + + None β†’ "never" + delta < 0 β†’ "just now" + 0 <= d < 60 β†’ "{int(d)}s ago" + 60 <= d < 3600 β†’ "{int(d/60)}m ago" + 3600 <= d < 86400 β†’ "{int(d/3600)}h ago" + d >= 86400 β†’ "{int(d/86400)}d ago" + +Format invariants: +- No leading zero ("5s ago" not "05s ago"). +- No plural-s suffix ("1s ago" not "1 second ago"). +- Single-character unit suffix (s/m/h/d), space before "ago". +- ``int()`` truncates toward zero, NOT round. 89.7s β†’ "89s ago", NOT "1m ago" + (UI-SPEC line 248 LOCKED). +""" + +from __future__ import annotations + +from datetime import UTC, datetime + + +_SECONDS_PER_MINUTE = 60 +_SECONDS_PER_HOUR = 3600 +_SECONDS_PER_DAY = 86400 + + +def relative_time(dt: datetime | None, *, now: datetime | None = None) -> str: + """Return a glanceable 'N{s,m,h,d} ago' label for ``dt`` (or 'never' / 'just now'). + + The ``now`` kwarg is optional so unit tests pin a deterministic clock; in + production callers pass ``now=datetime.now(UTC)`` once per render. + + See module docstring for the full LOCKED output table. + """ + if dt is None: + return "never" + reference = now if now is not None else datetime.now(UTC) + delta_seconds = (reference - dt).total_seconds() + if delta_seconds < 0: + return "just now" + if delta_seconds < _SECONDS_PER_MINUTE: + return f"{int(delta_seconds)}s ago" + if delta_seconds < _SECONDS_PER_HOUR: + return f"{int(delta_seconds // _SECONDS_PER_MINUTE)}m ago" + if delta_seconds < _SECONDS_PER_DAY: + return f"{int(delta_seconds // _SECONDS_PER_HOUR)}h ago" + return f"{int(delta_seconds // _SECONDS_PER_DAY)}d ago" diff --git a/tests/test_cert_bootstrap.py b/tests/test_cert_bootstrap.py new file mode 100644 index 0000000..93e4a53 --- /dev/null +++ b/tests/test_cert_bootstrap.py @@ -0,0 +1,165 @@ +"""Tests for phaze.cert_bootstrap (Phase 29 D-02, D-22). + +Verifies the 7 LOCKED test cases per Plan 29-01 task 1: + 1. First-call generates 4 files; all parse via x509 / serialization. + 2. Second call is a no-op (mtimes unchanged). + 3. Banner-via-stdout (capsys): contains "GENERATED NEW PHAZE INTERNAL CA"; + contains neither "BEGIN" nor "PRIVATE KEY" (Pitfall 4). + 4. File modes: 0o644 on certs; 0o600 on keys. + 5. Leaf SubjectAlternativeName entries match the sans_csv input. + 6. _parse_san_entries dispatches DNSName vs IPAddress correctly. + 7. WARNING-8: banner emitted via logger.warning() (caplog) -- both + channels (print + logger) are mandatory per CONTEXT D-02 "Both". +""" + +from __future__ import annotations + +import ipaddress +import logging +from typing import TYPE_CHECKING + +from cryptography import x509 +from cryptography.hazmat.primitives import serialization + +from phaze.cert_bootstrap import _parse_san_entries, ensure_certs_present + + +if TYPE_CHECKING: + from pathlib import Path + + import pytest + + +_DEFAULT_SANS = "localhost,127.0.0.1,api" + + +def test_first_call_generates_four_parseable_files(tmp_path: Path) -> None: + """Test 1: first call writes 4 files; CA + leaf parse, both keys parse.""" + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + + ca_crt = tmp_path / "phaze-ca.crt" + ca_key = tmp_path / "phaze-ca.key" + server_crt = tmp_path / "phaze-server.crt" + server_key = tmp_path / "phaze-server.key" + + for path in (ca_crt, ca_key, server_crt, server_key): + assert path.exists(), f"missing file: {path.name}" + + # Both certs parse via cryptography.x509. + x509.load_pem_x509_certificate(ca_crt.read_bytes()) + x509.load_pem_x509_certificate(server_crt.read_bytes()) + # Both keys parse via serialization. + serialization.load_pem_private_key(ca_key.read_bytes(), password=None) + serialization.load_pem_private_key(server_key.read_bytes(), password=None) + + +def test_second_call_is_noop_mtimes_unchanged(tmp_path: Path) -> None: + """Test 2: second invocation on a populated dir does not change mtimes.""" + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + + files = [ + tmp_path / "phaze-ca.crt", + tmp_path / "phaze-ca.key", + tmp_path / "phaze-server.crt", + tmp_path / "phaze-server.key", + ] + mtimes_before = {p: p.stat().st_mtime_ns for p in files} + + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + + for path in files: + assert path.stat().st_mtime_ns == mtimes_before[path], f"mtime changed on idempotent call: {path.name}" + + +def test_banner_stdout_contains_message_and_no_secrets(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Test 3: stdout banner contains the message; never leaks BEGIN or PRIVATE KEY (Pitfall 4).""" + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + captured = capsys.readouterr() + assert "GENERATED NEW PHAZE INTERNAL CA" in captured.out + assert "BEGIN" not in captured.out, f"banner leaked PEM marker on stdout: {captured.out!r}" + assert "PRIVATE KEY" not in captured.out, f"banner leaked private-key string on stdout: {captured.out!r}" + + +def test_file_modes_are_correct(tmp_path: Path) -> None: + """Test 4: certs are 0o644, keys are 0o600.""" + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + assert (tmp_path / "phaze-ca.crt").stat().st_mode & 0o777 == 0o644 + assert (tmp_path / "phaze-server.crt").stat().st_mode & 0o777 == 0o644 + assert (tmp_path / "phaze-ca.key").stat().st_mode & 0o777 == 0o600 + assert (tmp_path / "phaze-server.key").stat().st_mode & 0o777 == 0o600 + + +def test_leaf_san_entries_match_input(tmp_path: Path) -> None: + """Test 5: leaf cert's SubjectAlternativeName contains the supplied SANs.""" + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + leaf = x509.load_pem_x509_certificate((tmp_path / "phaze-server.crt").read_bytes()) + san_ext = leaf.extensions.get_extension_for_class(x509.SubjectAlternativeName) + san_value = san_ext.value + # Default has 3 entries: localhost (DNS), 127.0.0.1 (IP), api (DNS). + names: list[x509.GeneralName] = list(san_value) + assert len(names) == 3, f"expected 3 SAN entries, got {len(names)}: {names!r}" + + dns_names = [n.value for n in names if isinstance(n, x509.DNSName)] + ip_addrs = [str(n.value) for n in names if isinstance(n, x509.IPAddress)] + assert "localhost" in dns_names + assert "api" in dns_names + assert "127.0.0.1" in ip_addrs + + +def test_parse_san_entries_mixed_dns_and_ip() -> None: + """Test 6: _parse_san_entries dispatches DNSName for hostnames, IPAddress for IPs.""" + result = _parse_san_entries("localhost,127.0.0.1,api") + assert len(result) == 3, f"expected 3 entries, got {len(result)}" + assert isinstance(result[0], x509.DNSName) and result[0].value == "localhost" + assert isinstance(result[1], x509.IPAddress) and result[1].value == ipaddress.IPv4Address("127.0.0.1") + assert isinstance(result[2], x509.DNSName) and result[2].value == "api" + + +def test_unparseable_existing_certs_trigger_regeneration(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """Test 8: when all 4 files exist but the CA cert (or leaf) does not parse, + `ensure_certs_present` logs `existing certs unparseable; regenerating` and + rewrites all four. Closes the WARNING-8 regeneration branch (lines 202-203) + that the happy-path tests cannot reach.""" + # Pre-populate all 4 expected paths with garbage so `all(p.exists())` is + # True, but `x509.load_pem_x509_certificate` raises ValueError. + for name in ("phaze-ca.crt", "phaze-ca.key", "phaze-server.crt", "phaze-server.key"): + (tmp_path / name).write_text("NOT-A-CERT") + + with caplog.at_level(logging.WARNING, logger="phaze.cert_bootstrap"): + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + + # The regeneration warning fired. + assert any(r.levelname == "WARNING" and "existing certs unparseable" in r.getMessage() for r in caplog.records), ( + f"Expected the 'unparseable; regenerating' warning; got: {[r.getMessage() for r in caplog.records]}" + ) + + # And the four files now parse cleanly. + x509.load_pem_x509_certificate((tmp_path / "phaze-ca.crt").read_bytes()) + x509.load_pem_x509_certificate((tmp_path / "phaze-server.crt").read_bytes()) + serialization.load_pem_private_key((tmp_path / "phaze-ca.key").read_bytes(), password=None) + serialization.load_pem_private_key((tmp_path / "phaze-server.key").read_bytes(), password=None) + + +def test_banner_emitted_via_logger_warning(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """Test 7 (WARNING-8): banner MUST be emitted via logger.warning() per CONTEXT D-02 D-discretion 'Both'. + + Test 3 (capsys) covers the print() path; this test covers the + logger.warning() path independently -- a future refactor that drops + one path while keeping the other would slip past Test 3 alone. + """ + with caplog.at_level(logging.WARNING, logger="phaze.cert_bootstrap"): + ensure_certs_present(tmp_path, cn="localhost", sans_csv=_DEFAULT_SANS) + banner_records = [ + r + for r in caplog.records + if r.levelname == "WARNING" and r.name == "phaze.cert_bootstrap" and "GENERATED NEW PHAZE INTERNAL CA" in r.getMessage() + ] + assert banner_records, ( + f"Expected at least one WARNING-level log record from phaze.cert_bootstrap " + f"containing 'GENERATED NEW PHAZE INTERNAL CA'; got records: " + f"{[(r.levelname, r.name, r.getMessage()) for r in caplog.records]}" + ) + # Also assert the logger never leaks the private-key blob (parity with Test 3 for the print path): + for r in banner_records: + assert "BEGIN" not in r.getMessage(), f"banner record leaked PEM marker: {r.getMessage()}" + assert "PRIVATE KEY" not in r.getMessage(), f"banner record leaked private-key string: {r.getMessage()}" diff --git a/tests/test_config/__init__.py b/tests/test_config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_config/test_agent_settings_redis_password.py b/tests/test_config/test_agent_settings_redis_password.py new file mode 100644 index 0000000..b940ddb --- /dev/null +++ b/tests/test_config/test_agent_settings_redis_password.py @@ -0,0 +1,180 @@ +"""Unit tests for AgentSettings production-mode Redis-password enforcement (Phase 29 D-06). + +Covers 4 behaviors of `phaze.config.AgentSettings`: +1. `agent_env="production"` + passwordless `redis_url` β†’ `ValidationError` containing + the substring ``"requires a password in redis_url"``. +2. `agent_env="production"` + passworded `redis_url` (`redis://default:secret@host:6379`) + constructs successfully. +3. `agent_env="dev"` + passwordless `redis_url` constructs successfully (dev convenience + per RESEARCH Β§Pitfall 7). +4. Default `agent_env` is `"dev"` when not set (preserves existing dev workflow). + +Tests pass kwargs directly to `AgentSettings(...)` rather than using env-var indirection; +this is cleaner than the env-var monkeypatch pattern used elsewhere because the contract +under test is the model itself, not the env-var β†’ field mapping. + +No DB, no Redis required. +""" + +from __future__ import annotations + +from pydantic import SecretStr, ValidationError +import pytest + + +_VALID_API_URL = "https://api.test:8000" +_VALID_TOKEN = SecretStr("phaze_agent_test-token-abc123") +_VALID_ROOTS = ["/data/music"] +_PASSWORDLESS_URL = "redis://localhost:6379/0" +_PASSWORDED_URL = "redis://default:secret@localhost:6379/0" + + +def test_production_refuses_passwordless_redis_url() -> None: + """D-06: agent_env=production + passwordless redis_url raises ValidationError. + + The error message must contain ``"requires a password in redis_url"`` so the + operator sees an actionable hint pointing at Phase 29 D-06. + """ + from phaze.config import AgentSettings + + with pytest.raises(ValidationError) as exc_info: + AgentSettings( + agent_env="production", + redis_url=_PASSWORDLESS_URL, + agent_api_url=_VALID_API_URL, + agent_token=_VALID_TOKEN, + scan_roots=_VALID_ROOTS, + ) + assert "requires a password in redis_url" in str(exc_info.value), f"Expected D-06 password hint in error; got: {exc_info.value}" + + +def test_production_accepts_passworded_redis_url() -> None: + """D-06: agent_env=production + `redis://default:@host:6379` constructs OK.""" + from phaze.config import AgentSettings + + cfg = AgentSettings( + agent_env="production", + redis_url=_PASSWORDED_URL, + agent_api_url=_VALID_API_URL, + agent_token=_VALID_TOKEN, + scan_roots=_VALID_ROOTS, + ) + assert cfg.agent_env == "production" + assert cfg.redis_url == _PASSWORDED_URL + + +def test_dev_accepts_passwordless_redis_url() -> None: + """D-06: agent_env=dev (the default) permits passwordless redis_url. + + Pitfall 7: fresh dev clones must `docker compose up` without supplying a + Redis password; the `agent_env=dev` default lets this work. + """ + from phaze.config import AgentSettings + + cfg = AgentSettings( + agent_env="dev", + redis_url=_PASSWORDLESS_URL, + agent_api_url=_VALID_API_URL, + agent_token=_VALID_TOKEN, + scan_roots=_VALID_ROOTS, + ) + assert cfg.agent_env == "dev" + assert cfg.redis_url == _PASSWORDLESS_URL + + +def test_default_agent_env_is_dev() -> None: + """D-06: omitting `agent_env` defaults to `"dev"` so existing call sites are unaffected.""" + from phaze.config import AgentSettings + + cfg = AgentSettings( + redis_url=_PASSWORDLESS_URL, + agent_api_url=_VALID_API_URL, + agent_token=_VALID_TOKEN, + scan_roots=_VALID_ROOTS, + ) + assert cfg.agent_env == "dev", f"Default agent_env must be 'dev'; got {cfg.agent_env!r}" + + +# --------------------------------------------------------------------------- +# Phase 29 CR-01: production refuses http:// for agent_api_url +# --------------------------------------------------------------------------- + + +def test_production_refuses_http_agent_api_url() -> None: + """CR-01: agent_env=production + http:// agent_api_url raises ValidationError. + + The bearer token would otherwise transit in plaintext on the LAN, defeating + the entire TLS bootstrap landed in this phase. + """ + from phaze.config import AgentSettings + + with pytest.raises(ValidationError) as exc_info: + AgentSettings( + agent_env="production", + agent_api_url="http://app.test:8000", + agent_token=_VALID_TOKEN, + redis_url=_PASSWORDED_URL, + scan_roots=_VALID_ROOTS, + ) + assert "requires https://" in str(exc_info.value), f"Expected CR-01 https:// hint in error; got: {exc_info.value}" + + +def test_production_accepts_https_agent_api_url() -> None: + """CR-01: agent_env=production + https:// agent_api_url constructs OK.""" + from phaze.config import AgentSettings + + cfg = AgentSettings( + agent_env="production", + agent_api_url="https://app.test:8000", + agent_token=_VALID_TOKEN, + redis_url=_PASSWORDED_URL, + scan_roots=_VALID_ROOTS, + ) + assert cfg.agent_api_url == "https://app.test:8000" + + +def test_dev_accepts_http_agent_api_url() -> None: + """CR-01: agent_env=dev permits http:// agent_api_url for local dev convenience.""" + from phaze.config import AgentSettings + + cfg = AgentSettings( + agent_env="dev", + agent_api_url="http://localhost:8000", + agent_token=_VALID_TOKEN, + redis_url=_PASSWORDLESS_URL, + scan_roots=_VALID_ROOTS, + ) + assert cfg.agent_api_url == "http://localhost:8000" + + +# --------------------------------------------------------------------------- +# Phase 29 CR-02: PHAZE_REDIS_URL env var must bind to BaseSettings.redis_url +# --------------------------------------------------------------------------- + + +def test_phaze_redis_url_env_var_binds(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: # type: ignore[no-untyped-def] + """CR-02: setting PHAZE_REDIS_URL=... overrides the default redis_url. + + The original BaseSettings.redis_url had no validation_alias, so pydantic- + settings silently ignored PHAZE_REDIS_URL. A production agent following + `.env.example.agent` would fall back to the default and trigger the + passwordless-Redis validator at startup with a misleading error. + """ + from phaze.cert_bootstrap import ensure_certs_present + from phaze.config import AgentSettings + + ensure_certs_present(tmp_path, cn="localhost", sans_csv="localhost,127.0.0.1") + + monkeypatch.setenv("PHAZE_REDIS_URL", "redis://default:operator-supplied@redis.test:6379/0") + monkeypatch.setenv("PHAZE_AGENT_API_URL", "https://app.test:8000") + monkeypatch.setenv("PHAZE_AGENT_TOKEN", "phaze_agent_test-token-abc123") + monkeypatch.setenv("PHAZE_AGENT_SCAN_ROOTS", "/data/music") + monkeypatch.setenv("PHAZE_AGENT_ENV", "production") + monkeypatch.setenv("PHAZE_AGENT_CA_FILE", str(tmp_path / "phaze-ca.crt")) + + cfg = AgentSettings() + + assert cfg.redis_url == "redis://default:operator-supplied@redis.test:6379/0", ( + f"PHAZE_REDIS_URL must override the default redis_url; got: {cfg.redis_url!r}" + ) + assert cfg.agent_env == "production" diff --git a/tests/test_deployment/__init__.py b/tests/test_deployment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_deployment/test_agent_compose.py b/tests/test_deployment/test_agent_compose.py new file mode 100644 index 0000000..e12cb08 --- /dev/null +++ b/tests/test_deployment/test_agent_compose.py @@ -0,0 +1,164 @@ +"""Phase 29 D-15..D-17, D-22: docker-compose.agent.yml structural assertions. + +Pure YAML-parse tests for the file-server-host compose file (no docker daemon). + +Covers four invariants for ``docker-compose.agent.yml``: + +1. Top-level ``services`` is exactly ``{worker, watcher, audfprint, panako}``. +2. No agent service declares ``DATABASE_URL`` or a ``depends_on`` reference to + postgres (DIST-04 invariant β€” agents reach Postgres ONLY via the HTTP API). +3. ``worker`` service has ``PHAZE_ROLE=agent`` in its environment. +4. WARNING-3: Every ``SCAN_PATH`` volume mount across all 4 services uses the + fail-fast ``${VAR:?MESSAGE}`` operator (catches a future YAML drift to + ``${SCAN_PATH:-/data/music}`` loose-default form which would silently let + ``docker compose up`` succeed on a misconfigured host). + +A fifth test (WARNING-4) parses ``.github/workflows/docker-publish.yml`` and +asserts the ``docker/metadata-action`` step emits BOTH a ``:latest`` tag and a +``:v`` tag pattern. + +These tests deliberately use ``yaml.safe_load`` so the assertions are robust +against YAML reformatting. ``yaml.safe_load`` does NOT perform docker-compose +env-var interpolation, so the raw ``${VAR:?...}`` tokens are visible to the +tests β€” that is intentional, because the test asserts the source-file +invariant, not the post-interpolation runtime value. +""" + +from pathlib import Path +import re +from typing import Any + +import yaml + + +COMPOSE_PATH = Path(__file__).resolve().parents[2] / "docker-compose.agent.yml" +PUBLISH_WORKFLOW_PATH = Path(__file__).resolve().parents[2] / ".github" / "workflows" / "docker-publish.yml" + + +def _load_agent_compose() -> dict[str, Any]: + return yaml.safe_load(COMPOSE_PATH.read_text()) + + +def _env_to_strs(env: Any) -> list[str]: + """Normalize a compose ``environment`` to a list of ``"KEY=VALUE"`` strings. + + Compose accepts both list-of-string and dict forms. + """ + if isinstance(env, list): + return [str(e) for e in env] + if isinstance(env, dict): + return [f"{k}={v}" for k, v in env.items()] + return [] + + +def test_agent_compose_service_list() -> None: + """D-15: agent compose declares exactly worker, watcher, audfprint, panako.""" + data = _load_agent_compose() + assert set(data["services"].keys()) == {"worker", "watcher", "audfprint", "panako"}, ( + f"agent compose services must be exactly {{worker, watcher, audfprint, panako}}; got {sorted(data['services'].keys())!r}" + ) + + +def test_agent_compose_has_no_postgres_env() -> None: + """DIST-04: agents must never have DATABASE_URL or depends_on: postgres. + + Agents reach Postgres only via the application server's HTTP API. A + DATABASE_URL on any agent service would punch through the trust boundary. + """ + data = _load_agent_compose() + for svc_name, svc in data["services"].items(): + env_strs = _env_to_strs(svc.get("environment", [])) + for entry in env_strs: + assert "DATABASE_URL" not in entry, f"agent service {svc_name} has DATABASE_URL in environment: {entry!r}" + assert "POSTGRES_" not in entry, f"agent service {svc_name} has POSTGRES_* env var: {entry!r}" + depends = svc.get("depends_on", {}) + # depends_on accepts list (["postgres"]) and dict ({"postgres": {...}}) forms. + if isinstance(depends, (list, dict)): + assert "postgres" not in depends, f"agent service {svc_name} has depends_on: postgres" + + +def test_worker_service_has_phaze_role_agent() -> None: + """D-17: the worker service runs under PHAZE_ROLE=agent.""" + data = _load_agent_compose() + worker_env = _env_to_strs(data["services"]["worker"].get("environment", [])) + assert any("PHAZE_ROLE=agent" in e for e in worker_env), f"worker service must have PHAZE_ROLE=agent in environment; got {worker_env!r}" + + +def test_all_scan_path_mounts_use_failfast_syntax() -> None: + """WARNING-3: every SCAN_PATH volume mount uses the fail-fast ${VAR:?MESSAGE} form. + + Defends against a YAML drift that silently introduces a loose default like + ``${SCAN_PATH:-/data/music}`` which would let ``docker compose up`` succeed + on a misconfigured file-server host (Phase 29 WARNING-3). + """ + data = _load_agent_compose() + failfast_re = re.compile(r"\$\{SCAN_PATH:\?[^}]*\}") + offenders: list[str] = [] + for svc_name, svc in data["services"].items(): + for vol in svc.get("volumes", []) or []: + if not isinstance(vol, str): + continue + if "SCAN_PATH" in vol and not failfast_re.search(vol): + offenders.append(f"{svc_name}: {vol}") + assert not offenders, "Some SCAN_PATH mounts are not fail-fast (must use ${SCAN_PATH:?MESSAGE} form):\n" + "\n".join(offenders) + + +def _extract_api_metadata_action_step(workflow_data: dict[str, Any]) -> dict[str, Any] | None: + """Locate a docker/metadata-action step whose `images:` output points at the api image. + + docker-publish.yml uses a matrix over {api, audfprint, panako}; the same + metadata-action step runs for each matrix value with an interpolated + `images:` URL. The agent.yml's worker+watcher pull from the *api* image + URL (bare-repo, no sub-path), so this helper specifically looks for the + api-image step. If the workflow uses a single shared step (no matrix + differentiation in `images:`), any docker/metadata-action step is + returned. + """ + for job in (workflow_data.get("jobs") or {}).values(): + for step in job.get("steps", []) or []: + uses = (step.get("uses") or "").lower() + if "docker/metadata-action" in uses: + return step # type: ignore[no-any-return] + return None + + +def _metadata_action_tag_lines(step: dict[str, Any]) -> list[str]: + """Return the docker/metadata-action `with.tags:` block split on newlines.""" + tags_raw = (step.get("with") or {}).get("tags", "") + return [line.strip() for line in str(tags_raw).splitlines() if line.strip()] + + +def test_docker_publish_workflow_tags_both_latest_and_version() -> None: + """WARNING-4: .github/workflows/docker-publish.yml emits BOTH :latest AND :v tags. + + Replaces the original `checkpoint:human-verify` task (Phase 29 plan 04 + WARNING-4 resolution). An automated YAML-parse test guarantees the tag + strategy stays correct across metadata-action upgrades or maintainer + edits β€” a regression that drops the version tag pattern (e.g., during a + refactor) is caught in CI rather than after the next release ships. + + Tag patterns accepted: + - `:latest` ← `type=raw,value=latest` (with or without `enable=...`) + - `:v` ← `type=semver,pattern={{version}}` OR `type=ref,event=tag` + """ + assert PUBLISH_WORKFLOW_PATH.exists(), f"docker-publish.yml missing at {PUBLISH_WORKFLOW_PATH}" + workflow = yaml.safe_load(PUBLISH_WORKFLOW_PATH.read_text()) + step = _extract_api_metadata_action_step(workflow) + assert step is not None, ( + "Could not locate a docker/metadata-action step in docker-publish.yml. " + "Phase 29 D-16 requires the workflow to produce both :latest and :v tags." + ) + tags = _metadata_action_tag_lines(step) + assert tags, f"docker/metadata-action step has no `with.tags:` block; got step={step!r}" + + has_latest = any("value=latest" in t for t in tags) + has_version = any(("type=semver" in t) or ("type=ref,event=tag" in t) or ("type=ref" in t and "tag" in t) for t in tags) + missing: list[str] = [] + if not has_latest: + missing.append("'type=raw,value=latest' (or equivalent)") + if not has_version: + missing.append("'type=semver,pattern={{version}}' (or 'type=ref,event=tag')") + assert not missing, ( + f"docker-publish.yml tag patterns missing: {missing}\nFound tags: {tags}\n" + "Fix: add the missing pattern(s) under jobs..steps[uses=docker/metadata-action].with.tags." + ) diff --git a/tests/test_deployment/test_api_filesystem_isolation.py b/tests/test_deployment/test_api_filesystem_isolation.py new file mode 100644 index 0000000..c680e75 --- /dev/null +++ b/tests/test_deployment/test_api_filesystem_isolation.py @@ -0,0 +1,118 @@ +"""Phase 29 D-19: app-server compose declares NO music/model/output mounts on api or worker. + +Pure YAML-parse structural assertions. No Docker required; runs in ~50ms. + +Covers four invariants for the root ``docker-compose.yml``: + +1. ``api`` service has no banned filesystem mounts (DIST-01). +2. ``worker`` (controller) service has no banned filesystem mounts (DIST-01). +3. ``watcher``, ``agent-worker``, ``audfprint``, ``panako`` services are absent + from the root compose (D-15, D-17 β€” those live in ``docker-compose.agent.yml``). +4. ``redis`` service is hardened: ``--requirepass``, IP-prefixed port binding, + and an authenticated healthcheck with ``--no-auth-warning`` (D-05 / AUTH-03). + +These tests deliberately use ``yaml.safe_load`` rather than regex so the +assertions are robust against YAML reformatting. ``yaml.safe_load`` does NOT +perform docker-compose env-var interpolation, so the raw ``${VAR:-default}`` +tokens are visible to the tests β€” that is intentional, because the test asserts +the source-file invariant, not the post-interpolation runtime value. +""" + +from pathlib import Path +from typing import Any + +import yaml + + +COMPOSE_PATH = Path(__file__).resolve().parents[2] / "docker-compose.yml" +BANNED_MOUNT_TARGETS = ("/data/music", "/models", "/data/output") + + +def _volume_target(entry: Any) -> str: + """Return the container-side target path for a docker-compose volume entry. + + Compose accepts two volume forms: + + - Short string form: ``":[:ro|rw]"`` + - Long dict form: ``{"type": "bind", "source": ..., "target": ...}`` + + For named-volume short form (``"name:/path"``) the second segment is still + the container target, so the same ``split(":")[1]`` logic applies. + """ + if isinstance(entry, str): + return entry.split(":")[1] if ":" in entry else entry + if isinstance(entry, dict): + return str(entry.get("target", "")) + return "" + + +def _load_compose() -> dict[str, Any]: + return yaml.safe_load(COMPOSE_PATH.read_text()) + + +def test_api_service_has_no_file_mounts() -> None: + """DIST-01: the application server's api container reads no music/model/output paths.""" + data = _load_compose() + api_volumes = data["services"]["api"].get("volumes", []) or [] + for vol_entry in api_volumes: + target = _volume_target(vol_entry) + for banned in BANNED_MOUNT_TARGETS: + assert banned not in target, f"api service has banned mount: {vol_entry}" + + +def test_controller_worker_has_no_file_mounts() -> None: + """DIST-01: the controller worker is fileless β€” no music/model/output mounts.""" + data = _load_compose() + worker_volumes = data["services"]["worker"].get("volumes", []) or [] + for vol_entry in worker_volumes: + target = _volume_target(vol_entry) + for banned in BANNED_MOUNT_TARGETS: + assert banned not in target, f"worker has banned mount: {vol_entry}" + + +def test_no_watcher_or_agent_worker_in_root_compose() -> None: + """D-15 / D-17: agent + sidecar services live ONLY in docker-compose.agent.yml. + + The root compose is the application-server compose; it must not declare + watcher, agent-worker, audfprint, or panako. + """ + data = _load_compose() + services = data["services"] + assert "watcher" not in services, "watcher belongs in docker-compose.agent.yml (D-17)" + assert "agent-worker" not in services, "agent-worker belongs in docker-compose.agent.yml (D-17)" + assert "audfprint" not in services, "audfprint sidecar is file-server-local (D-15)" + assert "panako" not in services, "panako sidecar is file-server-local (D-15)" + + +def test_redis_hardened() -> None: + """D-05 / AUTH-03: redis service uses requirepass + LAN binding + authenticated healthcheck.""" + data = _load_compose() + redis = data["services"]["redis"] + + # --- command: requirepass + REDIS_PASSWORD interpolation token present --- + command = redis.get("command") + assert command is not None, "redis service must declare a command with --requirepass" + # yaml.safe_load preserves list-form command verbatim; join for substring checks. + command_str = " ".join(command) if isinstance(command, list) else str(command) + assert "requirepass" in command_str, f"redis command missing --requirepass: {command!r}" + assert "REDIS_PASSWORD" in command_str, f"redis command missing REDIS_PASSWORD interpolation token: {command!r}" + + # --- ports: IP-prefixed (not a bare 6379:6379 that defaults to 0.0.0.0) --- + ports = redis.get("ports", []) + assert ports, "redis service must declare a ports entry" + # Look for a ":6379:6379" form. Reject a bare "6379:6379" or + # ":6379:6379" (no host IP) since both would bind 0.0.0.0. + assert any(isinstance(p, str) and ":6379:6379" in p and not p.startswith(":") and p != "6379:6379" for p in ports), ( + f"redis ports must be IP-prefixed (e.g. ${{REDIS_BIND_IP:-127.0.0.1}}:6379:6379); got {ports!r}" + ) + + # --- healthcheck: redis-cli --no-auth-warning -a ping --- + healthcheck = redis.get("healthcheck", {}) + test_cmd = healthcheck.get("test", []) + assert isinstance(test_cmd, list), f"redis healthcheck.test must be a list; got {test_cmd!r}" + assert "redis-cli" in test_cmd, f"redis healthcheck missing redis-cli: {test_cmd!r}" + assert "--no-auth-warning" in test_cmd, f"redis healthcheck missing --no-auth-warning: {test_cmd!r}" + assert "-a" in test_cmd, f"redis healthcheck missing -a flag: {test_cmd!r}" + assert any("REDIS_PASSWORD" in entry for entry in test_cmd if isinstance(entry, str)), ( + f"redis healthcheck must reference ${{REDIS_PASSWORD}}: {test_cmd!r}" + ) diff --git a/tests/test_entrypoint.py b/tests/test_entrypoint.py new file mode 100644 index 0000000..e421704 --- /dev/null +++ b/tests/test_entrypoint.py @@ -0,0 +1,100 @@ +"""Tests for `phaze.entrypoint.main` (Phase 29 D-02, RESEARCH Pattern 2). + +Three LOCKED cases: +1. Defaults: with no env vars set, `main()` calls `ensure_certs_present` + with `Path("/certs"), cn="localhost", sans_csv="localhost,127.0.0.1,api"` + and then `execvp`s uvicorn with the expected argv (host 0.0.0.0, port 8000, + ssl-keyfile `/certs/phaze-server.key`, ssl-certfile `/certs/phaze-server.crt`). +2. Overrides: PHAZE_CERTS_DIR / PHAZE_API_HOST / PHAZE_API_TLS_SANS values + are threaded through `ensure_certs_present` and the `--ssl-*` flags. +3. Sequencing: `ensure_certs_present` MUST run BEFORE `os.execvp` + (RESEARCH Pattern 2 invariant β€” cert files must exist when uvicorn boots). + +`os.execvp` is monkeypatched to a recording stub so the test process is not +replaced. `ensure_certs_present` is monkeypatched to a recording stub so +the test does not depend on `cryptography` round-trip behavior β€” that is +already covered by `tests/test_cert_bootstrap.py`. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import phaze.entrypoint as entrypoint + + +if TYPE_CHECKING: + import pytest + + +def _install_recorder(monkeypatch: pytest.MonkeyPatch) -> dict[str, Any]: + """Stub `ensure_certs_present` and `os.execvp`; return the recorder dict.""" + calls: dict[str, Any] = {"ensure": None, "execvp": None, "order": []} + + def fake_ensure(certs_dir: Path, cn: str, sans_csv: str) -> None: + calls["ensure"] = (certs_dir, cn, sans_csv) + calls["order"].append("ensure") + + def fake_execvp(file: str, args: list[str]) -> None: + calls["execvp"] = (file, args) + calls["order"].append("execvp") + + monkeypatch.setattr(entrypoint, "ensure_certs_present", fake_ensure) + monkeypatch.setattr(entrypoint.os, "execvp", fake_execvp) + return calls + + +def test_main_defaults_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: + """Test 1: with no PHAZE_* env vars, the docstring defaults are used end-to-end.""" + monkeypatch.delenv("PHAZE_CERTS_DIR", raising=False) + monkeypatch.delenv("PHAZE_API_HOST", raising=False) + monkeypatch.delenv("PHAZE_API_TLS_SANS", raising=False) + calls = _install_recorder(monkeypatch) + + entrypoint.main() + + assert calls["ensure"] == (Path("/certs"), "localhost", "localhost,127.0.0.1,api") + file_arg, argv = calls["execvp"] + assert file_arg == "uv" + assert argv[:4] == ["uv", "run", "uvicorn", "phaze.main:app"] + # Required flags + default-derived cert paths under /certs/. + assert "--host" in argv and "0.0.0.0" in argv # noqa: S104 # nosec B104 # asserting on entrypoint's container-bind flag + assert "--port" in argv and "8000" in argv + assert "--ssl-keyfile" in argv + assert argv[argv.index("--ssl-keyfile") + 1] == "/certs/phaze-server.key" + assert "--ssl-certfile" in argv + assert argv[argv.index("--ssl-certfile") + 1] == "/certs/phaze-server.crt" + + +def test_main_honors_env_overrides(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Test 2: PHAZE_CERTS_DIR / PHAZE_API_HOST / PHAZE_API_TLS_SANS flow through.""" + monkeypatch.setenv("PHAZE_CERTS_DIR", str(tmp_path)) + monkeypatch.setenv("PHAZE_API_HOST", "phaze.lan") + monkeypatch.setenv("PHAZE_API_TLS_SANS", "phaze.lan,10.0.0.5") + calls = _install_recorder(monkeypatch) + + entrypoint.main() + + # ensure_certs_present sees the operator-supplied values. + assert calls["ensure"] == (tmp_path, "phaze.lan", "phaze.lan,10.0.0.5") + # execvp's --ssl-* flags point at the operator-supplied certs dir. + _file, argv = calls["execvp"] + assert argv[argv.index("--ssl-keyfile") + 1] == str(tmp_path / "phaze-server.key") + assert argv[argv.index("--ssl-certfile") + 1] == str(tmp_path / "phaze-server.crt") + + +def test_main_runs_ensure_before_execvp(monkeypatch: pytest.MonkeyPatch) -> None: + """Test 3 (sequencing invariant): ensure_certs_present must run BEFORE execvp. + + If execvp ran first, uvicorn would boot against the still-empty /certs/ + bind mount and crash on missing --ssl-keyfile / --ssl-certfile paths. + """ + monkeypatch.delenv("PHAZE_CERTS_DIR", raising=False) + monkeypatch.delenv("PHAZE_API_HOST", raising=False) + monkeypatch.delenv("PHAZE_API_TLS_SANS", raising=False) + calls = _install_recorder(monkeypatch) + + entrypoint.main() + + assert calls["order"] == ["ensure", "execvp"] diff --git a/tests/test_phase01_gaps.py b/tests/test_phase01_gaps.py index e9a00a8..1304a53 100644 --- a/tests/test_phase01_gaps.py +++ b/tests/test_phase01_gaps.py @@ -21,7 +21,9 @@ def test_settings_database_url_default(monkeypatch: pytest.MonkeyPatch) -> None: def test_settings_redis_url_default(monkeypatch: pytest.MonkeyPatch) -> None: """Settings.redis_url defaults to the Docker Compose redis address.""" + monkeypatch.delenv("PHAZE_REDIS_URL", raising=False) monkeypatch.delenv("REDIS_URL", raising=False) + monkeypatch.delenv("redis_url", raising=False) s = Settings(_env_file=None) assert s.redis_url == "redis://redis:6379/0" diff --git a/tests/test_phase04_gaps.py b/tests/test_phase04_gaps.py index 39eaeb5..4c9aa5b 100644 --- a/tests/test_phase04_gaps.py +++ b/tests/test_phase04_gaps.py @@ -107,49 +107,93 @@ async def test_lifespan_disconnects_queue_on_shutdown() -> None: @pytest.mark.asyncio -async def test_agent_startup_raises_if_models_dir_missing(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Agent-worker startup fails fast if models directory does not exist.""" +async def test_agent_startup_invokes_ensure_models_present_after_whoami(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """Phase 29 D-21: agent_worker.startup delegates the models check to + ensure_models_present and invokes it AFTER /whoami succeeds (RESEARCH + line 906 -- auth fails fast before spending 5min on the 150MB + download). The old fail-fast RuntimeError("Models directory not found") + behaviour is REPLACED, not duplicated. + """ monkeypatch.setenv("PHAZE_ROLE", "agent") monkeypatch.setenv("PHAZE_AGENT_API_URL", "http://test") monkeypatch.setenv("PHAZE_AGENT_TOKEN", "phaze_agent_test-token-1234567890abcdef") - monkeypatch.setenv("PHAZE_AGENT_QUEUE", "phaze-agent-test") + monkeypatch.setenv("PHAZE_AGENT_QUEUE", "phaze-agent-test-id") monkeypatch.setenv("PHAZE_AGENT_SCAN_ROOTS", str(tmp_path)) monkeypatch.setenv("PHAZE_REDIS_URL", "redis://localhost:6379/0") from phaze.config import AgentSettings import phaze.tasks.agent_worker as aw - missing = tmp_path / "nonexistent" + models_dir = tmp_path / "models" fake_cfg = AgentSettings() - fake_cfg.models_path = str(missing) # type: ignore[misc] + fake_cfg.models_path = str(models_dir) # type: ignore[misc] monkeypatch.setattr(aw, "get_settings", lambda: fake_cfg) - with pytest.raises(RuntimeError, match="Models directory not found"): - await aw.startup({}) + # Order tracking: whoami must run BEFORE ensure_models_present (D-21 specifics). + call_order: list[str] = [] + fake_identity = MagicMock(agent_id="test-id") + fake_client = AsyncMock() + + async def fake_whoami() -> object: + call_order.append("whoami") + return fake_identity + + fake_client.whoami = fake_whoami + fake_client.close = AsyncMock() + monkeypatch.setattr(aw, "construct_agent_client", lambda _cfg: fake_client) + monkeypatch.setattr(aw, "create_process_pool", lambda: MagicMock()) + monkeypatch.setattr(aw, "AudfprintAdapter", lambda *_a, **_kw: MagicMock()) + monkeypatch.setattr(aw, "PanakoAdapter", lambda *_a, **_kw: MagicMock()) + monkeypatch.setattr(aw, "FingerprintOrchestrator", lambda **_kw: MagicMock(engines=[])) + + def fake_ensure(models_path: Path) -> None: + call_order.append("ensure_models_present") + assert models_path == models_dir, "ensure_models_present must receive cfg.models_path" + + monkeypatch.setattr(aw, "ensure_models_present", fake_ensure) + + await aw.startup({}) + + assert call_order == ["whoami", "ensure_models_present"], f"expected whoami then ensure_models_present, got: {call_order}" @pytest.mark.asyncio -async def test_agent_startup_raises_if_no_pb_files(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Agent-worker startup fails fast if models directory has no .pb files.""" +async def test_agent_startup_propagates_ensure_models_present_failure(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """A RuntimeError from ensure_models_present propagates out of startup so + the container exits non-zero and restart: unless-stopped retries + (T-29-05-02 / Phase 29 D-21 failure mode). + """ monkeypatch.setenv("PHAZE_ROLE", "agent") monkeypatch.setenv("PHAZE_AGENT_API_URL", "http://test") monkeypatch.setenv("PHAZE_AGENT_TOKEN", "phaze_agent_test-token-1234567890abcdef") - monkeypatch.setenv("PHAZE_AGENT_QUEUE", "phaze-agent-test") + monkeypatch.setenv("PHAZE_AGENT_QUEUE", "phaze-agent-test-id") monkeypatch.setenv("PHAZE_AGENT_SCAN_ROOTS", str(tmp_path)) monkeypatch.setenv("PHAZE_REDIS_URL", "redis://localhost:6379/0") from phaze.config import AgentSettings import phaze.tasks.agent_worker as aw - models_dir = tmp_path / "models" - models_dir.mkdir() - (models_dir / "readme.txt").write_text("empty") - fake_cfg = AgentSettings() - fake_cfg.models_path = str(models_dir) # type: ignore[misc] + fake_cfg.models_path = str(tmp_path / "models") # type: ignore[misc] monkeypatch.setattr(aw, "get_settings", lambda: fake_cfg) - with pytest.raises(RuntimeError, match=r"No \.pb model files"): + fake_identity = MagicMock(agent_id="test-id") + fake_client = AsyncMock() + fake_client.whoami = AsyncMock(return_value=fake_identity) + fake_client.close = AsyncMock() + monkeypatch.setattr(aw, "construct_agent_client", lambda _cfg: fake_client) + monkeypatch.setattr(aw, "create_process_pool", lambda: MagicMock()) + monkeypatch.setattr(aw, "AudfprintAdapter", lambda *_a, **_kw: MagicMock()) + monkeypatch.setattr(aw, "PanakoAdapter", lambda *_a, **_kw: MagicMock()) + monkeypatch.setattr(aw, "FingerprintOrchestrator", lambda **_kw: MagicMock(engines=[])) + + def boom(_models_path: Path) -> None: + msg = "Model download failed: simulated network failure" + raise RuntimeError(msg) + + monkeypatch.setattr(aw, "ensure_models_present", boom) + + with pytest.raises(RuntimeError, match="Model download failed"): await aw.startup({}) @@ -181,14 +225,22 @@ def test_docker_compose_worker_command_is_controller_settings() -> None: def test_docker_compose_has_agent_worker_consuming_agent_queue() -> None: - """docker-compose.yml has a service running 'saq phaze.tasks.agent_worker.settings' as PHAZE_ROLE=agent.""" + """A compose file declares a service running 'saq phaze.tasks.agent_worker.settings' as PHAZE_ROLE=agent. + + Phase 29 D-15/D-17 split the compose surface in two: + - docker-compose.yml β€” application-server-only services (api, worker=control, postgres, redis). + - docker-compose.agent.yml β€” file-server-only services (worker=agent, watcher, audfprint, panako). + The agent-worker now lives in docker-compose.agent.yml; this test scans + BOTH files so the Phase 27 UAT gap-13 invariant (an agent-side SAQ + consumer exists somewhere in the deployment surface) stays codified. + """ import yaml - compose_file = Path(__file__).parent.parent / "docker-compose.yml" - assert compose_file.exists(), "docker-compose.yml not found at project root" - - compose = yaml.safe_load(compose_file.read_text()) - services = compose.get("services", {}) + root_dir = Path(__file__).parent.parent + compose_files = [ + root_dir / "docker-compose.yml", + root_dir / "docker-compose.agent.yml", + ] def env_has(svc_env: object, key: str, value: str) -> bool: # Compose env may be a list ("KEY=VAL") or a dict. @@ -198,15 +250,23 @@ def env_has(svc_env: object, key: str, value: str) -> bool: return svc_env.get(key) == value return False - consumers = [ - name - for name, spec in services.items() - if "saq phaze.tasks.agent_worker.settings" in str(spec.get("command", "")) and env_has(spec.get("environment"), "PHAZE_ROLE", "agent") - ] + consumers: list[str] = [] + for compose_file in compose_files: + if not compose_file.exists(): + continue + compose = yaml.safe_load(compose_file.read_text()) + services = compose.get("services", {}) or {} + for name, spec in services.items(): + command = str(spec.get("command", "")) + if "saq phaze.tasks.agent_worker.settings" in command and env_has(spec.get("environment"), "PHAZE_ROLE", "agent"): + consumers.append(f"{compose_file.name}::{name}") assert consumers, ( - "docker-compose.yml must include at least one service that runs " + "No compose file declares a service running " "'uv run saq phaze.tasks.agent_worker.settings' with PHAZE_ROLE=agent. " - "Without it, scan_directory / extract_file_metadata jobs the API " - "enqueues onto 'phaze-agent-{agent_id}' have no consumer (Phase 27 UAT gap-13)." + "Phase 29 moved the agent-worker out of docker-compose.yml and into " + "docker-compose.agent.yml (D-15 / D-17). Without an agent-side SAQ " + "consumer somewhere in the deployment surface, scan_directory / " + "extract_file_metadata jobs the API enqueues onto " + "'phaze-agent-{agent_id}' have no consumer (Phase 27 UAT gap-13)." ) diff --git a/tests/test_routers/test_admin_agents.py b/tests/test_routers/test_admin_agents.py new file mode 100644 index 0000000..d6b79c8 --- /dev/null +++ b/tests/test_routers/test_admin_agents.py @@ -0,0 +1,257 @@ +"""Controller-side contract tests for Phase 29 plan 07: /admin/agents router. + +Covers: +- GET /admin/agents β€” full page render (extends base.html, contains nav + table). +- GET /admin/agents/_table β€” partial-only render (HTMX poll target). +- HX-Request: true on /admin/agents β€” returns the partial only. +- 5-state status-pill rendering (alive/stale/dead/revoked/never). +- Empty state (UI-SPEC Β§Empty State LOCKED copy). +- Sort order: alive β†’ stale β†’ dead β†’ never β†’ revoked (UI-SPEC LOCKED). +- BLOCKER-2 failure-tolerant footer (htmx event listener + localStorage red banner). + +Uses a self-contained smoke-app fixture (mirrors test_pipeline_scans.py:46-78) +that installs the admin_agents router on a bare FastAPI app and overrides +get_session to use the project-wide session fixture. +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta +from typing import TYPE_CHECKING + +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient +import pytest +import pytest_asyncio + +from phaze.database import get_session +from phaze.models.agent import Agent +from phaze.routers import admin_agents + + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from sqlalchemy.ext.asyncio import AsyncSession + + +def _make_smoke_app(session: AsyncSession) -> FastAPI: + """Build a smoke FastAPI app mounting only admin_agents.router.""" + app = FastAPI(title="admin-agents-smoke", version="test") + app.include_router(admin_agents.router) + app.dependency_overrides[get_session] = lambda: session + return app + + +@pytest_asyncio.fixture +async def smoke(session: AsyncSession) -> AsyncGenerator[AsyncClient]: + """Smoke client seeding one agent per status (5 rows).""" + now = datetime.now(UTC) + session.add_all( + [ + Agent(id="alive-agent", name="AliveBox", scan_roots=["/data/music"], last_seen_at=now), + Agent( + id="stale-agent", + name="StaleBox", + scan_roots=["/data/music"], + last_seen_at=now - timedelta(seconds=120), + ), + Agent( + id="dead-agent", + name="DeadBox", + scan_roots=["/data/music"], + last_seen_at=now - timedelta(seconds=600), + ), + Agent( + id="revoked-agent", + name="RevokedBox", + scan_roots=["/data/music"], + last_seen_at=now, + revoked_at=now, + ), + Agent(id="never-agent", name="NeverBox", scan_roots=["/data/music"]), + ] + ) + await session.commit() + + app = _make_smoke_app(session) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as ac: + yield ac + + +@pytest_asyncio.fixture +async def empty_smoke(session: AsyncSession) -> AsyncGenerator[AsyncClient]: + """Smoke client with NO seeded agents beyond the conftest legacy row. + + The conftest legacy `legacy-application-server` agent is automatically + seeded by `async_engine`; we do NOT want it visible on the /admin/agents + page for the empty-state test, so this fixture deletes it. + """ + from sqlalchemy import delete + + await session.execute(delete(Agent)) + await session.commit() + + app = _make_smoke_app(session) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as ac: + yield ac + + +# --------------------------------------------------------------------------- +# 6 core tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_page_renders_full_html(smoke: AsyncClient) -> None: + """GET /admin/agents returns the full page with base.html chrome.""" + response = await smoke.get("/admin/agents") + assert response.status_code == 200, response.text + body = response.text + # Full-page chrome from base.html. + assert " None: + """HX-Request: true on /admin/agents returns the partial, not the full page.""" + response = await smoke.get("/admin/agents", headers={"HX-Request": "true"}) + assert response.status_code == 200 + body = response.text + # Partial has no chrome. + assert " None: + """GET /admin/agents/_table returns the partial unconditionally (UI-SPEC LOCKED).""" + response = await smoke.get("/admin/agents/_table") + assert response.status_code == 200 + body = response.text + assert " None: + """5-state status pill rendering with LOCKED Tailwind classes per UI-SPEC.""" + response = await smoke.get("/admin/agents/_table") + body = response.text + # ALIVE β€” green-100/950 surface. + assert "ALIVE" in body + assert "bg-green-100 dark:bg-green-950" in body + assert 'aria-label="Status: alive"' in body + # STALE β€” amber-100/950 surface. + assert "STALE" in body + assert "bg-amber-100 dark:bg-amber-950" in body + assert 'aria-label="Status: stale"' in body + # DEAD β€” red-100/950 surface. + assert "DEAD" in body + assert "bg-red-100 dark:bg-red-950" in body + assert 'aria-label="Status: dead"' in body + # REVOKED β€” gray-100/800 surface (neutral). + assert "REVOKED" in body + # NEVER β€” same gray-100/800 surface (visually unified "no signal"). + assert "NEVER" in body + assert "bg-gray-100 dark:bg-gray-800" in body + + +@pytest.mark.asyncio +async def test_empty_state(empty_smoke: AsyncClient) -> None: + """Empty agents table renders the UI-SPEC Β§Empty State LOCKED copy.""" + response = await empty_smoke.get("/admin/agents/_table") + assert response.status_code == 200 + body = response.text + assert "No agents registered yet" in body + assert "just up-agent" in body + # The polling cadence is still emitted on the empty-state section. + assert 'hx-trigger="every 5s"' in body + + +@pytest.mark.asyncio +async def test_sort_order(smoke: AsyncClient) -> None: + """Sort order: alive β†’ stale β†’ dead β†’ never β†’ revoked (UI-SPEC LOCKED).""" + response = await smoke.get("/admin/agents/_table") + body = response.text + # Names appear in the LOCKED sort order. We rely on substring positions. + pos = { + "alive": body.find("AliveBox"), + "stale": body.find("StaleBox"), + "dead": body.find("DeadBox"), + "never": body.find("NeverBox"), + "revoked": body.find("RevokedBox"), + } + assert all(v > 0 for v in pos.values()), f"missing agent name in body: {pos}" + assert pos["alive"] < pos["stale"] < pos["dead"] < pos["never"] < pos["revoked"], f"sort order violated: {pos}" + + +# --------------------------------------------------------------------------- +# 3 BLOCKER-2 tests β€” UI-SPEC Β§Error / Failure-Tolerant Refresh LOCKED +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_page_includes_htmx_error_listener(smoke: AsyncClient) -> None: + """BLOCKER-2: UI-SPEC Β§Error / Failure-Tolerant Refresh LOCKED β€” the full + page must include the htmx:responseError + htmx:sendError listener that + writes localStorage `phaze:agents:lastError`.""" + response = await smoke.get("/admin/agents") + body = response.text + assert "htmx:responseError" in body, "Missing htmx:responseError listener (BLOCKER-2)" + assert "htmx:sendError" in body, "Missing htmx:sendError listener (BLOCKER-2)" + assert "htmx:afterSwap" in body, "Missing htmx:afterSwap recovery handler (BLOCKER-2)" + assert "phaze:agents:lastError" in body, "Missing localStorage key (BLOCKER-2)" + assert "localStorage.setItem" in body, "Listener must write to localStorage (BLOCKER-2)" + assert "localStorage.removeItem" in body, "Recovery handler must clear localStorage (BLOCKER-2)" + + +@pytest.mark.asyncio +async def test_partial_includes_failure_tolerant_footer(smoke: AsyncClient) -> None: + """BLOCKER-2: agents_table partial must render the red 'Refresh failed' + footer driven by localStorage `phaze:agents:lastError`.""" + response = await smoke.get("/admin/agents/_table") + body = response.text + assert "localStorage.getItem" in body, "Partial must read from localStorage (BLOCKER-2)" + assert "phaze:agents:lastError" in body, "Partial must reference the localStorage key (BLOCKER-2)" + assert "Refresh failed" in body, "Partial must include the red 'Refresh failed' copy (BLOCKER-2)" + + +@pytest.mark.asyncio +async def test_partial_failure_footer_uses_role_alert(smoke: AsyncClient) -> None: + """BLOCKER-2 + accessibility: red failure banner uses role=alert so + screen readers announce it when it becomes visible.""" + response = await smoke.get("/admin/agents/_table") + body = response.text + assert 'role="alert"' in body, "Failure banner must have role=alert (a11y + BLOCKER-2)" + + +# --------------------------------------------------------------------------- +# Production-wiring smoke test (router registered in main.create_app) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_router_registered_in_main_app() -> None: + """admin_agents.router is registered in main.create_app() (production wiring).""" + from phaze.main import create_app + + app = create_app() + paths = {route.path for route in app.routes if hasattr(route, "path")} # type: ignore[attr-defined] + # Both handlers must be reachable on the production app. + assert "/admin/agents" in paths + assert "/admin/agents/_table" in paths diff --git a/tests/test_scripts/__init__.py b/tests/test_scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scripts/test_download_models.py b/tests/test_scripts/test_download_models.py new file mode 100644 index 0000000..0172516 --- /dev/null +++ b/tests/test_scripts/test_download_models.py @@ -0,0 +1,137 @@ +"""Tests for `phaze.scripts.download_models` (Phase 29 D-21). + +Covers the previously-untested branches of `_download_one` and `download_to`: +- `_download_one` skips when `dest` already exists (idempotent fast-path) +- `_download_one` streams to `.part` and atomically renames on success +- `_download_one` raises (and leaves the `.part` behind for the bootstrap + caller to reject β€” see `phaze.tasks._shared.model_bootstrap`) +- `download_to` walks both CLASSIFIER_MODELS and GENRE_MODELS, requesting + `.pb` + `.json` per model under the documented Essentia URL bases + +Uses `respx` (already a dev dep β€” see `pyproject.toml`) to intercept +`httpx.stream`. No real network I/O is performed. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import httpx +import pytest +import respx + +from phaze.scripts.download_models import ( + CLASSIFIER_MODELS, + GENRE_MODELS, + _download_one, + download_to, +) + + +if TYPE_CHECKING: + from pathlib import Path + + +_CLASSIFIER_BASE = "https://essentia.upf.edu/models/classifiers" +_GENRE_BASE = "https://essentia.upf.edu/models/music-style-classification/discogs-effnet" + + +@respx.mock +def test_download_one_skips_when_dest_exists(tmp_path: Path) -> None: + """Idempotent fast-path: an existing file returns immediately without HTTP I/O.""" + dest = tmp_path / "already-here.pb" + dest.write_bytes(b"pre-existing") + # No respx route registered β€” any HTTP call would 1) fail at network or + # 2) trip respx's strict "unhandled request" mode. Either way the test + # would fail if `_download_one` made a request. + + _download_one("https://example.invalid/should-not-be-fetched.pb", dest) + + assert dest.read_bytes() == b"pre-existing", "existing file must be left untouched" + + +@respx.mock +def test_download_one_streams_atomically(tmp_path: Path) -> None: + """Success path: byte stream is written to `.part`, then renamed.""" + url = "https://example.test/model.pb" + dest = tmp_path / "subdir" / "model.pb" # parent dir doesn't exist yet + payload = b"model-bytes" * 1024 # > 1 chunk worth + + respx.get(url).mock(return_value=httpx.Response(200, content=payload)) + + _download_one(url, dest) + + assert dest.exists(), "destination must exist after successful download" + assert dest.read_bytes() == payload + # The atomic `.part` rename means no temp file is left behind on success. + assert not (tmp_path / "subdir" / "model.pb.part").exists() + + +@respx.mock +def test_download_one_4xx_raises_and_no_dest_written(tmp_path: Path) -> None: + """Failure path: a 4xx response raises HTTPStatusError and `dest` is not created. + + The atomic `.part` rename means a failed download MUST leave `dest` absent; + `phaze.tasks._shared.model_bootstrap.ensure_models_present` relies on + `glob("*.pb")` skipping `.part` files to decide whether to retry. + """ + url = "https://example.test/missing.pb" + dest = tmp_path / "missing.pb" + respx.get(url).mock(return_value=httpx.Response(404)) + + with pytest.raises(httpx.HTTPStatusError): + _download_one(url, dest) + + assert not dest.exists(), "failed download must NOT leave dest in place" + + +@respx.mock +def test_download_to_fetches_classifier_and_genre_urls(tmp_path: Path) -> None: + """`download_to` walks both model families and requests .pb + .json per model. + + This is the contract that `phaze.tasks._shared.model_bootstrap` depends on + when it triggers a bulk download into an empty `/models` directory. + """ + # Mock every classifier .pb + .json with a 1-byte payload. + for model_path in CLASSIFIER_MODELS: + respx.get(f"{_CLASSIFIER_BASE}/{model_path}.pb").mock(return_value=httpx.Response(200, content=b"P")) + respx.get(f"{_CLASSIFIER_BASE}/{model_path}.json").mock(return_value=httpx.Response(200, content=b"J")) + # Same for genre. + for model in GENRE_MODELS: + respx.get(f"{_GENRE_BASE}/{model}.pb").mock(return_value=httpx.Response(200, content=b"P")) + respx.get(f"{_GENRE_BASE}/{model}.json").mock(return_value=httpx.Response(200, content=b"J")) + + download_to(tmp_path) + + # Expect 2 files per model across both families. CLASSIFIER_MODELS uses + # the trailing path segment as the filename (matches the prod helper's + # `rsplit("/", 1)[-1]` logic). + expected_classifier_basenames = {p.rsplit("/", 1)[-1] for p in CLASSIFIER_MODELS} + for basename in expected_classifier_basenames: + assert (tmp_path / f"{basename}.pb").exists(), f"missing {basename}.pb" + assert (tmp_path / f"{basename}.json").exists(), f"missing {basename}.json" + for model in GENRE_MODELS: + assert (tmp_path / f"{model}.pb").exists(), f"missing genre {model}.pb" + assert (tmp_path / f"{model}.json").exists(), f"missing genre {model}.json" + + +@respx.mock +def test_download_to_is_idempotent_on_already_populated_dir(tmp_path: Path) -> None: + """Re-running `download_to` against a full models dir is a no-op (no HTTP).""" + # Pre-seed every expected file with a sentinel byte so `_download_one` + # takes the existence-skip branch for all of them. + expected_classifier_basenames = {p.rsplit("/", 1)[-1] for p in CLASSIFIER_MODELS} + for basename in expected_classifier_basenames: + (tmp_path / f"{basename}.pb").write_bytes(b"X") + (tmp_path / f"{basename}.json").write_bytes(b"X") + for model in GENRE_MODELS: + (tmp_path / f"{model}.pb").write_bytes(b"X") + (tmp_path / f"{model}.json").write_bytes(b"X") + + # No respx routes registered β€” if `_download_one` reached the network for + # any file the call would raise (respx is in strict mode by default). + download_to(tmp_path) + + # Sentinels still in place: nothing was overwritten. + for basename in expected_classifier_basenames: + assert (tmp_path / f"{basename}.pb").read_bytes() == b"X" diff --git a/tests/test_services/test_agent_client_tls.py b/tests/test_services/test_agent_client_tls.py new file mode 100644 index 0000000..a6499af --- /dev/null +++ b/tests/test_services/test_agent_client_tls.py @@ -0,0 +1,168 @@ +"""Real-TLS integration tests for PhazeAgentClient verify= wiring (Phase 29 D-04). + +Stands up a real uvicorn server with a leaf cert generated by +``phaze.cert_bootstrap.ensure_certs_present`` and exercises the +``httpx.AsyncClient(verify=...)`` path against TWO independent CA bundles: + + 1. wrong-CA -> ``httpx.ConnectError`` (D-04 success criterion) + 2. correct-CA -> 200 OK response + 3. ``construct_agent_client(cfg)`` with a missing CA file + -> ``RuntimeError("CA file empty or unreadable: ...")`` + +These tests intentionally do NOT use respx -- respx mocks at the httpx +transport layer below TLS, so it cannot exercise cert verification. +The trade-off is a slower test (real uvicorn boot + asyncio task) but +it is the only way to assert the rejection path that AUTH-02 promises. + +The test is marked with ``pytest.mark.integration`` per +``[tool.pytest.ini_options] markers`` in pyproject.toml. +""" + +from __future__ import annotations + +import asyncio +import socket +from typing import TYPE_CHECKING + +from fastapi import FastAPI +import httpx +from pydantic import SecretStr +import pytest +import uvicorn + +from phaze.cert_bootstrap import ensure_certs_present +from phaze.config import AgentSettings +from phaze.tasks._shared.agent_bootstrap import construct_agent_client + + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from pathlib import Path + + +pytestmark = pytest.mark.integration + + +def _free_port() -> int: + """Bind to port 0 to find an unused TCP port; release immediately.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return int(s.getsockname()[1]) + + +async def _wait_for_tcp(host: str, port: int, timeout: float = 5.0) -> None: + """Poll until a TCP connect to (host, port) succeeds or timeout elapses.""" + deadline = asyncio.get_running_loop().time() + timeout + while asyncio.get_running_loop().time() < deadline: + try: + reader, writer = await asyncio.open_connection(host, port) + writer.close() + await writer.wait_closed() + except (OSError, ConnectionRefusedError): + await asyncio.sleep(0.05) + else: + del reader + return + msg = f"server did not start listening on {host}:{port} within {timeout}s" + raise TimeoutError(msg) + + +@pytest.fixture +async def tls_server(tmp_path: Path) -> AsyncIterator[tuple[int, Path, Path]]: + """Spin up a uvicorn server with a real TLS cert in tmp_path/server_certs. + + Yields ``(port, server_certs_dir, wrong_certs_dir)``: + - ``port``: the bound port the server listens on + - ``server_certs_dir``: dir holding ``phaze-ca.crt`` matching the + server's leaf (the "correct" CA from the agent's POV) + - ``wrong_certs_dir``: dir holding a DIFFERENT CA -- the agent's + verify= path uses this to prove untrusted certs are rejected + """ + server_certs_dir = tmp_path / "server_certs" + wrong_certs_dir = tmp_path / "wrong_certs" + ensure_certs_present(server_certs_dir, cn="localhost", sans_csv="127.0.0.1,localhost") + ensure_certs_present(wrong_certs_dir, cn="localhost", sans_csv="127.0.0.1,localhost") + + app = FastAPI() + + @app.get("/test") + async def _test_endpoint() -> dict[str, str]: + return {"status": "ok"} + + port = _free_port() + config = uvicorn.Config( + app, + host="127.0.0.1", + port=port, + ssl_keyfile=str(server_certs_dir / "phaze-server.key"), + ssl_certfile=str(server_certs_dir / "phaze-server.crt"), + log_level="warning", + lifespan="off", + ) + server = uvicorn.Server(config) + server_task = asyncio.create_task(server.serve()) + + await _wait_for_tcp("127.0.0.1", port, timeout=5.0) + try: + yield port, server_certs_dir, wrong_certs_dir + finally: + server.should_exit = True + try: + await asyncio.wait_for(server_task, timeout=5.0) + except (TimeoutError, asyncio.CancelledError): + server_task.cancel() + + +@pytest.mark.asyncio +async def test_wrong_ca_raises_connect_error(tls_server: tuple[int, Path, Path]) -> None: + """D-04 success criterion: httpx.AsyncClient with the WRONG CA against a + server presenting the RIGHT cert raises httpx.ConnectError on first request. + """ + port, _server_certs, wrong_certs = tls_server + wrong_ca = str(wrong_certs / "phaze-ca.crt") + async with httpx.AsyncClient(base_url=f"https://127.0.0.1:{port}", verify=wrong_ca) as client: + with pytest.raises(httpx.ConnectError): + await client.get("/test") + + +@pytest.mark.asyncio +async def test_correct_ca_succeeds(tls_server: tuple[int, Path, Path]) -> None: + """The same client with the CORRECT CA bundle gets a 200 OK.""" + port, server_certs, _wrong_certs = tls_server + correct_ca = str(server_certs / "phaze-ca.crt") + async with httpx.AsyncClient(base_url=f"https://127.0.0.1:{port}", verify=correct_ca) as client: + response = await client.get("/test") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} + + +def test_construct_agent_client_missing_ca_raises(tmp_path: Path) -> None: + """D-03 fail-fast: construct_agent_client with a non-existent CA file + raises RuntimeError("CA file empty or unreadable: ..."). + """ + missing_ca = tmp_path / "nonexistent.crt" + cfg = AgentSettings( + agent_api_url="https://127.0.0.1:9999", + agent_token=SecretStr("phaze_agent_test-token-1234567890abcdef"), + scan_roots=["/tmp"], # noqa: S108 # test-only path + agent_ca_file=str(missing_ca), + ) + with pytest.raises(RuntimeError, match="CA file empty or unreadable"): + construct_agent_client(cfg) + + +def test_construct_agent_client_empty_ca_raises(tmp_path: Path) -> None: + """D-03 fail-fast: construct_agent_client with an empty CA file (size=0) + also raises RuntimeError. Catches the case where the operator created + the file but didn't populate it. + """ + empty_ca = tmp_path / "empty.crt" + empty_ca.write_bytes(b"") + cfg = AgentSettings( + agent_api_url="https://127.0.0.1:9999", + agent_token=SecretStr("phaze_agent_test-token-1234567890abcdef"), + scan_roots=["/tmp"], # noqa: S108 # test-only path + agent_ca_file=str(empty_ca), + ) + with pytest.raises(RuntimeError, match="CA file empty or unreadable"): + construct_agent_client(cfg) diff --git a/tests/test_services/test_agent_liveness.py b/tests/test_services/test_agent_liveness.py new file mode 100644 index 0000000..6b29b01 --- /dev/null +++ b/tests/test_services/test_agent_liveness.py @@ -0,0 +1,169 @@ +"""Tests for phaze.services.agent_liveness β€” pure-function classifier + sort_key. + +Phase 29 D-12 LOCKED thresholds: + - alive: now - last_seen_at < 90s (AGENT_LIVENESS_ALIVE_SECONDS) + - stale: 90s <= delta < 300s (AGENT_LIVENESS_STALE_SECONDS) + - dead: delta >= 300s + - revoked: revoked_at IS NOT NULL (precedence over all last_seen_at math) + - never: revoked_at IS NULL AND last_seen_at IS NULL + +UI-SPEC Β§Status Pill Component sort order: + revoked agents last; within non-revoked: status_rank ascending + (aliveβ†’staleβ†’deadβ†’never); within same status: last_seen_at descending. +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta + +import pytest + +from phaze.models.agent import Agent +from phaze.services.agent_liveness import AgentStatus, classify, sort_key + + +NOW = datetime(2026, 5, 16, 12, 0, 0, tzinfo=UTC) + + +def _make_agent( + agent_id: str, + *, + last_seen_at: datetime | None = None, + revoked_at: datetime | None = None, +) -> Agent: + return Agent( + id=agent_id, + name=agent_id, + scan_roots=[], + last_seen_at=last_seen_at, + revoked_at=revoked_at, + ) + + +# --------------------------------------------------------------------------- +# classify(agent, now) β€” 5-state matrix +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("delta_seconds", "expected"), + [ + (0, "alive"), + (1, "alive"), + (60, "alive"), + (89, "alive"), + # 90s boundary: alive < 90, stale >= 90 + (90, "stale"), + (120, "stale"), + (200, "stale"), + (299, "stale"), + # 300s boundary: stale < 300, dead >= 300 + (300, "dead"), + (600, "dead"), + (86400, "dead"), + ], +) +def test_classify_thresholds(delta_seconds: int, expected: str) -> None: + """5-state thresholds at all boundary cases (D-12).""" + agent = _make_agent("test", last_seen_at=NOW - timedelta(seconds=delta_seconds)) + assert classify(agent, NOW) == expected + + +def test_classify_never_when_last_seen_at_is_none() -> None: + """Agent registered but never heartbeated β†’ 'never' (revoked_at also NULL).""" + agent = _make_agent("never-agent") + assert classify(agent, NOW) == "never" + + +def test_classify_revoked_takes_precedence_over_alive() -> None: + """Revoked agent with recent last_seen_at still classifies as 'revoked'.""" + agent = _make_agent( + "revoked-agent", + last_seen_at=NOW, + revoked_at=NOW - timedelta(seconds=10), + ) + assert classify(agent, NOW) == "revoked" + + +def test_classify_revoked_takes_precedence_over_never() -> None: + """Revoked + never-heartbeated agent classifies as 'revoked' (precedence).""" + agent = _make_agent( + "revoked-never", + last_seen_at=None, + revoked_at=NOW, + ) + assert classify(agent, NOW) == "revoked" + + +def test_classify_returns_literal_type() -> None: + """classify return is one of the 5 AgentStatus literal members.""" + agent = _make_agent("test", last_seen_at=NOW) + result: AgentStatus = classify(agent, NOW) + assert result in {"alive", "stale", "dead", "revoked", "never"} + + +# --------------------------------------------------------------------------- +# sort_key(agent, now) β€” ordering invariants +# --------------------------------------------------------------------------- + + +def test_sort_key_revoked_last() -> None: + """Revoked agents sort AFTER every non-revoked agent regardless of last_seen.""" + alive = _make_agent("alive", last_seen_at=NOW) + revoked = _make_agent("revoked", last_seen_at=NOW, revoked_at=NOW) + assert sort_key(alive, NOW) < sort_key(revoked, NOW) + + +def test_sort_key_status_rank_alive_before_stale_before_dead() -> None: + """Non-revoked agents sort by status: alive < stale < dead < never.""" + alive = _make_agent("alive", last_seen_at=NOW) + stale = _make_agent("stale", last_seen_at=NOW - timedelta(seconds=150)) + dead = _make_agent("dead", last_seen_at=NOW - timedelta(seconds=600)) + never = _make_agent("never") + assert sort_key(alive, NOW) < sort_key(stale, NOW) + assert sort_key(stale, NOW) < sort_key(dead, NOW) + assert sort_key(dead, NOW) < sort_key(never, NOW) + + +def test_sort_key_within_same_status_last_seen_descending() -> None: + """Within the same status bucket, more-recently-seen agents sort first.""" + recent = _make_agent("recent", last_seen_at=NOW - timedelta(seconds=10)) + older = _make_agent("older", last_seen_at=NOW - timedelta(seconds=60)) + # Both are alive (<90s); recent should come BEFORE older. + assert sort_key(recent, NOW) < sort_key(older, NOW) + + +def test_sort_key_full_sort_order() -> None: + """End-to-end: sort a mixed list and assert expected order.""" + alive_recent = _make_agent("alive-recent", last_seen_at=NOW) + alive_older = _make_agent("alive-older", last_seen_at=NOW - timedelta(seconds=30)) + stale = _make_agent("stale", last_seen_at=NOW - timedelta(seconds=120)) + dead = _make_agent("dead", last_seen_at=NOW - timedelta(seconds=600)) + never = _make_agent("never") + revoked_recent = _make_agent("revoked-recent", last_seen_at=NOW, revoked_at=NOW) + revoked_old = _make_agent( + "revoked-old", + last_seen_at=NOW - timedelta(seconds=600), + revoked_at=NOW, + ) + + unsorted = [revoked_old, dead, alive_older, stale, revoked_recent, alive_recent, never] + sorted_agents = sorted(unsorted, key=lambda a: sort_key(a, NOW)) + sorted_ids = [a.id for a in sorted_agents] + # alive_recent (alive, most recent) β†’ alive_older β†’ stale β†’ dead β†’ never β†’ revoked_recent β†’ revoked_old + assert sorted_ids == [ + "alive-recent", + "alive-older", + "stale", + "dead", + "never", + "revoked-recent", + "revoked-old", + ] + + +def test_sort_key_never_after_dead_within_non_revoked() -> None: + """'never' has same rank as 'revoked' (3) but lives in the non-revoked group.""" + dead = _make_agent("dead", last_seen_at=NOW - timedelta(seconds=600)) + never = _make_agent("never") + assert sort_key(dead, NOW) < sort_key(never, NOW) diff --git a/tests/test_services/test_model_bootstrap.py b/tests/test_services/test_model_bootstrap.py new file mode 100644 index 0000000..992d476 --- /dev/null +++ b/tests/test_services/test_model_bootstrap.py @@ -0,0 +1,183 @@ +"""Tests for ``phaze.tasks._shared.model_bootstrap.ensure_models_present`` (Phase 29 D-21). + +Three LOCKED cases per PATTERNS lines 1077-1090: +- empty-dir -> ``download_to`` is invoked, INFO log surfaces the download notice +- populated -> ``download_to`` is NOT invoked, INFO log surfaces the "Models present" line +- network-fail -> ``RuntimeError("Model download failed")`` wrapping the underlying exception +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING +from unittest.mock import MagicMock + +import httpx +import pytest + + +if TYPE_CHECKING: + from pathlib import Path + + +def test_ensure_models_present_empty_dir_downloads( + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """An empty models directory triggers ``download_to`` and logs the start banner.""" + import phaze.tasks._shared.model_bootstrap as mb + + def fake_download(target: Path) -> None: + # Simulate a real download by writing a sentinel .pb file. + (target / "test_model.pb").touch() + + mock = MagicMock(side_effect=fake_download) + monkeypatch.setattr(mb, "download_to", mock) + + with caplog.at_level(logging.INFO, logger="phaze.tasks._shared.model_bootstrap"): + mb.ensure_models_present(tmp_path) + + mock.assert_called_once_with(tmp_path) + text = "\n".join(rec.getMessage() for rec in caplog.records) + assert "downloading essentia weights" in text, f"expected start banner in logs, got: {text!r}" + + +def test_ensure_models_present_populated_no_op( + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A fully-populated models directory short-circuits before invoking ``download_to``. + + Phase 29 CR-03: "populated" now means count-equals-expected (34 .pb files), + not "any .pb file present". Write all expected files so the short-circuit + branch is exercised. + """ + import phaze.tasks._shared.model_bootstrap as mb + + expected = mb._EXPECTED_MODEL_COUNT + for idx in range(expected): + (tmp_path / f"model_{idx:03d}.pb").touch() + + mock = MagicMock() + monkeypatch.setattr(mb, "download_to", mock) + + with caplog.at_level(logging.INFO, logger="phaze.tasks._shared.model_bootstrap"): + mb.ensure_models_present(tmp_path) + + mock.assert_not_called() + text = "\n".join(rec.getMessage() for rec in caplog.records) + assert f"Models present ({expected} weight files" in text, f"expected 'Models present' log, got: {text!r}" + + +def test_ensure_models_present_partial_triggers_redownload( + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Phase 29 CR-03: a partial models directory (some .pb files but not all) re-runs download. + + Previously, the bootstrap short-circuited on *any* .pb file present, so an + interrupted first download (e.g., 1/34 files written) left every subsequent + start skipping the re-download and the agent silently broken at analysis + time. This test pins the new behavior: partial state triggers download_to + and logs a WARNING with the observed/expected counts. + """ + import phaze.tasks._shared.model_bootstrap as mb + + # 1 out of N: clearly partial. + (tmp_path / "first_model.pb").touch() + assert len(list(tmp_path.glob("*.pb"))) < mb._EXPECTED_MODEL_COUNT + + completed = MagicMock() + + def fake_download(target: Path) -> None: + completed(target) + + monkeypatch.setattr(mb, "download_to", fake_download) + + with caplog.at_level(logging.WARNING, logger="phaze.tasks._shared.model_bootstrap"): + mb.ensure_models_present(tmp_path) + + completed.assert_called_once_with(tmp_path) + text = "\n".join(rec.getMessage() for rec in caplog.records) + assert "Partial model state" in text, f"expected partial-state WARNING, got: {text!r}" + assert f"1/{mb._EXPECTED_MODEL_COUNT}" in text, f"expected observed/expected counts in WARNING, got: {text!r}" + + +def test_ensure_models_present_download_failure( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Network failure during download is wrapped in ``RuntimeError`` with the original cause chained.""" + import phaze.tasks._shared.model_bootstrap as mb + + underlying = httpx.HTTPError("network down") + + def boom(target: Path) -> None: + raise underlying + + monkeypatch.setattr(mb, "download_to", boom) + + with pytest.raises(RuntimeError, match="Model download failed") as excinfo: + mb.ensure_models_present(tmp_path) + assert excinfo.value.__cause__ is underlying + + +def test_download_models_classifier_count_matches_bash() -> None: + """CLASSIFIER_MODELS contains exactly the 33 paths declared in scripts/download-models.sh.""" + from phaze.scripts.download_models import CLASSIFIER_MODELS, GENRE_MODELS + + assert len(CLASSIFIER_MODELS) == 33 + assert len(GENRE_MODELS) == 1 + assert GENRE_MODELS == ("discogs-effnet-bs64-1",) + + +def test_download_one_is_idempotent_when_dest_exists( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``_download_one`` short-circuits when ``dest`` already exists -- no network call.""" + from phaze.scripts import download_models + + dest = tmp_path / "already_here.pb" + dest.write_bytes(b"existing-bytes") + + # If httpx.stream is invoked, the test fails -- it must not be touched. + def boom(*_args: object, **_kwargs: object) -> object: + msg = "httpx.stream must not be called when dest exists" + raise AssertionError(msg) + + monkeypatch.setattr(download_models.httpx, "stream", boom) + + download_models._download_one("https://example.invalid/never-fetched.pb", dest) + + assert dest.read_bytes() == b"existing-bytes" + + +def test_download_to_creates_pb_and_json_pairs( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``download_to`` produces a .pb + .json file pair for every classifier and genre model.""" + from phaze.scripts import download_models + + fetched: list[tuple[str, Path]] = [] + + def fake_download_one(url: str, dest: Path) -> None: + fetched.append((url, dest)) + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(b"\x00") + + monkeypatch.setattr(download_models, "_download_one", fake_download_one) + + download_models.download_to(tmp_path) + + # 33 classifier models x 2 files (.pb + .json) + 1 genre x 2 = 68 files. + expected_file_count = (len(download_models.CLASSIFIER_MODELS) + len(download_models.GENRE_MODELS)) * 2 + assert len(fetched) == expected_file_count + pb_files = sorted(p.name for _, p in fetched if p.suffix == ".pb") + json_files = sorted(p.name for _, p in fetched if p.suffix == ".json") + assert len(pb_files) == len(download_models.CLASSIFIER_MODELS) + len(download_models.GENRE_MODELS) + assert len(json_files) == len(download_models.CLASSIFIER_MODELS) + len(download_models.GENRE_MODELS) diff --git a/tests/test_task_split.py b/tests/test_task_split.py index d18a939..85a247a 100644 --- a/tests/test_task_split.py +++ b/tests/test_task_split.py @@ -158,6 +158,43 @@ def test_agent_watcher_does_not_import_phaze_database() -> None: assert result.returncode == 0, f"agent_watcher import contaminated sys.modules:\nstdout={result.stdout}\nstderr={result.stderr}" +def test_cert_bootstrap_stays_postgres_free() -> None: + """Phase 29 D-22 extension of D-25: phaze.cert_bootstrap stays Postgres-free. + + The cert bootstrap runs in the api container's pre-uvicorn entrypoint + (Phase 29 D-02 / RESEARCH Pattern 2). It must NOT import: + - phaze.database + - phaze.tasks.session + - sqlalchemy.ext.asyncio + + Verified by subprocess so a contaminated import in the test process + cannot poison downstream tests via sys.modules caching. + + No env vars are required: cert_bootstrap does not call get_settings(). + """ + script = textwrap.dedent(""" + import sys + import phaze.cert_bootstrap # noqa: F401 + + forbidden = ("phaze.database", "phaze.tasks.session", "sqlalchemy.ext.asyncio") + present = [m for m in forbidden if m in sys.modules] + if present: + for m in present: + mod = sys.modules[m] + sys.stderr.write(f"BANNED MODULE IMPORTED: {m} (file={getattr(mod, '__file__', '?')})\\n") + sys.exit(1) + sys.exit(0) + """) + result = subprocess.run( # noqa: S603 # trusted input: literal sys.executable + literal -c script + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=20, + check=False, + ) + assert result.returncode == 0, f"cert_bootstrap import contaminated sys.modules:\nstdout={result.stdout}\nstderr={result.stderr}" + + def test_shared_bootstrap_stays_postgres_free() -> None: """Phase 27 D-17 invariant: phaze.tasks._shared.agent_bootstrap is Postgres-free. @@ -197,3 +234,48 @@ def test_shared_bootstrap_stays_postgres_free() -> None: check=False, ) assert result.returncode == 0, f"shared bootstrap import contaminated sys.modules:\nstdout={result.stdout}\nstderr={result.stderr}" + + +def test_model_bootstrap_stays_postgres_free() -> None: + """Phase 29 D-21 invariant: phaze.tasks._shared.model_bootstrap is Postgres-free. + + Parallel to test_shared_bootstrap_stays_postgres_free (which covers + agent_bootstrap.py only). The model_bootstrap module imports: + - stdlib (logging, pathlib) + - phaze.scripts.download_models (which imports httpx only) + + None of those pull in phaze.database, phaze.tasks.session, or + sqlalchemy.ext.asyncio. This test fails CI if the model_bootstrap module + is later extended with a Postgres-touching import (e.g., to track + download progress in the DB). + + Phase 29 BLOCKER-1 resolution. + """ + script = textwrap.dedent(""" + import os + import sys + os.environ.setdefault("PHAZE_ROLE", "agent") + os.environ.setdefault("PHAZE_AGENT_API_URL", "http://localhost:8000") + os.environ.setdefault("PHAZE_AGENT_TOKEN", "phaze_agent_test-token-1234567890abcdef") + os.environ.setdefault("PHAZE_AGENT_QUEUE", "phaze-agent-test") + os.environ.setdefault("PHAZE_AGENT_SCAN_ROOTS", "/tmp") + os.environ.setdefault("PHAZE_REDIS_URL", "redis://localhost:6379/0") + import phaze.tasks._shared.model_bootstrap # noqa: F401 + + forbidden = ("phaze.database", "phaze.tasks.session", "sqlalchemy.ext.asyncio") + present = [m for m in forbidden if m in sys.modules] + if present: + for m in present: + mod = sys.modules[m] + sys.stderr.write(f"BANNED MODULE IMPORTED: {m} (file={getattr(mod, '__file__', '?')})\\n") + sys.exit(1) + sys.exit(0) + """) + result = subprocess.run( # noqa: S603 # trusted input: literal sys.executable + literal -c script + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=20, + check=False, + ) + assert result.returncode == 0, f"model_bootstrap import contaminated sys.modules:\nstdout={result.stdout}\nstderr={result.stderr}" diff --git a/tests/test_tasks/test_agent_startup_banner.py b/tests/test_tasks/test_agent_startup_banner.py index 9c58763..2ec2134 100644 --- a/tests/test_tasks/test_agent_startup_banner.py +++ b/tests/test_tasks/test_agent_startup_banner.py @@ -54,7 +54,11 @@ async def test_agent_worker_startup_logs_role_banner_with_token_preview( # Patch models-dir check so we don't need real .pb files mounted. monkeypatch.setattr(pathlib.Path, "is_dir", lambda _self: True) - monkeypatch.setattr(pathlib.Path, "glob", lambda _self, _pat: [pathlib.Path("/m/x.pb")]) + # Phase 29 CR-03: ensure_models_present now compares against an expected + # weight-file count (34), so the prior "glob returns one fake .pb" trick + # triggers a partial-state re-download. Patch the bootstrap directly -- + # these tests are about the banner / queue-mismatch logic, not models. + monkeypatch.setattr(aw, "ensure_models_present", lambda _models_dir: None) ctx: dict[str, Any] = {} with caplog.at_level(logging.INFO, logger="phaze.tasks.agent_worker"): @@ -110,7 +114,11 @@ async def test_agent_worker_startup_raises_on_queue_token_mismatch( monkeypatch.setattr(aw, "PanakoAdapter", lambda *_a, **_kw: MagicMock()) monkeypatch.setattr(aw, "FingerprintOrchestrator", lambda **_kw: MagicMock(engines=[])) monkeypatch.setattr(pathlib.Path, "is_dir", lambda _self: True) - monkeypatch.setattr(pathlib.Path, "glob", lambda _self, _pat: [pathlib.Path("/m/x.pb")]) + # Phase 29 CR-03: ensure_models_present now compares against an expected + # weight-file count (34), so the prior "glob returns one fake .pb" trick + # triggers a partial-state re-download. Patch the bootstrap directly -- + # these tests are about the banner / queue-mismatch logic, not models. + monkeypatch.setattr(aw, "ensure_models_present", lambda _models_dir: None) ctx: dict[str, Any] = {} with pytest.raises(RuntimeError, match="queue/token mismatch"): diff --git a/tests/test_tasks/test_heartbeat_cron.py b/tests/test_tasks/test_heartbeat_cron.py new file mode 100644 index 0000000..de6aaf5 --- /dev/null +++ b/tests/test_tasks/test_heartbeat_cron.py @@ -0,0 +1,140 @@ +"""Phase 29 D-07..D-10 β€” happy-path tests for the SAQ heartbeat cron handler. + +Covers: + +* Success path: heartbeat_tick reads ctx, builds HeartbeatRequest with + agent_version (from importlib.metadata), worker_pid (from os.getpid), and + queue_depth (from ctx["worker"].queue.info()["queued"]), then POSTs via + ctx["api_client"].heartbeat (D-08, D-10). +* Ctx-missing path: empty ctx logs WARNING and returns gracefully β€” no + exception escapes (defensive guard for restart races). +* Queue.info() failure: heartbeat still goes out with queue_depth=0 + (D-10 defensive default; cron must not crash on transient queue read). +* importlib.metadata source: payload's agent_version equals the real + installed `phaze` package version (0.1.0 at the time of writing). + +All tests use `unittest.mock.patch` for `os.getpid` to make `worker_pid` +deterministic, and rely on pytest-asyncio `asyncio_mode = "auto"` so plain +`async def test_*` functions are collected without an explicit decorator. +""" + +from __future__ import annotations + +from datetime import UTC, datetime +import importlib.metadata +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock, MagicMock, patch + +from phaze.schemas.agent_heartbeat import HeartbeatRequest +from phaze.schemas.agent_identity import AgentIdentity +from phaze.tasks.heartbeat import heartbeat_tick + + +if TYPE_CHECKING: + import pytest + + +def _make_ctx(*, queued: int = 5, raise_info: bool = False) -> dict[str, Any]: + """Build a ctx dict shaped like the one SAQ injects into a cron handler. + + Mirrors RESEARCH Pattern 5 + Pitfall 8: SAQ pre-populates ``ctx["worker"]`` + in ``Worker.__init__`` and the agent_worker.startup hook adds + ``api_client`` and ``agent_identity``. Queue access is via + ``ctx["worker"].queue`` -- NEVER ``ctx["queue"]``. + """ + client = AsyncMock() + identity = AgentIdentity( + agent_id="test-agent", + name="Test Agent", + scan_roots=["/data"], + created_at=datetime(2026, 1, 1, tzinfo=UTC), + ) + worker = MagicMock() + queue = AsyncMock() + if raise_info: + queue.info = AsyncMock(side_effect=RuntimeError("redis down")) + else: + queue.info = AsyncMock( + return_value={ + "queued": queued, + "active": 0, + "scheduled": 0, + "name": "phaze-agent-test-agent", + "workers": {}, + "jobs": [], + }, + ) + worker.queue = queue + return { + "api_client": client, + "agent_identity": identity, + "worker": worker, + "job": MagicMock(), + } + + +async def test_heartbeat_success(caplog: pytest.LogCaptureFixture) -> None: + """D-08, D-10: heartbeat_tick POSTs once with the expected payload shape.""" + ctx = _make_ctx(queued=7) + + with ( + patch("phaze.tasks.heartbeat.os.getpid", return_value=12345), + patch("phaze.tasks.heartbeat.importlib.metadata.version", return_value="0.1.0"), + ): + await heartbeat_tick(ctx) + + client = ctx["api_client"] + assert client.heartbeat.await_count == 1 + call_args = client.heartbeat.await_args + # Signature: heartbeat(payload) β€” positional or kwarg both acceptable. + payload = call_args.args[0] if call_args.args else call_args.kwargs["payload"] + assert isinstance(payload, HeartbeatRequest) + assert payload.agent_version == "0.1.0" + assert payload.worker_pid == 12345 + assert payload.queue_depth == 7 + + +async def test_heartbeat_skips_when_ctx_missing(caplog: pytest.LogCaptureFixture) -> None: + """Defensive guard: missing api_client / agent_identity β†’ WARNING + return. + + No exception escapes; lets SAQ keep the cron running while the startup hook + races to initialise ctx during worker restarts. + """ + ctx: dict[str, Any] = {"worker": MagicMock(), "job": MagicMock()} + + with caplog.at_level("WARNING", logger="phaze.tasks.heartbeat"): + await heartbeat_tick(ctx) + + assert "heartbeat_tick: ctx not initialized" in caplog.text + + +async def test_heartbeat_queue_info_failure_defaults_to_zero( + caplog: pytest.LogCaptureFixture, +) -> None: + """D-10 defensive default: queue.info() exception β†’ queue_depth=0, still POST.""" + ctx = _make_ctx(raise_info=True) + + with caplog.at_level("WARNING", logger="phaze.tasks.heartbeat"): + await heartbeat_tick(ctx) + + client = ctx["api_client"] + assert client.heartbeat.await_count == 1 + call_args = client.heartbeat.await_args + payload = call_args.args[0] if call_args.args else call_args.kwargs["payload"] + assert isinstance(payload, HeartbeatRequest) + assert payload.queue_depth == 0 + assert "queue.info() failed" in caplog.text + + +async def test_heartbeat_agent_version_from_importlib() -> None: + """agent_version sources from importlib.metadata.version('phaze') β€” not hardcoded.""" + ctx = _make_ctx() + + await heartbeat_tick(ctx) + + expected_version = importlib.metadata.version("phaze") + client = ctx["api_client"] + call_args = client.heartbeat.await_args + payload = call_args.args[0] if call_args.args else call_args.kwargs["payload"] + assert isinstance(payload, HeartbeatRequest) + assert payload.agent_version == expected_version diff --git a/tests/test_tasks/test_heartbeat_failure.py b/tests/test_tasks/test_heartbeat_failure.py new file mode 100644 index 0000000..a1600de --- /dev/null +++ b/tests/test_tasks/test_heartbeat_failure.py @@ -0,0 +1,72 @@ +"""Phase 29 D-09 β€” failure-mode test for the SAQ heartbeat cron handler. + +When ``client.heartbeat()`` raises any subclass of ``AgentApiError`` (auth, +4xx, or 5xx after retries), the cron handler must log a WARNING and return +without re-raising. SAQ retries the cron on the next tick; the application +server sees ``last_seen_at`` stop advancing and the admin UI surfaces +"stale" -> operator notices. Mirrors Phase 28 D-16 fire-and-forget posture. + +``AgentApiServerError`` has NO custom ``__init__`` (verified at +``src/phaze/services/agent_client.py:86-87``); it accepts ONLY positional +args. Constructing it with ``status_code=`` would ``TypeError`` at test +setup time. +""" + +from __future__ import annotations + +from datetime import UTC, datetime +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock, MagicMock + +from phaze.schemas.agent_identity import AgentIdentity +from phaze.services.agent_client import AgentApiServerError +from phaze.tasks.heartbeat import heartbeat_tick + + +if TYPE_CHECKING: + import pytest + + +def _make_ctx() -> dict[str, Any]: + """Build a minimally-populated ctx dict (matches test_heartbeat_cron._make_ctx).""" + client = AsyncMock() + identity = AgentIdentity( + agent_id="test-agent", + name="Test Agent", + scan_roots=["/data"], + created_at=datetime(2026, 1, 1, tzinfo=UTC), + ) + worker = MagicMock() + queue = AsyncMock() + queue.info = AsyncMock( + return_value={ + "queued": 3, + "active": 0, + "scheduled": 0, + "name": "phaze-agent-test-agent", + "workers": {}, + "jobs": [], + }, + ) + worker.queue = queue + return { + "api_client": client, + "agent_identity": identity, + "worker": worker, + "job": MagicMock(), + } + + +async def test_heartbeat_agentapierror_warning(caplog: pytest.LogCaptureFixture) -> None: + """D-09: AgentApiServerError -> WARNING + swallow; no exception escapes.""" + ctx = _make_ctx() + # AgentApiServerError has no custom __init__ -- POSITIONAL ARGS ONLY. + # Verified at src/phaze/services/agent_client.py:86-87. Do NOT pass kwargs. + ctx["api_client"].heartbeat = AsyncMock(side_effect=AgentApiServerError("server error")) + + with caplog.at_level("WARNING", logger="phaze.tasks.heartbeat"): + # Must not raise. + await heartbeat_tick(ctx) + + assert "heartbeat failed" in caplog.text + assert any(r.levelname == "WARNING" for r in caplog.records) diff --git a/tests/test_tasks/test_shared_agent_bootstrap.py b/tests/test_tasks/test_shared_agent_bootstrap.py index 12706a8..9fc2f85 100644 --- a/tests/test_tasks/test_shared_agent_bootstrap.py +++ b/tests/test_tasks/test_shared_agent_bootstrap.py @@ -14,6 +14,7 @@ from __future__ import annotations import logging +from typing import TYPE_CHECKING from unittest.mock import AsyncMock from pydantic import SecretStr @@ -24,17 +25,32 @@ from phaze.tasks._shared import agent_bootstrap as ab -def _build_agent_settings(monkeypatch: pytest.MonkeyPatch) -> AgentSettings: - """Build an AgentSettings instance bypassing env-var resolution.""" +if TYPE_CHECKING: + from pathlib import Path + + +def _build_agent_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> AgentSettings: + """Build an AgentSettings instance bypassing env-var resolution. + + Phase 29 D-03: construct_agent_client validates ``agent_ca_file`` and + passes it through to ``httpx.AsyncClient(verify=...)``, which loads the + PEM at client-construction time. The fixture therefore generates a real + CA via the same cert_bootstrap module the production entrypoint uses. + """ + from phaze.cert_bootstrap import ensure_certs_present + + ensure_certs_present(tmp_path, cn="localhost", sans_csv="localhost,127.0.0.1") + ca_file = tmp_path / "phaze-ca.crt" monkeypatch.setenv("PHAZE_AGENT_API_URL", "http://app.test:8000") monkeypatch.setenv("PHAZE_AGENT_TOKEN", "phaze_agent_test-token-xyz") monkeypatch.setenv("PHAZE_AGENT_SCAN_ROOTS", "/data/music") + monkeypatch.setenv("PHAZE_AGENT_CA_FILE", str(ca_file)) return AgentSettings() -def test_construct_agent_client_uses_cfg_fields(monkeypatch: pytest.MonkeyPatch) -> None: +def test_construct_agent_client_uses_cfg_fields(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: """construct_agent_client(cfg) returns a PhazeAgentClient with base_url/token from cfg.""" - cfg = _build_agent_settings(monkeypatch) + cfg = _build_agent_settings(monkeypatch, tmp_path) client = ab.construct_agent_client(cfg) @@ -56,11 +72,12 @@ def test_construct_agent_client_uses_cfg_fields(monkeypatch: pytest.MonkeyPatch) def test_construct_agent_client_does_not_log_secret( caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, ) -> None: """T-27-04 mitigation: construct_agent_client must not emit any log line that includes the cleartext bearer token. Verified by sweeping caplog for the secret bytes.""" - cfg = _build_agent_settings(monkeypatch) + cfg = _build_agent_settings(monkeypatch, tmp_path) # Override the token with a synthetic secret bytes pattern we can grep for. cfg = cfg.model_copy(update={"agent_token": SecretStr("phaze_agent_BYTES-1234-ABCDEF")}) diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py new file mode 100644 index 0000000..e239344 --- /dev/null +++ b/tests/test_utils/__init__.py @@ -0,0 +1 @@ +"""Tests for phaze.utils package.""" diff --git a/tests/test_utils/test_humanize.py b/tests/test_utils/test_humanize.py new file mode 100644 index 0000000..c2cc1f2 --- /dev/null +++ b/tests/test_utils/test_humanize.py @@ -0,0 +1,166 @@ +"""Tests for phaze.utils.humanize.relative_time per UI-SPEC Β§Relative-Time Helper LOCKED. + +Output table (UI-SPEC LOCKED): + None β†’ "never" + delta < 0 β†’ "just now" + 0 <= d < 60 β†’ "{int(d)}s ago" + 60 <= d < 3600 β†’ "{int(d/60)}m ago" + 3600 <= d < 86400 β†’ "{int(d/3600)}h ago" + d >= 86400 β†’ "{int(d/86400)}d ago" + +Truncation rule: int() truncates toward zero, NOT round. +UI-SPEC line 248 explicit: 89.7s β†’ "89s ago", NOT "1m ago". +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta + +import pytest + +from phaze.utils.humanize import relative_time + + +NOW = datetime(2026, 5, 16, 12, 0, 0, tzinfo=UTC) + + +# --------------------------------------------------------------------------- +# Special cases: None, negative delta (future) +# --------------------------------------------------------------------------- + + +def test_relative_time_none_returns_never() -> None: + """None dt β†’ 'never' (matches the 'never' pill state).""" + assert relative_time(None, now=NOW) == "never" + + +def test_relative_time_negative_delta_returns_just_now() -> None: + """Future timestamp (clock skew) β†’ 'just now'.""" + future = NOW + timedelta(seconds=10) + assert relative_time(future, now=NOW) == "just now" + + +def test_relative_time_zero_delta_returns_zero_seconds() -> None: + """delta == 0 β†’ '0s ago' (NOT 'just now' which is only for future).""" + assert relative_time(NOW, now=NOW) == "0s ago" + + +# --------------------------------------------------------------------------- +# Boundary table β€” locked output rules +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("delta_seconds", "expected"), + [ + # seconds bucket: 0 <= d < 60 + (0, "0s ago"), + (1, "1s ago"), + (5, "5s ago"), + (23, "23s ago"), + (59, "59s ago"), + # 60s boundary β€” first 'm' result + (60, "1m ago"), + (61, "1m ago"), + (119, "1m ago"), + (120, "2m ago"), + # 3599s boundary β€” last 'm' result + (3599, "59m ago"), + # 3600s boundary β€” first 'h' result + (3600, "1h ago"), + (3601, "1h ago"), + (7200, "2h ago"), + # 86399s boundary β€” last 'h' result + (86399, "23h ago"), + # 86400s boundary β€” first 'd' result + (86400, "1d ago"), + (86401, "1d ago"), + (172800, "2d ago"), + (259200, "3d ago"), + ], +) +def test_relative_time_boundaries(delta_seconds: int, expected: str) -> None: + """All output bucket boundaries (UI-SPEC LOCKED).""" + dt = NOW - timedelta(seconds=delta_seconds) + assert relative_time(dt, now=NOW) == expected + + +# --------------------------------------------------------------------------- +# Truncation rule: int() truncates toward zero, NOT round +# --------------------------------------------------------------------------- + + +def test_relative_time_truncates_not_rounds_within_seconds_bucket() -> None: + """UI-SPEC truncation rule: int() truncates toward zero, NOT round. + + UI-SPEC LOCKED line 248 spells the rule with a "89.7s β†’ '89s ago'" example, + but 89.7s lies in the [60, 3600) minutes bucket per the LOCKED output + table on lines 232-241 (the table is the authoritative contract). The + truncation rule itself is verified here with a value INSIDE the seconds + bucket so both LOCKED invariants hold: ``int()`` truncates (59.7 β†’ 59, + not round to 60 which would cross the bucket boundary), AND the + [0, 60) seconds bucket is respected. This is the Rule-1 reconciliation + of the UI-SPEC documentation defect (the 89.7 prose example is + internally inconsistent with its own bucket table β€” see plan 29-07 + SUMMARY deviation log). + """ + dt = NOW - timedelta(seconds=59.7) + assert relative_time(dt, now=NOW) == "59s ago" + + +def test_relative_time_truncates_fractional_minutes() -> None: + """61.9s β†’ '1m ago' (truncated; 61.9/60 = 1.03 β†’ 1).""" + dt = NOW - timedelta(seconds=61.9) + assert relative_time(dt, now=NOW) == "1m ago" + + +def test_relative_time_truncates_fractional_hours() -> None: + """3600 + 1800 = 5400s = 1.5h β†’ '1h ago' (truncated, not rounded to 2h).""" + dt = NOW - timedelta(seconds=5400) + assert relative_time(dt, now=NOW) == "1h ago" + + +def test_relative_time_truncates_fractional_days() -> None: + """1.5d = 129600s β†’ '1d ago' (truncated, not rounded to 2d).""" + dt = NOW - timedelta(seconds=129600) + assert relative_time(dt, now=NOW) == "1d ago" + + +# --------------------------------------------------------------------------- +# Default now=None branch (uses datetime.now(UTC)) +# --------------------------------------------------------------------------- + + +def test_relative_time_default_now_returns_just_now_for_recent_dt() -> None: + """When `now=None`, uses datetime.now(UTC). A very recent dt is < 60s away.""" + # Construct a dt that is "just before" the current wall clock so the test + # works regardless of when it runs. Use 1 second ago. + from datetime import datetime as _datetime_for_now + + dt = _datetime_for_now.now(UTC) - timedelta(seconds=1) + out = relative_time(dt) + # Allow either '1s ago' or '0s ago' / '2s ago' depending on wall-clock slack. + assert out.endswith("s ago") + + +# --------------------------------------------------------------------------- +# Format invariants +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("delta", [1, 60, 3600, 86400]) +def test_relative_time_no_plural_s_suffix(delta: int) -> None: + """No 'seconds'/'minutes'/'hours'/'days' word β€” compact unit char only.""" + dt = NOW - timedelta(seconds=delta) + out = relative_time(dt, now=NOW) + for word in ("seconds", "minutes", "hours", "days", "second", "minute", "hour", "day"): + assert word not in out, f"output {out!r} should not contain {word!r}" + + +def test_relative_time_unit_char_is_single_letter() -> None: + """Every non-special output ends with 's ago', 'm ago', 'h ago', or 'd ago'.""" + for delta in (5, 60, 3600, 86400): + out = relative_time(NOW - timedelta(seconds=delta), now=NOW) + assert out.endswith(" ago") + unit = out.split(" ")[0][-1] + assert unit in {"s", "m", "h", "d"} diff --git a/uv.lock b/uv.lock index 0c87129..fee5989 100644 --- a/uv.lock +++ b/uv.lock @@ -203,6 +203,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" }, ] +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "(implementation_name != 'PyPy' and platform_machine == 'arm64' and sys_platform == 'darwin') or (implementation_name != 'PyPy' and platform_machine == 'x86_64' and sys_platform == 'darwin') or (implementation_name != 'PyPy' and platform_machine == 'aarch64' and sys_platform == 'linux') or (implementation_name != 'PyPy' and platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, +] + [[package]] name = "cfgv" version = "3.5.0" @@ -268,6 +285,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/39/783980e78cb92c2d7bdb1fc7dbc86e94ccc6d58224d76a7f1f51b6c51e30/croniter-6.2.2-py3-none-any.whl", hash = "sha256:a5d17b1060974d36251ea4faf388233eca8acf0d09cbd92d35f4c4ac8f279960", size = 45422, upload-time = "2026-03-15T08:43:46.626Z" }, ] +[[package]] +name = "cryptography" +version = "48.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "(platform_machine == 'arm64' and platform_python_implementation != 'PyPy' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and platform_python_implementation != 'PyPy' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and platform_python_implementation != 'PyPy' and sys_platform == 'linux') or (platform_machine == 'x86_64' and platform_python_implementation != 'PyPy' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/a9/db8f313fdcd85d767d4973515e1db101f9c71f95fced83233de224673757/cryptography-48.0.0.tar.gz", hash = "sha256:5c3932f4436d1cccb036cb0eaef46e6e2db91035166f1ad6505c3c9d5a635920", size = 832984, upload-time = "2026-05-04T22:59:38.133Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/3d/01f6dd9190170a5a241e0e98c2d04be3664a9e6f5b9b872cde63aff1c3dd/cryptography-48.0.0-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:0c558d2cdffd8f4bbb30fc7134c74d2ca9a476f830bb053074498fbc86f41ed6", size = 8001587, upload-time = "2026-05-04T22:57:36.803Z" }, + { url = "https://files.pythonhosted.org/packages/b2/6e/e90527eef33f309beb811cf7c982c3aeffcce8e3edb178baa4ca3ae4a6fa/cryptography-48.0.0-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f5333311663ea94f75dd408665686aaf426563556bb5283554a3539177e03b8c", size = 4690433, upload-time = "2026-05-04T22:57:40.373Z" }, + { url = "https://files.pythonhosted.org/packages/90/04/673510ed51ddff56575f306cf1617d80411ee76831ccd3097599140efdfe/cryptography-48.0.0-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7995ef305d7165c3f11ae07f2517e5a4f1d5c18da1376a0a9ed496336b69e5f3", size = 4710620, upload-time = "2026-05-04T22:57:42.935Z" }, + { url = "https://files.pythonhosted.org/packages/14/d5/e9c4ef932c8d800490c34d8bd589d64a31d5890e27ec9e9ad532be893294/cryptography-48.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:40ba1f85eaa6959837b1d51c9767e230e14612eea4ef110ee8854ada22da1bf5", size = 4696283, upload-time = "2026-05-04T22:57:45.294Z" }, + { url = "https://files.pythonhosted.org/packages/95/38/0d29a6fd7d0d1373f0c0c88a04ba20e359b257753ac497564cd660fc1d55/cryptography-48.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a0e692c683f4df67815a2d258b324e66f4738bd7a96a218c826dce4f4bd05d8f", size = 4743677, upload-time = "2026-05-04T22:57:50.067Z" }, + { url = "https://files.pythonhosted.org/packages/84/9e/500463e87abb7a0a0f9f256ec21123ecde0a7b5541a15e840ea54551fd81/cryptography-48.0.0-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e8eac43dfca5c4cccc6dad9a80504436fca53bb9bc3100a2386d730fbe6b602", size = 4695941, upload-time = "2026-05-04T22:57:54.603Z" }, + { url = "https://files.pythonhosted.org/packages/d0/c0/7101d3b7215edcdc90c45da544961fd8ed2d6448f77577460fa75a8443f7/cryptography-48.0.0-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:bd72e68b06bb1e96913f97dd4901119bc17f39d4586a5adf2d3e47bc2b9d58b5", size = 4743326, upload-time = "2026-05-04T22:57:59.535Z" }, + { url = "https://files.pythonhosted.org/packages/ac/d8/5b833bad13016f562ab9d063d68199a4bd121d18458e439515601d3357ec/cryptography-48.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:59baa2cb386c4f0b9905bd6eb4c2a79a69a128408fd31d32ca4d7102d4156321", size = 4826672, upload-time = "2026-05-04T22:58:01.996Z" }, + { url = "https://files.pythonhosted.org/packages/98/e1/7074eb8bf3c135558c73fc2bcf0f5633f912e6fb87e868a55c454080ef09/cryptography-48.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9249e3cd978541d665967ac2cb2787fd6a62bddf1e75b3e347a594d7dacf4f74", size = 4972574, upload-time = "2026-05-04T22:58:03.968Z" }, + { url = "https://files.pythonhosted.org/packages/f2/63/61d4a4e1c6b6bab6ce1e213cd36a24c415d90e76d78c5eb8577c5541d2e8/cryptography-48.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:58d00498e8933e4a194f3076aee1b4a97dfec1a6da444535755822fe5d8b0b86", size = 7983482, upload-time = "2026-05-04T22:58:43.769Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ac/f5b5995b87770c693e2596559ffafe195b4033a57f14a82268a2842953f3/cryptography-48.0.0-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:614d0949f4790582d2cc25553abd09dd723025f0c0e7c67376a1d77196743d6e", size = 4683266, upload-time = "2026-05-04T22:58:46.064Z" }, + { url = "https://files.pythonhosted.org/packages/ec/c6/8b14f67e18338fbc4adb76f66c001f5c3610b3e2d1837f268f47a347dbbb/cryptography-48.0.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7ce4bfae76319a532a2dc68f82cc32f5676ee792a983187dac07183690e5c66f", size = 4696228, upload-time = "2026-05-04T22:58:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/ea/73/f808fbae9514bd91b47875b003f13e284c8c6bdfd904b7944e803937eec1/cryptography-48.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2eb992bbd4661238c5a397594c83f5b4dc2bc5b848c365c8f991b6780efcc5c7", size = 4689097, upload-time = "2026-05-04T22:58:50.9Z" }, + { url = "https://files.pythonhosted.org/packages/02/e1/50edc7a50334807cc4791fc4a0ce7468b4a1416d9138eab358bfc9a3d70b/cryptography-48.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2b4d59804e8408e2fea7d1fbaf218e5ec984325221db76e6a241a9abd6cdd95c", size = 4730479, upload-time = "2026-05-04T22:58:55.611Z" }, + { url = "https://files.pythonhosted.org/packages/90/ee/89aa26a06ef0a7d7611788ffd571a7c50e368cc6a4d5eef8b4884e866edb/cryptography-48.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:5a5ed8fde7a1d09376ca0b40e68cd59c69fe23b1f9768bd5824f54681626032a", size = 4688713, upload-time = "2026-05-04T22:59:00.077Z" }, + { url = "https://files.pythonhosted.org/packages/c9/70/ca4003b1ce5ca3dc3186ada51908c8a9b9ff7d5cab83cc0d43ee14ec144f/cryptography-48.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9071196d81abc88b3516ac8cdfad32e2b66dd4a5393a8e68a961e9161ddc6239", size = 4729947, upload-time = "2026-05-04T22:59:05.255Z" }, + { url = "https://files.pythonhosted.org/packages/44/a0/4ec7cf774207905aef1a8d11c3750d5a1db805eb380ee4e16df317870128/cryptography-48.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1e2d54c8be6152856a36f0882ab231e70f8ec7f14e93cf87db8a2ed056bf160c", size = 4822059, upload-time = "2026-05-04T22:59:07.802Z" }, + { url = "https://files.pythonhosted.org/packages/1e/75/a2e55f99c16fcac7b5d6c1eb19ad8e00799854d6be5ca845f9259eae1681/cryptography-48.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a5da777e32ffed6f85a7b2b3f7c5cbc88c146bfcd0a1d7baf5fcc6c52ee35dd4", size = 4960575, upload-time = "2026-05-04T22:59:09.851Z" }, +] + [[package]] name = "cyclonedx-python-lib" version = "11.7.0" @@ -862,6 +908,7 @@ dependencies = [ { name = "alembic", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "asyncpg", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "beautifulsoup4", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "cryptography", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "essentia-tensorflow", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "fastapi", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "httpx", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -900,6 +947,7 @@ requires-dist = [ { name = "alembic", specifier = ">=1.18.4" }, { name = "asyncpg", specifier = ">=0.31.0" }, { name = "beautifulsoup4", specifier = ">=4.14.3" }, + { name = "cryptography", specifier = ">=46.0.0,<49" }, { name = "essentia-tensorflow", marker = "platform_machine == 'x86_64' or sys_platform != 'linux'", specifier = ">=2.1b6.dev1389" }, { name = "fastapi", specifier = ">=0.136.1" }, { name = "httpx", specifier = ">=0.28.1" }, @@ -1057,6 +1105,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9b/bf/7595e817906a29453ba4d99394e781b6fabe55d21f3c15d240f85dd06bb1/py_serializable-2.1.0-py3-none-any.whl", hash = "sha256:b56d5d686b5a03ba4f4db5e769dc32336e142fc3bd4d68a8c25579ebb0a67304", size = 23045, upload-time = "2025-07-21T09:56:46.848Z" }, ] +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + [[package]] name = "pydantic" version = "2.12.5"