From 2d9d4f8a5fbd4429a74ca911e0205d168da05c62 Mon Sep 17 00:00:00 2001 From: Rockford Lhotka Date: Wed, 22 Apr 2026 23:57:21 -0500 Subject: [PATCH 1/4] Step 9: deprecate subsumed specialists (v0.2 complete) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove fetch-page-title and extract-structured-data from the advertised skill set. Both are reachable via browser-task — page-title as a trivial intent, structured extraction via a "return JSON: {...}" instruction carried in the planner's done(result=...) channel. Cost delta is 2-3x tokens per call, acceptable given zero deterministic high-volume callers today. extract-structured-data was also out of spec on §7.1 — it called the no-argument CreateSessionAsync overload and accepted any host. The generalist enforces allowlists by design. Advertised v0.2 surface lands at three skills: browser-task, learn-form-schema, execute-form-batch. - Delete FetchPageTitleCapability, ExtractStructuredDataCapability, and the shared CapabilityInput URL/description parser (no other consumers). browser-task has its own BrowserTaskInput; form capabilities have their own input classes. - Delete the session-level one-shot helpers that only the removed specialists used: IBrowserSession.FetchPageTitleAsync, CapturePageSnapshotAsync, PageSnapshot, PageSnapshotSource. - Delete the corresponding tests — 7 unit tests for the capabilities, the PlaywrightBrowserSessionTests + PageSnapshotTests integration suites, and the ExtractStructuredDataIntegrationTests real-LLM benchmark. BrowserTaskIntegrationTests remains the real-LLM surface. - Trim StubBrowserSessionFactory + FakeAgentBrowserSession to match the pruned IBrowserSession. Update metadata: deploy/rockbot-seed/*.json, docker-compose.yml description + curl smoke example, .env.example comments, Program.cs comment, docs/capabilities.md, spec §5.2 capability table and §9.1 step-9 description, CLAUDE.md Status + Browser + Capabilities sections, framework-feedback step-9 section. Version bumped 0.2.0-alpha.8 → 0.2.0-alpha.9. Tests: 48 passed (46 Agent unit + 1 Forms integration + 1 placeholder), 3 real-LLM BrowserTaskIntegrationTests skipped as expected. Build clean on Release. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 11 +- CLAUDE.md | 6 +- Directory.Build.props | 2 +- deploy/rockbot-seed/agent-trust.json | 2 +- deploy/rockbot-seed/well-known-agents.json | 12 +- docker-compose.yml | 8 +- docs/capabilities.md | 30 ++- docs/foragent-specification.md | 29 ++- docs/framework-feedback.md | 59 +++++ src/Foragent.Agent/Program.cs | 6 +- src/Foragent.Browser/IBrowserSession.cs | 32 --- .../PlaywrightBrowserSessionFactory.cs | 60 ----- src/Foragent.Capabilities/CapabilityInput.cs | 124 ----------- .../ExtractStructuredDataCapability.cs | 112 ---------- .../FetchPageTitleCapability.cs | 44 ---- ...CapabilitiesServiceCollectionExtensions.cs | 6 +- .../BrowserTask/FakeBrowserAgentPage.cs | 6 - .../ExtractStructuredDataCapabilityTests.cs | 210 ------------------ .../FetchPageTitleCapabilityTests.cs | 197 ---------------- tests/Foragent.Agent.Tests/TestDoubles.cs | 12 - .../ExtractStructuredDataIntegrationTests.cs | 157 ------------- .../PageSnapshotTests.cs | 41 ---- .../PlaywrightBrowserSessionTests.cs | 83 ------- 23 files changed, 122 insertions(+), 1127 deletions(-) delete mode 100644 src/Foragent.Capabilities/CapabilityInput.cs delete mode 100644 src/Foragent.Capabilities/ExtractStructuredDataCapability.cs delete mode 100644 src/Foragent.Capabilities/FetchPageTitleCapability.cs delete mode 100644 tests/Foragent.Agent.Tests/ExtractStructuredDataCapabilityTests.cs delete mode 100644 tests/Foragent.Agent.Tests/FetchPageTitleCapabilityTests.cs delete mode 100644 tests/Foragent.Browser.Tests/ExtractStructuredDataIntegrationTests.cs delete mode 100644 tests/Foragent.Browser.Tests/PageSnapshotTests.cs delete mode 100644 tests/Foragent.Browser.Tests/PlaywrightBrowserSessionTests.cs diff --git a/.env.example b/.env.example index 7c0c4c8..e03e339 100644 --- a/.env.example +++ b/.env.example @@ -2,14 +2,13 @@ # # These are consumed by docker-compose.yml. Foragent and the RockBot agent can # be pointed at different models — Foragent uses FORAGENT_LLM_* for its own -# LLM-backed capabilities (extract-structured-data and beyond), the RockBot -# agent uses LLM_* for its own reasoning. +# LLM-backed capabilities (browser-task planner, form-schema enrichment), the +# RockBot agent uses LLM_* for its own reasoning. # -# Direct curl tests of capabilities that don't need an LLM (fetch-page-title) -# still work without either set; anything LLM-backed will fail until Foragent's -# config is populated. +# Every Foragent capability needs the LLM wired; without it, the host fails +# fast at startup. -# ── Foragent's LLM (REQUIRED for extract-structured-data) ──────────────────── +# ── Foragent's LLM (REQUIRED — browser-task + form-schema enrichment) ──────── # Azure AI Foundry / OpenAI-compatible endpoints are both fine. # Foundry endpoint shape: https://-.cognitiveservices.azure.com/openai/v1/ FORAGENT_LLM_ENDPOINT=https://your-resource-region.cognitiveservices.azure.com/openai/v1/ diff --git a/CLAUDE.md b/CLAUDE.md index bf6d232..2122f27 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Status -Foragent is at **milestone 8 shipped, step 9 next**. Five capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`, `fetch-page-title`, `extract-structured-data`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.11`, which brings the structured-data `invoke_agent` surface from PR #291 so RockBot can consume Foragent's FormSchema JSON natively rather than as prose). RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. +Foragent is at **milestone 9 shipped — v0.2 complete**. Three capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.11`, which brings the structured-data `invoke_agent` surface from PR #291 so RockBot can consume Foragent's FormSchema JSON natively rather than as prose). Step 9 removed the v0.1 `fetch-page-title` and `extract-structured-data` specialists: both are reachable via `browser-task` (the planner's `done(result=...)` channel carries JSON when the intent asks for it), at roughly 2–3× the tokens per call — acceptable given zero deterministic high-volume callers today. `extract-structured-data` was also out of spec on §7.1 mandatory allowlists, which the generalist enforces by design. Session-level `FetchPageTitleAsync` / `CapturePageSnapshotAsync` were removed from `IBrowserSession` alongside the capabilities that used them. RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. ## Build / test @@ -74,7 +74,7 @@ Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any ## Browser -`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read / `SelectOptionAsync` / `SetCheckedAsync` / `ScanFormAsync`) for multi-step flows like login + post and form submission, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). `ScanFormAsync` (added in step 8) runs a single JS pass in the page to collect inputs/selects/textareas with labels, validation attributes, and select/radio options — returns a typed `FormScan` that the form capabilities lift into `FormSchema`. The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. +`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read / `SelectOptionAsync` / `SetCheckedAsync` / `ScanFormAsync`) for multi-step flows like login + post and form submission, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). The one-shot `FetchPageTitleAsync` / `CapturePageSnapshotAsync` + `PageSnapshot` record were removed in step 9 along with the specialists that used them — `browser-task`'s `snapshot` tool covers the same ground. `ScanFormAsync` (added in step 8) runs a single JS pass in the page to collect inputs/selects/textareas with labels, validation attributes, and select/radio options — returns a typed `FormScan` that the form capabilities lift into `FormSchema`. The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. `CreateSessionAsync(Func allowedHost, ...)` is the step-6 entry point for allowlist-scoped sessions. The factory installs a context-wide `RouteAsync("**/*", ...)` that aborts off-list document/subframe navigations before Playwright issues the request (spec §7.1). The no-argument overload accepts any host and stays available for specialists that enforce narrower rules elsewhere. @@ -85,7 +85,7 @@ Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any - Each capability implements `ICapability` — owns its own `AgentSkill` metadata (exposed as a static `SkillDefinition`) and its own `ExecuteAsync` logic. - `ForagentTaskHandler` is a pure dispatcher that resolves `IEnumerable` from DI and routes on `SkillId`. **Do not add skill-specific logic to the handler.** New capabilities go in new `ICapability` classes. - `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it. -- `CapabilityInput.Parse` is the shared URL + description shim used by `fetch-page-title` and `extract-structured-data`. Capabilities with different input shapes parse their own input near the capability (e.g. `BrowserTaskInput` in `BrowserTask/`). Don't overload `CapabilityInput` for unrelated shapes. +- Capabilities parse their own input near the capability (`BrowserTaskInput` in `BrowserTask/`, the `*Input` classes in `Forms/`). The `CapabilityInput.Parse` shared URL+description shim was removed in step 9 with its only consumers. - `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability. - `learn-form-schema` and `execute-form-batch` (both in `Forms/`) are the step-8 phase-1 / phase-3 pair (spec §5.5). `FormSchema` / `FormField` are the wire contract — stable JSON shape, stored via `FormSchema.SerializerOptions`. `LearnFormSchemaCapability` navigates, calls `IBrowserPage.ScanFormAsync`, maps the raw scan to `FormSchema` via the deterministic `FormSchemaMapper` (pure — no LLM), then runs `FormSchemaEnricher` for one LLM turn to infer `dependsOn` and a note (skipped when there are no select/radio fields). The schema is persisted as a `Skill` bundle at `sites/{host}/forms/{slug}` with a `SkillResourceType.JsonSchema` resource named `schema.json`. Only the enricher can add `dependsOn` / `notes` — structural fields (type, selector, required, options) come only from the DOM scan, so the LLM cannot fabricate fields or rewrite selectors. `ExecuteFormBatchCapability` accepts `schemaRef` (resolves via `ISkillStore.GetResourceAsync(name, "schema.json")`) or an inline `schema`, and submits each row with `FillAsync` / `SelectOptionAsync` / `SetCheckedAsync` per field type. Per-row progress is published via `AgentTaskContext.PublishStatus(new AgentTaskStatusUpdate { State = Working, Message = … })`. Default `mode` is `"abort-on-first"` (spec open-question #8 resolution: a failed submit on a mutating form usually indicates a real problem, so continuing would just generate more bad submissions); callers opt into `"continue"` for known-messy batches. Success detection: an optional `successIndicator` CSS selector is the preferred signal; the fallback is URL change after submit, which fails (correctly) for forms that submit in place. File uploads and multi-step wizards are **out of scope** for v0.2 — `ScanFormAsync` explicitly skips `type=file`, and there's no flow control for wizards. diff --git a/Directory.Build.props b/Directory.Build.props index 74c6bcf..a1616ee 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -6,7 +6,7 @@ enable true true - 0.2.0-alpha.8 + 0.2.0-alpha.9 Marimer LLC Marimer LLC Copyright (c) Marimer LLC diff --git a/deploy/rockbot-seed/agent-trust.json b/deploy/rockbot-seed/agent-trust.json index 05a69b7..1deca74 100644 --- a/deploy/rockbot-seed/agent-trust.json +++ b/deploy/rockbot-seed/agent-trust.json @@ -2,7 +2,7 @@ { "agentId": "Foragent", "level": 4, - "approvedSkills": ["browser-task", "fetch-page-title", "extract-structured-data"], + "approvedSkills": ["browser-task", "learn-form-schema", "execute-form-batch"], "firstSeen": "2026-04-21T00:00:00+00:00", "lastInteraction": "2026-04-21T00:00:00+00:00", "interactionCount": 0 diff --git a/deploy/rockbot-seed/well-known-agents.json b/deploy/rockbot-seed/well-known-agents.json index c973a20..83a1532 100644 --- a/deploy/rockbot-seed/well-known-agents.json +++ b/deploy/rockbot-seed/well-known-agents.json @@ -14,14 +14,14 @@ "description": "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. Input JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. allowedHosts is required and empty rejects. Returns a structured JSON result with status (done/failed/incomplete), summary, optional result, step count, and navigations." }, { - "id": "fetch-page-title", - "name": "Fetch Page Title", - "description": "Navigate to a URL with a real browser and return the contents of its element." + "id": "learn-form-schema", + "name": "Learn Form Schema", + "description": "Navigate to a URL, introspect the first form, and return a typed FormSchema (fields, types, required, options, dependencies). Persists the schema as a skill at sites/{host}/forms/{slug} for later reuse by execute-form-batch." }, { - "id": "extract-structured-data", - "name": "Extract Structured Data", - "description": "Navigate to a URL and extract data matching a natural-language description, returning JSON. Input the target URL and a description of what to extract." + "id": "execute-form-batch", + "name": "Execute Form Batch", + "description": "Given a learned FormSchema (by ref or inline) and a batch of row data, submit the form once per row. Streams per-row progress. Default mode abort-on-first; opt into 'continue' for known-messy batches." } ] } diff --git a/docker-compose.yml b/docker-compose.yml index 4c8a7dc..b9b9181 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,7 +21,7 @@ # curl -X POST http://localhost:5210/ \ # -H "X-Api-Key: rockbot-calls-foragent" \ # -H "Content-Type: application/json" \ -# -d '{"jsonrpc":"2.0","id":1,"method":"message/send","params":{"message":{"role":"ROLE_USER","messageId":"m1","parts":[{"text":"https://example.com"}]},"metadata":{"skill":"fetch-page-title"}}}' +# -d '{"jsonrpc":"2.0","id":1,"method":"message/send","params":{"message":{"role":"ROLE_USER","messageId":"m1","parts":[{"text":"{\"intent\":\"fetch the page title\",\"url\":\"https://example.com\",\"allowedHosts\":[\"example.com\"]}"}]},"metadata":{"skill":"browser-task"}}}' # Note: the A2A v1-preview schema uses protobuf-style enum values (ROLE_USER, not "user") # and parts are bare {"text":"..."} objects — no "kind" field. # @@ -64,12 +64,12 @@ services: RabbitMq__VirtualHost: / Gateway__AgentName: Foragent Gateway__InternalAgentName: Foragent - Gateway__Description: "Browser agent — browser-task (generalist), learn-form-schema, execute-form-batch, fetch-page-title, extract-structured-data" + Gateway__Description: "Browser agent — browser-task (generalist), learn-form-schema, execute-form-batch" # RockBot will call Foragent with header X-Api-Key: rockbot-calls-foragent ApiKeys__rockbot-calls-foragent__AgentId: RockBot ApiKeys__rockbot-calls-foragent__DisplayName: RockBot - # LLM required for the extract-structured-data capability. Namespaced so - # Foragent can point at a different model than the RockBot side. + # LLM required for the browser-task planner and form-schema enrichment. + # Namespaced so Foragent can point at a different model than the RockBot side. ForagentLlm__Endpoint: ${FORAGENT_LLM_ENDPOINT:?FORAGENT_LLM_ENDPOINT is required} ForagentLlm__ModelId: ${FORAGENT_LLM_MODEL_ID:?FORAGENT_LLM_MODEL_ID is required} ForagentLlm__ApiKey: ${FORAGENT_LLM_API_KEY:?FORAGENT_LLM_API_KEY is required} diff --git a/docs/capabilities.md b/docs/capabilities.md index e8ac2e3..48769cd 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -8,13 +8,22 @@ invoke capabilities by name; Foragent handles the browser mechanics. - `browser-task` — **generalist**, spec §5.2. LLM-in-the-loop planner that drives a real browser to accomplish a free-form intent. Shipped in step 6; step 7 added skills + memory priming (spec §5.6). -- `fetch-page-title` — specialist. Inherited from step 1/2. -- `extract-structured-data` — specialist. Inherited from step 3. - -The step-4 `post-to-site` capability was removed in step 7 — the -generalist `browser-task` plus the seeded `sites/bsky.app/login` skill -subsume its function, and the project is still pre-public so no consumer -needed a deprecation path. +- `learn-form-schema` — specialist (phase-1, spec §5.5). Introspects a + form and returns a typed `FormSchema` persisted at `sites/{host}/forms/{slug}`. +- `execute-form-batch` — specialist (phase-3, spec §5.5). Submits rows + against a learned schema, streaming per-row progress over A2A. + +Three v0.1/v0.2 specialists have been removed as `browser-task` subsumes +them. The project is pre-public so no deprecation path was required: + +- `post-to-site` — removed in step 7. `browser-task` + the seeded + `sites/bsky.app/login` skill cover the use case. +- `fetch-page-title` — removed in step 9. Was a milestone-1 smoke + target; `browser-task` with a simple intent produces the same result. +- `extract-structured-data` — removed in step 9. `browser-task` with a + "return JSON: {…}" intent produces the same result. Its typed input + shape also lacked the mandatory allowlist required by spec §7.1; the + generalist enforces that by design. ## `browser-task` input shape @@ -60,7 +69,12 @@ A JSON object in a single text part: ``` `incomplete` means the budget was exhausted before `done`/`fail` was -called. +called. For extraction-style tasks, instruct the planner to return JSON +via the `result` field — e.g. intent `"Open https://shop.example/p/42 +and return {\"name\":..., \"price_usd\":...} as JSON in the result +field."`. The planner is not schema-enforced the way +`extract-structured-data` used to be, so keep the target shape explicit +in the intent. ## `browser-task` tool surface diff --git a/docs/foragent-specification.md b/docs/foragent-specification.md index 4d105d2..bc21bd2 100644 --- a/docs/foragent-specification.md +++ b/docs/foragent-specification.md @@ -311,15 +311,15 @@ callers cheap — not to proliferate. | `browser-task` | Generalist | Given intent + optional URL, credential id, and allowed-hosts list, plan and drive the browser to fulfill the intent. Uses RockBot skills + memory as priming. Returns a result or a structured intermediate artifact (e.g. a learned form schema). | | `learn-form-schema` | Specialist (phase-1) | Given a URL and optional credential, introspect a form and return its schema — fields, types, dropdown dependencies, validation rules. Persists the schema as a skill (§5.6). Returns the schema to the caller for review. | | `execute-form-batch` | Specialist (phase-2) | Given a learned schema (by id or inline) and a batch of row data, submit the form once per row. Streams A2A progress updates. Handles partial failure. | -| `fetch-page-title` | Specialist | Return the `<title>` of a URL. Inherited from milestone 2. | -| `extract-structured-data` | Specialist | Extract structured data from a page matching a natural-language description. Inherited from milestone 3. | -The v0.1 `post-to-site` capability ships in the main codebase as a -regression test for credential handling. After step 7 it is removed -from the advertised skill list; `browser-task` subsumes its function. - -The v0.1 `monitor-page` and `fill-form` capabilities fold into -`browser-task` and do not ship as separate advertised skills. +After step 9 the v0.2 surface is three skills. The v0.1 `post-to-site` +capability was removed in step 7 once the seeded `bsky.app` skill + +`browser-task` covered it; the v0.1 `fetch-page-title` and v0.1 +`extract-structured-data` specialists were removed in step 9 — the +generalist subsumes both at the cost of 2–3× tokens per call, which +is acceptable given zero deterministic high-volume callers today. The +v0.1 `monitor-page` and `fill-form` capabilities fold into `browser-task` +and do not ship as separate advertised skills either. ### 5.3 Capabilities explicitly out of scope (v1) @@ -714,10 +714,15 @@ hard design questions until usage forces them. Resolve open question #6 (how to persist typed JSON alongside markdown skills) in the deliverable. -9. **Deprecate subsumed specialists.** Review whether `fetch-page-title` - / `extract-structured-data` still pay their way or fold into - `browser-task` with equivalent cost. Land on the minimum advertised - capability set v0.2 actually needs. +9. **Deprecate subsumed specialists.** Reviewed whether `fetch-page-title` + / `extract-structured-data` still paid their way vs. `browser-task` at + equivalent cost. Both removed: `fetch-page-title` was a milestone-1 + smoke-test relic that `browser-task` subsumes trivially; + `extract-structured-data` was functionally equivalent to a `browser-task` + intent that asks for JSON in the `done.result` channel (cost delta + ~2–3× tokens per call, zero deterministic high-volume callers today), + and was out of spec on §7.1 mandatory allowlists. Advertised surface + lands at `browser-task` + `learn-form-schema` + `execute-form-batch`. Each milestone produces framework feedback. Capture it in `docs/framework-feedback.md` — some will be small ergonomic fixes; some diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index 0481e9c..f584fd0 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -573,3 +573,62 @@ the operator turns the harness on for a sustained session. `FORAGENT_LLM_*`. - Existing step-6 benchmark still 3/3 — framework bump didn't regress anything else. + +## Step 9 — Deprecate subsumed specialists + +Step 9 is a decision milestone, not a feature ship, so the framework +observations are few. Advertised surface lands at three skills: +`browser-task`, `learn-form-schema`, `execute-form-batch`. + +### What was deleted and why + +- **`fetch-page-title`** — milestone-1 smoke-test relic. Pure + specialist wrapping `<title>` reads. `browser-task` with intent + `"fetch the page title of <url>"` and `done.result` produces the + same value for ~2× the tokens, which is fine given no deterministic + high-volume caller actually exists. +- **`extract-structured-data`** — single-turn typed extraction with + `ResponseFormat = Json`. Two deciding factors on top of the "no + caller" argument: (1) it was out of spec on §7.1 — it called the + no-argument `CreateSessionAsync` overload and accepted any host, + which the generalist refuses by design; (2) bringing it into spec + would have added mandatory `allowedHosts` to its input shape and + erased its simplicity advantage. +- **`IBrowserSession.FetchPageTitleAsync` / + `CapturePageSnapshotAsync` / `PageSnapshot` / `PageSnapshotSource`** + — orphaned once the two capabilities went. Deleted from the + `Foragent.Browser` surface rather than left as dead code. The + `snapshot` tool inside `browser-task` uses `IBrowserAgentPage` and + never touched this API path. +- **`CapabilityInput.Parse`** — the shared URL+description shim had + only the deleted specialists as consumers. `BrowserTaskInput` + handles its own shape; `Forms/*Input.cs` handles theirs. Deleted. +- **Integration test `ExtractStructuredDataIntegrationTests`** — the + only `[SkippableFact]` in the browser-tests project that needed + `FORAGENT_LLM_*`. Removed; `BrowserTaskIntegrationTests` remains the + real-LLM benchmark. + +### Framework observations + +- **Capability-surface evolution is painless.** Removing + `ICapability` implementations is a three-line edit to + `ForagentCapabilitiesServiceCollectionExtensions` + a one-line edit + to the static `Skills` array. No framework API made the deletion + harder than it had to be — `IAgentTaskHandler.HandleTaskAsync` + + DI-resolved capabilities remain the right shape for a fast-moving + pre-1.0 product surface. Confirms that foragent#5 / + [rockbot#283](https://github.com/MarimerLLC/rockbot/issues/283) (per-skill + handler registration) is a quality-of-life improvement, not a + blocker. +- **`AgentCard` and `Gateway:Skills` share one source of truth now.** + Post-step-1 refactor landed the `ForagentCapabilities.Skills` static + that both the A2A card and the HTTP gateway read from — step 9 + touched exactly that one array and both sides updated. That worked + exactly as intended; the step-1 feedback entry about duplicate card + declarations is effectively closed on the Foragent side via the + local static. An upstream generalization — framework reads + `A2AOptions.Card.Skills` directly in the gateway — would remove the + small local static but isn't urgent. +- **No spec open-questions closed or opened.** Open items #3, #4, #5, + #7 remain as written; #6 and #8 closed in step 8; #1 and #2 closed + in v0.2 spec adoption. v0.2 is the shipped minimum surface. diff --git a/src/Foragent.Agent/Program.cs b/src/Foragent.Agent/Program.cs index a0fceef..2815c56 100644 --- a/src/Foragent.Agent/Program.cs +++ b/src/Foragent.Agent/Program.cs @@ -16,9 +16,9 @@ builder.Configuration.AddUserSecrets<Program>(optional: true); -// ── LLM (required — capabilities like extract-structured-data reason over -// page content via Microsoft.Extensions.AI). Config is namespaced under -// ForagentLlm so it can differ from the host's RockBot LLM config. ─────── +// ── LLM (required — browser-task's planner and form-schema enrichment reason +// over page content via Microsoft.Extensions.AI). Config is namespaced +// under ForagentLlm so it can differ from the host's RockBot LLM config. ─ var llmSection = builder.Configuration.GetSection("ForagentLlm"); var llmEndpoint = llmSection["Endpoint"] diff --git a/src/Foragent.Browser/IBrowserSession.cs b/src/Foragent.Browser/IBrowserSession.cs index 65629a4..bceca44 100644 --- a/src/Foragent.Browser/IBrowserSession.cs +++ b/src/Foragent.Browser/IBrowserSession.cs @@ -8,23 +8,6 @@ namespace Foragent.Browser; /// </summary> public interface IBrowserSession : IAsyncDisposable { - /// <summary> - /// Navigates to <paramref name="url"/> and returns the contents of the - /// rendered <c><title></c> element, or <c>null</c> if the page - /// does not expose one. - /// </summary> - Task<string?> FetchPageTitleAsync(Uri url, CancellationToken cancellationToken = default); - - /// <summary> - /// Navigates to <paramref name="url"/> and returns a compact, - /// LLM-consumable representation of the page. Uses the Chromium - /// accessibility tree when available (spec §3.2 — the right grain for an - /// extraction prompt); falls back to <c><body></c> inner text when - /// the accessibility snapshot is empty or unavailable. Includes the page - /// title at the top so the LLM has enough context to reason. - /// </summary> - Task<PageSnapshot> CapturePageSnapshotAsync(Uri url, CancellationToken cancellationToken = default); - /// <summary> /// Opens a page for a multi-step flow (login, fill form, navigate, read /// back confirmation). The caller drives the page with the methods on @@ -210,21 +193,6 @@ public sealed record FormScanField( /// <summary>An option entry for a <c><select></c> or radio group.</summary> public sealed record FormScanOption(string Value, string? Label); -/// <summary> -/// A compact rendering of a page suitable for LLM prompting. -/// </summary> -/// <param name="Url">The final URL after any redirects.</param> -/// <param name="Title">Page title, or <c>null</c>.</param> -/// <param name="Content">Accessibility-tree rendering or inner text.</param> -/// <param name="Source">Whether the content came from the accessibility tree or from a text fallback.</param> -public sealed record PageSnapshot(Uri Url, string? Title, string Content, PageSnapshotSource Source); - -public enum PageSnapshotSource -{ - Accessibility, - InnerText -} - /// <summary> /// Creates fresh <see cref="IBrowserSession"/> instances against the shared /// long-lived browser. Each session wraps a new browser context; disposing diff --git a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs index 7dcfb30..703f429 100644 --- a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs +++ b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs @@ -49,66 +49,6 @@ internal sealed class PlaywrightBrowserSession( IBrowserContext context, Func<Uri, bool> allowedHost) : IBrowserSession { - public async Task<string?> FetchPageTitleAsync(Uri url, CancellationToken cancellationToken = default) - { - cancellationToken.ThrowIfCancellationRequested(); - EnsureAllowed(url); - var page = await context.NewPageAsync(); - try - { - var response = await page.GotoAsync(url.ToString(), new PageGotoOptions - { - WaitUntil = WaitUntilState.DOMContentLoaded - }); - if (response is null || !response.Ok) - throw new InvalidOperationException( - $"Navigation to {url} returned status {response?.Status.ToString() ?? "no response"}."); - - var title = await page.TitleAsync(); - return string.IsNullOrEmpty(title) ? null : title; - } - finally - { - await page.CloseAsync(); - } - } - - public async Task<PageSnapshot> CapturePageSnapshotAsync(Uri url, CancellationToken cancellationToken = default) - { - cancellationToken.ThrowIfCancellationRequested(); - EnsureAllowed(url); - var page = await context.NewPageAsync(); - try - { - var response = await page.GotoAsync(url.ToString(), new PageGotoOptions - { - WaitUntil = WaitUntilState.DOMContentLoaded - }); - if (response is null || !response.Ok) - throw new InvalidOperationException( - $"Navigation to {url} returned status {response?.Status.ToString() ?? "no response"}."); - - var finalUrl = new Uri(page.Url); - var title = await page.TitleAsync(); - title = string.IsNullOrEmpty(title) ? null : title; - - // Locator.AriaSnapshotAsync returns a compact YAML-like rendering of the - // accessibility tree (roles + accessible names + values), which is what - // Playwright's own assertion helpers use. Replaces the deprecated - // IPage.Accessibility.SnapshotAsync() API. - var ariaSnapshot = await page.Locator("body").AriaSnapshotAsync(); - if (!string.IsNullOrWhiteSpace(ariaSnapshot)) - return new PageSnapshot(finalUrl, title, ariaSnapshot, PageSnapshotSource.Accessibility); - - var body = await page.Locator("body").InnerTextAsync(); - return new PageSnapshot(finalUrl, title, body ?? string.Empty, PageSnapshotSource.InnerText); - } - finally - { - await page.CloseAsync(); - } - } - public async Task<IBrowserPage> OpenPageAsync(Uri url, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); diff --git a/src/Foragent.Capabilities/CapabilityInput.cs b/src/Foragent.Capabilities/CapabilityInput.cs deleted file mode 100644 index c4bcdaf..0000000 --- a/src/Foragent.Capabilities/CapabilityInput.cs +++ /dev/null @@ -1,124 +0,0 @@ -using System.Text.Json; -using System.Text.RegularExpressions; -using RockBot.A2A; - -namespace Foragent.Capabilities; - -/// <summary> -/// Helpers for pulling structured input out of an <see cref="AgentTaskRequest"/>. -/// Foragent's capability contract (spec §5.3 — capabilities own their input shape): -/// <list type="bullet"> -/// <item>Target URL in <c>url</c> metadata</item> -/// <item>Natural-language description in the first text part, or <c>description</c> metadata</item> -/// </list> -/// Parse order, most-structured-first: -/// <list type="number"> -/// <item>Message or request metadata <c>url</c>/<c>description</c> (rockbot 0.8.5+)</item> -/// <item>JSON blob in a text part: <c>{"url":"...","description":"..."}</c></item> -/// <item>Bare URL text part (step 1/2 callers)</item> -/// <item>First <c>http(s)://…</c> URL embedded in free-form text (handles -/// LLM-wrapped prompts like "please fetch the title of https://example.com")</item> -/// </list> -/// </summary> -internal static partial class CapabilityInput -{ - // Matches the first http(s) URL inside a larger string. Stops at whitespace - // or common terminators so trailing punctuation doesn't poison the URL. - [GeneratedRegex(@"https?://[^\s<>""'\)\]\}]+", RegexOptions.IgnoreCase)] - private static partial Regex EmbeddedUrlRegex(); - - /// <summary> - /// Parses a capability request into (url, description). Either field may be - /// null if the caller didn't supply it — capabilities decide which are - /// required. Parse order: (1) JSON shim, (2) bare URL, (3) first embedded - /// URL in free-form text — so LLM-authored requests like "please fetch - /// the title of https://example.com" are accepted. If (3) is used and the - /// text carries more than just the URL, the surrounding text becomes the - /// description. - /// </summary> - public static (Uri? Url, string? Description) Parse(AgentTaskRequest request) - { - // 1. Metadata (rockbot 0.8.5+). Message-level wins over request-level - // when both are present — the URL is about the message content. - var metaUrl = ReadMetadata(request, "url"); - var metaDescription = ReadMetadata(request, "description"); - var urlFromMetadata = TryParseAbsoluteHttpUri(metaUrl); - if (urlFromMetadata is not null) - return (urlFromMetadata, string.IsNullOrWhiteSpace(metaDescription) ? null : metaDescription); - - var text = request.Message.Parts - .Where(p => p.Kind == "text") - .Select(p => p.Text) - .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) - ?.Trim(); - - if (string.IsNullOrEmpty(text)) - return (null, string.IsNullOrWhiteSpace(metaDescription) ? null : metaDescription); - - // JSON shim: {"url": "...", "description": "..."} - if (text.StartsWith('{')) - { - try - { - using var doc = JsonDocument.Parse(text); - var root = doc.RootElement; - var urlStr = root.TryGetProperty("url", out var u) ? u.GetString() : null; - var description = root.TryGetProperty("description", out var d) ? d.GetString() : null; - return (TryParseAbsoluteHttpUri(urlStr), description); - } - catch (JsonException) - { - // Not valid JSON — fall through to bare-URL handling. - } - } - - // Bare URL — covers step 1/2 callers and simple single-input skills. - var bare = TryParseAbsoluteHttpUri(text); - if (bare is not null) - return (bare, null); - - // Fall back to extracting the first http(s) URL embedded in free-form - // text. Anything around it becomes the description for capabilities - // that want one. - var match = EmbeddedUrlRegex().Match(text); - if (match.Success) - { - var embedded = TryParseAbsoluteHttpUri(TrimUrlTerminators(match.Value)); - if (embedded is not null) - { - var remainder = (text[..match.Index] + text[(match.Index + match.Length)..]).Trim(); - return (embedded, string.IsNullOrWhiteSpace(remainder) ? null : remainder); - } - } - - return (null, null); - } - - private static string TrimUrlTerminators(string value) => - value.TrimEnd('.', ',', ';', ':', '!', '?'); - - private static string? ReadMetadata(AgentTaskRequest request, string key) - { - if (request.Message.Metadata is not null - && request.Message.Metadata.TryGetValue(key, out var msgValue) - && !string.IsNullOrWhiteSpace(msgValue)) - { - return msgValue; - } - if (request.Metadata is not null - && request.Metadata.TryGetValue(key, out var reqValue) - && !string.IsNullOrWhiteSpace(reqValue)) - { - return reqValue; - } - return null; - } - - private static Uri? TryParseAbsoluteHttpUri(string? value) - { - if (string.IsNullOrWhiteSpace(value)) return null; - if (!Uri.TryCreate(value.Trim(), UriKind.Absolute, out var uri)) return null; - if (uri.Scheme != Uri.UriSchemeHttp && uri.Scheme != Uri.UriSchemeHttps) return null; - return uri; - } -} diff --git a/src/Foragent.Capabilities/ExtractStructuredDataCapability.cs b/src/Foragent.Capabilities/ExtractStructuredDataCapability.cs deleted file mode 100644 index 68e5872..0000000 --- a/src/Foragent.Capabilities/ExtractStructuredDataCapability.cs +++ /dev/null @@ -1,112 +0,0 @@ -using System.Text; -using Foragent.Browser; -using Microsoft.Extensions.AI; -using Microsoft.Extensions.Logging; -using RockBot.A2A; - -namespace Foragent.Capabilities; - -public sealed class ExtractStructuredDataCapability( - IBrowserSessionFactory browserFactory, - IChatClient chatClient, - ILogger<ExtractStructuredDataCapability> logger) : ICapability -{ - public static AgentSkill SkillDefinition { get; } = new() - { - Id = "extract-structured-data", - Name = "Extract Structured Data", - Description = "Navigate to a URL and extract data matching a natural-language description, returning JSON." - }; - - // Keep the prompt short — the page content already dominates the token budget. - private const string SystemPrompt = """ - You extract structured data from web pages on behalf of other agents. - The user gives you (1) a description of what to extract and (2) a - compact accessibility-tree or text rendering of a page. Respond with a - single JSON object containing only the fields that answer the request. - Use null for fields that are not present on the page. Do not wrap the - JSON in code fences or prose. - """; - - // Extraction calls are usually a few seconds. Cap snapshot size so we don't - // blow past the model's context window on a page dump. - private const int MaxSnapshotChars = 40_000; - - public string SkillId => SkillDefinition.Id; - public AgentSkill Skill => SkillDefinition; - - public async Task<AgentTaskResult> ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) - { - var ct = context.MessageContext.CancellationToken; - var (url, description) = CapabilityInput.Parse(request); - - if (url is null) - return CapabilityResult.Error( - request, - "Provide the target URL (as 'url' in the request payload)."); - - if (string.IsNullOrWhiteSpace(description)) - return CapabilityResult.Error( - request, - "Provide a natural-language description of the data to extract."); - - PageSnapshot snapshot; - try - { - await using var session = await browserFactory.CreateSessionAsync(ct); - snapshot = await session.CapturePageSnapshotAsync(url, ct); - logger.LogInformation( - "Captured {Source} snapshot of {Url} ({Chars} chars)", - snapshot.Source, snapshot.Url, snapshot.Content.Length); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - logger.LogWarning(ex, "Snapshot failed for {Url}", url); - return CapabilityResult.Error(request, $"Fetch failed: {ex.Message}"); - } - - var prompt = BuildPrompt(snapshot, description!); - - try - { - var response = await chatClient.GetResponseAsync( - [ - new ChatMessage(ChatRole.System, SystemPrompt), - new ChatMessage(ChatRole.User, prompt) - ], - new ChatOptions { ResponseFormat = ChatResponseFormat.Json }, - ct); - - var text = response.Text?.Trim() ?? string.Empty; - if (string.IsNullOrEmpty(text)) - return CapabilityResult.Error(request, "LLM returned an empty response."); - - return CapabilityResult.Completed(request, text); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - logger.LogWarning(ex, "LLM extraction failed for {Url}", url); - return CapabilityResult.Error(request, $"Extraction failed: {ex.Message}"); - } - } - - private static string BuildPrompt(PageSnapshot snapshot, string description) - { - var content = snapshot.Content.Length > MaxSnapshotChars - ? snapshot.Content[..MaxSnapshotChars] + "\n…(truncated)" - : snapshot.Content; - - var sb = new StringBuilder(); - sb.Append("Page URL: ").AppendLine(snapshot.Url.ToString()); - if (snapshot.Title is not null) - sb.Append("Page title: ").AppendLine(snapshot.Title); - sb.Append("Snapshot source: ").AppendLine(snapshot.Source.ToString()); - sb.AppendLine(); - sb.AppendLine("Description of data to extract:"); - sb.AppendLine(description); - sb.AppendLine(); - sb.AppendLine("Page content:"); - sb.AppendLine(content); - return sb.ToString(); - } -} diff --git a/src/Foragent.Capabilities/FetchPageTitleCapability.cs b/src/Foragent.Capabilities/FetchPageTitleCapability.cs deleted file mode 100644 index 150258e..0000000 --- a/src/Foragent.Capabilities/FetchPageTitleCapability.cs +++ /dev/null @@ -1,44 +0,0 @@ -using Foragent.Browser; -using Microsoft.Extensions.Logging; -using RockBot.A2A; - -namespace Foragent.Capabilities; - -public sealed class FetchPageTitleCapability( - IBrowserSessionFactory browserFactory, - ILogger<FetchPageTitleCapability> logger) : ICapability -{ - public static AgentSkill SkillDefinition { get; } = new() - { - Id = "fetch-page-title", - Name = "Fetch Page Title", - Description = "Navigate to a URL with a real browser and return the contents of its <title> element." - }; - - public string SkillId => SkillDefinition.Id; - public AgentSkill Skill => SkillDefinition; - - public async Task<AgentTaskResult> ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) - { - var ct = context.MessageContext.CancellationToken; - var (url, _) = CapabilityInput.Parse(request); - - if (url is null) - return CapabilityResult.Error(request, "Provide an absolute http(s) URL as the task message."); - - try - { - await using var session = await browserFactory.CreateSessionAsync(ct); - var title = await session.FetchPageTitleAsync(url, ct); - var text = string.IsNullOrEmpty(title) ? "(no title)" : title; - - logger.LogInformation("Fetched title from {Url}: {Title}", url, text); - return CapabilityResult.Completed(request, text); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - logger.LogWarning(ex, "Fetch failed for {Url}", url); - return CapabilityResult.Error(request, $"Fetch failed: {ex.Message}"); - } - } -} diff --git a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs index 169de19..5d06dde 100644 --- a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs +++ b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs @@ -14,8 +14,6 @@ public static class ForagentCapabilitiesServiceCollectionExtensions /// </summary> public static IServiceCollection AddForagentCapabilities(this IServiceCollection services) { - services.AddScoped<ICapability, FetchPageTitleCapability>(); - services.AddScoped<ICapability, ExtractStructuredDataCapability>(); services.AddScoped<ICapability, BrowserTaskCapability>(); services.AddScoped<ICapability, LearnFormSchemaCapability>(); services.AddScoped<ICapability, ExecuteFormBatchCapability>(); @@ -38,8 +36,6 @@ public static class ForagentCapabilities [ BrowserTaskCapability.SkillDefinition, LearnFormSchemaCapability.SkillDefinition, - ExecuteFormBatchCapability.SkillDefinition, - FetchPageTitleCapability.SkillDefinition, - ExtractStructuredDataCapability.SkillDefinition + ExecuteFormBatchCapability.SkillDefinition ]; } diff --git a/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs index 8027593..327abe4 100644 --- a/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs +++ b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs @@ -52,12 +52,6 @@ public ValueTask DisposeAsync() internal sealed class FakeAgentBrowserSession(FakeBrowserAgentPage page) : IBrowserSession { - public Task<string?> FetchPageTitleAsync(Uri url, CancellationToken ct = default) => - throw new NotSupportedException(); - - public Task<PageSnapshot> CapturePageSnapshotAsync(Uri url, CancellationToken ct = default) => - throw new NotSupportedException(); - public Task<IBrowserPage> OpenPageAsync(Uri url, CancellationToken ct = default) => throw new NotSupportedException(); diff --git a/tests/Foragent.Agent.Tests/ExtractStructuredDataCapabilityTests.cs b/tests/Foragent.Agent.Tests/ExtractStructuredDataCapabilityTests.cs deleted file mode 100644 index da38a95..0000000 --- a/tests/Foragent.Agent.Tests/ExtractStructuredDataCapabilityTests.cs +++ /dev/null @@ -1,210 +0,0 @@ -using Foragent.Browser; -using Foragent.Capabilities; -using Microsoft.Extensions.AI; -using Microsoft.Extensions.Logging.Abstractions; -using RockBot.A2A; -using Xunit; - -namespace Foragent.Agent.Tests; - -public class ExtractStructuredDataCapabilityTests -{ - private const string ValidInput = """ - {"url":"https://example.com","description":"the main heading"} - """; - - [Fact] - public async Task ReturnsLlmText_OnSuccess() - { - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, """{"heading":"Example"}""")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.Equal(AgentTaskState.Completed, result.State); - Assert.Equal("""{"heading":"Example"}""", TestContext.TextOf(result)); - } - - [Fact] - public async Task PromptIncludesUrl_Description_AndSnapshot() - { - var factory = new StubBrowserSessionFactory - { - SnapshotResponder = (url, _) => - Task.FromResult(new PageSnapshot(url, "Page Title", "body - Hello", PageSnapshotSource.Accessibility)) - }; - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.Equal(1, chat.CallCount); - var userMessage = chat.LastMessages.First(m => m.Role == ChatRole.User).Text ?? string.Empty; - Assert.Contains("https://example.com", userMessage); - Assert.Contains("the main heading", userMessage); - Assert.Contains("body - Hello", userMessage); - Assert.Contains("Page Title", userMessage); - } - - [Fact] - public async Task RequestsJsonResponseFormat() - { - ChatOptions? capturedOptions = null; - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, opts) => - { - capturedOptions = opts; - return Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}"))); - }); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.NotNull(capturedOptions?.ResponseFormat); - Assert.IsType<ChatResponseFormatJson>(capturedOptions!.ResponseFormat); - } - - [Fact] - public async Task RejectsMissingUrl() - { - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", """{"description":"no url"}"""), - context); - - Assert.Equal(0, factory.SessionsCreated); - Assert.Equal(0, chat.CallCount); - Assert.Contains("URL", TestContext.TextOf(result)); - } - - [Fact] - public async Task RejectsMissingDescription() - { - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", """{"url":"https://example.com"}"""), - context); - - Assert.Equal(0, factory.SessionsCreated); - Assert.Equal(0, chat.CallCount); - Assert.Contains("description", TestContext.TextOf(result)); - } - - [Fact] - public async Task ReportsError_WhenBrowserThrows() - { - var factory = new StubBrowserSessionFactory - { - SnapshotResponder = (_, _) => Task.FromException<PageSnapshot>(new InvalidOperationException("nav failed")) - }; - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.Contains("Fetch failed: nav failed", TestContext.TextOf(result)); - Assert.Equal(0, chat.CallCount); - } - - [Fact] - public async Task ReportsError_WhenLlmThrows() - { - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, _) => - Task.FromException<ChatResponse>(new InvalidOperationException("llm boom"))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.Contains("Extraction failed: llm boom", TestContext.TextOf(result)); - } - - [Fact] - public async Task AcceptsInputs_FromMetadata() - { - Uri? capturedUrl = null; - var factory = new StubBrowserSessionFactory - { - SnapshotResponder = (url, _) => - { - capturedUrl = url; - return Task.FromResult(new PageSnapshot(url, null, "content", PageSnapshotSource.Accessibility)); - } - }; - string? descriptionInPrompt = null; - var chat = new StubChatClient((msgs, _) => - { - descriptionInPrompt = msgs.First(m => m.Role == ChatRole.User).Text; - return Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "{}"))); - }); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - var request = TestContext.RequestWithMetadata( - "extract-structured-data", - messageMetadata: new Dictionary<string, string> - { - ["url"] = "https://metadata.example", - ["description"] = "the shipping address" - }); - - var result = await capability.ExecuteAsync(request, context); - - Assert.Equal(AgentTaskState.Completed, result.State); - Assert.Equal(new Uri("https://metadata.example"), capturedUrl); - Assert.Contains("the shipping address", descriptionInPrompt); - } - - [Fact] - public async Task ReportsError_OnEmptyLlmResponse() - { - var factory = new StubBrowserSessionFactory(); - var chat = new StubChatClient((_, _) => - Task.FromResult(new ChatResponse(new ChatMessage(ChatRole.Assistant, "")))); - var capability = new ExtractStructuredDataCapability( - factory, chat, NullLogger<ExtractStructuredDataCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("extract-structured-data", ValidInput), - context); - - Assert.Contains("empty response", TestContext.TextOf(result)); - } -} diff --git a/tests/Foragent.Agent.Tests/FetchPageTitleCapabilityTests.cs b/tests/Foragent.Agent.Tests/FetchPageTitleCapabilityTests.cs deleted file mode 100644 index 83931bc..0000000 --- a/tests/Foragent.Agent.Tests/FetchPageTitleCapabilityTests.cs +++ /dev/null @@ -1,197 +0,0 @@ -using Foragent.Capabilities; -using Microsoft.Extensions.Logging.Abstractions; -using RockBot.A2A; -using Xunit; - -namespace Foragent.Agent.Tests; - -public class FetchPageTitleCapabilityTests -{ - [Fact] - public async Task ReturnsTitle_FromBrowser() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromResult<string?>("Hello World") - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "https://example.com"), - context); - - Assert.Equal(AgentTaskState.Completed, result.State); - Assert.Equal("Hello World", TestContext.TextOf(result)); - } - - [Fact] - public async Task ReportsNoTitle_WhenBrowserReturnsNull() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromResult<string?>(null) - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "https://example.com"), - context); - - Assert.Equal("(no title)", TestContext.TextOf(result)); - } - - [Fact] - public async Task ReportsError_WhenBrowserThrows() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromException<string?>(new InvalidOperationException("boom")) - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "https://example.com"), - context); - - Assert.Contains("Fetch failed: boom", TestContext.TextOf(result)); - } - - [Fact] - public async Task RejectsNonAbsoluteUrl_WithoutCreatingSession() - { - var factory = new StubBrowserSessionFactory(); - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "not a url"), - context); - - Assert.Equal(0, factory.SessionsCreated); - Assert.Contains("absolute http(s) URL", TestContext.TextOf(result)); - } - - [Fact] - public async Task DisposesSession_AfterUse() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromResult<string?>("t") - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "https://example.com"), - context); - - Assert.Equal(1, factory.SessionsCreated); - Assert.Equal(1, factory.SessionsDisposed); - } - - [Fact] - public async Task ExtractsUrlFromFreeFormText() - { - // Simulates what RockBot's LLM actually sends — conversational wrapping - // around a URL. The capability should still find it. - var factory = new StubBrowserSessionFactory - { - TitleResponder = (url, _) => - Task.FromResult<string?>(url.ToString() == "https://lhotka.net/" ? "Rockford Lhotka" : null) - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", "Please fetch the title of https://lhotka.net."), - context); - - Assert.Equal("Rockford Lhotka", TestContext.TextOf(result)); - } - - [Fact] - public async Task AcceptsUrlFromJsonShim() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromResult<string?>("Hello") - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - - var result = await capability.ExecuteAsync( - TestContext.Request("fetch-page-title", """{"url":"https://example.com"}"""), - context); - - Assert.Equal("Hello", TestContext.TextOf(result)); - } - - [Fact] - public async Task AcceptsUrl_FromMessageMetadata() - { - Uri? capturedUrl = null; - var factory = new StubBrowserSessionFactory - { - TitleResponder = (url, _) => - { - capturedUrl = url; - return Task.FromResult<string?>("From Metadata"); - } - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - var request = TestContext.RequestWithMetadata( - "fetch-page-title", - messageMetadata: new Dictionary<string, string> { ["url"] = "https://example.com" }); - - var result = await capability.ExecuteAsync(request, context); - - Assert.Equal("From Metadata", TestContext.TextOf(result)); - Assert.Equal(new Uri("https://example.com"), capturedUrl); - } - - [Fact] - public async Task AcceptsUrl_FromRequestMetadata() - { - var factory = new StubBrowserSessionFactory - { - TitleResponder = (_, _) => Task.FromResult<string?>("Request Metadata") - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - var request = TestContext.RequestWithMetadata( - "fetch-page-title", - requestMetadata: new Dictionary<string, string> { ["url"] = "https://example.com" }); - - var result = await capability.ExecuteAsync(request, context); - - Assert.Equal("Request Metadata", TestContext.TextOf(result)); - } - - [Fact] - public async Task MessageMetadata_WinsOverTextPart() - { - Uri? capturedUrl = null; - var factory = new StubBrowserSessionFactory - { - TitleResponder = (url, _) => - { - capturedUrl = url; - return Task.FromResult<string?>("ok"); - } - }; - var capability = new FetchPageTitleCapability(factory, NullLogger<FetchPageTitleCapability>.Instance); - var (context, _) = TestContext.Build(); - var request = TestContext.RequestWithMetadata( - "fetch-page-title", - text: "some conversational text with https://ignore-me.example", - messageMetadata: new Dictionary<string, string> { ["url"] = "https://authoritative.example" }); - - await capability.ExecuteAsync(request, context); - - Assert.Equal(new Uri("https://authoritative.example"), capturedUrl); - } -} diff --git a/tests/Foragent.Agent.Tests/TestDoubles.cs b/tests/Foragent.Agent.Tests/TestDoubles.cs index 67c7d2f..e09b5cf 100644 --- a/tests/Foragent.Agent.Tests/TestDoubles.cs +++ b/tests/Foragent.Agent.Tests/TestDoubles.cs @@ -88,12 +88,6 @@ internal sealed class StatusCapture internal sealed class StubBrowserSessionFactory : IBrowserSessionFactory { - public Func<Uri, CancellationToken, Task<string?>> TitleResponder { get; set; } = - (_, _) => Task.FromResult<string?>(null); - - public Func<Uri, CancellationToken, Task<PageSnapshot>> SnapshotResponder { get; set; } = - (url, _) => Task.FromResult(new PageSnapshot(url, "stub", "stub content", PageSnapshotSource.Accessibility)); - public Func<Uri, CancellationToken, Task<IBrowserPage>> PageResponder { get; set; } = (_, _) => Task.FromResult<IBrowserPage>(new StubBrowserPage()); @@ -114,12 +108,6 @@ public Task<IBrowserSession> CreateSessionAsync(Func<Uri, bool> allowedHost, Can private sealed class StubSession(StubBrowserSessionFactory owner) : IBrowserSession { - public Task<string?> FetchPageTitleAsync(Uri url, CancellationToken ct = default) => - owner.TitleResponder(url, ct); - - public Task<PageSnapshot> CapturePageSnapshotAsync(Uri url, CancellationToken ct = default) => - owner.SnapshotResponder(url, ct); - public Task<IBrowserPage> OpenPageAsync(Uri url, CancellationToken ct = default) => owner.PageResponder(url, ct); diff --git a/tests/Foragent.Browser.Tests/ExtractStructuredDataIntegrationTests.cs b/tests/Foragent.Browser.Tests/ExtractStructuredDataIntegrationTests.cs deleted file mode 100644 index 4f60dd2..0000000 --- a/tests/Foragent.Browser.Tests/ExtractStructuredDataIntegrationTests.cs +++ /dev/null @@ -1,157 +0,0 @@ -using System.ClientModel; -using System.Text.Json; -using Foragent.Capabilities; -using Microsoft.AspNetCore.Builder; -using Microsoft.AspNetCore.Http; -using Microsoft.Extensions.AI; -using Microsoft.Extensions.Logging.Abstractions; -using OpenAI; -using RockBot.A2A; -using RockBot.Host; -using RockBot.Messaging; -using Microsoft.Extensions.DependencyInjection; -using Xunit; - -namespace Foragent.Browser.Tests; - -/// <summary> -/// End-to-end test that runs the real <see cref="ExtractStructuredDataCapability"/> -/// against a local Kestrel page, driving real Playwright Chromium and a real -/// LLM. Skipped automatically when <c>FORAGENT_LLM_*</c> env vars aren't set — -/// so the unit + browser test suites still run in CI without credentials. -/// </summary> -[Collection("Playwright")] -public class ExtractStructuredDataIntegrationTests(TestPageServerFixture fixture) -{ - [SkippableFact] - public async Task ExtractsProductPrice_FromFakeShopPage() - { - var config = LlmConfig.FromEnvironment(); - Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping real-LLM test."); - - await using var shop = await StartFakeShopAsync(); - var capability = BuildCapability(config!); - var request = BuildRequest( - $"{shop.BaseUrl}/product", - "the product name and price as fields called 'name' and 'price_usd'"); - - var result = await capability.ExecuteAsync(request, BuildContext()); - - Assert.Equal(AgentTaskState.Completed, result.State); - var text = result.Message?.Parts.FirstOrDefault(p => p.Kind == "text")?.Text ?? string.Empty; - - using var doc = JsonDocument.Parse(text); - var name = doc.RootElement.GetProperty("name").GetString(); - Assert.Equal("Premium Widget", name, ignoreCase: true); - - // price_usd may come back as a number or string — accept either. - var priceElement = doc.RootElement.GetProperty("price_usd"); - var price = priceElement.ValueKind == JsonValueKind.Number - ? priceElement.GetDecimal() - : decimal.Parse(priceElement.GetString() ?? "0", System.Globalization.CultureInfo.InvariantCulture); - Assert.Equal(49.99m, price); - } - - // ── helpers ────────────────────────────────────────────────────────────── - - private ExtractStructuredDataCapability BuildCapability(LlmConfig config) - { - var openAi = new OpenAIClient( - new ApiKeyCredential(config.ApiKey), - new OpenAIClientOptions { Endpoint = new Uri(config.Endpoint) }); - var chatClient = openAi.GetChatClient(config.ModelId).AsIChatClient(); - return new ExtractStructuredDataCapability( - fixture.Factory, chatClient, NullLogger<ExtractStructuredDataCapability>.Instance); - } - - private static AgentTaskRequest BuildRequest(string url, string description) => new() - { - TaskId = Guid.NewGuid().ToString(), - Skill = ExtractStructuredDataCapability.SkillDefinition.Id, - Message = new AgentMessage - { - Role = "user", - Parts = - [ - new AgentMessagePart - { - Kind = "text", - Text = JsonSerializer.Serialize(new { url, description }) - } - ] - } - }; - - private static AgentTaskContext BuildContext() - { - var envelope = MessageEnvelope.Create( - messageType: typeof(AgentTaskRequest).FullName!, - body: ReadOnlyMemory<byte>.Empty, - source: "test"); - var messageContext = new MessageHandlerContext - { - Envelope = envelope, - Agent = new AgentIdentity("Foragent"), - Services = new ServiceCollection().BuildServiceProvider(), - CancellationToken = CancellationToken.None - }; - return new AgentTaskContext - { - MessageContext = messageContext, - PublishStatus = (_, _) => Task.CompletedTask - }; - } - - private static async Task<FakeShop> StartFakeShopAsync() - { - var builder = WebApplication.CreateEmptyBuilder(new WebApplicationOptions()); - builder.WebHost.UseKestrelCore(); - builder.WebHost.UseUrls("http://127.0.0.1:0"); - builder.Services.AddRoutingCore(); - builder.Logging.ClearProviders(); - - var app = builder.Build(); - app.UseRouting(); - app.MapGet("/product", () => Results.Content(""" - <!doctype html> - <html> - <head><title>Premium Widget | The Shop - -

Premium Widget

-

A top-of-the-line widget for demanding applications.

-

USD 49.99

- - - - """, "text/html")); - - await app.StartAsync(); - var addresses = app.Services - .GetRequiredService() - .Features.Get()! - .Addresses; - return new FakeShop(app, addresses.First().TrimEnd('/')); - } - - private sealed record FakeShop(WebApplication App, string BaseUrl) : IAsyncDisposable - { - public async ValueTask DisposeAsync() => await App.DisposeAsync(); - } - - private sealed record LlmConfig(string Endpoint, string ModelId, string ApiKey) - { - public static LlmConfig? FromEnvironment() - { - var endpoint = Environment.GetEnvironmentVariable("FORAGENT_LLM_ENDPOINT"); - var model = Environment.GetEnvironmentVariable("FORAGENT_LLM_MODEL_ID"); - var key = Environment.GetEnvironmentVariable("FORAGENT_LLM_API_KEY"); - if (string.IsNullOrWhiteSpace(endpoint) - || string.IsNullOrWhiteSpace(model) - || string.IsNullOrWhiteSpace(key)) - { - return null; - } - return new LlmConfig(endpoint, model, key); - } - } -} diff --git a/tests/Foragent.Browser.Tests/PageSnapshotTests.cs b/tests/Foragent.Browser.Tests/PageSnapshotTests.cs deleted file mode 100644 index 914a0ad..0000000 --- a/tests/Foragent.Browser.Tests/PageSnapshotTests.cs +++ /dev/null @@ -1,41 +0,0 @@ -using Xunit; - -namespace Foragent.Browser.Tests; - -[Collection("Playwright")] -public class PageSnapshotTests(TestPageServerFixture fixture) -{ - [Fact] - public async Task CapturePageSnapshotAsync_ReturnsAccessibilityRendering_ForPlainHtml() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var snapshot = await session.CapturePageSnapshotAsync(fixture.Url("/plain")); - - Assert.Equal("Hello World", snapshot.Title); - Assert.Equal(PageSnapshotSource.Accessibility, snapshot.Source); - Assert.False(string.IsNullOrWhiteSpace(snapshot.Content)); - // The body text "hi" should appear somewhere in the accessibility tree. - Assert.Contains("hi", snapshot.Content); - } - - [Fact] - public async Task CapturePageSnapshotAsync_FollowsRedirect_AndReportsFinalUrl() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var snapshot = await session.CapturePageSnapshotAsync(fixture.Url("/redirect")); - - Assert.EndsWith("/plain", snapshot.Url.AbsolutePath); - Assert.Equal("Hello World", snapshot.Title); - } - - [Fact] - public async Task CapturePageSnapshotAsync_ThrowsOn404() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - await Assert.ThrowsAsync(() => - session.CapturePageSnapshotAsync(fixture.Url("/not-found"))); - } -} diff --git a/tests/Foragent.Browser.Tests/PlaywrightBrowserSessionTests.cs b/tests/Foragent.Browser.Tests/PlaywrightBrowserSessionTests.cs deleted file mode 100644 index 2450d2a..0000000 --- a/tests/Foragent.Browser.Tests/PlaywrightBrowserSessionTests.cs +++ /dev/null @@ -1,83 +0,0 @@ -using Xunit; - -namespace Foragent.Browser.Tests; - -[Collection("Playwright")] -public class PlaywrightBrowserSessionTests(TestPageServerFixture fixture) -{ - [Fact] - public async Task FetchPageTitleAsync_ReturnsTitle_FromPlainHtml() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var title = await session.FetchPageTitleAsync(fixture.Url("/plain")); - - Assert.Equal("Hello World", title); - } - - [Fact] - public async Task FetchPageTitleAsync_ReturnsNull_WhenTitleMissing() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var title = await session.FetchPageTitleAsync(fixture.Url("/no-title")); - - Assert.Null(title); - } - - [Fact] - public async Task FetchPageTitleAsync_DecodesHtmlEntities() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var title = await session.FetchPageTitleAsync(fixture.Url("/entities")); - - Assert.Equal("AT&T \u2014 Home", title); - } - - [Fact] - public async Task FetchPageTitleAsync_ReturnsJsUpdatedTitle() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var title = await session.FetchPageTitleAsync(fixture.Url("/js-title")); - - Assert.Equal("updated by js", title); - } - - [Fact] - public async Task FetchPageTitleAsync_FollowsRedirects() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - var title = await session.FetchPageTitleAsync(fixture.Url("/redirect")); - - Assert.Equal("Hello World", title); - } - - [Fact] - public async Task FetchPageTitleAsync_ThrowsOn404() - { - await using var session = await fixture.Factory.CreateSessionAsync(); - - await Assert.ThrowsAsync(() => - session.FetchPageTitleAsync(fixture.Url("/not-found"))); - } - - [Fact] - public async Task Sessions_AreIndependent() - { - // Each session should get a fresh context per spec §3.5. A smoke check that - // the factory doesn't hand back the same session twice or leak state. - await using var sessionA = await fixture.Factory.CreateSessionAsync(); - await using var sessionB = await fixture.Factory.CreateSessionAsync(); - - Assert.NotSame(sessionA, sessionB); - - var titleA = await sessionA.FetchPageTitleAsync(fixture.Url("/plain")); - var titleB = await sessionB.FetchPageTitleAsync(fixture.Url("/entities")); - - Assert.Equal("Hello World", titleA); - Assert.Equal("AT&T \u2014 Home", titleB); - } -} From 0bb7b8d0a4289362c2280210e1a9a7e146c06194 Mon Sep 17 00:00:00 2001 From: Rockford Lhotka Date: Thu, 23 Apr 2026 00:33:08 -0500 Subject: [PATCH 2/4] Step 9 follow-ups: fixes surfaced during end-to-end validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running RockBot → Foragent with a real browser-task (MacBook-price search across apple.com + bestbuy.com) surfaced three pre-existing issues that blocked step 9's claimed end-to-end validation. Fixing them here on the step-9 branch keeps the PR's test plan honest. 1. BrowserTaskPriming required IEmbeddingGenerator (DI resolution bug) The primary-constructor parameter was annotated nullable (IEmbeddingGenerator>?), but MSDI ignores C# nullable annotations — it only honors default parameter values. Reordered to put embeddingGenerator last with = null so MSDI treats it as optional. Spec §5.6 says missing embeddings should downgrade to BM25-only retrieval; that claim is now actually true. Two test callers updated to drop the explicit embeddingGenerator: null arg. 2. Skill names with dotted hosts failed silently RockBot 0.9's FileSkillStore.ValidateName rejects '.' — every real host (bsky.app, apple.com, example.com) threw ArgumentException on save. BskySeedSkillService swallowed the throw as a startup warning, TryWriteLearnedSkillAsync swallowed it on the error path, and form schemas just never persisted. Added SkillNaming.SanitizeHost that replaces '.' → '-' (bsky.app → bsky-app) and applied it at three call sites: BskySeedSkillService, BrowserTaskCapability. TryWriteLearnedSkillAsync, LearnFormSchemaCapability.DeriveSkillName. Allowlist matching and memory-search categories keep the original dotted host — only skill names need sanitization. Test assertions (BrowserTaskCapabilityTests, BskySeedSkillServiceTests, LearnFormSchemaCapabilityTests) updated to the sanitized names; skill-optimize.md directive examples updated so the dream loop produces valid names. 3. Fresh named volume masks Dockerfile chown The Foragent Dockerfile chowns /data to the non-root foragent user (uid 1655) at image-build time, but Docker mounts a fresh named volume root-owned, masking the build-time chown. Added a foragent-init busybox one-shot (mirroring rockbot-init) that chmod -R 777 /data/foragent on volume creation. Docs updated: CLAUDE.md Status + Learning-substrate sections, docs/capabilities.md, spec §5.6 skill-naming paragraph (calls out the sanitization rule), framework-feedback step-9 follow-up section with three framework observations (MSDI nullable footgun, validator's dot rejection making real hosts fail, named-volume permissions pattern). Tests: 48 passed / 3 LLM-gated skipped. End-to-end smoke: RockBot dispatches browser-task to Foragent over the bus; Foragent plans 2 steps (navigate + snapshot), emits done with JSON result, reply lands on user.response.RockBot. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 4 +- docker-compose.yml | 15 +++++++ docs/capabilities.md | 2 +- docs/foragent-specification.md | 9 ++-- docs/framework-feedback.md | 44 +++++++++++++++++++ src/Foragent.Agent/BskySeedSkillService.cs | 13 ++++-- .../directives/skill-optimize.md | 8 ++-- .../BrowserTask/BrowserTaskCapability.cs | 5 ++- .../BrowserTask/BrowserTaskPriming.cs | 11 ++++- .../Forms/LearnFormSchemaCapability.cs | 2 +- src/Foragent.Capabilities/SkillNaming.cs | 19 ++++++++ .../BrowserTask/BrowserTaskCapabilityTests.cs | 9 ++-- .../BskySeedSkillServiceTests.cs | 14 +++--- .../Forms/LearnFormSchemaCapabilityTests.cs | 2 +- .../BrowserTaskIntegrationTests.cs | 1 - 15 files changed, 125 insertions(+), 33 deletions(-) create mode 100644 src/Foragent.Capabilities/SkillNaming.cs diff --git a/CLAUDE.md b/CLAUDE.md index 2122f27..325b500 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -102,7 +102,7 @@ Embeddings are optional. `ForagentEmbeddings:Endpoint` / `ModelId` / `ApiKey` ar On successful completion (`state.IsDone`), `BrowserTaskCapability.TryWriteLearnedSkillAsync` runs one extra synthesizer LLM turn (same `IChatClient`, no tools) to author a reusable skill at `sites/{primaryHost}/learned/{intent-slug}`. The synthesizer prompt forbids including credential values or typed field contents. Writes are skipped when the task was trivial (≤1 navigation) or the primary host can't be determined. Errors are logged but never fail the completed task. -`BskySeedSkillService` (IHostedService) seeds `sites/bsky.app/login` on first start by calling `ISkillStore.GetAsync` and only writing if absent — docker volume recreation reseeds cleanly; operator edits to the skill through other channels are preserved. +`BskySeedSkillService` (IHostedService) seeds `sites/bsky-app/login` on first start by calling `ISkillStore.GetAsync` and only writing if absent — docker volume recreation reseeds cleanly; operator edits to the skill through other channels are preserved. **Skill names sanitize host segments via `SkillNaming.SanitizeHost` (replace `.` with `-`) because RockBot 0.9's `FileSkillStore.ValidateName` rejects dots.** `bsky.app` → `bsky-app`, `apple.com` → `apple-com`. Allowlist patterns and memory categories stay unsanitized — only skill names need this. Skill naming follows spec §5.6: `sites/{host}/{intent}` for human-authored primers, `sites/{host}/learned/{slug}` for agent-generated, `sites/{host}/forms/{slug}` for learned form schemas (step 8). `Skill.SeeAlso` cross-references related skills to surface clusters rather than single entries. **Note:** `Skill` (from `RockBot.Host 0.9.x`) carries `Manifest: IReadOnlyList?` for multi-file bundles but still doesn't carry tags, metadata, or importance — the `agent-learned` vs `human-authored` vs `form-schema` distinction is encoded in the name prefix only. The dream loop (below) keeps the distinction from mattering at retrieval time: skills get improved, merged, and deduped across origins on a daily cadence; `SaveAsync(skill)` with null resources preserves the existing manifest so dream-loop markdown edits don't orphan attached resource files. @@ -113,7 +113,7 @@ Foragent runs a daily RockBot dream pass to consolidate accumulated skills and m - **Enabled:** main orchestrator (`dream.md`), skill-optimize (merge/dedup), skill-gap (detect missing coverage), sequence-skill (detect repeated tool patterns), memory-mining (promote durable observations to `ILongTermMemory`). - **Disabled:** preference inference, episode extraction, tier-routing review, entity extraction, graph consolidation, identity reflection, DLQ review, Wisp failure analysis. All personality-agent territory. -`ProtectedSkillPrefixes = []` — empty on purpose. Operator primers like `sites/bsky.app/login` are *improved in place* by the dream, not frozen; the seed service only writes on a cold boot, so later dream-authored improvements survive restarts. Operators who need to reset a primer can delete the stored skill file and bounce the host. +`ProtectedSkillPrefixes = []` — empty on purpose. Operator primers like `sites/bsky-app/login` are *improved in place* by the dream, not frozen; the seed service only writes on a cold boot, so later dream-authored improvements survive restarts. Operators who need to reset a primer can delete the stored skill file and bounce the host. Directive files live at `src/Foragent.Agent/directives/*.md` and ship with the binary via ``. `DreamService` resolves each `DreamOptions.*DirectivePath` relative to `AgentProfileOptions.BasePath` (confirmed by IL inspection — relative base paths combine against `AppContext.BaseDirectory`, which is the binary output dir). Program.cs configures `AgentProfileOptions.BasePath = "directives"`; no `WithProfile()` call, Foragent doesn't need the personality-profile doc set. diff --git a/docker-compose.yml b/docker-compose.yml index b9b9181..4cef4d1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,12 +47,27 @@ services: timeout: 3s retries: 15 + foragent-init: + # One-shot ownership fix for the foragent-data named volume. The Foragent + # Dockerfile chowns /data to the non-root `foragent` user at build time, but + # Docker mounts a fresh named volume root-owned and masks the chown — + # FileSkillStore then hits UnauthorizedAccessException on first boot. This + # init container runs as root once per volume creation and mirrors the + # rockbot-init pattern. Subsequent boots skip the mkdir+chown if already set. + image: busybox:latest + user: root + command: ["sh", "-c", "mkdir -p /data/foragent/skills /data/foragent/memory && chmod -R 777 /data/foragent"] + volumes: + - foragent-data:/data/foragent + foragent: build: context: . depends_on: rabbitmq: condition: service_healthy + foragent-init: + condition: service_completed_successfully ports: - "5210:8080" environment: diff --git a/docs/capabilities.md b/docs/capabilities.md index 48769cd..768eee7 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -17,7 +17,7 @@ Three v0.1/v0.2 specialists have been removed as `browser-task` subsumes them. The project is pre-public so no deprecation path was required: - `post-to-site` — removed in step 7. `browser-task` + the seeded - `sites/bsky.app/login` skill cover the use case. + `sites/bsky-app/login` skill cover the use case. - `fetch-page-title` — removed in step 9. Was a milestone-1 smoke target; `browser-task` with a simple intent produces the same result. - `extract-structured-data` — removed in step 9. `browser-task` with a diff --git a/docs/foragent-specification.md b/docs/foragent-specification.md index bc21bd2..33082d2 100644 --- a/docs/foragent-specification.md +++ b/docs/foragent-specification.md @@ -393,10 +393,10 @@ site knowledge, rather than building a Foragent-local store. `RockBot.Host.Abstractions` + `RockBot.Host.AgentMemoryExtensions.WithSkills()`). Stores site knowledge as markdown skills. Two origin categories: - **Human-authored skills** — operator-written primers for a site - (e.g. `sites/bsky.app/overview`). Treated as priming hints for the + (e.g. `sites/bsky-app/overview`). Treated as priming hints for the generalist planner. - **Agent-learned skills** — written by the generalist on successful - task completion (e.g. `sites/bsky.app/learned/login-flow`). Tagged + task completion (e.g. `sites/bsky-app/learned/login-flow`). Tagged with `metadata.source = "agent-learned"` and an importance score. - **`ILongTermMemory`** (file-backed, BM25 + semantic — `WithLongTermMemory()`). Declarative observations that don't fit the @@ -404,7 +404,10 @@ site knowledge, rather than building a Foragent-local store. facts. **Skill naming:** `sites/{host}/{phase-or-intent}` — e.g. -`sites/bsky.app/login`, `sites/bsky.app/compose-post`. Hierarchical `/` +`sites/bsky-app/login`, `sites/bsky-app/compose-post`. Host segments are +sanitized (`.` → `-`) because RockBot 0.9's `FileSkillStore.ValidateName` +rejects dots; `bsky.app` becomes `bsky-app`. Allowlists and memory +categories keep the original dotted host. Hierarchical `/` nesting is supported by the store. `seeAlso` links cross-reference skills for the same site so retrieval surfaces a small knowledge cluster, not one skill at a time. diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index f584fd0..036530e 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -632,3 +632,47 @@ observations are few. Advertised surface lands at three skills: - **No spec open-questions closed or opened.** Open items #3, #4, #5, #7 remain as written; #6 and #8 closed in step 8; #1 and #2 closed in v0.2 spec adoption. v0.2 is the shipped minimum surface. + +### Follow-up fixes surfaced while validating step 9 + +Running RockBot → Foragent end-to-end (MacBook-price search across +apple.com + bestbuy.com) surfaced three pre-existing issues. All fixed +on the step-9 branch since step 9's test plan claims end-to-end +validation that didn't actually work without them. + +- **`BrowserTaskPriming` required `IEmbeddingGenerator`** — the + primary-constructor parameter was already annotated nullable + (`IEmbeddingGenerator>?`), but MSDI ignores + C# nullable annotations; it only honors default parameter values. + Reordered to put `embeddingGenerator` last with `= null` so MSDI + treats it as optional. Spec §5.6 says missing embeddings should + downgrade to BM25-only — this made that claim actually true. + Framework observation: MSDI's "nullable means optional" footgun is + well-documented but still catches people; worth a sentence in the + RockBot host-wiring docs if they exist. +- **Skill names with dotted hosts fail silently** — RockBot 0.9's + `FileSkillStore.ValidateName` rejects `.` (only alphanumeric, + hyphens, underscores, `/`). `sites/bsky.app/login`, + `sites/apple.com/learned/…`, `sites/example.com/forms/…` — all + common real hosts — threw `ArgumentException` on save, which + `BskySeedSkillService` swallowed as a warning and + `TryWriteLearnedSkillAsync` swallowed on the error path. Added + `SkillNaming.SanitizeHost(host)` that replaces `.` → `-` + (`bsky.app` → `bsky-app`). Three call sites updated: + `BskySeedSkillService`, `BrowserTaskCapability.TryWriteLearnedSkillAsync`, + `LearnFormSchemaCapability.DeriveSkillName`. Framework observation: + the validator's error is informative but the fact that *every real + host fails validation* suggests either `.` should be allowed in + skill names (it's fine on filesystems) or the framework should + offer a canonical sanitizer so every consumer doesn't reinvent one. + Allowlist matching and memory-search categories keep the original + dotted host. +- **Named-volume permissions on fresh compose boot** — the Foragent + Dockerfile chowns `/data` to the non-root `foragent` user (uid 1655) + at image-build time, but Docker mounts a fresh named volume + root-owned and masks the build-time chown. Added a `foragent-init` + busybox one-shot (mirroring the `rockbot-init` pattern) that + `chmod -R 777 /data/foragent` on volume creation. Harness issue, + not framework — noting here because it's the kind of thing that + could bite any RockBot consumer that mounts persistent state as a + named volume. diff --git a/src/Foragent.Agent/BskySeedSkillService.cs b/src/Foragent.Agent/BskySeedSkillService.cs index 4b2947c..8995064 100644 --- a/src/Foragent.Agent/BskySeedSkillService.cs +++ b/src/Foragent.Agent/BskySeedSkillService.cs @@ -5,7 +5,7 @@ namespace Foragent.Agent; /// -/// Seeds the operator-authored sites/bsky.app/login skill into the +/// Seeds the operator-authored sites/bsky-app/login skill into the /// configured on startup if it is not already /// present. Idempotent — subsequent starts are no-ops, so recreating the /// persistence volume does not wipe human-authored priming, but editing the @@ -14,12 +14,17 @@ namespace Foragent.Agent; /// Human-authored skills seed the generalist planner with site-specific /// primers (spec §5.6). Agent-learned skills are written by the capability /// itself on successful task completion. +/// +/// Note: the host segment is bsky-app (not bsky.app) because +/// RockBot 0.9's FileSkillStore.ValidateName rejects dots — all +/// skill names with hosts use the +/// substitution. /// internal sealed class BskySeedSkillService( ISkillStore skillStore, ILogger logger) : IHostedService { - private const string SkillName = "sites/bsky.app/login"; + private const string SkillName = "sites/bsky-app/login"; private const string SkillSummary = "Log in to bsky.app with an app password; look for 2FA challenges."; @@ -80,7 +85,7 @@ before assuming it worked. ## See also - - `sites/bsky.app/compose-post` once that skill is learned. + - `sites/bsky-app/compose-post` once that skill is learned. """; public async Task StartAsync(CancellationToken cancellationToken) @@ -101,7 +106,7 @@ public async Task StartAsync(CancellationToken cancellationToken) CreatedAt: DateTimeOffset.UtcNow, UpdatedAt: null, LastUsedAt: null, - SeeAlso: ["sites/bsky.app/compose-post"]); + SeeAlso: ["sites/bsky-app/compose-post"]); await skillStore.SaveAsync(skill); logger.LogInformation("Seeded human-authored skill '{Name}'.", SkillName); diff --git a/src/Foragent.Agent/directives/skill-optimize.md b/src/Foragent.Agent/directives/skill-optimize.md index 079aaf7..e0eda6b 100644 --- a/src/Foragent.Agent/directives/skill-optimize.md +++ b/src/Foragent.Agent/directives/skill-optimize.md @@ -34,7 +34,7 @@ Merge two skills when **all three** are true: - The combined skill would still fit comfortably inside ~400 words. Do not merge skills that happen to target the same site but different -intents (e.g. `sites/bsky.app/login` vs `sites/bsky.app/compose-post`). +intents (e.g. `sites/bsky-app/login` vs `sites/bsky-app/compose-post`). Those stay separate — different retrieval contexts. ## Output format @@ -48,9 +48,9 @@ For each change you want to make, emit one of: Example: ``` -DELETE sites/bsky.app/learned/sign-in-with-app-password -DELETE sites/bsky.app/learned/log-into-bluesky -UPSERT sites/bsky.app/login | Log in to bsky.app with an app password; watch for 2FA challenges. +DELETE sites/bsky-app/learned/sign-in-with-app-password +DELETE sites/bsky-app/learned/log-into-bluesky +UPSERT sites/bsky-app/login | Log in to bsky.app with an app password; watch for 2FA challenges. Bluesky's public web app is at https://bsky.app… (full markdown body) --- diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs index c1ca8a9..292d86a 100644 --- a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs @@ -185,8 +185,9 @@ private async Task TryWriteLearnedSkillAsync( if (string.IsNullOrEmpty(primaryHost)) return; + var hostSegment = SkillNaming.SanitizeHost(primaryHost); var slug = Slugify(input.Intent!); - var name = $"sites/{primaryHost}/learned/{slug}"; + var name = $"sites/{hostSegment}/learned/{slug}"; try { @@ -213,7 +214,7 @@ private async Task TryWriteLearnedSkillAsync( CreatedAt: existing?.CreatedAt ?? DateTimeOffset.UtcNow, UpdatedAt: existing is null ? null : DateTimeOffset.UtcNow, LastUsedAt: DateTimeOffset.UtcNow, - SeeAlso: [$"sites/{primaryHost}/login"]); + SeeAlso: [$"sites/{hostSegment}/login"]); await skillStore.SaveAsync(skill); logger.LogInformation("Wrote learned skill '{Name}' ({Nav} navigations, {Steps} steps).", name, state.Navigations.Count, state.Steps); diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskPriming.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskPriming.cs index c47bda6..aa99be3 100644 --- a/src/Foragent.Capabilities/BrowserTask/BrowserTaskPriming.cs +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskPriming.cs @@ -14,11 +14,18 @@ namespace Foragent.Capabilities.BrowserTask; /// Isolated from so tests can inject /// fake stores without going through the capability's full execute path. /// +/// +/// is placed last with a default of +/// null so MSDI resolves the type even when no embedding generator is +/// registered — the C# nullable annotation alone is not enough; MSDI only +/// honors optional parameters via default values (spec §5.6 — "missing +/// embeddings downgrade retrieval to BM25-only"). +/// public sealed class BrowserTaskPriming( ISkillStore skillStore, ILongTermMemory longTermMemory, - IEmbeddingGenerator>? embeddingGenerator, - ILogger logger) + ILogger logger, + IEmbeddingGenerator>? embeddingGenerator = null) { public const int MaxSkills = 5; public const int MaxMemories = 5; diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs index 83769cd..da37079 100644 --- a/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs @@ -197,7 +197,7 @@ private static string BuildPrimer(FormSchema schema, string? intent) private static string DeriveSkillName(Uri url, string? intent) { - var host = url.Host.ToLowerInvariant(); + var host = SkillNaming.SanitizeHost(url.Host.ToLowerInvariant()); var slug = string.IsNullOrWhiteSpace(intent) ? SlugFromPath(url.AbsolutePath) : Slugify(intent!); diff --git a/src/Foragent.Capabilities/SkillNaming.cs b/src/Foragent.Capabilities/SkillNaming.cs new file mode 100644 index 0000000..7fc4b6b --- /dev/null +++ b/src/Foragent.Capabilities/SkillNaming.cs @@ -0,0 +1,19 @@ +namespace Foragent.Capabilities; + +/// +/// Shared helpers for constructing skill names that pass RockBot 0.9's +/// FileSkillStore.ValidateName — only alphanumeric, hyphens, +/// underscores, and / are allowed. Real hosts (bsky.app, +/// apple.com) contain dots, so anything that embeds a host inside a +/// skill name must go through . +/// +internal static class SkillNaming +{ + /// + /// Replaces characters the skill-store validator rejects. Dots become + /// hyphens (bsky.appbsky-app) — readable, reversible + /// by humans, and keeps the host recognizable in the stored skill path. + /// + public static string SanitizeHost(string host) => + host.Replace('.', '-'); +} diff --git a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs index e93b3b0..623ab31 100644 --- a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs +++ b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs @@ -171,7 +171,6 @@ internal static ( var priming = new BrowserTaskPriming( skillStore, memory, - embeddingGenerator: null, NullLogger.Instance); var capability = new BrowserTaskCapability( @@ -192,7 +191,7 @@ public async Task Priming_InjectsRetrievedSkillIntoUserPrompt() { var skills = new FakeSkillStore(); await skills.SaveAsync(new Skill( - Name: "sites/example.com/login", + Name: "sites/example-com/login", Summary: "Use the app password, not the account password.", Content: "Click 'Sign in', enter handle, then password.", CreatedAt: DateTimeOffset.UtcNow, @@ -214,7 +213,7 @@ await capability.ExecuteAsync( var userMessage = scripted.FirstMessages.Single(m => m.Role == ChatRole.User).Text ?? string.Empty; Assert.Contains("Known site knowledge", userMessage); - Assert.Contains("sites/example.com/login", userMessage); + Assert.Contains("sites/example-com/login", userMessage); Assert.Contains("app password", userMessage); } @@ -243,7 +242,7 @@ public async Task LearnedSkill_IsWrittenOnSuccess_WhenMultipleNavigations() ctx); Assert.Equal(AgentTaskState.Completed, result.State); - var learned = skills.Saved.Keys.SingleOrDefault(k => k.StartsWith("sites/example.com/learned/")); + var learned = skills.Saved.Keys.SingleOrDefault(k => k.StartsWith("sites/example-com/learned/")); Assert.NotNull(learned); var skill = skills.Saved[learned!]; Assert.Equal("Navigate home then click the details link.", skill.Summary); @@ -268,7 +267,7 @@ await capability.ExecuteAsync( """{"intent":"read one page","url":"https://example.com/","allowedHosts":["example.com"]}"""), ctx); - Assert.DoesNotContain(skills.Saved.Keys, k => k.StartsWith("sites/example.com/learned/")); + Assert.DoesNotContain(skills.Saved.Keys, k => k.StartsWith("sites/example-com/learned/")); } private sealed class StubCredentialBroker : ICredentialBroker diff --git a/tests/Foragent.Agent.Tests/BskySeedSkillServiceTests.cs b/tests/Foragent.Agent.Tests/BskySeedSkillServiceTests.cs index 5a8c542..7719cb3 100644 --- a/tests/Foragent.Agent.Tests/BskySeedSkillServiceTests.cs +++ b/tests/Foragent.Agent.Tests/BskySeedSkillServiceTests.cs @@ -18,10 +18,10 @@ public async Task Seed_IsWritten_WhenSkillMissing() await service.StartAsync(CancellationToken.None); - Assert.True(store.Saved.ContainsKey("sites/bsky.app/login")); - var skill = store.Saved["sites/bsky.app/login"]; + Assert.True(store.Saved.ContainsKey("sites/bsky-app/login")); + var skill = store.Saved["sites/bsky-app/login"]; Assert.Contains("app password", skill.Summary, StringComparison.OrdinalIgnoreCase); - Assert.Contains("sites/bsky.app/compose-post", skill.SeeAlso!); + Assert.Contains("sites/bsky-app/compose-post", skill.SeeAlso!); } [Fact] @@ -29,7 +29,7 @@ public async Task Seed_LeavesExistingSkillIntact() { var store = new FakeSkillStore(); var existing = new Skill( - Name: "sites/bsky.app/login", + Name: "sites/bsky-app/login", Summary: "operator-edited summary", Content: "operator-edited content", CreatedAt: DateTimeOffset.UtcNow.AddDays(-7), @@ -41,7 +41,7 @@ public async Task Seed_LeavesExistingSkillIntact() var service = new BskySeedSkillService(store, NullLogger.Instance); await service.StartAsync(CancellationToken.None); - var after = store.Saved["sites/bsky.app/login"]; + var after = store.Saved["sites/bsky-app/login"]; Assert.Equal("operator-edited summary", after.Summary); Assert.Equal("operator-edited content", after.Content); } @@ -53,9 +53,9 @@ public async Task Seed_IsNoop_OnSecondStart() var service = new BskySeedSkillService(store, NullLogger.Instance); await service.StartAsync(CancellationToken.None); - var firstContent = store.Saved["sites/bsky.app/login"].Content; + var firstContent = store.Saved["sites/bsky-app/login"].Content; await service.StartAsync(CancellationToken.None); - var secondContent = store.Saved["sites/bsky.app/login"].Content; + var secondContent = store.Saved["sites/bsky-app/login"].Content; Assert.Same(firstContent, secondContent); } diff --git a/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs index 489b798..868f5f3 100644 --- a/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs +++ b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs @@ -70,7 +70,7 @@ public async Task ReturnsSchema_AndPersistsSkillWithResource() using var doc = JsonDocument.Parse(TestContext.TextOf(result)); Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); var skillName = doc.RootElement.GetProperty("skillName").GetString()!; - Assert.StartsWith("sites/example.com/forms/", skillName); + Assert.StartsWith("sites/example-com/forms/", skillName); var saved = skills.Saved[skillName]; Assert.NotNull(saved.Manifest); diff --git a/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs index 82c6bb1..1f11674 100644 --- a/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs +++ b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs @@ -172,7 +172,6 @@ private BrowserTaskCapability BuildCapability(LlmConfig config) var priming = new BrowserTaskPriming( skillStore, memory, - embeddingGenerator: null, NullLogger.Instance); return new BrowserTaskCapability( From 4c2ac7f7bc4d6828f56bce029a7b86ec88776f8c Mon Sep 17 00:00:00 2001 From: Rockford Lhotka Date: Thu, 23 Apr 2026 01:29:34 -0500 Subject: [PATCH 3/4] Step 9 follow-ups: A2A DataPart input + agent-card hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When RockBot's LLM called invoke_agent with free-form prose (message= "...allowedHosts: ['*']..."), Foragent's three input parsers kept rejecting with 'Missing allowedHosts' — they only consumed text parts and expected a JSON object. RockBot 0.9.11+ supports structured input via an A2A DataPart (AgentMessagePart{Kind="data", Data=}), but Foragent never advertised that it consumed data parts and the invoke_agent tool description steered the LLM to omit 'data' unless the target "is known to consume data." Result: loop. Fix spans three surfaces: 1. Parsers accept DataPart. BrowserTaskInput, LearnFormSchemaInput, and ExecuteFormBatchInput now look for a Kind="data" part first and use its Data string as the JSON source. Text-JSON fallback stays (curl callers), and for browser-task, a prose text part serves as the intent fallback when the data part doesn't supply one. Metadata overrides remain. 2. Skill descriptions explicitly direct callers to use the data parameter. Each SkillDefinition.Description now leads with "PASS INPUT AS AN A2A DATA PART (a structured JSON object), not as prose inside the text message. When calling via RockBot's invoke_agent, populate the 'data' parameter with this object." Matching entries in deploy/rockbot-seed/well-known-agents.json updated so the LLM sees the same guidance through list_known_agents. 3. Tests. Four new unit tests: one per input parser verifying a DataPart with JSON is consumed; one for browser-task's text-as- intent fallback when the data part omits intent. TestContext gained RequestWithData(...) to build the dual-part shape RockBot's invoke_agent produces. Image bumped to rockylhotka/rockbot-agent:0.9.14 — softens the invoke_agent 'data' tool description upstream, complementing the skill-description hints on the Foragent side. CLAUDE.md Status paragraph updated. Docs: CLAUDE.md Capabilities section gains a note on the DataPart contract. framework-feedback step-9 follow-up section extended with the three-surface lesson (sender tool description ↔ target skill description ↔ target parser all need to agree on the canonical shape). Tests: 52 passed / 3 LLM-gated skipped. Build clean. Curl smoke (text-JSON path) returns valid JSON via browser-task unchanged. Live Blazor end-to-end test is next, against the updated 0.9.14 rockbot image. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 3 +- deploy/rockbot-seed/well-known-agents.json | 8 +-- docker-compose.yml | 6 +-- docs/framework-feedback.md | 27 ++++++++++ .../BrowserTask/BrowserTaskCapability.cs | 6 ++- .../BrowserTask/BrowserTaskInput.cs | 37 ++++++++++++-- .../Forms/ExecuteFormBatchCapability.cs | 5 +- .../Forms/ExecuteFormBatchInput.cs | 25 ++++++++-- .../Forms/LearnFormSchemaCapability.cs | 5 +- .../Forms/LearnFormSchemaInput.cs | 21 ++++++-- .../BrowserTask/BrowserTaskCapabilityTests.cs | 49 +++++++++++++++++++ .../Forms/ExecuteFormBatchCapabilityTests.cs | 34 +++++++++++++ .../Forms/LearnFormSchemaCapabilityTests.cs | 23 +++++++++ tests/Foragent.Agent.Tests/TestDoubles.cs | 29 +++++++++++ 14 files changed, 254 insertions(+), 24 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 325b500..2988001 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Status -Foragent is at **milestone 9 shipped — v0.2 complete**. Three capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.11`, which brings the structured-data `invoke_agent` surface from PR #291 so RockBot can consume Foragent's FormSchema JSON natively rather than as prose). Step 9 removed the v0.1 `fetch-page-title` and `extract-structured-data` specialists: both are reachable via `browser-task` (the planner's `done(result=...)` channel carries JSON when the intent asks for it), at roughly 2–3× the tokens per call — acceptable given zero deterministic high-volume callers today. `extract-structured-data` was also out of spec on §7.1 mandatory allowlists, which the generalist enforces by design. Session-level `FetchPageTitleAsync` / `CapturePageSnapshotAsync` were removed from `IBrowserSession` alongside the capabilities that used them. RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. +Foragent is at **milestone 9 shipped — v0.2 complete**. Three capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.14`, which brings the structured-data `invoke_agent` surface from PR #291 (0.9.11) so RockBot can consume Foragent's FormSchema JSON natively rather than as prose, plus a softened `data`-parameter description in 0.9.14 that encourages the caller LLM to use DataParts when the target skill description advertises them). Step 9 removed the v0.1 `fetch-page-title` and `extract-structured-data` specialists: both are reachable via `browser-task` (the planner's `done(result=...)` channel carries JSON when the intent asks for it), at roughly 2–3× the tokens per call — acceptable given zero deterministic high-volume callers today. `extract-structured-data` was also out of spec on §7.1 mandatory allowlists, which the generalist enforces by design. Session-level `FetchPageTitleAsync` / `CapturePageSnapshotAsync` were removed from `IBrowserSession` alongside the capabilities that used them. RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. ## Build / test @@ -86,6 +86,7 @@ Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any - `ForagentTaskHandler` is a pure dispatcher that resolves `IEnumerable` from DI and routes on `SkillId`. **Do not add skill-specific logic to the handler.** New capabilities go in new `ICapability` classes. - `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it. - Capabilities parse their own input near the capability (`BrowserTaskInput` in `BrowserTask/`, the `*Input` classes in `Forms/`). The `CapabilityInput.Parse` shared URL+description shim was removed in step 9 with its only consumers. +- **Structured input arrives on an A2A DataPart** (`AgentMessagePart { Kind = "data", Data = , MimeType = "application/json" }`) — the shape RockBot's `invoke_agent` tool produces when the caller populates its `data` parameter. All three Foragent parsers look for a data part first, then fall back to JSON in the text part (for curl callers), then metadata overrides either source. Every `SkillDefinition.Description` tells the LLM to pass inputs via the `data` parameter, not in the text message — otherwise the LLM writes prose like `allowedHosts: ['*']` that strict JSON parsers reject. Text parts remain the home for human-readable prose (used as `intent` fallback in `browser-task` when the data part doesn't carry one). - `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability. - `learn-form-schema` and `execute-form-batch` (both in `Forms/`) are the step-8 phase-1 / phase-3 pair (spec §5.5). `FormSchema` / `FormField` are the wire contract — stable JSON shape, stored via `FormSchema.SerializerOptions`. `LearnFormSchemaCapability` navigates, calls `IBrowserPage.ScanFormAsync`, maps the raw scan to `FormSchema` via the deterministic `FormSchemaMapper` (pure — no LLM), then runs `FormSchemaEnricher` for one LLM turn to infer `dependsOn` and a note (skipped when there are no select/radio fields). The schema is persisted as a `Skill` bundle at `sites/{host}/forms/{slug}` with a `SkillResourceType.JsonSchema` resource named `schema.json`. Only the enricher can add `dependsOn` / `notes` — structural fields (type, selector, required, options) come only from the DOM scan, so the LLM cannot fabricate fields or rewrite selectors. `ExecuteFormBatchCapability` accepts `schemaRef` (resolves via `ISkillStore.GetResourceAsync(name, "schema.json")`) or an inline `schema`, and submits each row with `FillAsync` / `SelectOptionAsync` / `SetCheckedAsync` per field type. Per-row progress is published via `AgentTaskContext.PublishStatus(new AgentTaskStatusUpdate { State = Working, Message = … })`. Default `mode` is `"abort-on-first"` (spec open-question #8 resolution: a failed submit on a mutating form usually indicates a real problem, so continuing would just generate more bad submissions); callers opt into `"continue"` for known-messy batches. Success detection: an optional `successIndicator` CSS selector is the preferred signal; the fallback is URL change after submit, which fails (correctly) for forms that submit in place. File uploads and multi-step wizards are **out of scope** for v0.2 — `ScanFormAsync` explicitly skips `type=file`, and there's no flow control for wizards. diff --git a/deploy/rockbot-seed/well-known-agents.json b/deploy/rockbot-seed/well-known-agents.json index 83a1532..2f5f05d 100644 --- a/deploy/rockbot-seed/well-known-agents.json +++ b/deploy/rockbot-seed/well-known-agents.json @@ -1,7 +1,7 @@ [ { "agentName": "Foragent", - "description": "Browser agent — navigates pages with Chromium and exposes task-level skills over HTTP A2A.", + "description": "Browser agent — navigates pages with Chromium and exposes task-level skills over HTTP A2A. All skills consume structured input on the A2A DataPart (invoke_agent 'data' parameter); the text 'message' is only for human-readable summaries.", "version": "1.0", "url": "http://foragent:8080", "protocolVersion": "1.0", @@ -11,17 +11,17 @@ { "id": "browser-task", "name": "Browser Task (generalist)", - "description": "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. Input JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. allowedHosts is required and empty rejects. Returns a structured JSON result with status (done/failed/incomplete), summary, optional result, step count, and navigations." + "description": "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. PASS INPUT AS AN A2A DATA PART — populate invoke_agent's 'data' parameter with {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. 'intent' and 'allowedHosts' are required; an empty allowlist rejects the task. Use [\"*\"] explicitly when any host is acceptable. Returns a structured JSON result with status (done/failed/incomplete), summary, optional result, step count, and navigations." }, { "id": "learn-form-schema", "name": "Learn Form Schema", - "description": "Navigate to a URL, introspect the first form, and return a typed FormSchema (fields, types, required, options, dependencies). Persists the schema as a skill at sites/{host}/forms/{slug} for later reuse by execute-form-batch." + "description": "Navigate to a web form, extract its structure (fields, types, options, validation), and persist it as a reusable skill. PASS INPUT AS AN A2A DATA PART — populate invoke_agent's 'data' parameter with {\"url\":\"https://...\",\"allowedHosts\":[\"host\"],\"formSelector\":\"optional\",\"credentialId\":\"optional\",\"skillName\":\"optional override\",\"intent\":\"optional prose\"}. 'url' and 'allowedHosts' are required. Returns the typed form schema plus the skill name it was persisted under." }, { "id": "execute-form-batch", "name": "Execute Form Batch", - "description": "Given a learned FormSchema (by ref or inline) and a batch of row data, submit the form once per row. Streams per-row progress. Default mode abort-on-first; opt into 'continue' for known-messy batches." + "description": "Submit a batch of rows against a learned form schema. PASS INPUT AS AN A2A DATA PART — populate invoke_agent's 'data' parameter with {\"schemaRef\":\"sites/host/forms/name\" OR \"schema\":{...FormSchema...},\"rows\":[{fieldName:value,...}],\"allowedHosts\":[\"host\"],\"credentialId\":\"optional\",\"mode\":\"abort-on-first\"|\"continue\",\"successIndicator\":\"optional CSS selector\"}. 'rows', 'allowedHosts', and exactly one of schemaRef/schema are required. Streams per-row progress. Default mode aborts on first failure." } ] } diff --git a/docker-compose.yml b/docker-compose.yml index 4cef4d1..94a2c1b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ # - foragent — this project; exposes HTTP A2A on port 5210 # - rockbot-init — seeds /data/agent with RockBot profile + well-known-agents.json # pointing at foragent -# - rockbot — rockylhotka/rockbot-agent:0.9.11, configured to know Foragent +# - rockbot — rockylhotka/rockbot-agent:0.9.14, configured to know Foragent # as an A2A peer it can delegate tasks to. 0.9.11 brings # the structured-data invoke_agent surface (PR #291) so # RockBot can consume Foragent's FormSchema JSON results @@ -115,7 +115,7 @@ services: - foragent-data:/data/foragent rockbot-init: - image: rockylhotka/rockbot-agent:0.9.11 + image: rockylhotka/rockbot-agent:0.9.14 user: root entrypoint: ["/bin/sh", "-c"] command: @@ -151,7 +151,7 @@ services: - ./deploy/rockbot-seed:/seed:ro rockbot: - image: rockylhotka/rockbot-agent:0.9.11 + image: rockylhotka/rockbot-agent:0.9.14 depends_on: rockbot-init: condition: service_completed_successfully diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index 036530e..fdbf2de 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -676,3 +676,30 @@ validation that didn't actually work without them. not framework — noting here because it's the kind of thing that could bite any RockBot consumer that mounts persistent state as a named volume. +- **Structured A2A input requires a DataPart, not JSON-in-text** — + running RockBot → Foragent through the Blazor UI, the caller LLM + kept looping on `"Missing 'allowedHosts'"` rejections. Root cause: + Foragent's three input parsers (`BrowserTaskInput`, + `LearnFormSchemaInput`, `ExecuteFormBatchInput`) only consumed the + first `Kind = "text"` part, requiring JSON-as-a-string. RockBot + 0.9.11's `invoke_agent` tool instead sends structured fields as a + separate `AgentMessagePart { Kind = "data", Data = , + MimeType = "application/json" }` when its `data` parameter is + filled — the proper A2A DataPart shape. But Foragent never + advertised that it consumed data parts, and `invoke_agent`'s tool + description steered the LLM away from sending them unless the + target was "known to consume data." The LLM saw Foragent's skill + description say "Input: JSON {…}" and interpreted that as + "stringify the JSON into message," which strict parsers reject, + which looped. **Fix**: taught all three parsers to read a + `Kind = "data"` part first; taught the skill descriptions to + explicitly say "PASS INPUT AS AN A2A DATA PART — populate + invoke_agent's 'data' parameter with …"; updated the seed card + identically. Framework-level observation: the A2A protocol cleanly + supports typed payloads via DataPart, but the interoperability + chain (sender's tool description ↔ target's skill description ↔ + target's parser) all need to agree that the data part is the + canonical shape. RockBot's `invoke_agent` tool description was + softened upstream in 0.9.14 (removing the "most agents only + understand text" steer) to help with this — consumers should + upgrade when available. diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs index 292d86a..483831c 100644 --- a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs @@ -34,7 +34,11 @@ public sealed class BrowserTaskCapability( Id = "browser-task", Name = "Browser Task (generalist)", Description = "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. " - + "Input: JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. " + + "PASS INPUT AS AN A2A DATA PART (a structured JSON object), not as prose inside the text message. " + + "When calling via RockBot's invoke_agent, populate the 'data' parameter with this object; " + + "the text 'message' should only be a human-readable summary. " + + "Fields: {\"intent\":\"what to accomplish\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. " + + "'intent' and 'allowedHosts' are REQUIRED — an empty allowlist rejects the task (spec §7.1). Use [\"*\"] explicitly when any host is acceptable. " + "Returns a short summary plus optional structured result string." }; diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs index cb49508..12e3157 100644 --- a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs @@ -6,9 +6,18 @@ namespace Foragent.Capabilities.BrowserTask; /// /// Parses the browser-task input shape (spec §5.2). /// -/// Accepts either a JSON object in the first text part or field-by-field -/// metadata on the message/request. Metadata overrides JSON when both are -/// present. Shape: +/// Accepts structured input in three forms, checked in order: +/// +/// An A2A DataPart (message part with Kind = "data", +/// MimeType = "application/json") carrying the JSON object. This +/// is the preferred shape and the one RockBot's invoke_agent tool +/// produces when the caller fills its data parameter. +/// A JSON object in the first text part (curl callers, agents that +/// don't use data parts). +/// Field-by-field metadata on the message/request. Metadata +/// overrides fields from the JSON source when both are present. +/// +/// Shape: /// /// intent — required. Free-form description of what to do. /// allowedHosts — required. Array of host patterns. Empty rejects. @@ -41,18 +50,30 @@ public static BrowserTaskInput Parse(AgentTaskRequest request) int? maxSteps = null; int? maxSeconds = null; + // Preferred shape: RockBot's invoke_agent sends structured fields as + // a DataPart alongside the prose text part. Prefer it when present so + // the text part can stay human-readable. + var dataJson = request.Message.Parts + .Where(p => p.Kind == "data" && !string.IsNullOrWhiteSpace(p.Data)) + .Select(p => p.Data) + .FirstOrDefault(); + var text = request.Message.Parts .Where(p => p.Kind == "text") .Select(p => p.Text) .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) ?.Trim(); - if (!string.IsNullOrEmpty(text) && text.StartsWith('{')) + string? jsonPayload = dataJson ?? (!string.IsNullOrEmpty(text) && text.StartsWith('{') ? text : null); + + if (jsonPayload is not null) { try { - using var doc = JsonDocument.Parse(text); + using var doc = JsonDocument.Parse(jsonPayload); var root = doc.RootElement; + if (root.ValueKind != JsonValueKind.Object) + return Fail("Structured input must be a JSON object."); if (root.TryGetProperty("intent", out var i)) intent = i.GetString(); if (root.TryGetProperty("url", out var u)) url = u.GetString(); if (root.TryGetProperty("credentialId", out var c)) credentialId = c.GetString(); @@ -67,6 +88,12 @@ public static BrowserTaskInput Parse(AgentTaskRequest request) { return Fail("Input must be a JSON object with intent, allowedHosts, and optional url/credentialId/maxSteps/maxSeconds."); } + + // When a DataPart carried the structured fields, the text part is + // the human-readable prose from the caller — use it as a fallback + // intent if the data part didn't supply one. + if (string.IsNullOrWhiteSpace(intent) && !string.IsNullOrEmpty(text) && !text.StartsWith('{')) + intent = text; } else if (!string.IsNullOrEmpty(text)) { diff --git a/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs b/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs index 3b83aee..08df635 100644 --- a/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs +++ b/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs @@ -30,7 +30,10 @@ public sealed class ExecuteFormBatchCapability( Id = "execute-form-batch", Name = "Execute Form Batch", Description = "Submit a batch of rows against a learned form schema. " - + "Input: JSON {\"schemaRef\":\"sites/host/forms/name\" OR \"schema\":{...FormSchema...},\"rows\":[{field:value,...}],\"allowedHosts\":[\"host\"],\"credentialId\":\"optional\",\"mode\":\"abort-on-first\"|\"continue\",\"successIndicator\":\"optional CSS selector\"}. " + + "PASS INPUT AS AN A2A DATA PART (a structured JSON object), not as prose inside the text message. " + + "When calling via RockBot's invoke_agent, populate the 'data' parameter with this object — the multi-row shape doesn't fit in plain text. " + + "Fields: {\"schemaRef\":\"sites/host/forms/name\" OR \"schema\":{...FormSchema...},\"rows\":[{fieldName:value,...}],\"allowedHosts\":[\"host\"],\"credentialId\":\"optional\",\"mode\":\"abort-on-first\"|\"continue\",\"successIndicator\":\"optional CSS selector\"}. " + + "'rows', 'allowedHosts', and exactly one of 'schemaRef'/'schema' are REQUIRED. " + "Streams per-row progress. Default mode aborts on first failure." }; diff --git a/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs index 82d9fdd..d55d8cc 100644 --- a/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs +++ b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs @@ -7,8 +7,13 @@ namespace Foragent.Capabilities.Forms; /// /// Parses input for execute-form-batch (spec §5.2, phase-3 of §5.5). /// -/// Shape (JSON in the first text part — metadata pass-through doesn't fit -/// multi-row shapes): +/// Accepts input on either an A2A DataPart (preferred — the shape +/// RockBot's invoke_agent produces when its data parameter is +/// filled) or a JSON object in the first text part (curl callers). Metadata +/// pass-through is not supported — the multi-row shape doesn't fit in +/// string-valued metadata. +/// +/// Shape: /// /// schemaRef — optional. Skill name produced by learn-form-schema. /// schema — optional. Inline JSON. @@ -32,14 +37,22 @@ internal readonly record struct ExecuteFormBatchInput( { public static ExecuteFormBatchInput Parse(AgentTaskRequest request) { + var dataJson = request.Message.Parts + .Where(p => p.Kind == "data" && !string.IsNullOrWhiteSpace(p.Data)) + .Select(p => p.Data) + .FirstOrDefault(); + var text = request.Message.Parts .Where(p => p.Kind == "text") .Select(p => p.Text) .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) ?.Trim(); - if (string.IsNullOrEmpty(text) || !text.StartsWith('{')) - return Fail("Input must be a JSON object with rows, allowedHosts, and either schemaRef or schema."); + string? jsonPayload = dataJson ?? (!string.IsNullOrEmpty(text) && text.StartsWith('{') ? text : null); + + if (jsonPayload is null) + return Fail("Input must be a JSON object with rows, allowedHosts, and either schemaRef or schema. " + + "Pass it as the A2A DataPart (invoke_agent's 'data' parameter) or as JSON in the text message."); string? schemaRef; FormSchema? inlineSchema = null; @@ -51,8 +64,10 @@ public static ExecuteFormBatchInput Parse(AgentTaskRequest request) try { - using var doc = JsonDocument.Parse(text); + using var doc = JsonDocument.Parse(jsonPayload); var root = doc.RootElement; + if (root.ValueKind != JsonValueKind.Object) + return Fail("Structured input must be a JSON object."); schemaRef = root.TryGetProperty("schemaRef", out var sr) ? sr.GetString() : null; if (root.TryGetProperty("schema", out var s) && s.ValueKind == JsonValueKind.Object) { diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs index da37079..db918f1 100644 --- a/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs @@ -29,7 +29,10 @@ public sealed class LearnFormSchemaCapability( Id = "learn-form-schema", Name = "Learn Form Schema", Description = "Navigate to a web form, extract its structure (fields, types, options, validation), and persist it as a reusable skill. " - + "Input: JSON {\"url\":\"https://...\",\"allowedHosts\":[\"host\"],\"formSelector\":\"optional\",\"credentialId\":\"optional\",\"skillName\":\"optional override\",\"intent\":\"optional prose\"}. " + + "PASS INPUT AS AN A2A DATA PART (a structured JSON object), not as prose inside the text message. " + + "When calling via RockBot's invoke_agent, populate the 'data' parameter with this object. " + + "Fields: {\"url\":\"https://...\",\"allowedHosts\":[\"host\"],\"formSelector\":\"optional CSS selector\",\"credentialId\":\"optional\",\"skillName\":\"optional override\",\"intent\":\"optional prose\"}. " + + "'url' and 'allowedHosts' are REQUIRED (spec §7.1). " + "Returns the typed form schema plus the skill name it was persisted under." }; diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs index 1a487ec..d9f800d 100644 --- a/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs @@ -8,7 +8,13 @@ namespace Foragent.Capabilities.Forms; /// Parses input for the learn-form-schema capability (spec §5.2, /// phase-1 of the learn→review→execute pattern in §5.5). /// -/// Shape (JSON in the first text part, field-by-field metadata overrides): +/// Accepts the structured input in three forms: preferred is an A2A +/// DataPart (Kind = "data", MimeType = "application/json") +/// — the shape RockBot's invoke_agent produces when the caller fills +/// its data parameter. Falls back to a JSON object in the first text +/// part for curl callers. Field-by-field metadata overrides either source. +/// +/// Shape: /// /// url — required. Absolute http(s) URL of the page hosting the form. /// allowedHosts — required. Host allowlist (spec §7.1). Empty rejects. @@ -36,18 +42,27 @@ public static LearnFormSchemaInput Parse(AgentTaskRequest request) string? intent = null; List? allowedHosts = null; + var dataJson = request.Message.Parts + .Where(p => p.Kind == "data" && !string.IsNullOrWhiteSpace(p.Data)) + .Select(p => p.Data) + .FirstOrDefault(); + var text = request.Message.Parts .Where(p => p.Kind == "text") .Select(p => p.Text) .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) ?.Trim(); - if (!string.IsNullOrEmpty(text) && text.StartsWith('{')) + string? jsonPayload = dataJson ?? (!string.IsNullOrEmpty(text) && text.StartsWith('{') ? text : null); + + if (jsonPayload is not null) { try { - using var doc = JsonDocument.Parse(text); + using var doc = JsonDocument.Parse(jsonPayload); var root = doc.RootElement; + if (root.ValueKind != JsonValueKind.Object) + return Fail("Structured input must be a JSON object."); if (root.TryGetProperty("url", out var u)) url = u.GetString(); if (root.TryGetProperty("formSelector", out var fs)) formSelector = fs.GetString(); if (root.TryGetProperty("credentialId", out var c)) credentialId = c.GetString(); diff --git a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs index 623ab31..6b91eb0 100644 --- a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs +++ b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs @@ -64,6 +64,55 @@ public async Task DonePayload_ReturnsStructuredJson() Assert.Contains("snapshot", page.Actions); } + [Fact] + public async Task AcceptsStructuredInput_OnDataPart() + { + // Simulates the shape RockBot's invoke_agent produces when the caller + // fills its 'data' parameter: a text part with human prose plus a + // DataPart with the structured JSON. The data part wins over any + // JSON embedded in text. + var (capability, page, _) = Build( + ScriptedChatClient.ToolCall("snapshot"), + ScriptedChatClient.ToolCall("done", new { summary = "picked up from DataPart" }), + ScriptedChatClient.Text("stopping")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.RequestWithData( + "browser-task", + dataJson: """{"intent":"read the page title","url":"https://example.com/","allowedHosts":["example.com"]}""", + text: "Please read the page title for me."), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + Assert.Contains("snapshot", page.Actions); + } + + [Fact] + public async Task DataPart_UsesProseTextAsIntentFallback() + { + // If the DataPart doesn't carry 'intent' but the text part is prose, + // the text fills in for intent. Lets callers keep 'intent' in the + // data payload without duplicating it in the message. + var (capability, _, _) = Build( + ScriptedChatClient.ToolCall("done", new { summary = "ok" }), + ScriptedChatClient.Text("stopping")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.RequestWithData( + "browser-task", + dataJson: """{"allowedHosts":["example.com"],"url":"https://example.com/"}""", + text: "Find the title of the page."), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + } + [Fact] public async Task Fail_ReturnsFailedStatus() { diff --git a/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs b/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs index 828f773..b4237d8 100644 --- a/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs +++ b/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs @@ -39,6 +39,40 @@ public async Task RejectsInput_WhenRowsMissing() Assert.Contains("rows", TestContext.TextOf(result)); } + [Fact] + public async Task AcceptsStructuredInput_OnDataPart() + { + var (cap, factory, skills) = Build(); + await SeedSchemaAsync(skills, "sites/example.com/forms/contact", SampleSchema()); + var page = SubmittingPage(); + factory.PageResponder = (_, _) => + { + page.CurrentUrl = new Uri("https://example.com/form"); + return Task.FromResult(page); + }; + + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.RequestWithData( + "execute-form-batch", + dataJson: """ + { + "schemaRef": "sites/example.com/forms/contact", + "allowedHosts": ["example.com"], + "rows": [ + {"email":"a@example.com","message":"hi"} + ] + } + """, + text: "Submit one contact form row."), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal(1, doc.RootElement.GetProperty("successCount").GetInt32()); + } + [Fact] public async Task ResolvesSchemaFromRef_AndSubmitsRows() { diff --git a/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs index 868f5f3..abe76a9 100644 --- a/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs +++ b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs @@ -38,6 +38,29 @@ public async Task RejectsInput_WhenAllowlistMissing() Assert.Contains("allowedHosts", TestContext.TextOf(result)); } + [Fact] + public async Task AcceptsStructuredInput_OnDataPart() + { + var scan = SampleScan(); + var (cap, factory, skills) = Build(); + var page = new StubBrowserPage { FormScan = scan }; + factory.PageResponder = (_, _) => Task.FromResult(page); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.RequestWithData( + "learn-form-schema", + dataJson: """{"url":"https://example.com/contact","allowedHosts":["example.com"]}""", + text: "Learn the schema of the contact form."), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + var skillName = doc.RootElement.GetProperty("skillName").GetString()!; + Assert.StartsWith("sites/example-com/forms/", skillName); + } + [Fact] public async Task RejectsInput_WhenUrlOffAllowlist() { diff --git a/tests/Foragent.Agent.Tests/TestDoubles.cs b/tests/Foragent.Agent.Tests/TestDoubles.cs index e09b5cf..5c1cc8e 100644 --- a/tests/Foragent.Agent.Tests/TestDoubles.cs +++ b/tests/Foragent.Agent.Tests/TestDoubles.cs @@ -27,6 +27,35 @@ public static AgentTaskRequest Request(string skill, string text) }; } + /// + /// Builds a request that carries a structured payload on an A2A DataPart + /// (Kind = "data"), matching what RockBot's invoke_agent tool + /// produces when the caller populates its data parameter. An + /// optional prose text part can be supplied for the human-readable + /// message; omit for pure data-only requests. + /// + public static AgentTaskRequest RequestWithData( + string skill, + string dataJson, + string? text = null) + { + var parts = new List(); + if (text is not null) + parts.Add(new AgentMessagePart { Kind = "text", Text = text }); + parts.Add(new AgentMessagePart { Kind = "data", Data = dataJson, MimeType = "application/json" }); + return new AgentTaskRequest + { + TaskId = Guid.NewGuid().ToString(), + ContextId = "ctx", + Skill = skill, + Message = new AgentMessage + { + Role = "user", + Parts = parts + } + }; + } + public static AgentTaskRequest RequestWithMetadata( string skill, string? text = null, From 6fc13c6333b16ef245e905bd8b028a985136fd92 Mon Sep 17 00:00:00 2001 From: Rockford Lhotka Date: Thu, 23 Apr 2026 02:03:10 -0500 Subject: [PATCH 4/4] Step 9 follow-ups: A2A task cancellation + self-teaching errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RockBot 0.9.15 now publishes agent.task.cancel.{agentName} messages when a wisp's local state fails after dispatching an A2A task (the duplicate- dispatch scenario observed during step-9 validation). Foragent's previous behavior inherited the framework's default AgentTaskCancelHandler, which always replies TaskNotCancelable because it assumes stateless agents. Foragent is stateful — a browser task is potentially minutes long — so leaving the default would orphan browser runs. Implementation: - InFlightTaskRegistry (singleton): ConcurrentDictionary with Register/TryCancel/Remove. Register returns a linked CT that fires on either external cancel or the parent message CT. Redelivered task ids cancel the prior registration before replacing it, so stale work unwinds. - ForagentTaskHandler wraps the capability's AgentTaskContext so the CT observed via context.MessageContext.CancellationToken is the linked one from the registry. Capabilities observe cancellation without any signature change. - ForagentCancelHandler (IMessageHandler): on match calls TryCancel and publishes nothing (the running task's own terminal reply is the acknowledgment); on miss publishes AgentTaskError{Code=TaskNotFound}. Registered via agent.HandleMessage() after AddA2A — last AddScoped wins, overriding the default. - 11 new unit tests across registry, cancel handler, and task-handler integration (parent-cancel → linked CT fires, external cancel → linked CT fires, register/remove ties to finally, Remove drops registration even on thrown capability). Also in this commit, incorporating earlier step-9 follow-ups for the same RockBot 0.9.x interop round: - Self-teaching errors. When a parser rejects for a missing required field, the response now tells the LLM exactly how to fix the call: "Pass inputs as a JSON object on the A2A DataPart — in RockBot's invoke_agent tool, that means filling the 'data' parameter, NOT adding fields to the 'message' text. Example data: {...}." Observed behavior: LLMs that ignore skill descriptions do read error replies and adjust subsequent calls. Applied to all three parsers. - Docker image bumped to rockylhotka/rockbot-agent:0.9.15 — brings (a) invoke_agent's structured 'data' parameter (0.9.11), (b) softened tool description encouraging DataPart usage (0.9.14), and (c) the cancel-publisher that this commit consumes (0.9.15). CLAUDE.md Status section updated accordingly. Framework-feedback step-9 follow-up section extended with the cancel- handler-override pattern as a candidate for upstream WithTaskCancellation ergonomics (non-blocking — ~50 LOC across consumers isn't unbearable). Tests: 63 passed / 3 LLM-gated skipped. Build clean. Foragent starts cleanly on fresh volumes; agent.task.cancel.Foragent subscription verified active at boot. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 3 +- docker-compose.yml | 6 +- docs/framework-feedback.md | 19 +++ src/Foragent.Agent/Program.cs | 7 ++ .../BrowserTask/BrowserTaskInput.cs | 16 ++- .../ForagentCancelHandler.cs | 54 +++++++++ ...CapabilitiesServiceCollectionExtensions.cs | 4 + .../ForagentTaskHandler.cs | 63 +++++++--- .../Forms/ExecuteFormBatchInput.cs | 13 +- .../Forms/LearnFormSchemaInput.cs | 8 +- .../InFlightTaskRegistry.cs | 78 ++++++++++++ .../ForagentCancelHandlerTests.cs | 112 ++++++++++++++++++ .../ForagentTaskHandlerTests.cs | 87 +++++++++++++- .../InFlightTaskRegistryTests.cs | 71 +++++++++++ 14 files changed, 508 insertions(+), 33 deletions(-) create mode 100644 src/Foragent.Capabilities/ForagentCancelHandler.cs create mode 100644 src/Foragent.Capabilities/InFlightTaskRegistry.cs create mode 100644 tests/Foragent.Agent.Tests/ForagentCancelHandlerTests.cs create mode 100644 tests/Foragent.Agent.Tests/InFlightTaskRegistryTests.cs diff --git a/CLAUDE.md b/CLAUDE.md index 2988001..70bd521 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Status -Foragent is at **milestone 9 shipped — v0.2 complete**. Three capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.14`, which brings the structured-data `invoke_agent` surface from PR #291 (0.9.11) so RockBot can consume Foragent's FormSchema JSON natively rather than as prose, plus a softened `data`-parameter description in 0.9.14 that encourages the caller LLM to use DataParts when the target skill description advertises them). Step 9 removed the v0.1 `fetch-page-title` and `extract-structured-data` specialists: both are reachable via `browser-task` (the planner's `done(result=...)` channel carries JSON when the intent asks for it), at roughly 2–3× the tokens per call — acceptable given zero deterministic high-volume callers today. `extract-structured-data` was also out of spec on §7.1 mandatory allowlists, which the generalist enforces by design. Session-level `FetchPageTitleAsync` / `CapturePageSnapshotAsync` were removed from `IBrowserSession` alongside the capabilities that used them. RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. +Foragent is at **milestone 9 shipped — v0.2 complete**. Three capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.15`, which brings the structured-data `invoke_agent` surface from PR #291 (0.9.11), a softened `data`-parameter description in 0.9.14 that encourages the caller LLM to use DataParts when the target skill description advertises them, and the A2A cancel-capability publisher in 0.9.15 that actually sends `agent.task.cancel.{agentName}` messages when a wisp's local state goes wrong after an A2A dispatch — the message Foragent's `ForagentCancelHandler` now consumes). Step 9 removed the v0.1 `fetch-page-title` and `extract-structured-data` specialists: both are reachable via `browser-task` (the planner's `done(result=...)` channel carries JSON when the intent asks for it), at roughly 2–3× the tokens per call — acceptable given zero deterministic high-volume callers today. `extract-structured-data` was also out of spec on §7.1 mandatory allowlists, which the generalist enforces by design. Session-level `FetchPageTitleAsync` / `CapturePageSnapshotAsync` were removed from `IBrowserSession` alongside the capabilities that used them. RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. ## Build / test @@ -86,6 +86,7 @@ Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any - `ForagentTaskHandler` is a pure dispatcher that resolves `IEnumerable` from DI and routes on `SkillId`. **Do not add skill-specific logic to the handler.** New capabilities go in new `ICapability` classes. - `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it. - Capabilities parse their own input near the capability (`BrowserTaskInput` in `BrowserTask/`, the `*Input` classes in `Forms/`). The `CapabilityInput.Parse` shared URL+description shim was removed in step 9 with its only consumers. +- **Cancel messages are honored.** `ForagentTaskHandler` registers every in-flight task with `InFlightTaskRegistry` (singleton, a `ConcurrentDictionary`) and wraps the capability's `AgentTaskContext` so the CT it observes is linked to both the framework's message CT and an external cancel trigger. `ForagentCancelHandler` (registered via `agent.HandleMessage()` in Program.cs — last `AddScoped` wins, overriding the framework's default stateless handler) looks up the task by id and calls `TryCancel`. Successful cancel publishes nothing extra (the running task's own terminal result is the acknowledgment); a cancel for an unknown task publishes `AgentTaskError{Code=TaskNotFound}`. This matters because RockBot's wisp/A2A coordination can publish cancel messages when a local wisp fails after already dispatching an A2A task — without this handler, Foragent would orphan the browser run. - **Structured input arrives on an A2A DataPart** (`AgentMessagePart { Kind = "data", Data = , MimeType = "application/json" }`) — the shape RockBot's `invoke_agent` tool produces when the caller populates its `data` parameter. All three Foragent parsers look for a data part first, then fall back to JSON in the text part (for curl callers), then metadata overrides either source. Every `SkillDefinition.Description` tells the LLM to pass inputs via the `data` parameter, not in the text message — otherwise the LLM writes prose like `allowedHosts: ['*']` that strict JSON parsers reject. Text parts remain the home for human-readable prose (used as `intent` fallback in `browser-task` when the data part doesn't carry one). - `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability. - `learn-form-schema` and `execute-form-batch` (both in `Forms/`) are the step-8 phase-1 / phase-3 pair (spec §5.5). `FormSchema` / `FormField` are the wire contract — stable JSON shape, stored via `FormSchema.SerializerOptions`. `LearnFormSchemaCapability` navigates, calls `IBrowserPage.ScanFormAsync`, maps the raw scan to `FormSchema` via the deterministic `FormSchemaMapper` (pure — no LLM), then runs `FormSchemaEnricher` for one LLM turn to infer `dependsOn` and a note (skipped when there are no select/radio fields). The schema is persisted as a `Skill` bundle at `sites/{host}/forms/{slug}` with a `SkillResourceType.JsonSchema` resource named `schema.json`. Only the enricher can add `dependsOn` / `notes` — structural fields (type, selector, required, options) come only from the DOM scan, so the LLM cannot fabricate fields or rewrite selectors. `ExecuteFormBatchCapability` accepts `schemaRef` (resolves via `ISkillStore.GetResourceAsync(name, "schema.json")`) or an inline `schema`, and submits each row with `FillAsync` / `SelectOptionAsync` / `SetCheckedAsync` per field type. Per-row progress is published via `AgentTaskContext.PublishStatus(new AgentTaskStatusUpdate { State = Working, Message = … })`. Default `mode` is `"abort-on-first"` (spec open-question #8 resolution: a failed submit on a mutating form usually indicates a real problem, so continuing would just generate more bad submissions); callers opt into `"continue"` for known-messy batches. Success detection: an optional `successIndicator` CSS selector is the preferred signal; the fallback is URL change after submit, which fails (correctly) for forms that submit in place. File uploads and multi-step wizards are **out of scope** for v0.2 — `ScanFormAsync` explicitly skips `type=file`, and there's no flow control for wizards. diff --git a/docker-compose.yml b/docker-compose.yml index 94a2c1b..ae4c62b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ # - foragent — this project; exposes HTTP A2A on port 5210 # - rockbot-init — seeds /data/agent with RockBot profile + well-known-agents.json # pointing at foragent -# - rockbot — rockylhotka/rockbot-agent:0.9.14, configured to know Foragent +# - rockbot — rockylhotka/rockbot-agent:0.9.15, configured to know Foragent # as an A2A peer it can delegate tasks to. 0.9.11 brings # the structured-data invoke_agent surface (PR #291) so # RockBot can consume Foragent's FormSchema JSON results @@ -115,7 +115,7 @@ services: - foragent-data:/data/foragent rockbot-init: - image: rockylhotka/rockbot-agent:0.9.14 + image: rockylhotka/rockbot-agent:0.9.15 user: root entrypoint: ["/bin/sh", "-c"] command: @@ -151,7 +151,7 @@ services: - ./deploy/rockbot-seed:/seed:ro rockbot: - image: rockylhotka/rockbot-agent:0.9.14 + image: rockylhotka/rockbot-agent:0.9.15 depends_on: rockbot-init: condition: service_completed_successfully diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index fdbf2de..0e3a8e2 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -676,6 +676,25 @@ validation that didn't actually work without them. not framework — noting here because it's the kind of thing that could bite any RockBot consumer that mounts persistent state as a named volume. +- **Cancel handler override pattern** — RockBot ships a default + `AgentTaskCancelHandler` inside `AddA2A` that assumes stateless agents + and always replies `TaskNotCancelable`. Foragent is stateful (browser + task per request, potentially minutes long), and RockBot's + wisp/A2A coordination can send cancel messages when a local wisp + fails after already dispatching an A2A task. Replacing the default + is simple — call `agent.HandleMessage()` after `AddA2A` and the later `AddScoped` + wins — but worth calling out as the canonical pattern for any + long-running agent. Foragent's impl uses a singleton + `InFlightTaskRegistry` (concurrent dict of taskId → linked CTS); + the task handler registers on entry and removes in `finally`, the + cancel handler calls `TryCancel`. Publishing nothing on successful + cancel is deliberate — the task's own terminal reply is the + acknowledgment. Framework-level observation: a `WithTaskCancellation()` + opt-in on `AgentHostBuilder` (or a non-internal registry type the + framework ships so consumers don't roll their own concurrent dict) + would save every stateful-agent consumer ~50 lines. Filed as a + candidate enhancement; non-blocking. - **Structured A2A input requires a DataPart, not JSON-in-text** — running RockBot → Foragent through the Blazor UI, the caller LLM kept looping on `"Missing 'allowedHosts'"` rejections. Root cause: diff --git a/src/Foragent.Agent/Program.cs b/src/Foragent.Agent/Program.cs index 2815c56..60e6779 100644 --- a/src/Foragent.Agent/Program.cs +++ b/src/Foragent.Agent/Program.cs @@ -155,6 +155,13 @@ agent.Services.AddForagentCapabilities(); agent.Services.AddHostedService(); + + // Replace the framework's default stateless AgentTaskCancelHandler + // (which always returns TaskNotCancelable) with Foragent's stateful + // one that actually cancels the running browser task via + // InFlightTaskRegistry. Last AddScoped wins, so this call after + // AddA2A above shadows the framework default. + agent.HandleMessage(); }); builder.Services.AddForagentBrowser(); diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs index 12e3157..e81e066 100644 --- a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs @@ -108,10 +108,12 @@ public static BrowserTaskInput Parse(AgentTaskRequest request) allowedHosts = [.. hostsCsv.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)]; if (string.IsNullOrWhiteSpace(intent)) - return Fail("Missing 'intent' — a natural-language description of what to do."); + return Fail(HintedError("Missing 'intent' — a natural-language description of what to do.")); if (allowedHosts is null || allowedHosts.Count == 0) - return Fail("Missing 'allowedHosts' — browser-task requires an explicit allowlist (spec §7.1). Use ['*'] to accept any host."); + return Fail(HintedError( + "Missing 'allowedHosts' — browser-task requires an explicit host allowlist (spec §7.1). " + + "Use [\"*\"] to accept any host.")); HostAllowlist allowlist; try @@ -142,6 +144,16 @@ public static BrowserTaskInput Parse(AgentTaskRequest request) private static BrowserTaskInput Fail(string message) => new(null, null, null, null, DefaultMaxSteps, DefaultMaxSeconds, message); + /// + /// Appends the DataPart usage hint to an error message so the caller LLM + /// (which likely never read the skill description) learns the correct + /// invocation pattern from the rejection itself. The hint tells it *how* + /// to fix the call, not just *what* was missing. + /// + private static string HintedError(string reason) => reason + " " + + "Pass inputs as a JSON object on the A2A DataPart — in RockBot's invoke_agent tool, that means filling the 'data' parameter, NOT adding fields to the 'message' text. " + + "Example data: {\"intent\":\"...\",\"allowedHosts\":[\"*\"],\"url\":\"optional\"}."; + private static int Clamp(int value, int min, int max) => value < min ? min : value > max ? max : value; diff --git a/src/Foragent.Capabilities/ForagentCancelHandler.cs b/src/Foragent.Capabilities/ForagentCancelHandler.cs new file mode 100644 index 0000000..6f6b768 --- /dev/null +++ b/src/Foragent.Capabilities/ForagentCancelHandler.cs @@ -0,0 +1,54 @@ +using Microsoft.Extensions.Logging; +using RockBot.A2A; +using RockBot.Host; +using RockBot.Messaging; + +namespace Foragent.Capabilities; + +/// +/// Replaces the framework's default AgentTaskCancelHandler (which +/// always replies +/// because stateless agents can't do anything useful). Foragent is stateful +/// — a browser task is long-running and must respond to cancel — so this +/// handler cancels the linked held by +/// . +/// +/// Reply semantics: on a successful cancel we publish nothing here. The +/// running task's own terminal result (an in +/// an error/cancelled shape) will reach the caller's ReplyTo topic shortly +/// afterward as the task unwinds. On a missed cancel (no matching task), +/// we publish so the caller +/// knows the cancel was a no-op. +/// +public sealed class ForagentCancelHandler( + InFlightTaskRegistry inFlight, + IMessagePublisher publisher, + A2AOptions options, + AgentIdentity agent, + ILogger logger) : IMessageHandler +{ + public async Task HandleAsync(AgentTaskCancelRequest request, MessageHandlerContext context) + { + if (inFlight.TryCancel(request.TaskId)) + { + logger.LogInformation("Cancel requested for in-flight task {TaskId} — cancellation issued.", request.TaskId); + return; + } + + logger.LogDebug("Cancel requested for task {TaskId}, but no in-flight registration exists.", request.TaskId); + + var replyTo = context.Envelope.ReplyTo ?? options.DefaultResultTopic; + var error = new AgentTaskError + { + TaskId = request.TaskId, + ContextId = request.ContextId, + Code = AgentTaskError.Codes.TaskNotFound, + Message = "No in-flight task with that id; nothing to cancel.", + IsRetryable = false + }; + var envelope = error.ToEnvelope( + source: agent.Name, + correlationId: context.Envelope.CorrelationId); + await publisher.PublishAsync(replyTo, envelope, context.CancellationToken); + } +} diff --git a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs index 5d06dde..0149038 100644 --- a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs +++ b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs @@ -20,6 +20,10 @@ public static IServiceCollection AddForagentCapabilities(this IServiceCollection services.AddScoped(); services.AddScoped(); services.AddScoped(); + // Process-wide registry: the cancel message is dispatched in a + // different DI scope than the task request, so the registry must + // outlive scopes. + services.AddSingleton(); return services; } } diff --git a/src/Foragent.Capabilities/ForagentTaskHandler.cs b/src/Foragent.Capabilities/ForagentTaskHandler.cs index ff524f1..590b3bb 100644 --- a/src/Foragent.Capabilities/ForagentTaskHandler.cs +++ b/src/Foragent.Capabilities/ForagentTaskHandler.cs @@ -1,5 +1,6 @@ using Microsoft.Extensions.Logging; using RockBot.A2A; +using RockBot.Host; namespace Foragent.Capabilities; @@ -8,9 +9,16 @@ namespace Foragent.Capabilities; /// instances and routes an incoming task to the one whose /// matches . Unknown skills return a user- /// facing error rather than throwing, so the A2A bridge still sees a result. +/// +/// Registers each in-flight task with and +/// wraps the context so the capability's is +/// linked to both the framework's message CT and the registry's external +/// cancel trigger — letting agent.task.cancel.Foragent messages stop +/// long-running browser work mid-flight. /// public sealed class ForagentTaskHandler( IEnumerable capabilities, + InFlightTaskRegistry inFlight, ILogger logger) : IAgentTaskHandler { private readonly Dictionary _bySkill = @@ -19,26 +27,49 @@ public sealed class ForagentTaskHandler( public async Task HandleTaskAsync( AgentTaskRequest request, AgentTaskContext context) { - var ct = context.MessageContext.CancellationToken; + var parentCt = context.MessageContext.CancellationToken; + var linkedCt = inFlight.Register(request.TaskId, parentCt); - logger.LogInformation("Handling task {TaskId} (skill={Skill})", - request.TaskId, request.Skill); - - await context.PublishStatus(new AgentTaskStatusUpdate + try { - TaskId = request.TaskId, - ContextId = request.ContextId, - State = AgentTaskState.Working - }, ct); + logger.LogInformation("Handling task {TaskId} (skill={Skill})", + request.TaskId, request.Skill); + + await context.PublishStatus(new AgentTaskStatusUpdate + { + TaskId = request.TaskId, + ContextId = request.ContextId, + State = AgentTaskState.Working + }, linkedCt); + + if (!_bySkill.TryGetValue(request.Skill, out var capability)) + { + var known = string.Join(", ", _bySkill.Keys.OrderBy(k => k)); + return CapabilityResult.Error( + request, + $"Unknown skill '{request.Skill}'. Known skills: {known}"); + } - if (!_bySkill.TryGetValue(request.Skill, out var capability)) + // Wrap the context so the capability observes the linked token + // instead of the raw message CT. Capabilities keep reading + // context.MessageContext.CancellationToken — no signature change. + var cancellableContext = new AgentTaskContext + { + MessageContext = new MessageHandlerContext + { + Envelope = context.MessageContext.Envelope, + Agent = context.MessageContext.Agent, + Services = context.MessageContext.Services, + CancellationToken = linkedCt + }, + PublishStatus = context.PublishStatus + }; + + return await capability.ExecuteAsync(request, cancellableContext); + } + finally { - var known = string.Join(", ", _bySkill.Keys.OrderBy(k => k)); - return CapabilityResult.Error( - request, - $"Unknown skill '{request.Skill}'. Known skills: {known}"); + inFlight.Remove(request.TaskId); } - - return await capability.ExecuteAsync(request, context); } } diff --git a/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs index d55d8cc..9fdf225 100644 --- a/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs +++ b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs @@ -51,8 +51,7 @@ public static ExecuteFormBatchInput Parse(AgentTaskRequest request) string? jsonPayload = dataJson ?? (!string.IsNullOrEmpty(text) && text.StartsWith('{') ? text : null); if (jsonPayload is null) - return Fail("Input must be a JSON object with rows, allowedHosts, and either schemaRef or schema. " - + "Pass it as the A2A DataPart (invoke_agent's 'data' parameter) or as JSON in the text message."); + return Fail(HintedError("No structured input found.")); string? schemaRef; FormSchema? inlineSchema = null; @@ -119,13 +118,13 @@ public static ExecuteFormBatchInput Parse(AgentTaskRequest request) } if (schemaRef is null && inlineSchema is null) - return Fail("Provide either 'schemaRef' (a skill name) or inline 'schema'."); + return Fail(HintedError("Provide either 'schemaRef' (a skill name) or inline 'schema'.")); if (schemaRef is not null && inlineSchema is not null) return Fail("Provide only one of 'schemaRef' or 'schema', not both."); if (rows is null || rows.Count == 0) - return Fail("Missing 'rows' — must be a non-empty array of objects."); + return Fail(HintedError("Missing 'rows' — must be a non-empty array of objects.")); if (allowedHosts is null || allowedHosts.Count == 0) - return Fail("Missing 'allowedHosts' — execute-form-batch requires an explicit allowlist (spec §7.1)."); + return Fail(HintedError("Missing 'allowedHosts' — execute-form-batch requires an explicit allowlist (spec §7.1).")); HostAllowlist allowlist; try @@ -158,6 +157,10 @@ public static ExecuteFormBatchInput Parse(AgentTaskRequest request) private static ExecuteFormBatchInput Fail(string message) => new(null, null, null, null, null, ExecuteFormBatchMode.AbortOnFirst, null, message); + + private static string HintedError(string reason) => reason + " " + + "Pass inputs as a JSON object on the A2A DataPart — in RockBot's invoke_agent tool, that means filling the 'data' parameter with the object, NOT adding fields to the 'message' text. " + + "Example data: {\"schemaRef\":\"sites/host/forms/name\",\"rows\":[{\"field\":\"value\"}],\"allowedHosts\":[\"host\"]}."; } internal enum ExecuteFormBatchMode diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs index d9f800d..1ec7598 100644 --- a/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs @@ -87,10 +87,10 @@ public static LearnFormSchemaInput Parse(AgentTaskRequest request) allowedHosts = [.. hostsCsv.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)]; if (string.IsNullOrWhiteSpace(url)) - return Fail("Missing 'url' — the page hosting the form."); + return Fail(HintedError("Missing 'url' — the page hosting the form.")); if (allowedHosts is null || allowedHosts.Count == 0) - return Fail("Missing 'allowedHosts' — learn-form-schema requires an explicit allowlist (spec §7.1)."); + return Fail(HintedError("Missing 'allowedHosts' — learn-form-schema requires an explicit allowlist (spec §7.1).")); if (!Uri.TryCreate(url, UriKind.Absolute, out var parsedUrl) || (parsedUrl.Scheme != Uri.UriSchemeHttp && parsedUrl.Scheme != Uri.UriSchemeHttps)) @@ -115,6 +115,10 @@ public static LearnFormSchemaInput Parse(AgentTaskRequest request) private static LearnFormSchemaInput Fail(string message) => new(null, null, null, null, null, null, message); + private static string HintedError(string reason) => reason + " " + + "Pass inputs as a JSON object on the A2A DataPart — in RockBot's invoke_agent tool, that means filling the 'data' parameter, NOT adding fields to the 'message' text. " + + "Example data: {\"url\":\"https://...\",\"allowedHosts\":[\"host\"]}."; + private static string? ReadMetadata(AgentTaskRequest request, string key) { if (request.Message.Metadata is not null diff --git a/src/Foragent.Capabilities/InFlightTaskRegistry.cs b/src/Foragent.Capabilities/InFlightTaskRegistry.cs new file mode 100644 index 0000000..aa3a313 --- /dev/null +++ b/src/Foragent.Capabilities/InFlightTaskRegistry.cs @@ -0,0 +1,78 @@ +using System.Collections.Concurrent; + +namespace Foragent.Capabilities; + +/// +/// Tracks in-flight A2A tasks so an out-of-band cancel message can reach the +/// running task's . The framework's default +/// AgentTaskCancelHandler assumes stateless agents and always replies +/// TaskNotCancelable; Foragent is stateful (long-running browser work +/// per task), so we replace it with a registry-aware handler. +/// +/// One instance per process — the registry crosses DI scopes because the +/// cancel message is handled in a different scope than the task request. +/// +public sealed class InFlightTaskRegistry +{ + private readonly ConcurrentDictionary _tokens = new(StringComparer.Ordinal); + + /// + /// Creates a linked for + /// , registers it, and returns the linked token + /// that the running task should observe. If the task id is already + /// registered — the framework redelivered a message — the prior + /// registration is cancelled first so the stale task unwinds before the + /// new one starts. + /// + public CancellationToken Register(string taskId, CancellationToken parentToken) + { + var cts = CancellationTokenSource.CreateLinkedTokenSource(parentToken); + _tokens.AddOrUpdate( + taskId, + _ => cts, + (_, existing) => + { + // A redelivered task id should not keep the prior CTS alive — + // cancel it so any residual work stops, then replace. + try { existing.Cancel(); } catch (ObjectDisposedException) { } + existing.Dispose(); + return cts; + }); + return cts.Token; + } + + /// + /// Cancels the task if registered. Returns true when a cancel was issued, + /// false when no task with that id is in flight. + /// + public bool TryCancel(string taskId) + { + if (!_tokens.TryGetValue(taskId, out var cts)) + return false; + try + { + cts.Cancel(); + return true; + } + catch (ObjectDisposedException) + { + // Race: the task finished and disposed its CTS between our + // TryGetValue and Cancel. Equivalent to not-found for the caller. + return false; + } + } + + /// + /// Removes and disposes the registration for . + /// Called from the task handler's finally so both normal completion + /// and failure clean up. + /// + public void Remove(string taskId) + { + if (_tokens.TryRemove(taskId, out var cts)) + cts.Dispose(); + } + + /// Count of in-flight registrations. Exposed for tests and diagnostics. + public int Count => _tokens.Count; +} diff --git a/tests/Foragent.Agent.Tests/ForagentCancelHandlerTests.cs b/tests/Foragent.Agent.Tests/ForagentCancelHandlerTests.cs new file mode 100644 index 0000000..526a718 --- /dev/null +++ b/tests/Foragent.Agent.Tests/ForagentCancelHandlerTests.cs @@ -0,0 +1,112 @@ +using System.Text.Json; +using Foragent.Capabilities; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using RockBot.A2A; +using RockBot.Host; +using RockBot.Messaging; +using Xunit; + +namespace Foragent.Agent.Tests; + +public class ForagentCancelHandlerTests +{ + [Fact] + public async Task CancelsInFlightTask_WithoutPublishingReply() + { + var registry = new InFlightTaskRegistry(); + using var parent = new CancellationTokenSource(); + var token = registry.Register("task-1", parent.Token); + + var publisher = new CapturingPublisher(); + var handler = BuildHandler(registry, publisher); + + await handler.HandleAsync( + new AgentTaskCancelRequest { TaskId = "task-1" }, + BuildContext()); + + Assert.True(token.IsCancellationRequested); + // On a successful cancel we deliberately don't publish a reply — + // the task's own terminal result is the acknowledgment. + Assert.Empty(publisher.Published); + } + + [Fact] + public async Task PublishesTaskNotFound_WhenNoInFlightRegistration() + { + var registry = new InFlightTaskRegistry(); + var publisher = new CapturingPublisher(); + var handler = BuildHandler(registry, publisher); + + await handler.HandleAsync( + new AgentTaskCancelRequest { TaskId = "nope", ContextId = "ctx-7" }, + BuildContext(replyTo: "caller.replies")); + + var (topic, envelope) = Assert.Single(publisher.Published); + Assert.Equal("caller.replies", topic); + // Use the framework's GetPayload extension so the camelCase naming + // policy matches what ToEnvelope used on the write side. + var error = envelope.GetPayload()!; + Assert.Equal("nope", error.TaskId); + Assert.Equal("ctx-7", error.ContextId); + Assert.Equal(AgentTaskError.Codes.TaskNotFound, error.Code); + } + + [Fact] + public async Task FallsBackToDefaultResultTopic_WhenEnvelopeHasNoReplyTo() + { + var registry = new InFlightTaskRegistry(); + var publisher = new CapturingPublisher(); + var handler = BuildHandler(registry, publisher); + + await handler.HandleAsync( + new AgentTaskCancelRequest { TaskId = "missing" }, + BuildContext(replyTo: null)); + + var (topic, _) = Assert.Single(publisher.Published); + Assert.Equal("agent.response", topic); // A2AOptions.DefaultResultTopic default + } + + // ── helpers ───────────────────────────────────────────────────────────── + + private static ForagentCancelHandler BuildHandler( + InFlightTaskRegistry registry, + IMessagePublisher publisher) + { + var options = new A2AOptions(); + var identity = new AgentIdentity("Foragent"); + return new ForagentCancelHandler( + registry, publisher, options, identity, + NullLogger.Instance); + } + + private static MessageHandlerContext BuildContext(string? replyTo = null) + { + var envelope = MessageEnvelope.Create( + messageType: typeof(AgentTaskCancelRequest).FullName!, + body: ReadOnlyMemory.Empty, + source: "test", + replyTo: replyTo); + return new MessageHandlerContext + { + Envelope = envelope, + Agent = new AgentIdentity("Foragent"), + Services = new ServiceCollection().BuildServiceProvider(), + CancellationToken = CancellationToken.None + }; + } + + private sealed class CapturingPublisher : IMessagePublisher + { + public List<(string Topic, MessageEnvelope Envelope)> Published { get; } = []; + + public Task PublishAsync(string topic, MessageEnvelope envelope, CancellationToken cancellationToken = default) + { + Published.Add((topic, envelope)); + return Task.CompletedTask; + } + + public ValueTask DisposeAsync() => ValueTask.CompletedTask; + } +} diff --git a/tests/Foragent.Agent.Tests/ForagentTaskHandlerTests.cs b/tests/Foragent.Agent.Tests/ForagentTaskHandlerTests.cs index 970fc16..d33be92 100644 --- a/tests/Foragent.Agent.Tests/ForagentTaskHandlerTests.cs +++ b/tests/Foragent.Agent.Tests/ForagentTaskHandlerTests.cs @@ -13,7 +13,7 @@ public async Task DispatchesToMatchingCapability() var executed = new List(); ICapability alpha = new TrackingCapability("alpha", executed); ICapability beta = new TrackingCapability("beta", executed); - var handler = new ForagentTaskHandler([alpha, beta], NullLogger.Instance); + var handler = new ForagentTaskHandler([alpha, beta], new InFlightTaskRegistry(), NullLogger.Instance); var (context, _) = TestContext.Build(); await handler.HandleTaskAsync(TestContext.Request("beta", "in"), context); @@ -25,7 +25,7 @@ public async Task DispatchesToMatchingCapability() public async Task PublishesWorkingStatus_BeforeDispatch() { ICapability alpha = new TrackingCapability("alpha", []); - var handler = new ForagentTaskHandler([alpha], NullLogger.Instance); + var handler = new ForagentTaskHandler([alpha], new InFlightTaskRegistry(), NullLogger.Instance); var (context, capture) = TestContext.Build(); await handler.HandleTaskAsync(TestContext.Request("alpha", "in"), context); @@ -38,7 +38,7 @@ public async Task PublishesWorkingStatus_BeforeDispatch() public async Task ReturnsErrorResult_ForUnknownSkill() { ICapability alpha = new TrackingCapability("alpha", []); - var handler = new ForagentTaskHandler([alpha], NullLogger.Instance); + var handler = new ForagentTaskHandler([alpha], new InFlightTaskRegistry(), NullLogger.Instance); var (context, _) = TestContext.Build(); var result = await handler.HandleTaskAsync(TestContext.Request("nope", "in"), context); @@ -48,12 +48,80 @@ public async Task ReturnsErrorResult_ForUnknownSkill() Assert.Contains("alpha", TestContext.TextOf(result)); } + [Fact] + public async Task RegistersTask_ForDurationOfExecution_ThenRemovesOnCompletion() + { + var registry = new InFlightTaskRegistry(); + CancellationToken? observedCt = null; + int countDuringExecute = 0; + var capability = new CallbackCapability("probe", (req, ctx) => + { + countDuringExecute = registry.Count; + observedCt = ctx.MessageContext.CancellationToken; + return Task.FromResult(CapabilityResultFactory.Completed(req, "ok")); + }); + var handler = new ForagentTaskHandler([capability], registry, NullLogger.Instance); + var (context, _) = TestContext.Build(); + + await handler.HandleTaskAsync(TestContext.Request("probe", "in"), context); + + Assert.Equal(1, countDuringExecute); + Assert.Equal(0, registry.Count); + Assert.NotNull(observedCt); + } + + [Fact] + public async Task ExternalCancel_CancelsTheTaskToken() + { + // Simulates the cancel handler firing mid-execution: the capability's + // observed CT should be cancelled when TryCancel is called, even + // though the parent message CT is still live. + var registry = new InFlightTaskRegistry(); + var observedCt = new TaskCompletionSource(); + var released = new TaskCompletionSource(); + var capability = new CallbackCapability("probe", async (req, ctx) => + { + observedCt.TrySetResult(ctx.MessageContext.CancellationToken); + await released.Task; + return CapabilityResultFactory.Completed(req, "ok"); + }); + var handler = new ForagentTaskHandler([capability], registry, NullLogger.Instance); + var (context, _) = TestContext.Build(); + var request = TestContext.Request("probe", "in"); + + var task = handler.HandleTaskAsync(request, context); + var ct = await observedCt.Task; + + Assert.False(ct.IsCancellationRequested); + Assert.True(registry.TryCancel(request.TaskId)); + Assert.True(ct.IsCancellationRequested); + + released.SetResult(); + await task; + Assert.Equal(0, registry.Count); + } + + [Fact] + public async Task Registry_IsClearedEven_WhenCapabilityThrows() + { + var registry = new InFlightTaskRegistry(); + var capability = new CallbackCapability("probe", (_, _) => + throw new InvalidOperationException("boom")); + var handler = new ForagentTaskHandler([capability], registry, NullLogger.Instance); + var (context, _) = TestContext.Build(); + + await Assert.ThrowsAsync(() => + handler.HandleTaskAsync(TestContext.Request("probe", "in"), context)); + + Assert.Equal(0, registry.Count); + } + [Fact] public async Task SkillLookup_IsCaseInsensitive() { var executed = new List(); ICapability alpha = new TrackingCapability("alpha", executed); - var handler = new ForagentTaskHandler([alpha], NullLogger.Instance); + var handler = new ForagentTaskHandler([alpha], new InFlightTaskRegistry(), NullLogger.Instance); var (context, _) = TestContext.Build(); await handler.HandleTaskAsync(TestContext.Request("Alpha", "in"), context); @@ -61,6 +129,17 @@ public async Task SkillLookup_IsCaseInsensitive() Assert.Equal(["alpha"], executed); } + private sealed class CallbackCapability( + string skillId, + Func> callback) : ICapability + { + public string SkillId => skillId; + public AgentSkill Skill => new() { Id = skillId, Name = skillId, Description = skillId }; + + public Task ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) => + callback(request, context); + } + private sealed class TrackingCapability(string skillId, List executed) : ICapability { public string SkillId => skillId; diff --git a/tests/Foragent.Agent.Tests/InFlightTaskRegistryTests.cs b/tests/Foragent.Agent.Tests/InFlightTaskRegistryTests.cs new file mode 100644 index 0000000..76cbe5d --- /dev/null +++ b/tests/Foragent.Agent.Tests/InFlightTaskRegistryTests.cs @@ -0,0 +1,71 @@ +using Foragent.Capabilities; +using Xunit; + +namespace Foragent.Agent.Tests; + +public class InFlightTaskRegistryTests +{ + [Fact] + public void Register_ReturnsLinkedToken_ThatFiresWhenCancelled() + { + var registry = new InFlightTaskRegistry(); + using var parent = new CancellationTokenSource(); + + var token = registry.Register("task-1", parent.Token); + + Assert.False(token.IsCancellationRequested); + Assert.True(registry.TryCancel("task-1")); + Assert.True(token.IsCancellationRequested); + } + + [Fact] + public void Register_ReturnsLinkedToken_ThatFiresWhenParentCancels() + { + var registry = new InFlightTaskRegistry(); + using var parent = new CancellationTokenSource(); + + var token = registry.Register("task-1", parent.Token); + + parent.Cancel(); + Assert.True(token.IsCancellationRequested); + } + + [Fact] + public void TryCancel_ReturnsFalse_ForUnknownTask() + { + var registry = new InFlightTaskRegistry(); + + Assert.False(registry.TryCancel("nope")); + } + + [Fact] + public void Remove_DropsRegistration() + { + var registry = new InFlightTaskRegistry(); + using var parent = new CancellationTokenSource(); + registry.Register("task-1", parent.Token); + Assert.Equal(1, registry.Count); + + registry.Remove("task-1"); + + Assert.Equal(0, registry.Count); + Assert.False(registry.TryCancel("task-1")); + } + + [Fact] + public void Register_ReplacesPriorRegistration_AndCancelsIt() + { + // A redelivered message with the same task id should not orphan the + // previous registration's CTS. The first CTS is cancelled so any + // residual work unwinds before the new execution begins. + var registry = new InFlightTaskRegistry(); + using var parent = new CancellationTokenSource(); + + var firstToken = registry.Register("task-1", parent.Token); + var secondToken = registry.Register("task-1", parent.Token); + + Assert.True(firstToken.IsCancellationRequested); + Assert.False(secondToken.IsCancellationRequested); + Assert.Equal(1, registry.Count); + } +}