diff --git a/packages/agent-core/src/tools/builtin/state/todo-list.md b/packages/agent-core/src/tools/builtin/state/todo-list.md index 3dc3c08dc..aeec3986d 100644 --- a/packages/agent-core/src/tools/builtin/state/todo-list.md +++ b/packages/agent-core/src/tools/builtin/state/todo-list.md @@ -19,9 +19,10 @@ Use this tool to maintain a structured TODO list as you work through a multi-ste - If no available tool can move any task forward, tell the user where you are stuck instead of repeatedly re-ordering the same todos. **How to use:** -- Call with `todos: [...]` to replace the full list. Statuses: pending / in_progress / done. +- Call with `todos: [...]` to replace the full list. Statuses: `pending` / `in_progress` / `done`. - Call with no `todos` argument to retrieve the current list without changing it. - Call with `todos: []` to clear the list. +- **Important:** the status must be exactly `done`, not `completed` or `finished`. - Keep titles short and actionable (e.g. "Read session-control.ts", "Add planMode flag to TurnManager"). - Update statuses as you make progress. - When work is underway, keep exactly one task `in_progress`. diff --git a/packages/agent-core/src/tools/builtin/state/todo-list.ts b/packages/agent-core/src/tools/builtin/state/todo-list.ts index 852042e19..9a5ffdb20 100644 --- a/packages/agent-core/src/tools/builtin/state/todo-list.ts +++ b/packages/agent-core/src/tools/builtin/state/todo-list.ts @@ -28,7 +28,7 @@ export const TODO_STORE_KEY = 'todo'; const TODO_LIST_WRITE_REMINDER = 'Ensure that you continue to use the todo list to track progress. Mark tasks done immediately after finishing them, and keep exactly one task in_progress when work is underway.'; -export type TodoStatus = 'pending' | 'in_progress' | 'done'; +export type TodoStatus = 'pending' | 'in_progress' | 'done' | 'completed'; export interface TodoItem { readonly title: string; @@ -45,7 +45,9 @@ declare module '../../store' { const TodoItemSchema = z.object({ title: z.string().min(1).describe('Short, actionable title for the todo.'), - status: z.enum(['pending', 'in_progress', 'done']).describe('Current status of the todo.'), + status: z + .preprocess((val) => (val === 'completed' ? 'done' : val), z.enum(['pending', 'in_progress', 'done'])) + .describe('Current status of the todo. Must be exactly one of: pending, in_progress, done. Do NOT use completed or finished.'), }); export interface TodoListInput { @@ -81,6 +83,7 @@ function statusMarker(status: TodoStatus): string { case 'in_progress': return '[in_progress]'; case 'done': + case 'completed': return '[done]'; default: { const _exhaustive: never = status; @@ -133,7 +136,10 @@ export class TodoListTool implements BuiltinTool { private setTodos(todos: readonly TodoItem[]): void { this.store.set( TODO_STORE_KEY, - todos.map((todo) => ({ title: todo.title, status: todo.status })), + todos.map((todo) => ({ + title: todo.title, + status: todo.status === 'completed' ? 'done' : todo.status, + })), ); } } diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.md b/packages/agent-core/src/tools/builtin/web/fetch-url.md index 79cc0ead5..10f37506b 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.md +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.md @@ -1,3 +1,3 @@ -Fetch content from a URL. For an HTML page the main article text is extracted; for a plain-text or markdown response the full body is returned verbatim. The result states which of the two you received, so you can judge how complete it is. Use this when you need to read a specific web page. +Fetch content from a URL. For an HTML page the main article text is extracted; for a plain-text or markdown response the full body is returned verbatim; for an image the image is returned directly so the model can view it. The result states which of the three you received, so you can judge how complete it is. Use this when you need to read a specific web page or view an image from a URL. -Only fully-formed public `http`/`https` URLs are supported; other schemes and private or loopback addresses are not fetched. Very large pages may be truncated or refused. +Only fully-formed public `http`/`https` URLs are supported; other schemes and private or loopback addresses are not fetched. Very large pages or images may be truncated or refused. diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.ts b/packages/agent-core/src/tools/builtin/web/fetch-url.ts index 38631a9a4..e2a8cc342 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.ts +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.ts @@ -7,6 +7,7 @@ */ import { z } from 'zod'; +import type { ContentPart } from '@moonshot-ai/kosong'; import type { BuiltinTool } from '../../../agent/tool'; import { ToolAccesses } from '../../../loop/tool-access'; @@ -26,13 +27,22 @@ import DESCRIPTION from './fetch-url.md?raw'; * - `extracted` — the body was an HTML page; only the main article text * was extracted and returned. */ -export type UrlFetchKind = 'passthrough' | 'extracted'; +export type UrlFetchKind = 'passthrough' | 'extracted' | 'image'; + +export interface UrlFetchImageData { + /** Base64-encoded image bytes. */ + base64: string; + /** MIME type of the image (e.g. image/png). */ + mimeType: string; +} export interface UrlFetchResult { - /** The text handed to the LLM. */ + /** The text handed to the LLM, or an empty string when imageData is present. */ content: string; - /** Whether `content` is a verbatim passthrough or extracted main text. */ + /** Whether `content` is a verbatim passthrough, extracted main text, or an image. */ kind: UrlFetchKind; + /** When `kind` is 'image', the image data to be rendered as an image_url content part. */ + imageData?: UrlFetchImageData; } export interface UrlFetcher { @@ -89,7 +99,20 @@ export class FetchURLTool implements BuiltinTool { }: ExecutableToolContext, ): Promise { try { - const { content, kind } = await this.fetcher.fetch(args.url, { toolCallId }); + const { content, kind, imageData } = await this.fetcher.fetch(args.url, { toolCallId }); + + // If the provider returned an image, render it as an image_url content part + // so the model can see it directly. + if (imageData) { + const output: ContentPart[] = [ + { type: 'text', text: `Fetched image from ${args.url}` }, + { + type: 'image_url', + imageUrl: { url: `data:${imageData.mimeType};base64,${imageData.base64}` }, + }, + ]; + return { output, isError: false }; + } if (!content) { return { diff --git a/packages/agent-core/src/tools/providers/local-fetch-url.ts b/packages/agent-core/src/tools/providers/local-fetch-url.ts index af10a8ca3..5efd6f07b 100644 --- a/packages/agent-core/src/tools/providers/local-fetch-url.ts +++ b/packages/agent-core/src/tools/providers/local-fetch-url.ts @@ -161,6 +161,35 @@ export class LocalFetchURLProvider implements UrlFetcher { ); } + const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); + + // Image responses: stream as binary and return as base64-encoded image data + // so the tool can render them directly as image_url content parts. + if (contentType.startsWith('image/')) { + const contentLengthRaw = response.headers.get('content-length'); + if (contentLengthRaw !== null) { + const cl = Number(contentLengthRaw); + if (Number.isFinite(cl) && cl > this.maxBytes) { + throw new Error( + `Response body too large: ${String(cl)} bytes exceeds maxBytes (${String(this.maxBytes)}).`, + ); + } + } + const buffer = await response.arrayBuffer(); + const bytes = Buffer.byteLength(buffer); + if (bytes > this.maxBytes) { + throw new Error( + `Response body too large: ${String(bytes)} bytes exceeds maxBytes (${String(this.maxBytes)}).`, + ); + } + const base64 = Buffer.from(buffer).toString('base64'); + return { + content: '', + kind: 'image', + imageData: { base64, mimeType: contentType.split(';')[0].trim() }, + }; + } + // Reject oversized responses before buffering the full body. const contentLengthRaw = response.headers.get('content-length'); if (contentLengthRaw !== null) { @@ -182,7 +211,6 @@ export class LocalFetchURLProvider implements UrlFetcher { ); } - const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); if (contentType.startsWith('text/plain') || contentType.startsWith('text/markdown')) { return { content: body, kind: 'passthrough' }; } diff --git a/packages/agent-core/src/tools/providers/moonshot-fetch-url.ts b/packages/agent-core/src/tools/providers/moonshot-fetch-url.ts index 825781da4..7c37981de 100644 --- a/packages/agent-core/src/tools/providers/moonshot-fetch-url.ts +++ b/packages/agent-core/src/tools/providers/moonshot-fetch-url.ts @@ -50,9 +50,7 @@ export class MoonshotFetchURLProvider implements UrlFetcher { async fetch(url: string, options?: { toolCallId?: string }): Promise { try { - const content = await this.fetchViaMoonshot(url, options?.toolCallId); - // The service returns text it has already extracted from the page. - return { content, kind: 'extracted' }; + return await this.fetchViaMoonshot(url, options?.toolCallId); } catch { // Forward an explicit options object even when the caller passed // none, so downstream consumers always see a defined second arg. @@ -63,7 +61,7 @@ export class MoonshotFetchURLProvider implements UrlFetcher { private async fetchViaMoonshot( url: string, toolCallId: string | undefined, - ): Promise { + ): Promise { const bodyJson = JSON.stringify({ url }); const response = await this.post(bodyJson, toolCallId); @@ -82,7 +80,19 @@ export class MoonshotFetchURLProvider implements UrlFetcher { ); } - return response.text(); + const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); + if (contentType.startsWith('image/')) { + const buffer = await response.arrayBuffer(); + const base64 = Buffer.from(buffer).toString('base64'); + return { + content: '', + kind: 'image', + imageData: { base64, mimeType: contentType.split(';')[0].trim() }, + }; + } + + // The service returns text it has already extracted from the page. + return { content: await response.text(), kind: 'extracted' }; } private async post(bodyJson: string, toolCallId: string | undefined): Promise { diff --git a/packages/agent-core/test/tools/fetch-url.test.ts b/packages/agent-core/test/tools/fetch-url.test.ts index 19d80d168..bca78a471 100644 --- a/packages/agent-core/test/tools/fetch-url.test.ts +++ b/packages/agent-core/test/tools/fetch-url.test.ts @@ -249,8 +249,33 @@ describe('FetchURLTool', () => { expect(toolContentString(result)).toMatch(/due to network error/i); }); + it('returns image data as image_url content parts when fetcher returns an image', async () => { + const fetcher: UrlFetcher = { + fetch: vi.fn().mockResolvedValue({ + content: '', + kind: 'image' as const, + imageData: { base64: 'abc123', mimeType: 'image/png' }, + }), + }; + const tool = new FetchURLTool(fetcher); + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c-img', + args: { url: 'https://example.com/chart.png' }, + signal, + }); + + expect(result.isError).toBe(false); + expect(Array.isArray(result.output)).toBe(true); + const parts = result.output as Array<{ type: string; text?: string; imageUrl?: { url: string } }>; + expect(parts).toHaveLength(2); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('Fetched image'); + expect(parts[1].type).toBe('image_url'); + expect(parts[1].imageUrl?.url).toBe('data:image/png;base64,abc123'); + }); + it('passes through markdown content verbatim instead of running text extraction', async () => { - // py: when the server returns text/markdown, extraction is skipped and // the body is returned as-is with a different status message. The // fetcher signals the bypass via UrlFetchResult.kind = 'passthrough'. const markdown = '# Title\n\nThis is a markdown document.\n'; diff --git a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts index 2c0ce931f..4b1661dd6 100644 --- a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts +++ b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts @@ -40,6 +40,24 @@ describe('LocalFetchURLProvider content kind', () => { expect(result).toEqual({ content: '# Title\n\nbody', kind: 'passthrough' }); }); + it('returns image data as base64 for image/* content types', async () => { + const fetchImpl = vi.fn().mockResolvedValue( + new Response(Buffer.from('fake-png-bytes'), { + status: 200, + headers: { 'content-type': 'image/png' }, + }), + ); + const provider = new LocalFetchURLProvider({ fetchImpl }); + + const result = await provider.fetch('https://example.com/chart.png'); + + expect(result.kind).toBe('image'); + expect(result.content).toBe(''); + expect(result.imageData).toBeDefined(); + expect(result.imageData?.mimeType).toBe('image/png'); + expect(result.imageData?.base64).toBe(Buffer.from('fake-png-bytes').toString('base64')); + }); + it('reports HTML bodies as extracted main content', async () => { const html = 'Doc
' + diff --git a/packages/agent-core/test/tools/todo-list.test.ts b/packages/agent-core/test/tools/todo-list.test.ts index 003e14a2e..3d8f4aad3 100644 --- a/packages/agent-core/test/tools/todo-list.test.ts +++ b/packages/agent-core/test/tools/todo-list.test.ts @@ -142,6 +142,23 @@ describe('TodoListTool', () => { ]); }); + it('accepts "completed" as a status and maps it to "done"', async () => { + const { tool, getTodos } = makeTool(); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'call_1', + args: { + todos: [{ title: 'done task', status: 'completed' }], + }, + signal, + }); + + expect(result).toMatchObject({ isError: false }); + expect(result.output).toContain('[done] done task'); + expect(getTodos()).toEqual([{ title: 'done task', status: 'done' }]); + }); + it('renders a done todo with a marker matching the status enum value', async () => { const { tool } = makeTool([{ title: 'shipped', status: 'done' }]);