diff --git a/.changeset/select-tools-progressive-disclosure.md b/.changeset/select-tools-progressive-disclosure.md new file mode 100644 index 000000000..cd457bb12 --- /dev/null +++ b/.changeset/select-tools-progressive-disclosure.md @@ -0,0 +1,6 @@ +--- +"@moonshot-ai/kimi-code": minor +"@moonshot-ai/kimi-code-sdk": minor +--- + +Add experimental progressive tool disclosure (`select_tools`). When the `tool-select` experimental flag is on and the active model declares the `select_tools` capability, MCP tool schemas stay out of the request's top-level `tools[]` (preserving the provider prompt cache); the model loads tools on demand by exact name via the new built-in `select_tools` tool, guided by `/` announcements. Off by default and inert on models without the capability — behavior is unchanged until a supporting model is catalogued. The SDK additionally maps the `select_tools` capability when building model aliases from a catalog and reports the new flag through `getExperimentalFeatures()`. diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index a310fd905..4b536358b 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -11,12 +11,19 @@ import { type GenerateResult, type Message, type TokenUsage, + type Tool, APIContextOverflowError, APIStatusError, createUserMessage, } from '@moonshot-ai/kosong'; import type { Agent } from '..'; +import type { ContextMessage } from '../context/types'; +import { + collectLoadedDynamicToolNames, + DYNAMIC_TOOL_SCHEMA_VARIANT, + stripDynamicToolContext, +} from '../context/dynamic-tools'; import { isAbortError } from '../../loop/errors'; import { retryBackoffDelays, @@ -218,7 +225,9 @@ export class FullCompaction { private estimateRequestTokens(messages: readonly Message[]): number { return ( estimateTokens(this.agent.config.systemPrompt) + - estimateTokensForTools(this.agent.tools.loopTools) + + // Deferred tools never reach the outbound top-level tools[] (kosong + // generate() strips them); keep the estimate aligned with the wire. + estimateTokensForTools(this.agent.tools.loopTools.filter((t) => t.deferred !== true)) + estimateTokensForMessages(messages) ); } @@ -357,6 +366,65 @@ export class FullCompaction { }).trimEnd(); } + /** + * Keep-all rebuild (Phase 1): after compaction folded the history — and the + * dynamic tool schema messages with it — append ONE merged schema message so + * the model keeps calling its loaded tools without re-selecting. Schemas are + * read from the live registry, never copied from the old history, so a + * schema that changed since load self-heals here. Names whose server is + * currently disconnected have no registry schema and are not rebuilt (the + * model re-selects after reconnect); names that survived into the + * post-compaction history (none under today's users+summary rebuild, but + * guarded) are not duplicated. The message goes through the normal + * injection-origin append, so estimation and records pick it up as usual. + * + * Budget guard: the rebuilt floor (users + summary + schemas) is the one + * part of the post-compaction context that compaction itself can never + * shrink — if it lands inside the auto-compaction trigger band, every + * following step re-compacts and rebuilds in a loop. Admit schemas (in name + * order) only while the projected context stays within HALF the compaction + * trigger, so normal turn content still fits before the next compaction; + * anything dropped is simply re-selectable on demand, the same degradation + * as a disconnected server. + */ + private rebuildDynamicToolSchemas(activeBefore: ReadonlySet): void { + if (!this.agent.toolSelectEnabled) return; + if (activeBefore.size === 0) return; + const surviving = collectLoadedDynamicToolNames(this.agent.context.history); + const names = [...activeBefore] + .filter((name) => !surviving.has(name)) + .toSorted((a, b) => a.localeCompare(b)); + const candidates = names + .map((name) => this.agent.tools.getMcpToolSchema(name)) + .filter((tool): tool is NonNullable => tool !== undefined); + if (candidates.length === 0) return; + const tools: Tool[] = []; + let projected = this.tokenCountWithPending; + for (const tool of candidates) { + const toolTokens = estimateTokensForTools([tool]); + // shouldCompact is monotonic in usedSize, so doubling the projected + // size checks "within half the trigger" for both trigger branches + // (ratio and reserved-context). + if (this.strategy.shouldCompact((projected + toolTokens) * 2)) break; + tools.push(tool); + projected += toolTokens; + } + if (tools.length < candidates.length) { + this.agent.log.info('trimmed dynamic tool schema rebuild to stay clear of the compaction trigger', { + kept: tools.length, + dropped: candidates.slice(tools.length).map((tool) => tool.name), + }); + } + if (tools.length === 0) return; + this.agent.context.appendMessage({ + role: 'system', + content: [], + toolCalls: [], + tools, + origin: { kind: 'injection', variant: DYNAMIC_TOOL_SCHEMA_VARIANT }, + }); + } + private postProcessSummary(summary: string): string { const storeData = this.agent.tools.storeData(); const todos = (storeData[TODO_STORE_KEY] as readonly TodoItem[] | undefined) ?? []; @@ -374,6 +442,11 @@ export class FullCompaction { const startedAt = Date.now(); const originalHistory = [...this.agent.context.history]; const tokensBefore = estimateTokensForMessages(originalHistory); + // Loaded-tools snapshot BEFORE the rebuild below folds the history away; + // read here so the keep-all schema rebuild after applyCompaction knows + // what was active. (The ledger scans history, which applyCompaction + // replaces.) + const activeDynamicToolsBefore = new Set(this.agent.tools.loadedDynamicToolNames()); let retryCount = 0; try { await this.triggerPreCompactHook(data, tokensBefore, signal); @@ -407,7 +480,15 @@ export class FullCompaction { // Compact the whole history, trimming old messages only when the // summarizer request itself cannot fit. Any trimmed messages are not // covered by the produced summary; `droppedCount` reports that blind spot. - let historyForModel = originalHistory; + // Dynamic-tool protocol context (schema messages, loadable-tools + // announcements) is excluded from the summarizer input entirely: it is + // protocol state, not conversation — summarizing it wastes tokens and + // risks schema text leaking into the summary. Zero information loss: + // the post-compaction boundary re-announces the manifest and the + // keep-all rebuild re-carries the schemas. Must happen before project() + // (which strips the origin anchor). `originalHistory` itself stays + // untouched for the prefix-race check and `compactedCount`. + let historyForModel: readonly ContextMessage[] = stripDynamicToolContext(originalHistory); let droppedCount = 0; let overflowShrinkCount = 0; let emptyOrTruncatedShrinkCount = 0; @@ -525,6 +606,7 @@ export class FullCompaction { tokensBefore, droppedCount: droppedCount === 0 ? undefined : droppedCount, }); + this.rebuildDynamicToolSchemas(activeDynamicToolsBefore); // Telemetry keys are snake_case, but the `context.apply_compaction` // record written below keeps its persisted camelCase field names @@ -544,7 +626,15 @@ export class FullCompaction { ? {} : { input_tokens: inputTotal(usage), output_tokens: usage.output }), }); - this.lastCompactedTokenCount = result.tokensAfter; + // Baseline the "nothing new since compaction" guard on the counter + // that includes the schema rebuild appended above. `result.tokensAfter` + // predates the rebuild (and deliberately keeps its persisted + // users+summary semantics — the rebuild message is accounted through + // the pending-estimate tail, so folding it into tokensAfter would + // double-count on both the live and the restore path). A baseline + // below the actual post-compaction floor would let checkAutoCompaction + // re-trigger even though the compacted shape cannot shrink further. + this.lastCompactedTokenCount = this.tokenCountWithPending; return result; } catch (error) { if (isAbortError(error)) return undefined; diff --git a/packages/agent-core/src/agent/context/dynamic-tools.ts b/packages/agent-core/src/agent/context/dynamic-tools.ts new file mode 100644 index 000000000..03c0fba81 --- /dev/null +++ b/packages/agent-core/src/agent/context/dynamic-tools.ts @@ -0,0 +1,151 @@ +/** + * Shared predicates and shaping helpers for select_tools progressive + * disclosure protocol context. + * + * Two kinds of messages carry that protocol state in the history: + * - dynamic tool schema messages: `role: 'system'` messages whose `tools` + * field holds full tool definitions (origin + * `{kind: 'injection', variant: 'dynamic_tool_schema'}` so undo keeps + * them — tool loading is protocol context, not conversation); + * - loadable-tools announcements: `/` system + * reminders (origin `{kind: 'system_trigger', name: 'loadable-tools'}` so + * undo removes them and the next turn-boundary diff self-heals). + * + * Everything here anchors on `origin` or the `tools` field, so callers that + * need to filter MUST run before `project()` — projection strips `origin`. + */ + +import type { Tool } from '@moonshot-ai/kosong'; + +import type { ContextMessage } from './types'; + +/** Origin variant of an injected dynamic tool schema message (undo keeps it). */ +export const DYNAMIC_TOOL_SCHEMA_VARIANT = 'dynamic_tool_schema'; + +/** Origin name of the loadable-tools diff announcements (undo removes them). */ +export const LOADABLE_TOOLS_TRIGGER = 'loadable-tools'; + +/** True for a message that loads tool definitions (`message.tools` present). */ +export function isDynamicToolSchemaMessage(message: ContextMessage): boolean { + return message.tools !== undefined && message.tools.length > 0; +} + +/** True for a `/` announcement reminder. */ +export function isLoadableToolsAnnouncement(message: ContextMessage): boolean { + return ( + message.origin?.kind === 'system_trigger' && message.origin.name === LOADABLE_TOOLS_TRIGGER + ); +} + +/** + * Shape a history for a consumer that must not see dynamic-tool protocol + * context: drop the loadable-tools announcements and strip `message.tools` + * (dropping the message entirely when nothing else remains). Two callers: + * - projection for a model without the `select_tools` capability (mid-session + * model switch — the canonical history keeps its shape, only the outgoing + * view changes; announcements would be noise and even reference a + * select_tools tool the model does not have); + * - the compaction summarizer input (schemas and announcements are protocol + * context, not conversation — summarizing them wastes tokens and risks + * leaking schema text into the summary). + * Returns the input array unchanged when there is nothing to strip, so the + * common no-dynamic-tools path costs one scan and no allocation. + */ +export function stripDynamicToolContext( + history: readonly ContextMessage[], +): readonly ContextMessage[] { + if (!history.some((m) => isDynamicToolSchemaMessage(m) || isLoadableToolsAnnouncement(m))) { + return history; + } + const out: ContextMessage[] = []; + for (const message of history) { + if (isLoadableToolsAnnouncement(message)) continue; + if (isDynamicToolSchemaMessage(message)) { + const { tools: _tools, ...rest } = message; + void _tools; + if (rest.content.length === 0 && rest.toolCalls.length === 0) continue; + out.push(rest); + continue; + } + out.push(message); + } + return out; +} + +/** Union of tool names loaded by dynamic tool schema messages in `history`. */ +export function collectLoadedDynamicToolNames( + history: readonly ContextMessage[], +): Set { + const names = new Set(); + for (const message of history) { + if (message.tools === undefined) continue; + for (const tool of message.tools) { + names.add(tool.name); + } + } + return names; +} + +const TOOLS_ADDED_BLOCK = /\n?([\s\S]*?)\n?<\/tools_added>/g; +const TOOLS_REMOVED_BLOCK = /\n?([\s\S]*?)\n?<\/tools_removed>/g; + +/** + * Fold every loadable-tools announcement in `history`, in order, into the + * currently-announced name set (`tools_removed` deletes, then `tools_added` + * adds — last wins). The announcements are the context's own record of what + * the model has been told is loadable; there is deliberately no separate + * persisted ledger, so undo/compaction/resume all self-heal by re-folding. + */ +export function foldAnnouncedToolNames(history: readonly ContextMessage[]): Set { + const announced = new Set(); + for (const message of history) { + if (!isLoadableToolsAnnouncement(message)) continue; + const text = message.content + .map((part) => (part.type === 'text' ? part.text : '')) + .join(''); + for (const name of matchToolNameBlocks(text, TOOLS_REMOVED_BLOCK)) { + announced.delete(name); + } + for (const name of matchToolNameBlocks(text, TOOLS_ADDED_BLOCK)) { + announced.add(name); + } + } + return announced; +} + +function matchToolNameBlocks(text: string, pattern: RegExp): string[] { + const names: string[] = []; + pattern.lastIndex = 0; + for (const match of text.matchAll(pattern)) { + const body = match[1] ?? ''; + for (const line of body.split('\n')) { + const name = line.trim(); + if (name.length > 0) names.push(name); + } + } + return names; +} + +/** + * Render one diff announcement. Only the blocks with content are emitted; the + * guidance sentence never contains a literal block tag, so `foldAnnouncedToolNames` + * can anchor on the tags without tripping over prose. + */ +export function renderLoadableToolsAnnouncement( + added: readonly string[], + removed: readonly string[], +): string { + const sections: string[] = []; + if (added.length > 0) { + sections.push(`\n${added.join('\n')}\n`); + } + if (removed.length > 0) { + sections.push(`\n${removed.join('\n')}\n`); + } + sections.push( + 'Use the select_tools tool with exact names to load full tool definitions before calling them. ' + + 'Names listed as removed are no longer loadable — do not select them. ' + + 'Fold all announcements in this conversation in order to get the current list.', + ); + return sections.join('\n\n'); +} diff --git a/packages/agent-core/src/agent/context/index.ts b/packages/agent-core/src/agent/context/index.ts index 5f6abc2f2..10247c9f5 100644 --- a/packages/agent-core/src/agent/context/index.ts +++ b/packages/agent-core/src/agent/context/index.ts @@ -23,6 +23,7 @@ import { type ProjectOptions, trimTrailingOpenToolExchange, } from './projector'; +import { stripDynamicToolContext } from './dynamic-tools'; import { USER_PROMPT_ORIGIN, type AgentContextData, @@ -31,6 +32,7 @@ import { } from './types'; export * from './types'; +export * from './dynamic-tools'; const TOOL_ERROR_STATUS = 'ERROR: Tool execution failed.'; const TOOL_EMPTY_STATUS = 'Tool output is empty.'; @@ -175,6 +177,7 @@ export class ContextMemory { this._lastAssistantAt = null; this.agent.microCompaction.reset(); this.agent.injection.onContextClear(); + this.agent.tools.onContextCleared(); this.agent.emitStatusUpdated(); } @@ -351,6 +354,7 @@ export class ContextMemory { this.tokenCountCoveredMessageCount = this._history.length; this.agent.microCompaction.reset(); this.agent.injection.onContextCompacted(); + this.agent.tools.onContextCompacted(); this.agent.emitStatusUpdated(); return result; } @@ -376,8 +380,15 @@ export class ContextMemory { } project(messages: readonly ContextMessage[], options?: ProjectOptions): Message[] { + // Shape for the current model BEFORE projecting: a model without the + // select_tools capability must not see dynamic-tool schema messages or + // loadable-tools announcements (the canonical history keeps them; only + // this outgoing view is shaped). Must run pre-projection — project() + // strips `origin`, the only anchor for the announcements. setModel never + // rewrites history, so a mid-session switch degrades/upgrades losslessly. + const shaped = this.agent.toolSelectEnabled ? messages : stripDynamicToolContext(messages); const anomalies: ProjectionAnomaly[] = []; - const result = project(this.agent.microCompaction.compact(messages), { + const result = project(this.agent.microCompaction.compact(shaped), { ...options, onAnomaly: (anomaly) => { anomalies.push(anomaly); diff --git a/packages/agent-core/src/agent/context/projector.ts b/packages/agent-core/src/agent/context/projector.ts index 75ee0fbcb..cf77fd6a1 100644 --- a/packages/agent-core/src/agent/context/projector.ts +++ b/packages/agent-core/src/agent/context/projector.ts @@ -390,6 +390,10 @@ function prepareMessageForProjection( }, ); } + // A message that loads tool definitions (`tools` present) is intentionally + // content-free — it must survive the empty-message cleanup or the loaded + // schemas silently vanish from every outgoing request. + if (next.tools !== undefined && next.tools.length > 0) return next; return next.content.length === 0 && next.toolCalls.length === 0 ? null : next; } @@ -429,6 +433,7 @@ function stripContextMetadata(message: ContextMessage): Message { toolCalls: message.toolCalls.map((tc) => ({ ...tc })), toolCallId: message.toolCallId, partial: message.partial, + tools: message.tools?.map((tool) => ({ ...tool })), }; } diff --git a/packages/agent-core/src/agent/index.ts b/packages/agent-core/src/agent/index.ts index fb759289d..095647c58 100644 --- a/packages/agent-core/src/agent/index.ts +++ b/packages/agent-core/src/agent/index.ts @@ -222,6 +222,25 @@ export class Agent { } } + /** + * Single decision point for select_tools progressive disclosure. All three + * gates must be open: the model declares the `select_tools` capability, the + * model declares `tool_use` (a model without tool use registering + * select_tools is a contradiction), and the `tool-select` experimental flag + * is on. Every consumer — top-level tools[] convergence, select_tools + * registration, manifest announcements, projection shaping — reads this + * instead of re-deriving the conditions, so degradation is lossless: any + * closed gate reproduces the inline behavior byte-for-byte. + */ + get toolSelectEnabled(): boolean { + const capability = this.config.modelCapabilities; + return ( + capability.select_tools === true && + capability.tool_use && + this.experimentalFlags.enabled('tool-select') + ); + } + get generate(): typeof generate { return async (provider, systemPrompt, tools, history, callbacks, options) => { const { requestLogFields, generateOptions } = splitGenerateOptions(options); diff --git a/packages/agent-core/src/agent/injection/manager.ts b/packages/agent-core/src/agent/injection/manager.ts index 812aa6188..823603ac6 100644 --- a/packages/agent-core/src/agent/injection/manager.ts +++ b/packages/agent-core/src/agent/injection/manager.ts @@ -7,6 +7,7 @@ import { PermissionModeInjector } from './permission-mode'; import { PluginSessionStartInjector } from './plugin-session-start'; import { PlanModeInjector } from './plan-mode'; import { TodoListReminderInjector } from './todo-list'; +import { ToolsDiffInjector } from './tools-diff'; const ACTIVE_BACKGROUND_TASK_GUIDANCE = 'The conversation was compacted, so the earlier messages that started these background tasks are gone — but the tasks are still running from before. Do not start duplicates. Use TaskOutput to fetch a task’s result, TaskList to list them, and TaskStop to cancel one.'; @@ -19,6 +20,9 @@ export class InjectionManager { // near the tail without mutating the prefix, so prompt caching is preserved and // the context does not grow O(n^2) the way per-step injection did. private readonly goalInjector: GoalInjector | null; + // Same boundary cadence, but NOT main-only: subagents announce their own + // loadable tool set. See ToolsDiffInjector for why it also diverges on origin. + private readonly toolsDiffInjector: ToolsDiffInjector; constructor(protected readonly agent: Agent) { this.injectors = [ @@ -28,6 +32,7 @@ export class InjectionManager { new PermissionModeInjector(agent), ]; this.goalInjector = agent.type === 'main' ? new GoalInjector(agent) : null; + this.toolsDiffInjector = new ToolsDiffInjector(agent); } async inject(): Promise { @@ -45,8 +50,18 @@ export class InjectionManager { await this.activeGoalInjector()?.inject(); } + /** + * Appends a loadable-tools diff announcement when the loadable set changed. + * Boundary cadence (turn start + post-compaction); no-op when the disclosure + * gate is closed or nothing changed. + */ + injectToolsDiff(): void { + this.toolsDiffInjector.inject(); + } + async injectAfterCompaction(): Promise { await this.injectGoal(); + this.injectToolsDiff(); this.injectActiveBackgroundTasks(); await this.inject(); } diff --git a/packages/agent-core/src/agent/injection/tools-diff.ts b/packages/agent-core/src/agent/injection/tools-diff.ts new file mode 100644 index 000000000..d42e2832f --- /dev/null +++ b/packages/agent-core/src/agent/injection/tools-diff.ts @@ -0,0 +1,51 @@ +/** + * ToolsDiffInjector — maintains the loadable-tools manifest in context via + * turn-boundary diffs (`/` announcements). + * + * Three deliberate departures from the DynamicInjector defaults: + * - Boundary cadence, not per-step: invoked next to injectGoal at turn start + * and after full compaction, never from the per-step inject() loop. Server + * connects/disconnects mid-turn are simply observed at the next boundary + * (a same-turn drop+reconnect nets out — natural debouncing). + * - Not main-only: subagents run their own disclosure and need the manifest. + * - `system_trigger` origin, not `injection`: undo must REMOVE announcements + * (the folded state rolls back with the conversation) and the next + * boundary diff self-heals; `injection` origin would survive undo. + * + * There is no in-memory "announced" ledger: the announcements in history ARE + * the ledger, re-folded on every boundary. Undo, compaction, and resume all + * self-heal for free, at the cost of one cheap origin-anchored scan per turn. + */ + +import type { Agent } from '..'; +import { + foldAnnouncedToolNames, + LOADABLE_TOOLS_TRIGGER, + renderLoadableToolsAnnouncement, +} from '../context/dynamic-tools'; + +export class ToolsDiffInjector { + constructor(protected readonly agent: Agent) {} + + /** + * Recompute the loadable set, fold the announced set from history, and + * append one diff announcement iff they differ. Most turns append nothing, + * keeping the prompt cache warm; the first announcement after session start + * (or after compaction folded the history) is naturally the full list. + */ + inject(): void { + if (!this.agent.toolSelectEnabled) return; + const loadable = this.agent.tools.loadableDynamicToolNames(); + const loadableSet = new Set(loadable); + const announced = foldAnnouncedToolNames(this.agent.context.history); + const added = loadable.filter((name) => !announced.has(name)); + const removed = [...announced] + .filter((name) => !loadableSet.has(name)) + .toSorted((a, b) => a.localeCompare(b)); + if (added.length === 0 && removed.length === 0) return; + this.agent.context.appendSystemReminder( + renderLoadableToolsAnnouncement(added, removed), + { kind: 'system_trigger', name: LOADABLE_TOOLS_TRIGGER }, + ); + } +} diff --git a/packages/agent-core/src/agent/llm-request-logger.ts b/packages/agent-core/src/agent/llm-request-logger.ts index b9ff6b665..1474546c0 100644 --- a/packages/agent-core/src/agent/llm-request-logger.ts +++ b/packages/agent-core/src/agent/llm-request-logger.ts @@ -24,18 +24,22 @@ export class LlmRequestLogger { }): void { const { provider, modelAlias, systemPrompt, tools, messages, fields } = input; const requestLogFields = fields ?? {}; + // This logs the outbound request; deferred tools are stripped by kosong + // generate() before the provider sees them, so mirror that here or the + // toolCount/toolsHash would describe a request that never hits the wire. + const wireTools = tools.filter((tool) => tool.deferred !== true); const config = { provider: provider.name, model: provider.modelName, modelAlias, thinkingEffort: provider.thinkingEffort ?? undefined, systemPromptChars: systemPrompt.length, - toolCount: tools.length, + toolCount: wireTools.length, }; const signature = JSON.stringify({ ...config, systemPromptHash: fingerprint(systemPrompt), - toolsHash: fingerprint(JSON.stringify(toolSignature(tools))), + toolsHash: fingerprint(JSON.stringify(toolSignature(wireTools))), }); if (signature !== this.lastConfigLogSignature) { this.lastConfigLogSignature = signature; diff --git a/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts b/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts index 2f8355ce0..bedbf5815 100644 --- a/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts +++ b/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts @@ -20,6 +20,9 @@ const DEFAULT_APPROVE_TOOLS = new Set([ 'GetGoal', 'SetGoalBudget', 'UpdateGoal', + // Loading a tool definition into context has no side effects on the world; + // executing the loaded tool still goes through its own approval. + 'select_tools', ]); export class DefaultToolApprovePermissionPolicy implements PermissionPolicy { diff --git a/packages/agent-core/src/agent/tool/index.ts b/packages/agent-core/src/agent/tool/index.ts index f0320b7ee..9219713d2 100644 --- a/packages/agent-core/src/agent/tool/index.ts +++ b/packages/agent-core/src/agent/tool/index.ts @@ -3,6 +3,9 @@ import type { ChatProvider, Tool } from '@moonshot-ai/kosong'; import picomatch from 'picomatch'; import type { Agent } from '..'; +import { + collectLoadedDynamicToolNames, +} from '../context/dynamic-tools'; import { makeErrorPayload } from '../../errors'; import type { ExecutableTool, ToolUpdate } from '../../loop'; import { createMcpAuthTool } from '../../mcp/auth-tool'; @@ -42,6 +45,14 @@ export class ToolManager { protected enabledTools: Set = new Set(); /** Glob patterns (e.g. `mcp__*`, `mcp__github__*`) gating which MCP tools the profile exposes. */ private mcpAccessPatterns: string[] = []; + /** + * Defer-window lead for the loaded-tools ledger: names marked loaded whose + * schema message may still sit in the context's deferred queue (an open tool + * exchange). The history itself is the source of truth — + * `loadedDynamicToolNames()` unions this set with a history scan — so + * undo/compaction/resume never need to roll this back. + */ + private readonly pendingLoadedDynamicTools = new Set(); protected readonly store: Partial = {}; private mcpToolStatusUnsubscribe: (() => void) | undefined; @@ -423,12 +434,119 @@ export class ToolManager { return this.mcpAccessPatterns.some((pattern) => picomatch.isMatch(name, pattern)); } + /** + * Whether MCP tools are disclosed progressively: kept out of the top-level + * `tools[]` and loaded on demand via select_tools. Reads the agent's single + * three-gate decision point. + */ + private get progressiveDisclosure(): boolean { + return this.agent.toolSelectEnabled; + } + + /** + * Names the model may select right now: registered MCP tools that pass the + * profile's `mcp__*` access patterns, sorted for byte-stable announcements. + * In disclosure mode the patterns keep their permission-filter role but stop + * feeding the top-level `tools[]`. + */ + loadableDynamicToolNames(): string[] { + return [...this.mcpTools.keys()] + .filter((name) => this.isMcpToolEnabled(name)) + .toSorted((a, b) => a.localeCompare(b)); + } + + /** + * The loaded-tools ledger: every name whose full definition has been + * delivered to the conversation via a `tools`-carrying message, plus the + * defer-window pending set. History is the single source of truth, so the + * ledger survives resume (records replay rebuilds the history), keeps its + * state across undo (schema messages have `injection` origin and are not + * undone), and self-heals after compaction (the rebuild message re-carries + * the schemas). + */ + loadedDynamicToolNames(): ReadonlySet { + const names = collectLoadedDynamicToolNames(this.agent.context.history); + for (const name of this.pendingLoadedDynamicTools) names.add(name); + return names; + } + + /** Mark names loaded ahead of their schema message landing in history. */ + markDynamicToolsLoaded(names: Iterable): void { + for (const name of names) this.pendingLoadedDynamicTools.add(name); + } + + /** + * Context was cleared (`/clear`): every schema message is gone, so the + * defer-window lead must not keep reporting its names as loaded — a stale + * entry would make select_tools answer "Already available" for a tool whose + * definition the model can no longer see. + */ + onContextCleared(): void { + this.pendingLoadedDynamicTools.clear(); + } + + /** + * Compaction rebuilt the history: from here on the keep-all rebuild message + * (which may have trimmed or skipped schemas — budget guard, disconnected + * servers) is the sole truth about what is still loaded. A pending entry + * surviving past this boundary would report a schema the context no longer + * carries as loaded, and re-selecting it would wrongly answer + * "Already available" instead of injecting. + */ + onContextCompacted(): void { + this.pendingLoadedDynamicTools.clear(); + } + + /** + * Plain schema snapshot of a registered MCP tool, read from the live + * registry (never from history) at injection time. + */ + getMcpToolSchema(name: string): Tool | undefined { + const entry = this.mcpTools.get(name); + if (entry === undefined) return undefined; + return { + name: entry.tool.name, + description: entry.tool.description, + parameters: entry.tool.parameters, + }; + } + + /** + * Disclosure-mode wording for a tool-call preflight miss. A loaded tool + * whose server dropped is a different situation from a never-announced name; + * telling them apart stops the model from re-selecting a disconnected tool + * in a loop or treating a transient disconnect as a permanent removal. + */ + missingToolMessage(name: string): string | undefined { + if (!this.progressiveDisclosure) return undefined; + if (!isMcpToolName(name)) return undefined; + const registered = this.mcpTools.has(name) && this.isMcpToolEnabled(name); + const loaded = this.loadedDynamicToolNames().has(name); + if (registered && !loaded) { + return ( + `Tool "${name}" is available but not loaded. ` + + `Call select_tools with ["${name}"] first, then call the tool.` + ); + } + if (!registered && loaded) { + return ( + `Tool "${name}" was loaded but its MCP server is currently disconnected. ` + + 'It may become available again when the server reconnects; do not retry immediately.' + ); + } + return undefined; + } + *toolInfos(): Iterable { for (const tool of this.builtinTools.values()) { yield { name: tool.name, description: tool.description, - active: this.enabledTools.has(tool.name), + // select_tools is always registered but only offered while the + // disclosure gate is open (see loopTools); report that live state. + active: + this.enabledTools.has(tool.name) || + (tool.name === b.SELECT_TOOLS_TOOL_NAME && this.agent.toolSelectEnabled), source: 'builtin', }; } @@ -492,6 +610,13 @@ export class ToolManager { new b.ReadMediaFileTool(kaos, workspace, modelCapabilities, videoUploader), new b.EnterPlanModeTool(this.agent), new b.ExitPlanModeTool(this.agent), + // Registered unconditionally: the tool-select flag can flip at runtime + // (config reload calls setConfigOverrides) without this method + // re-running, so registration must not depend on the gate — exposure + // is decided per step in loopTools instead. Deliberately not + // main-only: subagents run their own disclosure and need select_tools + // just as much. + new b.SelectToolsTool(this.agent), // Goal tools are main-agent-only. goalToolsEnabled && new b.CreateGoalTool(this.agent), goalToolsEnabled && new b.GetGoalTool(this.agent), @@ -595,21 +720,45 @@ export class ToolManager { get loopTools(): readonly ExecutableTool[] { if (this.loopToolsOverride !== undefined) return this.loopToolsOverride; - const mcpNames = [...this.mcpTools.keys()].filter((name) => this.isMcpToolEnabled(name)); + const disclosure = this.progressiveDisclosure; + const enabledMcpNames = [...this.mcpTools.keys()].filter((name) => + this.isMcpToolEnabled(name), + ); + // Progressive disclosure splits "the model can see this tool" from "the + // core can execute it": the top-level request view stays the immutable + // core set + select_tools, while loaded MCP tools join the executable + // table as deferred extras — dispatchable, but stripped from the outbound + // top-level tools[] by kosong generate(). With disclosure off this is the + // inline behavior, byte for byte. + const loadedSet = disclosure ? this.loadedDynamicToolNames() : undefined; + const mcpNames = + loadedSet === undefined + ? enabledMcpNames + : enabledMcpNames.filter((name) => loadedSet.has(name)); + const selectToolsName = disclosure ? [b.SELECT_TOOLS_TOOL_NAME] : []; // Mutation goal tools are only offered to the model while a goal exists. const hideGoalMutationTools = this.agent.goal.getGoal().goal === null; - return uniq([...this.enabledTools, ...mcpNames]) + return uniq([...this.enabledTools, ...selectToolsName, ...mcpNames]) .toSorted((a, b) => a.localeCompare(b)) .filter( (name) => !(hideGoalMutationTools && (name === 'SetGoalBudget' || name === 'UpdateGoal')), ) - .map( - (name) => + // select_tools is exposed exclusively through the disclosure gate — a + // profile or setActiveTools listing the name explicitly must not + // surface it in inline mode (it was silently dropped back when + // registration itself was gated; keep that contract). + .filter((name) => disclosure || name !== b.SELECT_TOOLS_TOOL_NAME) + .map((name) => { + const tool = this.userTools.get(name) ?? this.mcpTools.get(name)?.tool ?? - this.builtinTools.get(name), - ) + this.builtinTools.get(name); + if (tool === undefined) return undefined; + // MCP entries are plain object literals, so the spread keeps the + // execution closure intact while adding the wire-strip marker. + return disclosure && this.mcpTools.has(name) ? { ...tool, deferred: true as const } : tool; + }) .filter((tool) => !!tool); } } diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index b79b6f83b..d370774c2 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -676,6 +676,10 @@ export class TurnFlow { // there is no active goal). Each goal continuation is its own turn, so this // re-injects the reminder once per turn rather than per step, preserving prompt caching. await this.agent.injection.injectGoal(); + // Announce loadable-tool changes at the same boundary cadence: a diff is + // appended only when the loadable set actually changed, so quiet turns + // keep the prompt cache fully warm. + this.agent.injection.injectToolsDiff(); while (true) { signal.throwIfAborted(); const model = this.agent.config.model; @@ -689,7 +693,10 @@ export class TurnFlow { buildMessages: () => this.agent.context.messages, buildMessagesStrict: () => this.agent.context.strictMessages, dispatchEvent: this.buildDispatchEvent(turnId), - tools: this.agent.tools.loopTools, + // Re-read per step (not snapshotted per turn) so a select_tools load + // is dispatchable on the very next step of the same turn. + buildTools: () => this.agent.tools.loopTools, + describeMissingTool: (name) => this.agent.tools.missingToolMessage(name), log: this.agent.log, maxSteps: loopControl?.maxStepsPerTurn, maxRetryAttempts: loopControl?.maxRetriesPerStep, diff --git a/packages/agent-core/src/flags/registry.ts b/packages/agent-core/src/flags/registry.ts index a8e815e91..8a5fd1b3e 100644 --- a/packages/agent-core/src/flags/registry.ts +++ b/packages/agent-core/src/flags/registry.ts @@ -23,6 +23,15 @@ export const FLAG_DEFINITIONS = [ // default: false, // surface: 'core', // }, + { + id: 'tool-select', + title: 'Tool select (progressive tool disclosure)', + description: + 'Keep MCP tool schemas out of the immutable top-level tools[]; the model loads them on demand via the select_tools tool. Only takes effect on models whose capability catalog declares select_tools.', + env: 'KIMI_CODE_EXPERIMENTAL_TOOL_SELECT', + default: false, + surface: 'core', + }, ] as const satisfies readonly FlagDefinitionInput[]; /** Literal union of registered flag ids. */ diff --git a/packages/agent-core/src/loop/run-turn.ts b/packages/agent-core/src/loop/run-turn.ts index 3ee74cbcd..d6646ca1e 100644 --- a/packages/agent-core/src/loop/run-turn.ts +++ b/packages/agent-core/src/loop/run-turn.ts @@ -42,6 +42,21 @@ export interface RunTurnInput { readonly buildMessagesStrict?: LoopMessageBuilder | undefined; readonly dispatchEvent: LoopEventDispatcher; readonly tools?: readonly ExecutableTool[] | undefined; + /** + * Per-step tool table builder. When present it wins over `tools` and is + * re-invoked before every step, so a tool loaded mid-turn (select_tools + * schema injection) is dispatchable on the very next step and state-driven + * visibility (e.g. goal mutation tools) stays fresh. `tools` remains as the + * static per-turn snapshot for hosts without dynamic tool tables. + */ + readonly buildTools?: (() => readonly ExecutableTool[]) | undefined; + /** + * Optional wording override for a tool call whose name resolves to no + * executable tool. Lets the host distinguish "loaded but its server is + * disconnected" from a plain unknown name under progressive disclosure. + * Returning `undefined` keeps the default "not found" message. + */ + readonly describeMissingTool?: ((name: string) => string | undefined) | undefined; readonly hooks?: LoopHooks | undefined; readonly log?: Logger | undefined; readonly maxSteps?: number | undefined; @@ -60,6 +75,8 @@ export async function runTurn(input: RunTurnInput): Promise { buildMessagesStrict, dispatchEvent, tools, + buildTools, + describeMissingTool, hooks, log, maxSteps, @@ -96,6 +113,12 @@ export async function runTurn(input: RunTurnInput): Promise { dispatchEvent, llm, tools, + // Passed through unresolved: the step evaluates it AFTER beforeStep, + // next to buildMessages, so the tool table and the request messages + // come from the same state (beforeStep can run compaction, which + // trims loaded schemas and rewrites the ledger). + buildTools, + describeMissingTool, hooks, log, currentStep: steps, diff --git a/packages/agent-core/src/loop/tool-call.ts b/packages/agent-core/src/loop/tool-call.ts index 801609df9..396a9110c 100644 --- a/packages/agent-core/src/loop/tool-call.ts +++ b/packages/agent-core/src/loop/tool-call.ts @@ -63,6 +63,8 @@ function abortedToolOutput(toolName: string, signal: AbortSignal): string { export interface ToolCallStepContext { readonly tools?: readonly ExecutableTool[] | undefined; + /** See RunTurnInput.describeMissingTool. */ + readonly describeMissingTool?: ((name: string) => string | undefined) | undefined; readonly hooks?: LoopHooks | undefined; readonly log?: Logger | undefined; readonly dispatchEvent: LoopEventDispatcher; @@ -174,7 +176,7 @@ export async function runToolCallBatch( * events. Validator compilation may populate the local cache. */ function preflightToolCall( - step: Pick, + step: Pick, toolCall: ToolCall, ): PreflightedToolCall { const toolName = toolCall.name; @@ -186,7 +188,7 @@ function preflightToolCall( toolCall, toolName, args: parsedArgs.data, - output: `Tool "${toolName}" not found`, + output: step.describeMissingTool?.(toolName) ?? `Tool "${toolName}" not found`, }; } diff --git a/packages/agent-core/src/loop/turn-step.ts b/packages/agent-core/src/loop/turn-step.ts index 1de563354..dc81ca53a 100644 --- a/packages/agent-core/src/loop/turn-step.ts +++ b/packages/agent-core/src/loop/turn-step.ts @@ -38,6 +38,15 @@ export interface ExecuteLoopStepDeps { readonly dispatchEvent: LoopEventDispatcher; readonly llm: LLM; readonly tools?: readonly ExecutableTool[] | undefined; + /** + * Per-step tool table builder; wins over the static `tools` snapshot. + * Evaluated after `beforeStep`, next to `buildMessages`, so the executable + * table and the request messages reflect the same state — `beforeStep` can + * run compaction, which trims loaded dynamic tool schemas. + */ + readonly buildTools?: (() => readonly ExecutableTool[]) | undefined; + /** See RunTurnInput.describeMissingTool. */ + readonly describeMissingTool?: ((name: string) => string | undefined) | undefined; readonly hooks?: LoopHooks | undefined; readonly log?: Logger | undefined; readonly currentStep: number; @@ -57,6 +66,8 @@ export async function executeLoopStep(deps: ExecuteLoopStepDeps): Promise<{ dispatchEvent, llm, tools, + buildTools, + describeMissingTool, hooks, log, currentStep, @@ -78,13 +89,19 @@ export async function executeLoopStep(deps: ExecuteLoopStepDeps): Promise<{ signal.throwIfAborted(); + // Resolve the tool table AFTER beforeStep so it reflects the same state as + // the messages built below (beforeStep can run compaction, which trims + // loaded dynamic tool schemas out of the context and the ledger — a table + // captured earlier would still dispatch a tool the model no longer has). + const stepTools = buildTools !== undefined ? buildTools() : tools; const messages = await buildMessages(); signal.throwIfAborted(); const stepUuid = randomUUID(); const step: ToolCallStepContext = { - tools, + tools: stepTools, + describeMissingTool, hooks, log, dispatchEvent, @@ -104,7 +121,7 @@ export async function executeLoopStep(deps: ExecuteLoopStepDeps): Promise<{ const chatParams: LLMChatParams = { messages, - tools: tools ?? [], + tools: stepTools ?? [], signal, ...createChatStreamingCallbacks({ dispatchEvent, diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts index 245ec3275..4c93fbe3f 100644 --- a/packages/agent-core/src/session/provider-manager.ts +++ b/packages/agent-core/src/session/provider-manager.ts @@ -238,6 +238,10 @@ function resolveModelCapabilities( thinking: declared.has('thinking') || declared.has('always_thinking') || detected.thinking, tool_use: declared.has('tool_use') || detected.tool_use, max_context_tokens: alias.maxContextSize, + // Message-level tool declarations (select_tools progressive disclosure). + // Every field here must be merged explicitly — a capability registered in + // kosong that is not forwarded here never reaches the agent. + select_tools: declared.has('select_tools') || detected.select_tools === true, }; } diff --git a/packages/agent-core/src/tools/builtin/index.ts b/packages/agent-core/src/tools/builtin/index.ts index 744f90c6f..b5c8e3bbd 100644 --- a/packages/agent-core/src/tools/builtin/index.ts +++ b/packages/agent-core/src/tools/builtin/index.ts @@ -20,6 +20,7 @@ export * from './goal/set-goal-budget'; export * from './goal/update-goal'; export * from './planning/enter-plan-mode'; export * from './planning/exit-plan-mode'; +export * from './select-tools'; export * from './shell/bash'; export * from './state/todo-list'; export * from './web/fetch-url'; diff --git a/packages/agent-core/src/tools/builtin/select-tools.ts b/packages/agent-core/src/tools/builtin/select-tools.ts new file mode 100644 index 000000000..f4a610b40 --- /dev/null +++ b/packages/agent-core/src/tools/builtin/select-tools.ts @@ -0,0 +1,129 @@ +/** + * select_tools — the load-by-exact-name primitive of progressive tool + * disclosure. MCP tool schemas stay out of the immutable top-level `tools[]`; + * the model reads the `/` announcements, calls + * this tool with exact names, and the full definitions are appended to the + * conversation as a `role: 'system'` message carrying `tools` (the + * `messages[].tools` wire contract). Loaded tools become executable the very + * next step: the loop re-reads the executable tool table per step. + * + * Registered only when `agent.toolSelectEnabled` (capability × flag gate) and + * deliberately NOT main-agent-only — subagents get the same disclosure. + * + * Concurrency: no `accesses` is declared, so the execution defaults to + * `ToolAccesses.all()` and is serialized against every other tool in the same + * batch. That is a design constraint, not an accident — two select_tools + * calls settling concurrently could double-inject the same schema message. + */ + +import { z } from 'zod'; + +import type { Agent } from '#/agent'; +import { DYNAMIC_TOOL_SCHEMA_VARIANT } from '../../agent/context/dynamic-tools'; +import type { BuiltinTool } from '../../agent/tool/types'; +import type { ToolExecution } from '../../loop/types'; +import { toInputJsonSchema } from '../support/input-schema'; + +export const SELECT_TOOLS_TOOL_NAME = 'select_tools'; + +export const SelectToolsInputSchema = z + .object({ + names: z + .array(z.string()) + .min(1) + .describe('Exact tool names to load, taken from the latest announced tool list.'), + }) + .strict(); + +export type SelectToolsInput = z.infer; + +// The description sits inside the immutable top-level tools[] — it must stay +// byte-stable across the session. Anything that varies with the tool set +// (names, counts) belongs in the announcements, never here. +const DESCRIPTION = + 'Load one or more tools by name so you can call them. ' + + 'All available tool names are listed in the / announcements ' + + 'in the system context — fold them in order to get the current list. ' + + 'Pass the exact name(s) you need; their full definitions become available immediately, ' + + 'so you can call them directly in your next tool call.'; + +export class SelectToolsTool implements BuiltinTool { + readonly name = SELECT_TOOLS_TOOL_NAME; + readonly description: string = DESCRIPTION; + readonly parameters: Record = toInputJsonSchema(SelectToolsInputSchema); + + constructor(private readonly agent: Agent) {} + + resolveExecution(args: SelectToolsInput): ToolExecution { + return { + description: `Loading ${args.names.join(', ')}`, + approvalRule: this.name, + execute: async () => { + // The tool is registered unconditionally (the flag can flip at + // runtime without a builtin refresh) but only offered while the + // disclosure gate is open; guard the tiny window where the gate + // closed between table build and execution. + if (!this.agent.toolSelectEnabled) { + return { + output: 'select_tools is not available for the current model.', + isError: true, + }; + } + const manager = this.agent.tools; + const loadable = new Set(manager.loadableDynamicToolNames()); + const loaded = manager.loadedDynamicToolNames(); + + // Mixed input settles per name: hits load, known-loaded report, and + // unknowns error individually — never all-or-nothing, so the model + // does not re-request the whole batch over one typo. + const toLoad: string[] = []; + const alreadyAvailable: string[] = []; + const unknown: string[] = []; + for (const name of new Set(args.names)) { + if (loaded.has(name)) { + alreadyAvailable.push(name); + } else if (loadable.has(name)) { + toLoad.push(name); + } else { + unknown.push(name); + } + } + + if (toLoad.length > 0) { + // Schemas are read from the live registry at injection time and + // sorted by name for byte-stable output. History is never used as a + // schema source; an already-loaded name whose registry schema has + // since changed is NOT re-injected (no runtime last-wins reliance) — + // the next compaction rebuild or an explicit re-select picks up the + // new schema. + toLoad.sort((a, b) => a.localeCompare(b)); + const tools = toLoad + .map((name) => manager.getMcpToolSchema(name)) + .filter((tool): tool is NonNullable => tool !== undefined); + this.agent.context.appendMessage({ + role: 'system', + content: [], + toolCalls: [], + tools, + origin: { kind: 'injection', variant: DYNAMIC_TOOL_SCHEMA_VARIANT }, + }); + // The schema message may sit in the deferred queue until this tool + // exchange closes; the pending mark keeps the ledger ahead of the + // history inside that window so a same-step re-select is a no-op. + manager.markDynamicToolsLoaded(toLoad); + } + + const lines: string[] = []; + if (toLoad.length > 0) lines.push(`Loaded: ${toLoad.join(', ')}`); + if (alreadyAvailable.length > 0) { + lines.push(`Already available: ${alreadyAvailable.join(', ')}`); + } + for (const name of unknown) { + lines.push(`Unknown tool: ${name}. Pick from the latest announced tools list.`); + } + const isError = toLoad.length === 0 && alreadyAvailable.length === 0; + return isError ? { output: lines.join('\n'), isError } : { output: lines.join('\n') }; + }, + }; + } +} diff --git a/packages/agent-core/src/utils/tokens.ts b/packages/agent-core/src/utils/tokens.ts index 845e2024b..cb4a36662 100644 --- a/packages/agent-core/src/utils/tokens.ts +++ b/packages/agent-core/src/utils/tokens.ts @@ -11,6 +11,7 @@ interface TokenEstimatableMessage { readonly role: string; readonly content: readonly ContentPart[]; readonly toolCalls?: readonly { readonly name: string; readonly arguments: unknown }[]; + readonly tools?: readonly Tool[] | undefined; } const messageTokenEstimateCache = new WeakMap(); @@ -68,6 +69,12 @@ export function estimateTokensForMessage(message: TokenEstimatableMessage): numb total += estimateTokens(JSON.stringify(call.arguments)); } } + // Dynamic tool schema messages carry full tool definitions; without this the + // injected schemas are invisible to every compaction budget and the context + // overflows before compaction ever triggers. + if (message.tools !== undefined) { + total += estimateTokensForTools(message.tools); + } messageTokenEstimateCache.set(message, total); return total; } diff --git a/packages/agent-core/test/agent/config.test.ts b/packages/agent-core/test/agent/config.test.ts index e4e562489..71a4ad165 100644 --- a/packages/agent-core/test/agent/config.test.ts +++ b/packages/agent-core/test/agent/config.test.ts @@ -186,7 +186,12 @@ describe('Agent config', () => { [emit] agent.status.updated { "model": "changed-model", "contextTokens": 50, "maxContextTokens": 1000000, "contextUsage": 0.00005, "planMode": false, "swarmMode": false, "permission": "manual", "usage": { "byModel": { "mock-model": { "inputOther": 46, "output": 36, "inputCacheRead": 0, "inputCacheCreation": 0 } }, "total": { "inputOther": 46, "output": 36, "inputCacheRead": 0, "inputCacheCreation": 0 }, "currentTurn": { "inputOther": 46, "output": 36, "inputCacheRead": 0, "inputCacheCreation": 0 } } } [emit] turn.ended { "turnId": 0, "reason": "completed" } `); + // Model and system prompt keep the turn-start snapshot for the rest of the + // turn. The tool table is deliberately different: it is re-read per step + // (so select_tools loads and goal-state visibility apply mid-turn), which + // makes the mid-turn setActiveTools([]) visible from step 2 on. expect(ctx.lastLlmInput()).toMatchInlineSnapshot(` + tools: [] messages: assistant: text "I will run Bash." calls call_bash:Bash { "command": "printf original-result", "timeout": 60 } @@ -212,7 +217,6 @@ describe('Agent config', () => { `); expect(ctx.lastLlmInput()).toMatchInlineSnapshot(` system: "Changed system prompt." - tools: [] messages: assistant: text "Still using the original turn config." diff --git a/packages/agent-core/test/agent/dynamic-tools.test.ts b/packages/agent-core/test/agent/dynamic-tools.test.ts new file mode 100644 index 000000000..76abdc3de --- /dev/null +++ b/packages/agent-core/test/agent/dynamic-tools.test.ts @@ -0,0 +1,125 @@ +import { describe, expect, it } from 'vitest'; + +import { + collectLoadedDynamicToolNames, + foldAnnouncedToolNames, + isDynamicToolSchemaMessage, + isLoadableToolsAnnouncement, + LOADABLE_TOOLS_TRIGGER, + renderLoadableToolsAnnouncement, + stripDynamicToolContext, +} from '../../src/agent/context/dynamic-tools'; +import type { ContextMessage } from '../../src/agent/context/types'; + +function announcement(added: readonly string[], removed: readonly string[]): ContextMessage { + // Mirrors ContextMemory.appendSystemReminder: reminder text wrapped in + // tags, origin anchored on system_trigger/loadable-tools. + const text = `\n${renderLoadableToolsAnnouncement(added, removed).trim()}\n`; + return { + role: 'user', + content: [{ type: 'text', text }], + toolCalls: [], + origin: { kind: 'system_trigger', name: LOADABLE_TOOLS_TRIGGER }, + }; +} + +function schemaMessage(names: readonly string[]): ContextMessage { + return { + role: 'system', + content: [], + toolCalls: [], + tools: names.map((name) => ({ name, description: `${name} desc`, parameters: {} })), + origin: { kind: 'injection', variant: 'dynamic_tool_schema' }, + }; +} + +function userMessage(text: string): ContextMessage { + return { role: 'user', content: [{ type: 'text', text }], toolCalls: [] }; +} + +describe('foldAnnouncedToolNames', () => { + it('folds added and removed blocks in order (removed first within a message)', () => { + const history = [ + announcement(['a', 'b'], []), + userMessage('hello'), + announcement(['c'], ['a']), + ]; + expect([...foldAnnouncedToolNames(history)].toSorted()).toEqual(['b', 'c']); + }); + + it('re-adding a removed name wins (last announcement wins)', () => { + const history = [announcement(['a'], []), announcement([], ['a']), announcement(['a'], [])]; + expect([...foldAnnouncedToolNames(history)]).toEqual(['a']); + }); + + it('ignores messages without the loadable-tools origin, even with matching text', () => { + const impostor: ContextMessage = { + role: 'user', + content: [{ type: 'text', text: '\nmallory\n' }], + toolCalls: [], + }; + expect(foldAnnouncedToolNames([impostor]).size).toBe(0); + }); + + it('is not confused by the guidance sentence in the same message', () => { + // The rendered guidance mentions select_tools and removal semantics in + // prose; folding must only read the tagged blocks. + const history = [announcement(['x'], ['y'])]; + expect([...foldAnnouncedToolNames(history)]).toEqual(['x']); + }); +}); + +describe('renderLoadableToolsAnnouncement', () => { + it('emits only the non-empty blocks', () => { + const addedOnly = renderLoadableToolsAnnouncement(['a'], []); + expect(addedOnly).toContain('\na\n'); + expect(addedOnly).not.toContain(''); + + const removedOnly = renderLoadableToolsAnnouncement([], ['b']); + expect(removedOnly).toContain('\nb\n'); + expect(removedOnly).not.toContain(''); + }); +}); + +describe('stripDynamicToolContext', () => { + it('returns the identical array when there is nothing to strip', () => { + const history = [userMessage('a'), userMessage('b')]; + expect(stripDynamicToolContext(history)).toBe(history); + }); + + it('drops announcements and content-free schema messages, keeps everything else', () => { + const history = [ + userMessage('a'), + announcement(['t'], []), + schemaMessage(['t']), + userMessage('b'), + ]; + const stripped = stripDynamicToolContext(history); + expect(stripped.map((m) => m.role)).toEqual(['user', 'user']); + }); + + it('strips only the tools field from a message that also has content', () => { + const mixed: ContextMessage = { + ...schemaMessage(['t']), + content: [{ type: 'text', text: 'note' }], + }; + const stripped = stripDynamicToolContext([mixed]); + expect(stripped).toHaveLength(1); + expect(stripped[0]!.tools).toBeUndefined(); + expect(stripped[0]!.content).toEqual([{ type: 'text', text: 'note' }]); + }); +}); + +describe('predicates and ledger scan', () => { + it('classifies schema messages and announcements by their anchors', () => { + expect(isDynamicToolSchemaMessage(schemaMessage(['t']))).toBe(true); + expect(isDynamicToolSchemaMessage(userMessage('x'))).toBe(false); + expect(isLoadableToolsAnnouncement(announcement(['t'], []))).toBe(true); + expect(isLoadableToolsAnnouncement(userMessage('x'))).toBe(false); + }); + + it('collects the union of loaded names across schema messages', () => { + const history = [schemaMessage(['a', 'b']), userMessage('x'), schemaMessage(['b', 'c'])]; + expect([...collectLoadedDynamicToolNames(history)].toSorted()).toEqual(['a', 'b', 'c']); + }); +}); diff --git a/packages/agent-core/test/agent/harness/agent.ts b/packages/agent-core/test/agent/harness/agent.ts index df33507a8..f3ccef4c3 100644 --- a/packages/agent-core/test/agent/harness/agent.ts +++ b/packages/agent-core/test/agent/harness/agent.ts @@ -1094,6 +1094,7 @@ function capabilityNames(capabilities: ModelCapability | undefined): string[] { capabilities.audio_in ? 'audio_in' : undefined, capabilities.thinking ? 'thinking' : undefined, capabilities.tool_use ? 'tool_use' : undefined, + capabilities.select_tools === true ? 'select_tools' : undefined, ].filter((capability): capability is string => capability !== undefined); } diff --git a/packages/agent-core/test/agent/harness/scripted-generate.ts b/packages/agent-core/test/agent/harness/scripted-generate.ts index fe0aab0c1..08432dd11 100644 --- a/packages/agent-core/test/agent/harness/scripted-generate.ts +++ b/packages/agent-core/test/agent/harness/scripted-generate.ts @@ -55,11 +55,16 @@ export function createScriptedGenerate() { const input = normalizeGenerateInput({ systemPrompt, - tools: tools.map(({ name, description, parameters }) => ({ - name, - description, - parameters, - })), + // Mirror kosong generate(): deferred tools are stripped before the + // provider builds the request, so the recorded "wire" tools must not + // contain them either. + tools: tools + .filter((tool) => tool.deferred !== true) + .map(({ name, description, parameters }) => ({ + name, + description, + parameters, + })), history: structuredClone(history), }); calls.push(input); diff --git a/packages/agent-core/test/agent/tool-select.e2e.test.ts b/packages/agent-core/test/agent/tool-select.e2e.test.ts new file mode 100644 index 000000000..2736313e5 --- /dev/null +++ b/packages/agent-core/test/agent/tool-select.e2e.test.ts @@ -0,0 +1,633 @@ +/** + * select_tools progressive disclosure — end-to-end agent tests. + * + * Uses the scripted-generate harness: real ToolManager/turn loop/context, fake + * LLM. The three-condition gate (model capability.select_tools × + * capability.tool_use × `tool-select` flag) is driven through the alias + * capability declarations and an injected FlagResolver. + * + * The first block pins the gate-closed regression baseline (S0): with any + * gate closed, the outbound request keeps the inline shape byte-for-byte. + */ + +import { describe, expect, it } from 'vitest'; + +import type { ToolCall } from '@moonshot-ai/kosong'; + +import { + foldAnnouncedToolNames, + isLoadableToolsAnnouncement, +} from '../../src/agent/context/dynamic-tools'; +import { ToolManager } from '../../src/agent/tool'; +import type { Agent } from '../../src/agent'; +import { FLAG_DEFINITIONS, FlagResolver } from '../../src/flags'; +import type { MCPClient } from '../../src/mcp/types'; +import { estimateTokensForMessage } from '../../src/utils/tokens'; +import { testAgent, type TestAgentContext } from './harness/agent'; + +const DISCLOSURE_PROVIDER = { type: 'kimi', apiKey: 'test-key', model: 'select-capable-model' } as const; +const DISCLOSURE_CAPABILITIES = { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: 256_000, + select_tools: true, +} as const; + +const INLINE_PROVIDER = { type: 'kimi', apiKey: 'test-key', model: 'inline-model' } as const; +const INLINE_CAPABILITIES = { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: 256_000, +} as const; + +const GRAFANA_TOOL = 'mcp__grafana__query_range'; + +function toolSelectFlagOn(): FlagResolver { + return new FlagResolver({}, FLAG_DEFINITIONS, { 'tool-select': true }); +} + +/** Empty env so an ambient KIMI_CODE_EXPERIMENTAL_FLAG cannot force flags on. */ +function toolSelectFlagOff(): FlagResolver { + return new FlagResolver({}, FLAG_DEFINITIONS, {}); +} + +function grafanaClient(callLog: Array<[string, unknown]> = []): MCPClient { + return { + async listTools() { + return [ + { + name: 'query_range', + description: 'Query a metrics range', + inputSchema: { + type: 'object', + properties: { query: { type: 'string' } }, + required: ['query'], + }, + }, + ]; + }, + async callTool(name, args) { + callLog.push([name, args]); + return { content: [{ type: 'text', text: 'error_rate=0.02' }], isError: false }; + }, + }; +} + +async function registerGrafana( + ctx: TestAgentContext, + callLog: Array<[string, unknown]> = [], +): Promise { + const client = grafanaClient(callLog); + const defs = await client.listTools(); + ctx.agent.tools.registerMcpServer( + 'grafana', + client, + defs.map((d) => ({ + name: d.name, + description: d.description, + parameters: d.inputSchema as Record, + })), + ); +} + +async function disclosureAgent( + callLog: Array<[string, unknown]> = [], +): Promise { + const ctx = testAgent({ experimentalFlags: toolSelectFlagOn() }); + ctx.configure({ + tools: ['Read', 'mcp__*'], + provider: DISCLOSURE_PROVIDER, + modelCapabilities: DISCLOSURE_CAPABILITIES, + }); + await registerGrafana(ctx, callLog); + return ctx; +} + +function selectCall(id: string, names: readonly string[]): ToolCall { + return { + type: 'function', + id, + name: 'select_tools', + arguments: JSON.stringify({ names }), + }; +} + +function mcpCall(id: string, query: string): ToolCall { + return { + type: 'function', + id, + name: GRAFANA_TOOL, + arguments: JSON.stringify({ query }), + }; +} + +async function runTurn(ctx: TestAgentContext, prompt: string): Promise { + await ctx.rpc.prompt({ input: [{ type: 'text', text: prompt }] }); + await ctx.untilTurnEnd(); +} + +function historyText(ctx: TestAgentContext): string { + return ctx.agent.context.history + .flatMap((m) => m.content) + .map((part) => (part.type === 'text' ? part.text : '')) + .join('\n'); +} + +function toolResultTexts(ctx: TestAgentContext): string[] { + return ctx.agent.context.history + .filter((m) => m.role === 'tool') + .map((m) => m.content.map((p) => (p.type === 'text' ? p.text : '')).join('')); +} + +function schemaMessages(ctx: TestAgentContext) { + return ctx.agent.context.history.filter((m) => m.tools !== undefined && m.tools.length > 0); +} + +describe('gate closed — inline regression baseline (S0)', () => { + it('without the flag, MCP tools stay inline and nothing about disclosure appears', async () => { + const ctx = testAgent({ experimentalFlags: toolSelectFlagOff() }); + ctx.configure({ + tools: ['Read', 'mcp__*'], + provider: DISCLOSURE_PROVIDER, + modelCapabilities: DISCLOSURE_CAPABILITIES, + }); + await registerGrafana(ctx); + + const loopNames = ctx.agent.tools.loopTools.map((t) => t.name); + expect(loopNames).toContain(GRAFANA_TOOL); + expect(loopNames).not.toContain('select_tools'); + expect(ctx.agent.tools.loopTools.every((t) => t.deferred !== true)).toBe(true); + + ctx.mockNextResponse({ type: 'text', text: 'hello' }); + await runTurn(ctx, 'hi'); + + const call = ctx.llmCalls[0]!; + expect(call.tools.map((t) => t.name)).toContain(GRAFANA_TOOL); + expect(call.tools.map((t) => t.name)).not.toContain('select_tools'); + expect(historyText(ctx)).not.toContain(''); + }); + + it('without the model capability, the flag alone changes nothing', async () => { + const ctx = testAgent({ experimentalFlags: toolSelectFlagOn() }); + ctx.configure({ + tools: ['Read', 'mcp__*'], + provider: INLINE_PROVIDER, + modelCapabilities: INLINE_CAPABILITIES, + }); + await registerGrafana(ctx); + + const loopNames = ctx.agent.tools.loopTools.map((t) => t.name); + expect(loopNames).toContain(GRAFANA_TOOL); + expect(loopNames).not.toContain('select_tools'); + + ctx.mockNextResponse({ type: 'text', text: 'hello' }); + await runTurn(ctx, 'hi'); + expect(ctx.llmCalls[0]!.tools.map((t) => t.name)).toContain(GRAFANA_TOOL); + expect(historyText(ctx)).not.toContain(''); + }); +}); + +describe('disclosure mode — top-level convergence and announcements', () => { + it('keeps MCP tools out of the top level, registers select_tools, and announces the manifest', async () => { + const ctx = await disclosureAgent(); + + // Executable table before any select: core + select_tools, no MCP names. + const loopNames = ctx.agent.tools.loopTools.map((t) => t.name); + expect(loopNames).toContain('select_tools'); + expect(loopNames).toContain('Read'); + expect(loopNames.some((n) => n.startsWith('mcp__'))).toBe(false); + + ctx.mockNextResponse({ type: 'text', text: 'hello' }); + await runTurn(ctx, 'hi'); + + // Wire top-level: no MCP schema, select_tools present. + const call = ctx.llmCalls[0]!; + const wireNames = call.tools.map((t) => t.name); + expect(wireNames).toContain('select_tools'); + expect(wireNames.some((n) => n.startsWith('mcp__'))).toBe(false); + + // First boundary announces the full loadable list; the request saw it. + const announcements = ctx.agent.context.history.filter(isLoadableToolsAnnouncement); + expect(announcements).toHaveLength(1); + expect(historyText(ctx)).toContain(`\n${GRAFANA_TOOL}\n`); + expect(JSON.stringify(call.history)).toContain(GRAFANA_TOOL); + }); + + it('does not re-announce when the loadable set is unchanged', async () => { + const ctx = await disclosureAgent(); + ctx.mockNextResponse({ type: 'text', text: 'one' }); + await runTurn(ctx, 'first'); + ctx.mockNextResponse({ type: 'text', text: 'two' }); + await runTurn(ctx, 'second'); + + expect(ctx.agent.context.history.filter(isLoadableToolsAnnouncement)).toHaveLength(1); + }); + + it('announces tools_removed at the next boundary after a server disconnects', async () => { + const ctx = await disclosureAgent(); + ctx.mockNextResponse({ type: 'text', text: 'one' }); + await runTurn(ctx, 'first'); + + ctx.agent.tools.unregisterMcpServer('grafana'); + ctx.mockNextResponse({ type: 'text', text: 'two' }); + await runTurn(ctx, 'second'); + + expect(historyText(ctx)).toContain(`\n${GRAFANA_TOOL}\n`); + expect(foldAnnouncedToolNames(ctx.agent.context.history).size).toBe(0); + }); +}); + +describe('disclosure mode — select_tools three branches and dispatch', () => { + it('loads a schema, makes it dispatchable on the next step of the same turn', async () => { + const callLog: Array<[string, unknown]> = []; + const ctx = await disclosureAgent(callLog); + await ctx.rpc.setPermission({ mode: 'yolo' }); + + ctx.mockNextResponse({ type: 'text', text: 'loading' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'querying' }, mcpCall('call-2', 'errors')); + ctx.mockNextResponse({ type: 'text', text: 'done' }); + await runTurn(ctx, 'check the error rate'); + + // Three-branch result: loaded. + expect(toolResultTexts(ctx)).toContainEqual(`Loaded: ${GRAFANA_TOOL}`); + + // The schema message landed after the closed select exchange, carrying the + // registry schema. + const schemas = schemaMessages(ctx); + expect(schemas).toHaveLength(1); + expect(schemas[0]!.tools!.map((t) => t.name)).toEqual([GRAFANA_TOOL]); + expect(schemas[0]!.content).toEqual([]); + + // Step 2 dispatched the freshly loaded tool through the real MCP client. + expect(callLog).toEqual([['query_range', { query: 'errors' }]]); + expect(toolResultTexts(ctx)).toContainEqual('error_rate=0.02'); + + // The step-2 request carried the schema message but kept the top level clean. + const step2 = ctx.llmCalls[1]!; + expect(step2.tools.map((t) => t.name).some((n) => n.startsWith('mcp__'))).toBe(false); + expect(step2.history.some((m) => m.tools !== undefined && m.tools.length > 0)).toBe(true); + }); + + it('reports Already available without re-injecting, and Unknown per name', async () => { + const ctx = await disclosureAgent(); + + ctx.mockNextResponse({ type: 'text', text: 'loading' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load it'); + + // Mixed input: one already loaded, one unknown — settled per name. + ctx.mockNextResponse( + { type: 'text', text: 'again' }, + selectCall('call-2', [GRAFANA_TOOL, 'mcp__nope__missing']), + ); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load again'); + + const results = toolResultTexts(ctx); + expect(results).toContainEqual( + `Already available: ${GRAFANA_TOOL}\n` + + 'Unknown tool: mcp__nope__missing. Pick from the latest announced tools list.', + ); + // No duplicate schema injection. + expect(schemaMessages(ctx)).toHaveLength(1); + }); + + it('errors when every requested name is unknown', async () => { + const ctx = await disclosureAgent(); + ctx.mockNextResponse( + { type: 'text', text: 'try' }, + selectCall('call-1', ['mcp__ghost__tool']), + ); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load ghost'); + + const errorResult = ctx.agent.context.history.find( + (m) => m.role === 'tool' && m.isError === true, + ); + expect(errorResult).toBeDefined(); + expect(schemaMessages(ctx)).toHaveLength(0); + }); +}); + +describe('disclosure mode — preflight wording', () => { + it('distinguishes not-loaded from loaded-but-disconnected', async () => { + const ctx = await disclosureAgent(); + await ctx.rpc.setPermission({ mode: 'yolo' }); + + // Call without selecting first → guidance to select. + ctx.mockNextResponse({ type: 'text', text: 'call' }, mcpCall('call-1', 'errors')); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'query directly'); + expect(toolResultTexts(ctx).join('\n')).toContain( + `Tool "${GRAFANA_TOOL}" is available but not loaded.`, + ); + + // Load it, then disconnect the server → disconnected wording, not "not found". + ctx.mockNextResponse({ type: 'text', text: 'load' }, selectCall('call-2', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load it'); + ctx.agent.tools.unregisterMcpServer('grafana'); + + ctx.mockNextResponse({ type: 'text', text: 'call again' }, mcpCall('call-3', 'errors')); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'query again'); + expect(toolResultTexts(ctx).join('\n')).toContain( + `Tool "${GRAFANA_TOOL}" was loaded but its MCP server is currently disconnected.`, + ); + }); +}); + +describe('disclosure mode — undo semantics', () => { + it('keeps schema messages across undo, drops announcements, and self-heals', async () => { + const ctx = await disclosureAgent(); + + ctx.mockNextResponse({ type: 'text', text: 'load' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load it'); + expect(schemaMessages(ctx)).toHaveLength(1); + expect(ctx.agent.context.history.filter(isLoadableToolsAnnouncement)).toHaveLength(1); + + await ctx.rpc.undoHistory({ count: 1 }); + + // Schema injection survives (injection origin); the announcement and the + // select exchange are gone. + expect(schemaMessages(ctx)).toHaveLength(1); + expect(ctx.agent.context.history.filter(isLoadableToolsAnnouncement)).toHaveLength(0); + expect(ctx.agent.tools.loadedDynamicToolNames().has(GRAFANA_TOOL)).toBe(true); + + // Next turn re-announces (diff against the rolled-back fold) and a + // re-select reports Already available instead of re-injecting. + ctx.mockNextResponse({ type: 'text', text: 'again' }, selectCall('call-2', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load again'); + expect(ctx.agent.context.history.filter(isLoadableToolsAnnouncement)).toHaveLength(1); + expect(toolResultTexts(ctx)).toContainEqual(`Already available: ${GRAFANA_TOOL}`); + expect(schemaMessages(ctx)).toHaveLength(1); + }); +}); + +describe('disclosure mode — model switch projection', () => { + it('strips dynamic-tool context for a non-supporting model and restores it on switch-back', async () => { + const ctx = await disclosureAgent(); + + ctx.mockNextResponse({ type: 'text', text: 'load' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load it'); + + // Canonical history holds the protocol context. + expect(schemaMessages(ctx)).toHaveLength(1); + + // Switch to a model without select_tools: the outgoing view drops the + // schema message and the announcements; the tool table inlines MCP again. + ctx.configureRuntimeModel(INLINE_PROVIDER, INLINE_CAPABILITIES); + expect(ctx.agent.toolSelectEnabled).toBe(false); + const projected = ctx.agent.context.messages; + expect(projected.some((m) => m.tools !== undefined)).toBe(false); + expect( + projected.some((m) => + m.content.some((p) => p.type === 'text' && p.text.includes('')), + ), + ).toBe(false); + const inlineNames = ctx.agent.tools.loopTools.map((t) => t.name); + expect(inlineNames).toContain(GRAFANA_TOOL); + expect(inlineNames).not.toContain('select_tools'); + expect(ctx.agent.tools.loopTools.every((t) => t.deferred !== true)).toBe(true); + + // Switch back: history was never rewritten, the ledger re-scan picks the + // loaded tool back up as a deferred extra and projection restores. + ctx.configureRuntimeModel(DISCLOSURE_PROVIDER, DISCLOSURE_CAPABILITIES); + expect(ctx.agent.toolSelectEnabled).toBe(true); + expect(ctx.agent.context.messages.some((m) => m.tools !== undefined)).toBe(true); + const backNames = ctx.agent.tools.loopTools.map((t) => t.name); + expect(backNames).toContain('select_tools'); + expect(backNames).toContain(GRAFANA_TOOL); + const extra = ctx.agent.tools.loopTools.find((t) => t.name === GRAFANA_TOOL); + expect(extra?.deferred).toBe(true); + }); +}); + +describe('disclosure mode — executable table freshness', () => { + it('reflects goal-state tool visibility without waiting for a new turn snapshot', async () => { + // The loop re-reads loopTools per step (buildTools); the same mechanism + // that makes a selected tool dispatchable mid-turn also makes goal-gated + // mutation tools appear as soon as a goal exists. + const ctx = await disclosureAgent(); + ctx.configure({ + tools: ['Read', 'UpdateGoal', 'SetGoalBudget', 'mcp__*'], + provider: DISCLOSURE_PROVIDER, + modelCapabilities: DISCLOSURE_CAPABILITIES, + }); + expect(ctx.agent.tools.loopTools.map((t) => t.name)).not.toContain('UpdateGoal'); + await ctx.agent.goal.createGoal({ objective: 'ship the feature' }); + expect(ctx.agent.tools.loopTools.map((t) => t.name)).toContain('UpdateGoal'); + }); + + it('rebuilds the ledger from a replayed history with no in-memory state (resume path)', () => { + // Resume replays records into the context history; the ledger must come + // back from the history scan alone — there is no persisted ledger state. + const schemaMessage = { + role: 'system', + content: [], + toolCalls: [], + tools: [{ name: GRAFANA_TOOL, description: 'replayed', parameters: {} }], + origin: { kind: 'injection', variant: 'dynamic_tool_schema' }, + } as const; + const agent = { + toolSelectEnabled: true, + context: { history: [schemaMessage] }, + config: { hasProvider: false }, + goal: { getGoal: () => ({ goal: null }) }, + } as unknown as Agent; + const manager = new ToolManager(agent); + expect(manager.loadedDynamicToolNames().has(GRAFANA_TOOL)).toBe(true); + }); +}); + +describe('disclosure mode — compaction', () => { + it('filters protocol context from the summarizer input and rebuilds schemas after compaction', async () => { + const ctx = await disclosureAgent(); + + ctx.mockNextResponse({ type: 'text', text: 'load' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'ok' }); + await runTurn(ctx, 'load it'); + + const compacted = new Promise<{ tokensAfter: number }>((resolve) => { + ctx.emitter.once('context.apply_compaction', (entry: { args: { tokensAfter: number } }) => { + resolve({ tokensAfter: entry.args.tokensAfter }); + }); + }); + const completed = ctx.once('compaction.completed'); + ctx.mockNextResponse({ type: 'text', text: 'Compacted summary.' }); + await ctx.rpc.beginCompaction({}); + const { tokensAfter } = await compacted; + await completed; + + // Summarizer input: no schema messages, no announcements. + const summarizerCall = ctx.llmCalls.at(-1)!; + expect(summarizerCall.history.some((m) => m.tools !== undefined)).toBe(false); + expect(JSON.stringify(summarizerCall.history)).not.toContain(''); + + // Post-compaction context: one rebuild message with the registry schema, + // plus a fresh full announcement — no re-select needed. + const rebuilt = schemaMessages(ctx); + expect(rebuilt).toHaveLength(1); + expect(rebuilt[0]!.tools!.map((t) => t.name)).toEqual([GRAFANA_TOOL]); + expect(rebuilt[0]!.origin).toEqual({ kind: 'injection', variant: 'dynamic_tool_schema' }); + expect(ctx.agent.context.history.filter(isLoadableToolsAnnouncement)).toHaveLength(1); + expect(ctx.agent.tools.loadedDynamicToolNames().has(GRAFANA_TOOL)).toBe(true); + expect(ctx.agent.tools.loopTools.map((t) => t.name)).toContain(GRAFANA_TOOL); + + // The "nothing new since compaction" guard must be baselined on the + // counter that includes the rebuild message — result.tokensAfter predates + // it, and a lower baseline would let auto-compaction re-trigger against a + // floor that cannot shrink. + const internals = ctx.agent.fullCompaction as unknown as { + lastCompactedTokenCount: number | null; + }; + expect(internals.lastCompactedTokenCount).toBe( + tokensAfter + estimateTokensForMessage(rebuilt[0]!), + ); + + // The baseline lives strictly within one turn: runOneTurn re-arms it at + // every turn boundary, which is what makes cross-turn staleness (undo, + // model switches, /clear while idle) structurally impossible. If this + // reset ever moves, the guard's staleness analysis must be redone. + ctx.mockNextResponse({ type: 'text', text: 'next turn' }); + await runTurn(ctx, 'anything new'); + expect(internals.lastCompactedTokenCount).toBeNull(); + }); + + it('survives a runtime tool-select flag flip without a builtin refresh', async () => { + // Config reload calls FlagResolver.setConfigOverrides on the live + // resolver; initializeBuiltinTools does NOT re-run. select_tools must + // still be fully usable the moment the gate opens (it is registered + // unconditionally; only its exposure is gated), and flipping back off + // must restore the inline shape. + const callLog: Array<[string, unknown]> = []; + const resolver = toolSelectFlagOff(); + const ctx = testAgent({ experimentalFlags: resolver }); + ctx.configure({ + tools: ['Read', 'mcp__*'], + provider: DISCLOSURE_PROVIDER, + modelCapabilities: DISCLOSURE_CAPABILITIES, + }); + await registerGrafana(ctx, callLog); + await ctx.rpc.setPermission({ mode: 'yolo' }); + + // Flag off: inline. + ctx.mockNextResponse({ type: 'text', text: 'inline' }); + await runTurn(ctx, 'first'); + const inlineCall = ctx.llmCalls.at(-1)!; + expect(inlineCall.tools.map((t) => t.name)).toContain(GRAFANA_TOOL); + expect(inlineCall.tools.map((t) => t.name)).not.toContain('select_tools'); + + // Flip on at runtime: the full select → dispatch chain must work. + resolver.setConfigOverrides({ 'tool-select': true }); + ctx.mockNextResponse({ type: 'text', text: 'loading' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'querying' }, mcpCall('call-2', 'errors')); + ctx.mockNextResponse({ type: 'text', text: 'done' }); + await runTurn(ctx, 'now use the tool'); + const disclosureCall = ctx.llmCalls.at(-3)!; + expect(disclosureCall.tools.map((t) => t.name)).toContain('select_tools'); + expect(disclosureCall.tools.map((t) => t.name).some((n) => n.startsWith('mcp__'))).toBe(false); + expect(callLog).toEqual([['query_range', { query: 'errors' }]]); + + // Flip back off: inline again, select_tools gone from the wire. + resolver.setConfigOverrides({}); + ctx.mockNextResponse({ type: 'text', text: 'inline again' }); + await runTurn(ctx, 'back'); + const backCall = ctx.llmCalls.at(-1)!; + expect(backCall.tools.map((t) => t.name)).toContain(GRAFANA_TOOL); + expect(backCall.tools.map((t) => t.name)).not.toContain('select_tools'); + }); + + it('trims the schema rebuild instead of re-entering the compaction trigger band', async () => { + // A trigger far below one fat schema: without the rebuild budget guard the + // post-compaction floor (users + summary + schema) would sit permanently + // above the trigger, and every later step would re-compact and rebuild in + // a loop (with the default Infinity per-turn cap, forever). + const trigger = 2_000; + const ctx = testAgent({ + experimentalFlags: toolSelectFlagOn(), + compactionStrategy: { + shouldCompact: (used: number) => used >= trigger, + shouldBlock: (used: number) => used >= trigger, + checkAfterStep: false, + maxCompactionPerTurn: 3, + maxOverflowCompactionAttempts: 3, + }, + }); + ctx.configure({ + tools: ['Read', 'mcp__*'], + provider: DISCLOSURE_PROVIDER, + modelCapabilities: DISCLOSURE_CAPABILITIES, + }); + const fatClient: MCPClient = { + async listTools() { + return [ + { + name: 'query_range', + // ~3k estimated tokens — alone far past the 2k trigger budget. + description: 'x'.repeat(12_000), + inputSchema: { type: 'object', properties: {} }, + }, + ]; + }, + async callTool() { + return { content: [{ type: 'text', text: 'ok' }], isError: false }; + }, + }; + ctx.agent.tools.registerMcpServer( + 'grafana', + fatClient, + (await fatClient.listTools()).map((d) => ({ + name: d.name, + description: d.description, + parameters: d.inputSchema as Record, + })), + ); + await ctx.rpc.setPermission({ mode: 'yolo' }); + + // Step 1 loads the fat schema; step 2's boundary trips the trigger and + // blocks on auto-compaction (consuming the summary mock), which trims the + // rebuild. Step 2 then calls the MCP tool directly — the executable table + // is resolved AFTER the compaction (same state as the messages), so the + // now-unloaded tool must be rejected by preflight, not dispatched. + const fatCallLog: unknown[] = []; + (fatClient as { callTool: unknown }).callTool = async (...args: unknown[]) => { + fatCallLog.push(args); + return { content: [{ type: 'text', text: 'ok' }], isError: false }; + }; + ctx.mockNextResponse({ type: 'text', text: 'loading' }, selectCall('call-1', [GRAFANA_TOOL])); + ctx.mockNextResponse({ type: 'text', text: 'Compacted summary.' }); + ctx.mockNextResponse({ type: 'text', text: 'querying' }, mcpCall('call-2', 'errors')); + ctx.mockNextResponse({ type: 'text', text: 'done' }); + await runTurn(ctx, 'load the fat tool'); + + // The rebuild was trimmed away: no schema message survives, the ledger is + // empty again, and the tool is simply re-selectable on demand. + expect(schemaMessages(ctx)).toHaveLength(0); + expect(ctx.agent.tools.loadedDynamicToolNames().has(GRAFANA_TOOL)).toBe(false); + + // The direct call after the trim was rejected with select guidance and + // never reached the MCP client. + expect(fatCallLog).toHaveLength(0); + expect(toolResultTexts(ctx).join('\n')).toContain( + `Tool "${GRAFANA_TOOL}" is available but not loaded.`, + ); + + // Regression: the next turn must not re-compact. (No summary mock is + // queued — an unexpected compaction would fail the scripted generate.) + const started: unknown[] = []; + ctx.emitter.on('compaction.started', (event) => started.push(event)); + ctx.mockNextResponse({ type: 'text', text: 'quiet turn' }); + await runTurn(ctx, 'still fine?'); + expect(started).toHaveLength(0); + }); +}); diff --git a/packages/agent-core/test/utils/tokens.test.ts b/packages/agent-core/test/utils/tokens.test.ts index 4b22d78e8..ce27e2392 100644 --- a/packages/agent-core/test/utils/tokens.test.ts +++ b/packages/agent-core/test/utils/tokens.test.ts @@ -2,8 +2,10 @@ import type { ContentPart } from '@moonshot-ai/kosong'; import { describe, expect, it } from 'vitest'; import { + estimateTokens, estimateTokensForContentPart, estimateTokensForMessage, + estimateTokensForTools, MEDIA_TOKEN_ESTIMATE, } from '../../src/utils/tokens'; @@ -59,3 +61,41 @@ describe('estimateTokensForContentPart — media parts', () => { expect(estimateTokensForMessage(message)).toBeGreaterThan(100); }); }); + +// Dynamic tool schema messages (select_tools progressive disclosure) carry +// full tool definitions in `message.tools`. If the estimator ignores them, +// injected schemas are invisible to every compaction budget and the context +// overflows before compaction triggers. +describe('estimateTokensForMessage — message.tools', () => { + const tool = { + name: 'mcp__grafana__query_range', + description: 'Query a Prometheus-compatible range endpoint.', + parameters: { + type: 'object', + properties: { query: { type: 'string' }, minutes: { type: 'number' } }, + required: ['query'], + }, + }; + + it('counts injected tool schemas', () => { + const bare = { role: 'system', content: [] } as const; + const withTools = { role: 'system', content: [], tools: [tool] } as const; + expect(estimateTokensForMessage(withTools)).toBe( + estimateTokensForMessage(bare) + estimateTokensForTools([tool]), + ); + }); + + it('leaves messages without tools byte-identical to the old estimate', () => { + const message = { + role: 'user', + content: [{ type: 'text', text: 'hello world' }] satisfies ContentPart[], + toolCalls: [{ name: 'Read', arguments: { file: 'a.ts' } }], + }; + const expected = + estimateTokens('user') + + estimateTokens('hello world') + + estimateTokens('Read') + + estimateTokens(JSON.stringify({ file: 'a.ts' })); + expect(estimateTokensForMessage(message)).toBe(expected); + }); +}); diff --git a/packages/kosong/src/capability.ts b/packages/kosong/src/capability.ts index 08eb7b3eb..27d004860 100644 --- a/packages/kosong/src/capability.ts +++ b/packages/kosong/src/capability.ts @@ -15,6 +15,13 @@ export interface ModelCapability { readonly thinking: boolean; readonly tool_use: boolean; readonly max_context_tokens: number; + /** + * Model accepts message-level tool declarations (`messages[].tools`), the + * primitive behind select_tools progressive disclosure. Absent means + * unsupported: only models explicitly catalogued or declared with this + * capability may ever receive a message carrying `tools`. + */ + readonly select_tools?: boolean; } const UNKNOWN_CAPABILITY_MARKER = Symbol.for('moonshot-ai.kosong.UNKNOWN_CAPABILITY'); @@ -33,6 +40,7 @@ export const UNKNOWN_CAPABILITY: ModelCapability = Object.freeze( thinking: false, tool_use: false, max_context_tokens: 0, + select_tools: false, }, UNKNOWN_CAPABILITY_MARKER, { value: true }, @@ -50,6 +58,7 @@ export function isUnknownCapability(capability: ModelCapability): boolean { !capability.audio_in && !capability.thinking && !capability.tool_use && + capability.select_tools !== true && capability.max_context_tokens === 0 ); } diff --git a/packages/kosong/src/catalog.ts b/packages/kosong/src/catalog.ts index 40975430c..ff1fb9437 100644 --- a/packages/kosong/src/catalog.ts +++ b/packages/kosong/src/catalog.ts @@ -13,6 +13,8 @@ export interface CatalogModelEntry { readonly limit?: { readonly context?: number; readonly output?: number }; readonly tool_call?: boolean; readonly reasoning?: boolean; + /** Accepts message-level tool declarations (`messages[].tools`). Defaults to false. */ + readonly select_tools?: boolean; readonly interleaved?: boolean | { readonly field?: string }; readonly modalities?: { readonly input?: readonly string[]; @@ -136,6 +138,7 @@ export function catalogModelToCapability(model: CatalogModelEntry): CatalogModel thinking: Boolean(model.reasoning), tool_use: model.tool_call ?? true, max_context_tokens: context, + select_tools: model.select_tools === true, }, }; } diff --git a/packages/kosong/src/generate.ts b/packages/kosong/src/generate.ts index ad475ed52..5d6a90608 100644 --- a/packages/kosong/src/generate.ts +++ b/packages/kosong/src/generate.ts @@ -103,8 +103,16 @@ export async function generate( throwAbortError(); } + // Deferred tools are executable client-side but must not appear in the + // request's top-level `tools[]` (their schemas travel via message-level + // `tools` declarations; the top-level list stays byte-stable for prompt + // caching). This is the single strip point for every provider call. + const wireTools = tools.some((tool) => tool.deferred === true) + ? tools.filter((tool) => tool.deferred !== true) + : tools; + options?.onRequestStart?.(); - const stream = await provider.generate(systemPrompt, tools, history, options); + const stream = await provider.generate(systemPrompt, wireTools, history, options); // Post-await abort check: `provider.generate()` may have resolved before // noticing a mid-flight abort. Reject immediately rather than draining diff --git a/packages/kosong/src/message.ts b/packages/kosong/src/message.ts index 611cfb8c0..12db25c84 100644 --- a/packages/kosong/src/message.ts +++ b/packages/kosong/src/message.ts @@ -1,3 +1,5 @@ +import type { Tool } from './tool'; + export type Role = 'system' | 'user' | 'assistant' | 'tool'; export interface TextPart { @@ -100,6 +102,16 @@ export interface Message { readonly toolCallId?: string; /** When `true`, indicates the message was not fully received (e.g. stream interrupted). */ readonly partial?: boolean; + /** + * Full tool definitions carried by this message. Meaningful only on + * `role: 'system'` messages: it is the append-only primitive for loading a + * tool mid-conversation without touching the request's top-level `tools[]` + * (which must stay byte-stable to preserve the provider's prompt cache). + * Providers that support message-level tool declarations (Kimi + * `messages[].tools`) serialize it; callers must not send such a message to + * a provider without that capability. + */ + readonly tools?: readonly Tool[] | undefined; } /** Check if a streamed part is a ContentPart (text, think, image_url, audio_url, video_url). */ @@ -110,6 +122,24 @@ export function isContentPart(part: StreamedMessagePart): part is ContentPart { ); } +/** + * True for a message whose only payload is `tools` — the dynamic tool-loading + * primitive (see {@link Message.tools}). Message-level tool declarations are a + * Kimi wire feature; every other provider must skip such a message entirely: + * their explicit field construction already keeps the `tools` field off the + * wire, but the leftover empty message would be rejected (OpenAI: system + * message without content) or serialized as a garbage `` + * turn (Anthropic/Google system-to-user wrapping). + */ +export function isToolDeclarationOnlyMessage(message: Message): boolean { + return ( + message.tools !== undefined && + message.tools.length > 0 && + message.content.length === 0 && + message.toolCalls.length === 0 + ); +} + /** Check if a streamed part is a ToolCall. */ export function isToolCall(part: StreamedMessagePart): part is ToolCall { return part.type === 'function'; diff --git a/packages/kosong/src/providers/anthropic.ts b/packages/kosong/src/providers/anthropic.ts index a031b75ee..74c99702e 100644 --- a/packages/kosong/src/providers/anthropic.ts +++ b/packages/kosong/src/providers/anthropic.ts @@ -6,6 +6,7 @@ import { normalizeAPIStatusError, } from '#/errors'; import type { ContentPart, Message, StreamedMessagePart, ToolCall } from '#/message'; +import { isToolDeclarationOnlyMessage } from '#/message'; import type { ChatProvider, FinishReason, @@ -1031,7 +1032,13 @@ export class AnthropicChatProvider implements ChatProvider { // mergeConsecutiveUserMessages) so this provider and Gemini/Vertex stay in // step. const messages = mergeConsecutiveUserMessages( - normalizeToolCallIdsForProvider(history, ANTHROPIC_TOOL_CALL_ID_POLICY).map((msg) => + normalizeToolCallIdsForProvider( + // Message-level tool declarations are a Kimi wire feature; here the + // whole message is skipped (an empty leftover would serialize as a + // garbage `` user turn). See isToolDeclarationOnlyMessage. + history.filter((msg) => !isToolDeclarationOnlyMessage(msg)), + ANTHROPIC_TOOL_CALL_ID_POLICY, + ).map((msg) => convertMessage(msg, this._model), ), { diff --git a/packages/kosong/src/providers/google-genai.ts b/packages/kosong/src/providers/google-genai.ts index 8c28cf215..1adec11d2 100644 --- a/packages/kosong/src/providers/google-genai.ts +++ b/packages/kosong/src/providers/google-genai.ts @@ -5,6 +5,7 @@ import { normalizeAPIStatusError, } from '#/errors'; import type { Message, StreamedMessagePart, ToolCall } from '#/message'; +import { isToolDeclarationOnlyMessage } from '#/message'; import type { ChatProvider, FinishReason, @@ -354,6 +355,15 @@ export function messagesToGoogleGenAIContents(messages: Message[]): GoogleConten const message = messages[i]; if (message === undefined) break; + // Message-level tool declarations are a Kimi wire feature. The system + // branch below would already drop the empty leftover via its text-length + // check, but skip explicitly so the behavior does not hinge on that + // coincidence (and covers a non-system carrier defensively). + if (isToolDeclarationOnlyMessage(message)) { + i += 1; + continue; + } + if (message.role === 'system') { // Google GenAI's `Content.role` only accepts "user" or "model", so a // system message in the history (e.g. from session restore or diff --git a/packages/kosong/src/providers/kimi.ts b/packages/kosong/src/providers/kimi.ts index e3a5a05e9..67d1d9616 100644 --- a/packages/kosong/src/providers/kimi.ts +++ b/packages/kosong/src/providers/kimi.ts @@ -96,6 +96,8 @@ interface OpenAIMessage { tool_call_id?: string | undefined; name?: string | undefined; reasoning_content?: string | undefined; + /** Message-level tool declarations (`messages[].tools`), see convertMessage. */ + tools?: OpenAIToolParam[] | undefined; } interface OpenAIToolCallOut { @@ -169,6 +171,16 @@ function convertMessage(message: Message): OpenAIMessage { result.reasoning_content = reasoningContent; } + // Message-level tool declarations: a system message carrying `tools` loads + // those definitions mid-conversation (`messages[].tools` in the Kimi + // contract; each entry is a full OpenAI-compatible tool param). Reusing + // convertTool keeps schema normalization and the `$` builtin_function + // branch identical to the top-level `tools[]` path. Such a message carries + // no `content` — the empty-content branch above already omits the field. + if (message.tools !== undefined && message.tools.length > 0) { + result.tools = message.tools.map((tool) => convertTool(tool)); + } + return result; } function convertTool(tool: Tool): OpenAIToolParam { diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index 50187fd7a..ee7376d5b 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -1,4 +1,5 @@ import type { ContentPart, Message, StreamedMessagePart, ToolCall } from '#/message'; +import { isToolDeclarationOnlyMessage } from '#/message'; import type { ChatProvider, FinishReason, @@ -286,6 +287,10 @@ function convertHistoryMessages( const pendingToolResultMedia: OpenAIContentPart[] = []; for (const msg of history) { + // Message-level tool declarations are a Kimi wire feature; skipped here + // because the leftover `{role:"system"}` without content is rejected by + // the Chat Completions API. See isToolDeclarationOnlyMessage. + if (isToolDeclarationOnlyMessage(msg)) continue; if (msg.role !== 'tool') { appendToolResultMediaMessage(messages, pendingToolResultMedia); } diff --git a/packages/kosong/src/providers/openai-responses.ts b/packages/kosong/src/providers/openai-responses.ts index 48544b81b..307a51e65 100644 --- a/packages/kosong/src/providers/openai-responses.ts +++ b/packages/kosong/src/providers/openai-responses.ts @@ -5,7 +5,7 @@ import { isContextOverflowErrorCode, } from '#/errors'; import type { ContentPart, Message, StreamedMessagePart, ToolCall } from '#/message'; -import { extractText } from '#/message'; +import { extractText, isToolDeclarationOnlyMessage } from '#/message'; import type { ChatProvider, FinishReason, @@ -614,6 +614,10 @@ function convertHistoryMessages( }; for (const msg of history) { + // Message-level tool declarations are a Kimi wire feature; skipped here + // because the leftover content-free message item is rejected by the + // Responses API. See isToolDeclarationOnlyMessage. + if (isToolDeclarationOnlyMessage(msg)) continue; if (msg.role !== 'tool') { flushPendingMedia(); } diff --git a/packages/kosong/src/tool.ts b/packages/kosong/src/tool.ts index 470b1f937..8ebc77897 100644 --- a/packages/kosong/src/tool.ts +++ b/packages/kosong/src/tool.ts @@ -12,4 +12,13 @@ export interface Tool { description: string; /** JSON Schema describing the tool's parameters. */ parameters: Record; + /** + * Client-internal marker: the tool is executable but its schema must not be + * serialized into the request's top-level `tools[]` — it was (or will be) + * delivered through a message-level `tools` declaration instead, and the + * top-level list must stay byte-stable for prompt caching. `generate()` + * strips marked tools before the provider builds the request; the marker + * itself never reaches the wire. + */ + deferred?: true; } diff --git a/packages/kosong/test/catalog.test.ts b/packages/kosong/test/catalog.test.ts index 5780e5502..5afe77823 100644 --- a/packages/kosong/test/catalog.test.ts +++ b/packages/kosong/test/catalog.test.ts @@ -82,6 +82,7 @@ describe('catalogModelToCapability', () => { thinking: true, tool_use: true, max_context_tokens: 200000, + select_tools: false, }, }); }); diff --git a/packages/kosong/test/select-tools.test.ts b/packages/kosong/test/select-tools.test.ts new file mode 100644 index 000000000..1ce7ff3af --- /dev/null +++ b/packages/kosong/test/select-tools.test.ts @@ -0,0 +1,353 @@ +/** + * select_tools progressive disclosure — kosong-side contract tests. + * + * Covers the three primitives this package contributes: + * - `Message.tools` serialization on the Kimi wire (`messages[].tools`, + * `{type:'function', function:{...}}` wrapping, no `content`, schema + * normalization and the `$` builtin branch shared with top-level tools); + * - `Tool.deferred` stripping in `generate()` (single strip point for every + * provider call — the marker itself must never reach the wire); + * - the `select_tools` capability bit (unknown/default-off semantics). + */ + +import { UNKNOWN_CAPABILITY, isUnknownCapability } from '#/capability'; +import { catalogModelToCapability } from '#/catalog'; +import { generate } from '#/generate'; +import { isToolDeclarationOnlyMessage } from '#/message'; +import type { Message, StreamedMessagePart } from '#/message'; +import { AnthropicChatProvider } from '#/providers/anthropic'; +import { messagesToGoogleGenAIContents } from '#/providers/google-genai'; +import { KimiChatProvider } from '#/providers/kimi'; +import { OpenAILegacyChatProvider } from '#/providers/openai-legacy'; +import { OpenAIResponsesChatProvider } from '#/providers/openai-responses'; +import type { ChatProvider, StreamedMessage, ThinkingEffort } from '#/provider'; +import type { Tool } from '#/tool'; +import { describe, expect, it, vi } from 'vitest'; + +const ADD_TOOL: Tool = { + name: 'add', + description: 'Add two integers.', + parameters: { + type: 'object', + properties: { + a: { type: 'integer', description: 'First number' }, + b: { type: 'integer', description: 'Second number' }, + }, + required: ['a', 'b'], + }, +}; + +const BUILTIN_TOOL: Tool = { + name: '$web_search', + description: 'Search the web', + parameters: { type: 'object', properties: {} }, +}; + +function makeChatCompletionResponse() { + return { + id: 'chatcmpl-test123', + object: 'chat.completion', + created: 1234567890, + model: 'kimi-test', + choices: [ + { + index: 0, + message: { role: 'assistant', content: 'Hello' }, + finish_reason: 'stop', + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, + }; +} + +async function captureRequestBody( + tools: Tool[], + history: Message[], +): Promise> { + const provider = new KimiChatProvider({ + model: 'kimi-test', + apiKey: 'test-key', + stream: false, + }); + let capturedBody: Record | undefined; + (provider as any)._client.chat.completions.create = vi + .fn() + .mockImplementation((params: unknown) => { + capturedBody = params as Record; + return Promise.resolve(makeChatCompletionResponse()); + }); + const stream = await provider.generate('system prompt', tools, history); + for await (const part of stream) { + void part; + } + if (capturedBody === undefined) { + throw new Error('Expected provider.generate() to call chat.completions.create'); + } + return capturedBody; +} + +describe('Kimi messages[].tools serialization', () => { + it('serializes a system message carrying tools with function wrapping and no content', async () => { + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + { role: 'system', content: [], toolCalls: [], tools: [ADD_TOOL] }, + ]; + const body = await captureRequestBody([], history); + const messages = body['messages'] as Array>; + // [system prompt, user, system+tools] + expect(messages).toHaveLength(3); + const toolsMessage = messages[2]!; + expect(toolsMessage['role']).toBe('system'); + expect('content' in toolsMessage).toBe(false); + expect(toolsMessage['tools']).toEqual([ + { + type: 'function', + function: { + name: 'add', + description: 'Add two integers.', + parameters: ADD_TOOL.parameters, + }, + }, + ]); + }); + + it('routes $-prefixed names through the builtin_function branch, same as top-level tools', async () => { + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + { role: 'system', content: [], toolCalls: [], tools: [BUILTIN_TOOL] }, + ]; + const body = await captureRequestBody([], history); + const messages = body['messages'] as Array>; + expect(messages[2]!['tools']).toEqual([ + { type: 'builtin_function', function: { name: '$web_search' } }, + ]); + }); + + it('leaves messages without tools untouched (no tools key)', async () => { + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + ]; + const body = await captureRequestBody([ADD_TOOL], history); + const messages = body['messages'] as Array>; + for (const message of messages) { + expect('tools' in message).toBe(false); + } + // Top-level tools[] unchanged by the feature. + expect(body['tools']).toEqual([ + { + type: 'function', + function: { + name: 'add', + description: 'Add two integers.', + parameters: ADD_TOOL.parameters, + }, + }, + ]); + }); + + it('does not serialize the deferred marker even if a marked tool reaches convertMessage', async () => { + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + { + role: 'system', + content: [], + toolCalls: [], + tools: [{ ...ADD_TOOL, deferred: true }], + }, + ]; + const body = await captureRequestBody([], history); + const messages = body['messages'] as Array>; + const serialized = JSON.stringify(messages[2]!['tools']); + expect(serialized).not.toContain('deferred'); + }); +}); + +describe('generate() deferred tool stripping', () => { + function createCapturingProvider(): { provider: ChatProvider; seenTools: () => Tool[] } { + let captured: Tool[] = []; + const stream: StreamedMessage = { + id: null, + usage: null, + finishReason: 'completed', + rawFinishReason: 'stop', + async *[Symbol.asyncIterator](): AsyncIterator { + yield { type: 'text', text: 'ok' }; + }, + }; + const provider: ChatProvider = { + name: 'mock', + modelName: 'mock-model', + thinkingEffort: null as ThinkingEffort | null, + generate: async (_systemPrompt, tools, _history) => { + captured = tools; + return stream; + }, + withThinking(_effort: ThinkingEffort): ChatProvider { + return this; + }, + }; + return { provider, seenTools: () => captured }; + } + + it('strips deferred tools before the provider builds the request', async () => { + const { provider, seenTools } = createCapturingProvider(); + await generate(provider, 'sys', [ADD_TOOL, { ...BUILTIN_TOOL, deferred: true }], [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + ]); + expect(seenTools()).toEqual([ADD_TOOL]); + }); + + it('passes the identical array through when nothing is deferred', async () => { + const { provider, seenTools } = createCapturingProvider(); + const tools = [ADD_TOOL, BUILTIN_TOOL]; + await generate(provider, 'sys', tools, [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + ]); + expect(seenTools()).toBe(tools); + }); +}); + +describe('providers without message-level tool declarations', () => { + const TOOLS_ONLY_MESSAGE: Message = { + role: 'system', + content: [], + toolCalls: [], + tools: [ADD_TOOL], + }; + const HISTORY: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + TOOLS_ONLY_MESSAGE, + ]; + + it('classifies tool-declaration-only messages', () => { + expect(isToolDeclarationOnlyMessage(TOOLS_ONLY_MESSAGE)).toBe(true); + expect(isToolDeclarationOnlyMessage(HISTORY[0]!)).toBe(false); + // A message that also carries content is NOT skipped wholesale (only the + // tools field stays off the wire via explicit field construction). + expect( + isToolDeclarationOnlyMessage({ + ...TOOLS_ONLY_MESSAGE, + content: [{ type: 'text', text: 'x' }], + }), + ).toBe(false); + }); + + it('anthropic skips the message instead of emitting a husk', async () => { + const provider = new AnthropicChatProvider({ model: 'k25', apiKey: 'test-key', stream: false }); + let captured: Record | undefined; + (provider as any)._client.messages.create = vi.fn().mockImplementation((params: unknown) => { + captured = params as Record; + return Promise.resolve({ + id: 'msg_test_123', + type: 'message', + role: 'assistant', + model: 'k25', + content: [{ type: 'text', text: 'Hello' }], + stop_reason: 'end_turn', + usage: { input_tokens: 10, output_tokens: 5 }, + }); + }); + const stream = await provider.generate('sys', [], HISTORY); + for await (const part of stream) void part; + expect(JSON.stringify(captured!['messages'])).not.toContain(''); + expect(captured!['messages'] as unknown[]).toHaveLength(1); + }); + + it('openai chat completions skips the message instead of sending a content-free system entry', async () => { + const provider = new OpenAILegacyChatProvider({ model: 'gpt-4.1', apiKey: 'test-key', stream: false }); + let captured: Record | undefined; + (provider as any)._client.chat.completions.create = vi + .fn() + .mockImplementation((params: unknown) => { + captured = params as Record; + return Promise.resolve({ + id: 'chatcmpl-test123', + object: 'chat.completion', + created: 1234567890, + model: 'gpt-4.1', + choices: [ + { + index: 0, + message: { role: 'assistant', content: 'Hello' }, + finish_reason: 'stop', + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, + }); + }); + const stream = await provider.generate('sys', [], HISTORY); + for await (const part of stream) void part; + const messages = captured!['messages'] as Array>; + // [system prompt, user] — no content-free leftover entry. + expect(messages).toHaveLength(2); + for (const message of messages) { + expect(message['content']).toBeDefined(); + } + }); + + it('openai responses skips the message', async () => { + const provider = new OpenAIResponsesChatProvider({ model: 'gpt-4.1', apiKey: 'test-key' }); + (provider as any)._stream = false; + let captured: Record | undefined; + ((provider as any)._client.responses as Record)['create'] = vi + .fn() + .mockImplementation((params: unknown) => { + captured = params as Record; + return Promise.resolve({ + id: 'resp_test123', + object: 'response', + created_at: 1234567890, + status: 'completed', + model: 'gpt-4.1', + output: [ + { + type: 'message', + id: 'msg_test', + role: 'assistant', + content: [{ type: 'output_text', text: 'Hello', annotations: [] }], + }, + ], + usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 }, + }); + }); + const stream = await provider.generate('sys', [], HISTORY); + for await (const part of stream) void part; + // The tools-only message contributes no input item at all. + expect(captured!['input'] as unknown[]).toHaveLength(1); + expect(JSON.stringify(captured!['input'])).not.toContain('"tools"'); + }); + + it('google genai skips the message explicitly (not just via the empty-text coincidence)', () => { + const contents = messagesToGoogleGenAIContents(HISTORY); + expect(contents).toHaveLength(1); + expect(JSON.stringify(contents)).not.toContain(''); + }); +}); + +describe('select_tools capability bit', () => { + it('defaults to false on UNKNOWN_CAPABILITY', () => { + expect(UNKNOWN_CAPABILITY.select_tools).toBe(false); + }); + + it('a capability that only has select_tools is not "unknown"', () => { + expect( + isUnknownCapability({ + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: false, + max_context_tokens: 0, + select_tools: true, + }), + ).toBe(false); + }); + + it('catalog entries map select_tools and default it to false', () => { + const base = { id: 'm', limit: { context: 1000 } }; + expect(catalogModelToCapability(base)?.capability.select_tools).toBe(false); + expect( + catalogModelToCapability({ ...base, select_tools: true })?.capability.select_tools, + ).toBe(true); + }); +}); diff --git a/packages/node-sdk/src/catalog.ts b/packages/node-sdk/src/catalog.ts index 27eeb6885..f7a215e7c 100644 --- a/packages/node-sdk/src/catalog.ts +++ b/packages/node-sdk/src/catalog.ts @@ -47,6 +47,7 @@ function capabilityToStrings(capability: ModelCapability): string[] | undefined if (capability.audio_in) caps.push('audio_in'); if (capability.thinking) caps.push('thinking'); if (capability.tool_use) caps.push('tool_use'); + if (capability.select_tools === true) caps.push('select_tools'); return caps.length > 0 ? caps : undefined; } diff --git a/packages/node-sdk/test/config.test.ts b/packages/node-sdk/test/config.test.ts index 5c51d7e10..060195054 100644 --- a/packages/node-sdk/test/config.test.ts +++ b/packages/node-sdk/test/config.test.ts @@ -332,10 +332,20 @@ describe('KimiHarness config API', () => { const homeDir = await makeTempDir(); const harness = createKimiHarness({ homeDir, identity: TEST_IDENTITY }); - // No experimental features are currently registered, so the harness exposes - // an empty list. const features = await harness.getExperimentalFeatures(); - expect(features).toEqual([]); + expect(features).toEqual([ + { + id: 'tool-select', + title: 'Tool select (progressive tool disclosure)', + description: + 'Keep MCP tool schemas out of the immutable top-level tools[]; the model loads them on demand via the select_tools tool. Only takes effect on models whose capability catalog declares select_tools.', + surface: 'core', + env: 'KIMI_CODE_EXPERIMENTAL_TOOL_SELECT', + defaultEnabled: false, + enabled: false, + source: 'default', + }, + ]); }); it('can create the default config scaffold without selecting a model', async () => {