From 94389a19a8a3f3d3ce1999c592ca6f0b54eb08dc Mon Sep 17 00:00:00 2001 From: m-aebrer Date: Tue, 19 May 2026 15:01:15 -0400 Subject: [PATCH 1/4] chore: open PR for issue 215 From a22f7d4f3e9e7bcbebbb5ab6152f6b797cb9e7ae Mon Sep 17 00:00:00 2001 From: m-aebrer Date: Tue, 19 May 2026 15:59:06 -0400 Subject: [PATCH 2/4] Fix subagent probe defaults --- packages/agent/test/e2e.test.ts | 4 +- packages/ai/test/context-overflow.test.ts | 4 +- packages/ai/test/empty.test.ts | 2 +- .../openai-responses-copilot-provider.test.ts | 62 ++++++- packages/ai/test/tokens.test.ts | 2 +- .../ai/test/tool-call-without-result.test.ts | 2 +- packages/ai/test/total-tokens.test.ts | 22 ++- packages/ai/test/unicode-surrogate.test.ts | 2 +- packages/coding-agent/README.md | 2 +- packages/coding-agent/src/core/sdk.ts | 5 +- packages/coding-agent/src/core/thinking.ts | 21 +++ .../coding-agent/src/core/tools/subagent.ts | 16 +- .../test/subagent-model-fallback.test.ts | 153 ++++++++++++++---- 13 files changed, 232 insertions(+), 65 deletions(-) create mode 100644 packages/coding-agent/src/core/thinking.ts diff --git a/packages/agent/test/e2e.test.ts b/packages/agent/test/e2e.test.ts index 8406f62c..288b2656 100644 --- a/packages/agent/test/e2e.test.ts +++ b/packages/agent/test/e2e.test.ts @@ -233,8 +233,8 @@ describe("Agent E2E Tests", () => { }); }); - describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider (grok-3)", () => { - const model = getModel("xai", "grok-3"); + describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider (grok-4.3)", () => { + const model = getModel("xai", "grok-4.3"); it("should handle basic text prompt", async () => { await basicPrompt(model); diff --git a/packages/ai/test/context-overflow.test.ts b/packages/ai/test/context-overflow.test.ts index 8bdbc740..a0481b88 100644 --- a/packages/ai/test/context-overflow.test.ts +++ b/packages/ai/test/context-overflow.test.ts @@ -314,8 +314,8 @@ describe("Context overflow error handling", () => { // ============================================================================= describe.skipIf(!process.env.XAI_API_KEY)("xAI", () => { - it("grok-3-fast - should detect overflow via isContextOverflow", async () => { - const model = getModel("xai", "grok-3-fast"); + it("grok-4.3 - should detect overflow via isContextOverflow", async () => { + const model = getModel("xai", "grok-4.3"); const result = await testContextOverflow(model, process.env.XAI_API_KEY!); logResult(result); diff --git a/packages/ai/test/empty.test.ts b/packages/ai/test/empty.test.ts index 99c4df84..f6a62384 100644 --- a/packages/ai/test/empty.test.ts +++ b/packages/ai/test/empty.test.ts @@ -249,7 +249,7 @@ describe("AI Providers Empty Message Tests", () => { }); describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider Empty Messages", () => { - const llm = getModel("xai", "grok-3"); + const llm = getModel("xai", "grok-4.3"); it("should handle empty content array", { retry: 3, timeout: 30000 }, async () => { await testEmptyMessage(llm); diff --git a/packages/ai/test/openai-responses-copilot-provider.test.ts b/packages/ai/test/openai-responses-copilot-provider.test.ts index 04b2856c..ef20f86c 100644 --- a/packages/ai/test/openai-responses-copilot-provider.test.ts +++ b/packages/ai/test/openai-responses-copilot-provider.test.ts @@ -1,6 +1,17 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import { getModel } from "../src/models.js"; import { streamOpenAIResponses } from "../src/providers/openai-responses.js"; +import { streamSimple } from "../src/stream.js"; +import type { Model } from "../src/types.js"; + +function mockDoneStream() { + vi.spyOn(globalThis, "fetch").mockResolvedValue( + new Response("data: [DONE]\n\n", { + status: 200, + headers: { "content-type": "text/event-stream" }, + }), + ); +} describe("openai-responses github-copilot defaults", () => { afterEach(() => { @@ -11,12 +22,7 @@ describe("openai-responses github-copilot defaults", () => { const model = getModel("github-copilot", "gpt-5-mini"); let capturedPayload: unknown; - vi.spyOn(globalThis, "fetch").mockResolvedValue( - new Response("data: [DONE]\n\n", { - status: 200, - headers: { "content-type": "text/event-stream" }, - }), - ); + mockDoneStream(); const stream = streamOpenAIResponses( model, @@ -41,4 +47,48 @@ describe("openai-responses github-copilot defaults", () => { reasoning: expect.anything(), }); }); + + it("streamSimple applies reasoning defaults without synthetic one-token cap", async () => { + const model: Model<"openai-responses"> = { + id: "gpt-5.5", + name: "gpt-5.5", + api: "openai-responses", + provider: "openai", + baseUrl: "https://api.openai.com/v1", + reasoning: true, + input: ["text"], + cost: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 200000, + maxTokens: 100000, + }; + let capturedPayload: { max_output_tokens?: unknown } | undefined; + + mockDoneStream(); + + const stream = streamSimple( + model, + { + systemPrompt: "sys", + messages: [{ role: "user", content: "hi", timestamp: Date.now() }], + }, + { + apiKey: "test-key", + reasoning: "xhigh", + onPayload: (payload) => { + capturedPayload = payload as { max_output_tokens?: unknown }; + }, + }, + ); + + for await (const event of stream) { + if (event.type === "done" || event.type === "error") break; + } + + expect(capturedPayload).toMatchObject({ + reasoning: { effort: "xhigh", summary: "auto" }, + max_output_tokens: 32000, + include: ["reasoning.encrypted_content"], + }); + expect(capturedPayload?.max_output_tokens).not.toBe(1); + }); }); diff --git a/packages/ai/test/tokens.test.ts b/packages/ai/test/tokens.test.ts index 4b0144cd..37d5ae8b 100644 --- a/packages/ai/test/tokens.test.ts +++ b/packages/ai/test/tokens.test.ts @@ -136,7 +136,7 @@ describe("Token Statistics on Abort", () => { }); describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider", () => { - const llm = getModel("xai", "grok-3-fast"); + const llm = getModel("xai", "grok-4.3"); it("should include token stats when aborted mid-stream", { retry: 3, timeout: 30000 }, async () => { await testTokensOnAbort(llm); diff --git a/packages/ai/test/tool-call-without-result.test.ts b/packages/ai/test/tool-call-without-result.test.ts index 7607609e..9cd90616 100644 --- a/packages/ai/test/tool-call-without-result.test.ts +++ b/packages/ai/test/tool-call-without-result.test.ts @@ -145,7 +145,7 @@ describe("Tool Call Without Result Tests", () => { }); describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider", () => { - const model = getModel("xai", "grok-3-fast"); + const model = getModel("xai", "grok-4.3"); it("should filter out tool calls without corresponding tool results", { retry: 3, timeout: 30000 }, async () => { await testToolCallWithoutResult(model); diff --git a/packages/ai/test/total-tokens.test.ts b/packages/ai/test/total-tokens.test.ts index 9a77bd58..971c1487 100644 --- a/packages/ai/test/total-tokens.test.ts +++ b/packages/ai/test/total-tokens.test.ts @@ -219,22 +219,18 @@ describe("totalTokens field", () => { // ========================================================================= describe.skipIf(!process.env.XAI_API_KEY)("xAI", () => { - it( - "grok-3-fast - should return totalTokens equal to sum of components", - { retry: 3, timeout: 60000 }, - async () => { - const llm = getModel("xai", "grok-3-fast"); + it("grok-4.3 - should return totalTokens equal to sum of components", { retry: 3, timeout: 60000 }, async () => { + const llm = getModel("xai", "grok-4.3"); - console.log(`\nxAI / ${llm.id}:`); - const { first, second } = await testTotalTokensWithCache(llm, { apiKey: process.env.XAI_API_KEY }); + console.log(`\nxAI / ${llm.id}:`); + const { first, second } = await testTotalTokensWithCache(llm, { apiKey: process.env.XAI_API_KEY }); - logUsage("First request", first); - logUsage("Second request", second); + logUsage("First request", first); + logUsage("Second request", second); - assertTotalTokensEqualsComponents(first); - assertTotalTokensEqualsComponents(second); - }, - ); + assertTotalTokensEqualsComponents(first); + assertTotalTokensEqualsComponents(second); + }); }); // ========================================================================= diff --git a/packages/ai/test/unicode-surrogate.test.ts b/packages/ai/test/unicode-surrogate.test.ts index 07cce99b..8b470dc9 100644 --- a/packages/ai/test/unicode-surrogate.test.ts +++ b/packages/ai/test/unicode-surrogate.test.ts @@ -540,7 +540,7 @@ describe("AI Providers Unicode Surrogate Pair Tests", () => { }); describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider Unicode Handling", () => { - const llm = getModel("xai", "grok-3"); + const llm = getModel("xai", "grok-4.3"); it("should handle emoji in tool results", { retry: 3, timeout: 30000 }, async () => { await testEmojiInToolResults(llm); diff --git a/packages/coding-agent/README.md b/packages/coding-agent/README.md index 8a6b648f..1343c392 100644 --- a/packages/coding-agent/README.md +++ b/packages/coding-agent/README.md @@ -346,7 +346,7 @@ The `subagent` tool delegates tasks to independent child agent processes. Each s **Agent definitions** live in `~/.dreb/agents/` (global) and `.dreb/agents/` (project). Each is a markdown file with YAML frontmatter specifying `name`, `model` (with provider fallback list), and optional `systemPrompt`. Built-in agents include `Explore` (read-only codebase exploration), `Sandbox` (restricted to `/tmp`), `feature-dev` (strong-tier coding), and several review agents. -**Model availability probes:** When an agent definition specifies a fallback list (comma-separated models), each model is verified with a lightweight 1-token API call before the subagent is spawned. Models that fail the probe (rate limit, quota exhaustion, auth failure, timeout) are skipped with a loud log line, and the next fallback is tried. If all configured models fail, the parent session's model is used as a last resort. Per-invocation model overrides and single-model configs skip probing entirely. +**Model availability probes:** When an agent definition specifies a fallback list (comma-separated models), each model is verified with a lightweight API call via the same `streamSimple` path the agent loop uses before the subagent is spawned. The probe uses normal coding-agent thinking defaults and does not pass a synthetic `maxTokens` override, which keeps the request shape representative for reasoning models as well as non-reasoning models. Models that fail the probe (rate limit, quota exhaustion, auth failure, timeout) are skipped with a loud log line, and the next fallback is tried. If all configured models fail, the parent session's model is used as a last resort. Per-invocation model overrides and single-model configs skip probing entirely. **Session metadata:** Each child process records its agent type in the session JSONL header (`agentType` field), providing an audit trail of which agent definition executed the work. diff --git a/packages/coding-agent/src/core/sdk.ts b/packages/coding-agent/src/core/sdk.ts index 8b3daab3..4cf4ffc8 100644 --- a/packages/coding-agent/src/core/sdk.ts +++ b/packages/coding-agent/src/core/sdk.ts @@ -14,6 +14,7 @@ import type { ResourceLoader } from "./resource-loader.js"; import { DefaultResourceLoader } from "./resource-loader.js"; import { getDefaultSessionDir, SessionManager } from "./session-manager.js"; import { SettingsManager } from "./settings-manager.js"; +import { resolveEffectiveThinkingLevel } from "./thinking.js"; import { time } from "./timings.js"; import { allTools, @@ -252,9 +253,7 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} } // Clamp to model capabilities - if (!model || !model.reasoning) { - thinkingLevel = "off"; - } + thinkingLevel = resolveEffectiveThinkingLevel(model, thinkingLevel); // Tools that are always active when available (created by factory, not in allTools singleton). // suggest_next is only auto-activated when tools aren't explicitly specified — subagent diff --git a/packages/coding-agent/src/core/thinking.ts b/packages/coding-agent/src/core/thinking.ts new file mode 100644 index 00000000..4cd16db7 --- /dev/null +++ b/packages/coding-agent/src/core/thinking.ts @@ -0,0 +1,21 @@ +import type { ThinkingLevel as AgentThinkingLevel } from "@dreb/agent-core"; +import type { ThinkingLevel as AiThinkingLevel, Model } from "@dreb/ai"; +import { DEFAULT_THINKING_LEVEL } from "./defaults.js"; + +/** + * Resolve the effective thinking level for a model using the same capability + * clamp as normal coding-agent sessions. + */ +export function resolveEffectiveThinkingLevel( + model: Model | undefined, + thinkingLevel: AgentThinkingLevel | undefined, + defaultThinkingLevel: AgentThinkingLevel = DEFAULT_THINKING_LEVEL, +): AgentThinkingLevel { + const effectiveThinkingLevel = thinkingLevel ?? defaultThinkingLevel; + return model?.reasoning ? effectiveThinkingLevel : "off"; +} + +/** Convert an effective thinking level into the reasoning option passed to streamSimple. */ +export function thinkingLevelToReasoning(thinkingLevel: AgentThinkingLevel): AiThinkingLevel | undefined { + return thinkingLevel === "off" ? undefined : (thinkingLevel as AiThinkingLevel); +} diff --git a/packages/coding-agent/src/core/tools/subagent.ts b/packages/coding-agent/src/core/tools/subagent.ts index 5940eb4e..ce5a9f1a 100644 --- a/packages/coding-agent/src/core/tools/subagent.ts +++ b/packages/coding-agent/src/core/tools/subagent.ts @@ -4,7 +4,7 @@ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs"; import { homedir } from "node:os"; import { join, resolve } from "node:path"; import type { AgentTool } from "@dreb/agent-core"; -import { type Api, type AssistantMessage, type Context, complete, type Model } from "@dreb/ai"; +import { type Api, type AssistantMessage, type Context, completeSimple, type Model } from "@dreb/ai"; import { Text } from "@dreb/tui"; import { type Static, Type } from "@sinclair/typebox"; import { CONFIG_DIR_NAME, getPackageDir, getSubagentSessionsDir } from "../../config.js"; @@ -14,6 +14,7 @@ import type { ToolDefinition, ToolRenderResultOptions } from "../extensions/type import { log } from "../logger.js"; import type { ModelRegistry } from "../model-registry.js"; import { resolveCliModel } from "../model-resolver.js"; +import { resolveEffectiveThinkingLevel, thinkingLevelToReasoning } from "../thinking.js"; import { getTextOutput, invalidArgText, str } from "./render-utils.js"; import { wrapToolDefinition } from "./tool-definition-wrapper.js"; import { DEFAULT_MAX_BYTES, formatSize, type TruncationResult } from "./truncate.js"; @@ -613,7 +614,7 @@ export async function probeModelAvailability( const probeSignal = makeProbeSignal(signal, timeoutMs); try { const context: Context = { - systemPrompt: "You are a model availability probe. Reply briefly.", + systemPrompt: "Reply with the single word OK.", messages: [{ role: "user", content: "hi", timestamp: Date.now() }], }; const apiKey = await Promise.race([ @@ -621,11 +622,18 @@ export async function probeModelAvailability( probeSignal.timeoutPromise, ]); if (signal?.aborted) return { ok: false, reason: "Aborted before spawn", aborted: true }; + const thinkingLevel = resolveEffectiveThinkingLevel(model, undefined); + const reasoning = thinkingLevelToReasoning(thinkingLevel); + // Use completeSimple — the same streamSimple path the agent loop uses. + // No maxTokens override is passed by the probe; buildBaseOptions applies + // normal model defaults. Reasoning is resolved through the shared normal + // coding-agent thinking default/clamp path instead of provider-specific + // probe logic, so reasoning models are probed with representative options. const result = await Promise.race([ - complete(model, context, { + completeSimple(model, context, { apiKey, maxRetryDelayMs: 0, - maxTokens: 1, + reasoning, signal: probeSignal.signal, }), probeSignal.timeoutPromise, diff --git a/packages/coding-agent/test/subagent-model-fallback.test.ts b/packages/coding-agent/test/subagent-model-fallback.test.ts index f08ab086..e8ab9efc 100644 --- a/packages/coding-agent/test/subagent-model-fallback.test.ts +++ b/packages/coding-agent/test/subagent-model-fallback.test.ts @@ -1,7 +1,7 @@ import { spawn } from "node:child_process"; import { EventEmitter } from "node:events"; import { PassThrough } from "node:stream"; -import { complete, type Model } from "@dreb/ai"; +import { complete, completeSimple, type Model } from "@dreb/ai"; import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; import { log } from "../src/core/logger.js"; import { @@ -31,11 +31,13 @@ vi.mock("@dreb/ai", async (importOriginal) => { return { ...actual, complete: vi.fn(), + completeSimple: vi.fn(), }; }); beforeEach(() => { vi.mocked(complete).mockReset(); + vi.mocked(completeSimple).mockReset(); vi.mocked(spawn).mockReset(); vi.spyOn(console, "error").mockImplementation(() => {}); vi.spyOn(log, "debug").mockImplementation(() => {}); @@ -584,7 +586,7 @@ function assistantResult(stopReason: "stop" | "error" | "aborted", errorMessage? stopReason, errorMessage, timestamp: Date.now(), - } as Awaited>; + } as Awaited>; } function probeRegistry() { @@ -651,25 +653,30 @@ function mockSpawnSubagentResult( } describe("spawn-time model availability probing", () => { - test("probeModelAvailability succeeds on a clean completion", async () => { - vi.mocked(complete).mockResolvedValueOnce(assistantResult("stop")); + test("probeModelAvailability succeeds on a clean completion via completeSimple (streamSimple path)", async () => { + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("stop")); const result = await probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 100 }); expect(result).toEqual({ ok: true }); - expect(complete).toHaveBeenCalledTimes(1); - expect(complete).toHaveBeenCalledWith( + expect(completeSimple).toHaveBeenCalledTimes(1); + expect(completeSimple).toHaveBeenCalledWith( probeModels[0], expect.objectContaining({ - systemPrompt: "You are a model availability probe. Reply briefly.", + systemPrompt: "Reply with the single word OK.", messages: [expect.objectContaining({ role: "user", content: "hi" })], }), - expect.objectContaining({ apiKey: "test-key", maxRetryDelayMs: 0, maxTokens: 1 }), + expect.objectContaining({ apiKey: "test-key", maxRetryDelayMs: 0, reasoning: "xhigh" }), ); + // Must NOT pass maxTokens — normal model defaults are used, which avoids + // tripping reasoning model minimums (e.g. OpenAI o-series with maxTokens:1). + const callOptions = vi.mocked(completeSimple).mock.calls[0][2]; + expect(callOptions).not.toHaveProperty("maxTokens"); + expect(callOptions).toHaveProperty("reasoning", "xhigh"); }); test("probeModelAvailability reports thrown errors", async () => { - vi.mocked(complete).mockRejectedValueOnce(new Error("rate limit exceeded")); + vi.mocked(completeSimple).mockRejectedValueOnce(new Error("rate limit exceeded")); const result = await probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 100 }); @@ -677,7 +684,7 @@ describe("spawn-time model availability probing", () => { }); test("probeModelAvailability treats returned aborted messages as unavailable", async () => { - vi.mocked(complete).mockResolvedValueOnce(assistantResult("aborted", "request cancelled")); + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("aborted", "request cancelled")); const result = await probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 100 }); @@ -695,14 +702,14 @@ describe("spawn-time model availability probing", () => { }); expect(result).toEqual({ ok: false, reason: "Aborted before spawn", aborted: true }); - expect(complete).not.toHaveBeenCalled(); + expect(completeSimple).not.toHaveBeenCalled(); }); test("probeModelAvailability propagates parent abort while in flight", async () => { const controller = new AbortController(); - vi.mocked(complete).mockImplementationOnce( + vi.mocked(completeSimple).mockImplementationOnce( (_model, _context, options) => - new Promise>>((resolve) => { + new Promise>>((resolve) => { options?.signal?.addEventListener("abort", () => resolve(assistantResult("aborted", "request cancelled")), ); @@ -721,7 +728,9 @@ describe("spawn-time model availability probing", () => { test("probeModelAvailability enforces timeout even if provider ignores abort", async () => { vi.useFakeTimers(); - vi.mocked(complete).mockImplementationOnce(() => new Promise>>(() => {})); + vi.mocked(completeSimple).mockImplementationOnce( + () => new Promise>>(() => {}), + ); const resultPromise = probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 50 }); await vi.advanceTimersByTimeAsync(50); @@ -740,7 +749,7 @@ describe("spawn-time model availability probing", () => { }); test("fallback loop uses the first model when its probe succeeds", async () => { - vi.mocked(complete).mockResolvedValueOnce(assistantResult("stop")); + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("stop")); const result = await resolveModelForSubagentSpawn( ["primary-model", "fallback-model"], @@ -755,11 +764,11 @@ describe("spawn-time model availability probing", () => { expect(result.provider).toBe("anthropic"); expect(result.skippedModels).toEqual([]); } - expect(complete).toHaveBeenCalledTimes(1); + expect(completeSimple).toHaveBeenCalledTimes(1); }); test("fallback loop skips a failed probe and uses the next fallback", async () => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", "429 rate limit")) .mockResolvedValueOnce(assistantResult("stop")); @@ -775,7 +784,7 @@ describe("spawn-time model availability probing", () => { expect(result.modelId).toBe("fallback-model"); expect(result.skippedModels).toEqual([{ model: "primary-model", reason: "429 rate limit" }]); } - expect(complete).toHaveBeenCalledTimes(2); + expect(completeSimple).toHaveBeenCalledTimes(2); expect(log.warn).toHaveBeenCalledWith( '[subagent] Model "primary-model" failed probe (429 rate limit). Trying next fallback...', ); @@ -784,7 +793,7 @@ describe("spawn-time model availability probing", () => { test.each(["429 rate limit", "insufficient quota", "probe timeout", "HTTP 503 upstream unavailable"])( "fallback loop skips probe error: %s", async (message) => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", message)) .mockResolvedValueOnce(assistantResult("stop")); @@ -801,7 +810,7 @@ describe("spawn-time model availability probing", () => { ); test("fallback loop uses parent model when all configured model probes fail", async () => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", "primary quota exhausted")) .mockRejectedValueOnce(new Error("fallback auth revoked")); @@ -821,11 +830,11 @@ describe("spawn-time model availability probing", () => { { model: "fallback-model", reason: "fallback auth revoked" }, ]); } - expect(complete).toHaveBeenCalledTimes(2); + expect(completeSimple).toHaveBeenCalledTimes(2); }); test("fallback loop returns an error when parent model also fails", async () => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", "primary down")) .mockResolvedValueOnce(assistantResult("error", "fallback down")); @@ -860,12 +869,12 @@ describe("spawn-time model availability probing", () => { ); expect(result).toEqual({ ok: false, error: "Aborted before spawn", skippedModels: [] }); - expect(complete).not.toHaveBeenCalled(); + expect(completeSimple).not.toHaveBeenCalled(); }); test("fallback loop exits when signal aborts during probing", async () => { const controller = new AbortController(); - vi.mocked(complete).mockImplementationOnce(async () => { + vi.mocked(completeSimple).mockImplementationOnce(async () => { controller.abort(new Error("user cancelled")); return assistantResult("aborted", "request cancelled"); }); @@ -879,7 +888,7 @@ describe("spawn-time model availability probing", () => { ); expect(result).toEqual({ ok: false, error: "Aborted before spawn", skippedModels: [] }); - expect(complete).toHaveBeenCalledTimes(1); + expect(completeSimple).toHaveBeenCalledTimes(1); }); test("array model config without registry skips probing", async () => { @@ -892,7 +901,7 @@ describe("spawn-time model availability probing", () => { expect(result.ok).toBe(true); if (result.ok) expect(result.modelId).toBe("primary-model"); - expect(complete).not.toHaveBeenCalled(); + expect(completeSimple).not.toHaveBeenCalled(); }); test("single model config skips probing", async () => { @@ -900,7 +909,7 @@ describe("spawn-time model availability probing", () => { expect(result.ok).toBe(true); if (result.ok) expect(result.modelId).toBe("primary-model"); - expect(complete).not.toHaveBeenCalled(); + expect(completeSimple).not.toHaveBeenCalled(); }); test("fallback summary formatting and prepending are visible in output", () => { @@ -917,7 +926,7 @@ describe("spawn-time model availability probing", () => { }); test("executeSingle prepends warning before fallback summary when parent model is used", async () => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", "primary quota exhausted")) .mockRejectedValueOnce(new Error("fallback auth revoked")); mockSpawnSubagentResult({ model: "parent-model", output: "child output" }); @@ -950,7 +959,7 @@ describe("spawn-time model availability probing", () => { }); test("executeSingle includes skipped model details when model resolution fails", async () => { - vi.mocked(complete) + vi.mocked(completeSimple) .mockResolvedValueOnce(assistantResult("error", "primary down")) .mockResolvedValueOnce(assistantResult("error", "fallback down")); @@ -996,12 +1005,96 @@ describe("spawn-time model availability probing", () => { expect(result.exitCode).toBe(0); expect(result.model).toBe("parent-model"); expect(result.output).toBe("override output"); - expect(complete).not.toHaveBeenCalled(); + expect(completeSimple).not.toHaveBeenCalled(); expect(spawn).toHaveBeenCalledTimes(1); expect(vi.mocked(spawn).mock.calls[0][1]).toContain("parent-model"); }); }); +describe("probe uses streamSimple path (issue 215 regression)", () => { + test("probe does NOT use low-level complete() — uses completeSimple (streamSimple path)", async () => { + // The old implementation used complete() which calls provider.stream() directly. + // The new implementation uses completeSimple() which calls provider.streamSimple(), + // the same unified path the agent loop uses. This ensures probes exercise the + // same code path as real subagent execution. + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("stop")); + + await probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 100 }); + + // completeSimple was called, not the low-level complete path. + expect(completeSimple).toHaveBeenCalledTimes(1); + expect(complete).not.toHaveBeenCalled(); + }); + + test("probe does NOT pass maxTokens:1 — avoids reasoning model failures", async () => { + // OpenAI reasoning models (o1, o3, etc.) reject or malfunction with maxTokens:1 + // because reasoning tokens count against the completion token budget. The probe + // must not set maxTokens at all, letting buildBaseOptions apply normal defaults. + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("stop")); + + await probeModelAvailability(probeModels[0], { registry: probeRegistry(), timeoutMs: 100 }); + + const callOptions = vi.mocked(completeSimple).mock.calls[0][2]; + expect(callOptions).not.toHaveProperty("maxTokens"); + }); + + test("probe leaves reasoning disabled for non-reasoning models", async () => { + vi.mocked(completeSimple).mockResolvedValueOnce(assistantResult("stop")); + + await probeModelAvailability(probeModels[1], { registry: probeRegistry(), timeoutMs: 100 }); + + const callOptions = vi.mocked(completeSimple).mock.calls[0][2]; + expect(callOptions).not.toHaveProperty("maxTokens"); + expect(callOptions?.reasoning).toBeUndefined(); + }); + + test("probe works for openai-responses reasoning model with normal reasoning default", async () => { + // Simulate an OpenAI reasoning model (e.g. gpt-5.5) — the old maxTokens:1 probe + // would fail on these because the provider sends max_output_tokens:1 which + // is too small for reasoning token overhead. The probe must also pass the + // normal coding-agent thinking default so streamSimple does not disable reasoning. + const reasoningModel: Model<"openai-responses"> = { + id: "gpt-5.5", + name: "gpt-5.5", + api: "openai-responses", + provider: "openai", + baseUrl: "https://api.openai.com", + reasoning: true, + input: ["text"], + cost: { input: 10, output: 40, cacheRead: 1, cacheWrite: 10 }, + contextWindow: 200000, + maxTokens: 100000, + }; + const reasoningRegistry = { + getAll: () => [reasoningModel], + find: (provider: string, modelId: string) => + provider === "openai" && modelId === "gpt-5.5" ? reasoningModel : undefined, + getApiKey: async () => "test-key", + authStorage: { hasAuth: () => true }, + } as unknown as Parameters[2]; + + vi.mocked(completeSimple).mockResolvedValueOnce({ + ...assistantResult("stop"), + api: "openai-responses", + provider: "openai", + model: "gpt-5.5", + }); + + const result = await probeModelAvailability(reasoningModel, { + registry: reasoningRegistry, + timeoutMs: 100, + }); + + expect(result).toEqual({ ok: true }); + expect(completeSimple).toHaveBeenCalledTimes(1); + // Verify no maxTokens was passed — critical for reasoning models — and + // the normal coding-agent thinking default was forwarded through streamSimple. + const callOptions = vi.mocked(completeSimple).mock.calls[0][2]; + expect(callOptions).not.toHaveProperty("maxTokens"); + expect(callOptions).toHaveProperty("reasoning", "xhigh"); + }); +}); + describe("subagent promptGuidelines", () => { test("waiting guideline mentions agent_end explicitly", () => { const guidelines = subagentToolDefinition.promptGuidelines ?? []; From 8170da33dc3bd51cd8cf7200315d4d5fde17c800 Mon Sep 17 00:00:00 2001 From: m-aebrer Date: Tue, 19 May 2026 16:33:55 -0400 Subject: [PATCH 3/4] Fix review findings 1-3 --- .../coding-agent/src/core/tools/subagent.ts | 7 ++- .../test/sdk-session-manager.test.ts | 36 +++++++++++- .../test/subagent-model-fallback.test.ts | 16 +++++ packages/coding-agent/test/thinking.test.ts | 58 +++++++++++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 packages/coding-agent/test/thinking.test.ts diff --git a/packages/coding-agent/src/core/tools/subagent.ts b/packages/coding-agent/src/core/tools/subagent.ts index ce5a9f1a..d01460e8 100644 --- a/packages/coding-agent/src/core/tools/subagent.ts +++ b/packages/coding-agent/src/core/tools/subagent.ts @@ -33,6 +33,7 @@ export interface AgentTypeConfig { } const DEFAULT_AGENT = "Explore"; +export const DEFAULT_MODEL_AVAILABILITY_PROBE_TIMEOUT_MS = 120_000; export function parseAgentFrontmatter( content: string, @@ -542,11 +543,11 @@ export function resolveModelStringSingle( } export interface ProbeModelAvailabilityOptions { - /** Parent/tool abort signal. A 10s probe timeout is layered on top. */ + /** Parent/tool abort signal. A model availability probe timeout is layered on top. */ signal?: AbortSignal; /** Model registry used to resolve provider API keys for the probe call. */ registry?: ModelRegistry; - /** Override the default 10s probe timeout; primarily useful for tests. */ + /** Override the default model availability probe timeout; primarily useful for tests. */ timeoutMs?: number; } @@ -608,7 +609,7 @@ export async function probeModelAvailability( model: Model, options: ProbeModelAvailabilityOptions = {}, ): Promise { - const { signal, registry, timeoutMs = 10_000 } = options; + const { signal, registry, timeoutMs = DEFAULT_MODEL_AVAILABILITY_PROBE_TIMEOUT_MS } = options; if (signal?.aborted) return { ok: false, reason: "Aborted before spawn", aborted: true }; const probeSignal = makeProbeSignal(signal, timeoutMs); diff --git a/packages/coding-agent/test/sdk-session-manager.test.ts b/packages/coding-agent/test/sdk-session-manager.test.ts index e77ba054..419eac58 100644 --- a/packages/coding-agent/test/sdk-session-manager.test.ts +++ b/packages/coding-agent/test/sdk-session-manager.test.ts @@ -1,10 +1,24 @@ import { existsSync, mkdirSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; -import { findModel } from "@dreb/ai"; +import { findModel, type Model } from "@dreb/ai"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { createAgentSession } from "../src/core/sdk.js"; import { SessionManager } from "../src/core/session-manager.js"; +import { createTestResourceLoader } from "./utilities.js"; + +const nonReasoningModel: Model<"anthropic-messages"> = { + id: "non-reasoning-model", + name: "Non-reasoning Model", + api: "anthropic-messages", + provider: "anthropic", + baseUrl: "https://api.anthropic.com", + reasoning: false, + input: ["text"], + cost: { input: 1, output: 3, cacheRead: 0.1, cacheWrite: 1 }, + contextWindow: 128000, + maxTokens: 8192, +}; describe("createAgentSession session manager defaults", () => { let tempDir: string; @@ -63,4 +77,24 @@ describe("createAgentSession session manager defaults", () => { session.dispose(); }); + + it.each(["high", "xhigh"] as const)( + "clamps explicit %s thinking to off for a non-reasoning model", + async (thinkingLevel) => { + const { session } = await createAgentSession({ + cwd, + agentDir, + model: nonReasoningModel, + thinkingLevel, + sessionManager: SessionManager.inMemory(cwd), + resourceLoader: createTestResourceLoader(), + }); + + try { + expect(session.thinkingLevel).toBe("off"); + } finally { + session.dispose(); + } + }, + ); }); diff --git a/packages/coding-agent/test/subagent-model-fallback.test.ts b/packages/coding-agent/test/subagent-model-fallback.test.ts index e8ab9efc..f0676f97 100644 --- a/packages/coding-agent/test/subagent-model-fallback.test.ts +++ b/packages/coding-agent/test/subagent-model-fallback.test.ts @@ -6,6 +6,7 @@ import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; import { log } from "../src/core/logger.js"; import { type AgentTypeConfig, + DEFAULT_MODEL_AVAILABILITY_PROBE_TIMEOUT_MS, executeSingle, formatModelFallbackSummary, isRuntimeUnavailableError, @@ -741,6 +742,21 @@ describe("spawn-time model availability probing", () => { }); }); + test("probeModelAvailability uses the named default timeout", async () => { + vi.useFakeTimers(); + vi.mocked(completeSimple).mockImplementationOnce( + () => new Promise>>(() => {}), + ); + + const resultPromise = probeModelAvailability(probeModels[0], { registry: probeRegistry() }); + await vi.advanceTimersByTimeAsync(DEFAULT_MODEL_AVAILABILITY_PROBE_TIMEOUT_MS); + + await expect(resultPromise).resolves.toEqual({ + ok: false, + reason: `Model availability probe timed out after ${DEFAULT_MODEL_AVAILABILITY_PROBE_TIMEOUT_MS}ms`, + }); + }); + test("isRuntimeUnavailableError treats provider error messages as unavailable", () => { expect(isRuntimeUnavailableError(assistantResult("error", "quota exhausted"))).toBe(true); expect(isRuntimeUnavailableError(new Error("timeout"))).toBe(true); diff --git a/packages/coding-agent/test/thinking.test.ts b/packages/coding-agent/test/thinking.test.ts new file mode 100644 index 00000000..194f4605 --- /dev/null +++ b/packages/coding-agent/test/thinking.test.ts @@ -0,0 +1,58 @@ +import type { ThinkingLevel as AgentThinkingLevel } from "@dreb/agent-core"; +import type { ThinkingLevel as AiThinkingLevel, Model } from "@dreb/ai"; +import { describe, expect, test } from "vitest"; +import { resolveEffectiveThinkingLevel, thinkingLevelToReasoning } from "../src/core/thinking.js"; + +const reasoningModel: Model<"anthropic-messages"> = { + id: "reasoning-model", + name: "Reasoning Model", + api: "anthropic-messages", + provider: "anthropic", + baseUrl: "https://api.anthropic.com", + reasoning: true, + input: ["text"], + cost: { input: 1, output: 3, cacheRead: 0.1, cacheWrite: 1 }, + contextWindow: 200000, + maxTokens: 8192, +}; + +const nonReasoningModel: Model<"anthropic-messages"> = { + ...reasoningModel, + id: "non-reasoning-model", + name: "Non-reasoning Model", + reasoning: false, +}; + +describe("resolveEffectiveThinkingLevel", () => { + test("undefined model clamps to off even if a thinking level is provided", () => { + expect(resolveEffectiveThinkingLevel(undefined, "high")).toBe("off"); + }); + + test("reasoning model with undefined thinking uses the default parameter", () => { + expect(resolveEffectiveThinkingLevel(reasoningModel, undefined, "low")).toBe("low"); + }); + + test.each(["minimal", "low", "medium", "high"] satisfies AgentThinkingLevel[])( + "reasoning model preserves explicit %s thinking level", + (thinkingLevel) => { + expect(resolveEffectiveThinkingLevel(reasoningModel, thinkingLevel)).toBe(thinkingLevel); + }, + ); + + test("non-reasoning model clamps to off", () => { + expect(resolveEffectiveThinkingLevel(nonReasoningModel, "high")).toBe("off"); + }); +}); + +describe("thinkingLevelToReasoning", () => { + test("returns undefined for off", () => { + expect(thinkingLevelToReasoning("off")).toBeUndefined(); + }); + + test.each(["minimal", "low", "medium", "high", "xhigh"] satisfies AiThinkingLevel[])( + "passes through %s", + (thinkingLevel) => { + expect(thinkingLevelToReasoning(thinkingLevel)).toBe(thinkingLevel); + }, + ); +}); From 8beb349f7abb213c7274010192f61464813eaa7f Mon Sep 17 00:00:00 2001 From: m-aebrer Date: Tue, 19 May 2026 17:02:07 -0400 Subject: [PATCH 4/4] chore: bump version to 2.19.3 --- package-lock.json | 28 +++++++++---------- package.json | 2 +- packages/agent/package.json | 2 +- packages/ai/package.json | 2 +- packages/coding-agent/package.json | 2 +- .../.claude-plugin/plugin.json | 2 +- packages/semantic-search/package.json | 2 +- packages/telegram/package.json | 2 +- packages/tui/package.json | 2 +- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/package-lock.json b/package-lock.json index 696b274e..ea62c09e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "dreb", - "version": "2.19.2", + "version": "2.19.3", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "dreb", - "version": "2.19.2", + "version": "2.19.3", "workspaces": [ "packages/*", "packages/coding-agent/examples/extensions/with-deps", @@ -3953,9 +3953,9 @@ "license": "MIT" }, "node_modules/brace-expansion": { - "version": "5.0.5", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.5.tgz", - "integrity": "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ==", + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz", + "integrity": "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==", "license": "MIT", "dependencies": { "balanced-match": "^4.0.2" @@ -8633,9 +8633,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.20.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.0.tgz", - "integrity": "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA==", + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", "license": "MIT", "engines": { "node": ">=10.0.0" @@ -8763,7 +8763,7 @@ }, "packages/agent": { "name": "@dreb/agent-core", - "version": "2.19.2", + "version": "2.19.3", "license": "MIT", "dependencies": { "@dreb/ai": "*" @@ -8792,7 +8792,7 @@ }, "packages/ai": { "name": "@dreb/ai", - "version": "2.19.2", + "version": "2.19.3", "license": "MIT", "dependencies": { "@anthropic-ai/sdk": "^0.73.0", @@ -8848,7 +8848,7 @@ }, "packages/coding-agent": { "name": "@dreb/coding-agent", - "version": "2.19.2", + "version": "2.19.3", "license": "MIT", "dependencies": { "@dreb/agent-core": "*", @@ -8963,7 +8963,7 @@ }, "packages/semantic-search": { "name": "@dreb/semantic-search", - "version": "2.19.2", + "version": "2.19.3", "license": "MIT", "dependencies": { "@huggingface/transformers": "^4.0.1", @@ -9012,7 +9012,7 @@ }, "packages/telegram": { "name": "@dreb/telegram", - "version": "2.19.2", + "version": "2.19.3", "dependencies": { "@dreb/coding-agent": "*", "grammy": "^1.35.0" @@ -9044,7 +9044,7 @@ }, "packages/tui": { "name": "@dreb/tui", - "version": "2.19.2", + "version": "2.19.3", "license": "MIT", "dependencies": { "@types/mime-types": "^2.1.4", diff --git a/package.json b/package.json index 7bbec91b..115661de 100644 --- a/package.json +++ b/package.json @@ -34,7 +34,7 @@ "engines": { "node": ">=20.0.0" }, - "version": "2.19.2", + "version": "2.19.3", "dependencies": { "@mariozechner/jiti": "^2.6.5", "@dreb/coding-agent": "*", diff --git a/packages/agent/package.json b/packages/agent/package.json index 66ac4503..02b67f6a 100644 --- a/packages/agent/package.json +++ b/packages/agent/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/agent-core", - "version": "2.19.2", + "version": "2.19.3", "description": "General-purpose agent with transport abstraction, state management, and attachment support", "type": "module", "main": "./dist/index.js", diff --git a/packages/ai/package.json b/packages/ai/package.json index de003b9a..0f1ab03e 100644 --- a/packages/ai/package.json +++ b/packages/ai/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/ai", - "version": "2.19.2", + "version": "2.19.3", "description": "Unified LLM API with automatic model discovery and provider configuration", "type": "module", "main": "./dist/index.js", diff --git a/packages/coding-agent/package.json b/packages/coding-agent/package.json index 6fbce494..eb1d4a87 100644 --- a/packages/coding-agent/package.json +++ b/packages/coding-agent/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/coding-agent", - "version": "2.19.2", + "version": "2.19.3", "description": "Coding agent CLI with read, bash, edit, write tools and session management", "type": "module", "drebConfig": { diff --git a/packages/semantic-search/.claude-plugin/plugin.json b/packages/semantic-search/.claude-plugin/plugin.json index d4839e5f..1a72da0e 100644 --- a/packages/semantic-search/.claude-plugin/plugin.json +++ b/packages/semantic-search/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "semantic-search", "description": "Semantic codebase search — natural language queries over code and docs using embeddings, tree-sitter parsing, and POEM multi-signal ranking", - "version": "2.19.2", + "version": "2.19.3", "author": { "name": "Drew Brereton" }, diff --git a/packages/semantic-search/package.json b/packages/semantic-search/package.json index efd7aa14..9ae0c046 100644 --- a/packages/semantic-search/package.json +++ b/packages/semantic-search/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/semantic-search", - "version": "2.19.2", + "version": "2.19.3", "description": "Semantic codebase search engine with embedding-based ranking and MCP server", "publishConfig": { "access": "public" diff --git a/packages/telegram/package.json b/packages/telegram/package.json index 9cd198bb..f11a6d29 100644 --- a/packages/telegram/package.json +++ b/packages/telegram/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/telegram", - "version": "2.19.2", + "version": "2.19.3", "description": "Telegram bot frontend for dreb coding agent", "type": "module", "main": "./dist/index.js", diff --git a/packages/tui/package.json b/packages/tui/package.json index 035af85d..efdaadb3 100644 --- a/packages/tui/package.json +++ b/packages/tui/package.json @@ -1,6 +1,6 @@ { "name": "@dreb/tui", - "version": "2.19.2", + "version": "2.19.3", "description": "Terminal User Interface library with differential rendering for efficient text-based applications", "type": "module", "main": "dist/index.js",