From 478de970392fc29df28acb9ec08adf7a286665e8 Mon Sep 17 00:00:00 2001 From: JangHaryeom <101104772+CocoRoF@users.noreply.github.com> Date: Wed, 20 May 2026 16:49:47 +0900 Subject: [PATCH] chore(deps): bump geny-executor pin to >=2.0.6 + slim down llm_patches.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit geny-executor 2.0.6 (released today, see CocoRoF/geny-executor#208) absorbed four of the five compat patches that previously lived in ``service/llm_patches.py``: A. ``--verbose`` auto-injected when ``--print --output-format= stream-json`` is used (required by Claude Code CLI ≥ 2.1.x). B. ``--bare`` auto-stripped when no ``ANTHROPIC_API_KEY`` is in env, so the OAuth subscription path no longer crashes. C. Dropped the auto-``--tools ""`` emit on MCP-configured spawns — CLI built-ins (``Bash`` / ``Read`` / ``Write`` / ``Edit`` / …) now stay available alongside MCP-wrapped host tools, which is what every host (notably Geny's Sub-Worker) actually wants. D. ``StreamJsonAccumulator.finalize`` (and ``parse_json_output_to_response``) now drop the ``tool_use`` blocks the CLI dispatched internally, so Stage 10 naturally no-ops instead of ghost-erroring against the ``mcp__geny__`` ids it has no registration for. Plus the executor lost the dead ``copilot_cli`` provider entirely (Geny had already removed its end of that wiring in #827). This bumps both pins (``pyproject.toml`` + ``requirements.txt``) from ``>=2.0.5`` to ``>=2.0.6``. ``llm_patches.py`` shrinks from ~830 lines to ~479 lines. Two patches remain — both are Geny-specific and won't fold upstream without the executor gaining hooks Geny is the only consumer of: 1. Friendly Korean error messages for ``is_error`` stream-json result envelopes (auth-expired → Settings card hint, generic API errors → structured short message). Will fold upstream once the executor gains a proper i18n hook. 2. CLI-tool observability into Geny's :class:`SessionLogger` via the new :data:`cli_stream_logger_ctx` ContextVar — taps ``StreamJsonAccumulator.feed`` to surface CLI built-in tool calls (Bash / Read / Write / Edit / …) into the session log alongside the MCP tool entries that ``mcp_bridge_controller`` already emits. ``mcp__*`` prefixed names are skipped so the bridge-side log isn't double-rendered. Will fold upstream once the executor emits first-class CLI-tool events on the pipeline event bus. Tests are rewritten end-to-end: - Old: 21 tests for ``_patched_argv`` predicate + idempotent installer (the patched-argv function is gone). - New: 14 focused tests covering: - ``_friendly_error_message_for_result_envelope`` (3 cases: auth, generic API error, fallback). - ``_maybe_extract_error_envelope`` (bytes/str input + 7 parametrised non-match cases). - ``install_llm_patches`` idempotency + re-export coverage. - Assembler patch end-to-end: raises Korean friendly error on auth-failure envelope; passes through clean streams. - Stream observability: emits ``log_tool_use`` for non-MCP tool blocks; skips ``mcp__*`` to avoid bridge double-render; emits ``log_tool_result`` with duration; handles both string and content-block-list result shapes; inert when no ContextVar is set. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/pyproject.toml | 2 +- backend/requirements.txt | 2 +- backend/service/llm_patches.py | 515 +++------------ backend/tests/service/test_llm_patches.py | 762 ++++++++++------------ 4 files changed, 409 insertions(+), 872 deletions(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index e3727251..ab3cc2e7 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ # provider/parallel/max_concurrent for multi-provider sub-agents. # Earlier 1.21.0 surfaces (bounded root _index.json, provider-driven # Stage 2 / Stage 18, etc.) all retained. - "geny-executor>=2.0.5,<3.0.0", + "geny-executor>=2.0.6,<3.0.0", # MCP (Model Context Protocol) "mcp>=1.0.0", # Auth diff --git a/backend/requirements.txt b/backend/requirements.txt index 2f3d60cf..77bd138f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -26,7 +26,7 @@ playwright>=1.49.0 # Must match the version pin in backend/pyproject.toml — out-of-sync # pins are what caused the prod Docker build to install 1.21.0 and # blow up on `from geny_executor import CredentialBundle`. -geny-executor>=2.0.5,<3.0.0 +geny-executor>=2.0.6,<3.0.0 # MCP (Model Context Protocol) mcp>=1.0.0 # TTS (Text-to-Speech) diff --git a/backend/service/llm_patches.py b/backend/service/llm_patches.py index 16686dd7..039da0e7 100644 --- a/backend/service/llm_patches.py +++ b/backend/service/llm_patches.py @@ -1,41 +1,49 @@ """ Runtime patches for ``geny-executor``'s LLM client integrations. -Why this exists ---------------- -``geny-executor`` 2.0.1 (current pinned version) builds the Claude -Code CLI argv via -``geny_executor.llm_client.translators._cli.claude_code_argv``. For -streaming requests it emits: - - --print --input-format stream-json --output-format stream-json - --include-partial-messages … - -The Claude Code CLI's release pipeline tightened the validation -sometime after ``geny-executor`` 2.0.1 was published: passing -``--print`` together with ``--output-format=stream-json`` now also -requires ``--verbose``, otherwise the CLI exits 1 with:: - - Error: When using --print, --output-format=stream-json requires --verbose - -Symptom on the user side: every Developer-role session with the -Claude Code backend selected (`Environment 6단계 → API → Claude Code`) -fails at the first command with the CLI message bubbled up -verbatim. - -The fix -------- -Monkey-patch ``claude_code_argv`` to insert ``--verbose`` right -after ``--print`` whenever the produced argv contains -``--output-format stream-json``. The patch is **idempotent** — -re-running ``install_llm_patches()`` is a no-op once the wrapper is -already in place. We also conditionally skip patching if a future -``geny-executor`` ships the fix upstream (detected by checking -whether the original argv already contains ``--verbose``). - -Tested against the upstream behaviour: the non-streaming branch -(`--output-format json`) does NOT need ``--verbose`` and we leave -it alone. +Why this exists (cycle 20260520, post-2.0.6) +-------------------------------------------- +``geny-executor`` 2.0.6 absorbed every generic Claude-Code CLI compat +patch this module used to carry — ``--verbose`` auto-injection for +stream-json output, ``--bare`` strip on the OAuth path, drop of the +auto-``--tools ""`` emit (CLI built-ins now stay available alongside +MCP-wrapped host tools), and the ``StreamJsonAccumulator.finalize`` +``tool_use`` strip that keeps Stage 10 from ghost-erroring against the +CLI's internal dispatches. None of those need monkey-patching anymore; +the executor's argv builder + accumulator behave correctly out of the +box for hosts pinned to ``>=2.0.6``. + +Two Geny-specific hooks remain here: + +1. **Friendly Korean error message for auth-expired envelopes.** The + executor surfaces a generic ``RuntimeError``; we sugar the wire-shape + ``{"is_error": true, "api_error_status": 401, + "error": "authentication_failed"}`` into a Korean human-readable + prompt that points the user at the LLM Backends Settings card. Same + patch turns other ``is_error`` result envelopes into a structured + message instead of the runtime's empty-stderr fallback. + +2. **CLI-handled tool call observability into Geny's SessionLogger.** + The executor (correctly, per the Phase-I design contract) drops + ``tool_use`` blocks from its assembled ``APIResponse`` so host + pipelines don't try to re-dispatch them. That also means Geny's + Stage 10 never fires the ``tool.call_start`` / ``tool.call_complete`` + events the session_logger taps into for *CLI built-in* (Bash, Read, + Write, Edit, Glob, Grep, WebFetch, WebSearch, …) tool use — those + tools never round-trip through Geny's MCP bridge either, so without + an explicit tap they would disappear from the session log entirely. + We monkey-patch ``StreamJsonAccumulator.feed`` to peek at every + stream-json line, emit ``session_logger.log_tool_use`` / + ``log_tool_result`` for non-``mcp__*`` tool blocks (MCP tools are + already logged from ``mcp_bridge_controller``; duplicating would + render them twice in the UI), and route the logger through a + ``ContextVar`` set by ``AgentSession.invoke()`` / ``astream()`` + for the duration of a turn. + +Both patches will fold upstream eventually — error envelope friendly +messages once the executor gains a proper i18n hook, and the +observability tap once the executor emits first-class CLI-tool +events through the pipeline event bus. Until then they live here. """ from __future__ import annotations @@ -49,8 +57,8 @@ logger = getLogger(__name__) -_PATCH_APPLIED_FLAG = "_geny_verbose_patch_applied" _ASSEMBLER_PATCH_APPLIED_FLAG = "_geny_assembler_error_patch_applied" +_OBSERVABILITY_PATCH_APPLIED_FLAG = "_geny_accumulator_observability_patch_applied" # Context variable carrying the active Geny ``SessionLogger`` so the @@ -68,13 +76,15 @@ "geny_cli_stream_logger_ctx", default=None, ) -# Cached wrapper. Set on the first ``install_llm_patches()`` call so -# subsequent installs re-use the same instance — matters because the -# wrapper has to be present (with identical identity) on three -# separate module attributes. A fresh wrapper per install would leave -# stale references on whichever module didn't get re-patched. -_cached_wrapper: Any = None +# Cached wrappers, re-used across repeated ``install_llm_patches()`` +# calls so the wrapper identity stays consistent on every patched +# module attribute. The Geny app boot can call ``install_llm_patches()`` +# multiple times during config reloads; without cached wrappers we'd +# either double-stack or leave stale references on whichever module +# didn't get re-patched. _cached_assembler_wrapper: Any = None +_cached_accumulator_init: Any = None +_cached_accumulator_feed: Any = None # Human-readable message shown to the end user when the Claude CLI @@ -143,233 +153,16 @@ def _maybe_extract_error_envelope(raw: Any) -> Optional[Dict[str, Any]]: return line -def _patched_argv( - original_argv: List[str], - *, - has_api_key: Optional[bool] = None, -) -> List[str]: - """Return a transformed argv applying three independent fixes: - - A. ``--verbose`` injection — required after ``--print - --output-format stream-json`` for CLI ≥ 2.1.x. See the - module docstring for the history. - - B. ``--bare`` stripping on the OAuth path — the CLI's - ``--bare`` flag explicitly disables OAuth ("Anthropic auth - is strictly ANTHROPIC_API_KEY or apiKeyHelper via - --settings (OAuth and keychain are never read)") but - ``geny-executor`` 2.0.1 builds the - ``ClaudeCodeCLIClient`` with ``bare_mode=True`` as the - default. The combination crashes every subscription / - OAuth user with ``"Not logged in · Please run /login"`` or - an empty-stderr ``exited with code 1:``. - - When *has_api_key* is False (no ``ANTHROPIC_API_KEY`` in - env), remove ``--bare`` from the argv so the CLI is allowed - to read the OAuth credentials. When *has_api_key* is True, - keep ``--bare`` — that's the auth path it was designed for. - When *has_api_key* is None (caller didn't tell us), we - detect via ``os.environ`` as a fall-back. - - C. ``--tools ""`` stripping (Phase-I follow-up) — executor 2.0.5 - auto-emits ``--tools ""`` whenever ``--mcp-config`` is set - and no ``allow_tools`` was supplied, which disables the CLI's - entire built-in palette (Bash / Read / Write / Edit / Glob / - Grep / TodoWrite / WebFetch / WebSearch / etc.). For Geny we - want the *opposite*: CLI built-ins available alongside our - MCP-wrapped Geny tools, so the Sub-Worker can actually edit - files / run shell / browse the web. The CLI's permission - system gates each call, and Geny pre-allows the safe set via - the synthesised ``settings.json`` (see - ``credentials.py:_build_claude_code``). Stripping the - ``["--tools", ""]`` pair restores CLI defaults; the - ``--strict-mcp-config`` flag remains in argv so the *MCP* - surface stays scoped to our session bridge. - - Pure function — handy for tests. - """ - new_argv = list(original_argv) - - # ── Fix B: drop --bare on OAuth path ───────────────────────────── - if has_api_key is None: - import os - has_api_key = bool(os.environ.get("ANTHROPIC_API_KEY", "").strip()) - if not has_api_key: - new_argv = [arg for arg in new_argv if arg != "--bare"] - - # ── Fix C: drop ``--tools ""`` pair so CLI built-ins remain on ── - cleaned: List[str] = [] - skip_next = False - for i, arg in enumerate(new_argv): - if skip_next: - skip_next = False - continue - if arg == "--tools" and (i + 1) < len(new_argv) and new_argv[i + 1] == "": - skip_next = True - continue - cleaned.append(arg) - new_argv = cleaned - - # ── Fix A: inject --verbose for stream-json output ────────────── - if "--verbose" in new_argv: - return new_argv - try: - of_idx = new_argv.index("--output-format") - except ValueError: - return new_argv - if of_idx + 1 >= len(new_argv): - return new_argv - if new_argv[of_idx + 1] != "stream-json": - return new_argv - try: - insert_at = new_argv.index("--print") + 1 - except ValueError: - insert_at = len(new_argv) - new_argv.insert(insert_at, "--verbose") - return new_argv - - def install_llm_patches() -> None: - """Idempotently install the Claude Code argv ``--verbose`` patch. - - The function is re-exported across three modules in - ``geny_executor``: + """Idempotently install Geny's two remaining ``geny-executor`` + monkey-patches: friendly Korean error envelopes + CLI-tool + observability into Geny's SessionLogger. - 1. ``geny_executor.llm_client.translators._cli`` (source). - 2. ``geny_executor.llm_client.translators`` (re-export in the - package ``__init__``). - 3. ``geny_executor.llm_client.claude_code`` (caller did - ``from … import claude_code_argv`` — captured a *local - binding* to the source function). - - Patching only #1 is insufficient because #3 already holds a - direct reference. We have to overwrite the attribute in **every** - namespace that re-exports the function so any future - ``module.claude_code_argv(...)`` lookup hits the wrapper. - - Safe to call multiple times. Logs once at INFO on the first - install, DEBUG on subsequent calls. + Safe to call multiple times. Both installers are themselves + idempotent (assembler uses a cached wrapper and ``_original`` + unwrap; accumulator uses a class-level applied flag). """ - import importlib - - candidate_modules = [ - "geny_executor.llm_client.translators._cli", - "geny_executor.llm_client.translators", - "geny_executor.llm_client.claude_code", - ] - - # Step 1 — find the original implementation. It lives in - # translators._cli; every other module just re-binds it. - try: - cli_translator = importlib.import_module(candidate_modules[0]) - except Exception: # noqa: BLE001 - logger.debug( - "geny_executor.llm_client.translators._cli unavailable — " - "Claude Code CLI patch skipped (this is fine when the " - "Claude Code backend isn't in use)", - exc_info=True, - ) - return - - original = getattr(cli_translator, "claude_code_argv", None) - if original is None: - logger.debug( - "claude_code_argv missing from geny_executor — patch skipped" - ) - return - # Unwrap any already-installed wrapper so we don't double-stack. - while getattr(original, _PATCH_APPLIED_FLAG, False): - inner = getattr(original, "_original", None) - if inner is None: - break - original = inner - - global _cached_wrapper - if _cached_wrapper is None: - def _wrapped(*args: Any, **kwargs: Any) -> List[str]: - argv = original(*args, **kwargs) - patched = _patched_argv(argv) - # Phase-I diagnostic: log the final argv every CLI invocation - # so we can verify ``--tools ""`` / ``--strict-mcp-config`` / - # ``--mcp-config`` are actually reaching the spawned ``claude`` - # process. Sensitive payloads — ``--mcp-config `` carries - # the per-session bearer token and ``--system-prompt`` carries - # the assembled persona — are redacted to their lengths so the - # log stays grep-friendly without leaking the actual content. - try: - redacted: List[str] = [] - skip_next = False - for i, arg in enumerate(patched): - if skip_next: - # Redact the value following a sensitive flag. - redacted.append(f"") - skip_next = False - continue - redacted.append(arg) - if arg in {"--mcp-config", "--system-prompt", "--append-system-prompt"}: - skip_next = True - logger.info("[llm_patches] claude argv (%d args): %s", len(patched), redacted) - except Exception: # noqa: BLE001 - pass - return patched - - setattr(_wrapped, _PATCH_APPLIED_FLAG, True) - setattr(_wrapped, "_original", original) - _cached_wrapper = _wrapped - # else: re-use the cached wrapper. Its closure captures the - # *original* function, so even if a stale wrapper was left on - # one of the modules between installs, re-binding to the cached - # instance restores the canonical chain. - - # Overwrite every known re-export with the SAME wrapper instance - # so any caller — including ``claude_code.py`` which captured a - # local binding at module load time — hits the wrapper. Doing - # this unconditionally (vs per-module idempotency-skip) keeps - # the three module attributes in lock-step across repeated - # installs; the unwrap-while-loop above strips stale wrappers - # off ``original`` so we don't double-stack. - patched_modules: List[str] = [] - for mod_name in candidate_modules: - try: - mod = importlib.import_module(mod_name) - except Exception: # noqa: BLE001 - continue - if getattr(mod, "claude_code_argv", None) is None: - continue - setattr(mod, "claude_code_argv", _cached_wrapper) - patched_modules.append(mod_name) - - if patched_modules: - logger.info( - "[llm_patches] installed Claude Code CLI --verbose patch " - "across %d modules: %s " - "(workaround for geny-executor 2.0.1 + new claude CLI)", - len(patched_modules), - ", ".join(patched_modules), - ) - else: - logger.debug( - "[llm_patches] no claude_code_argv attribute found on any " - "known module — nothing to patch" - ) - - # Always (re-)install the assembler-side patch alongside the argv - # patch. They're independent fixes but always travel together: - # without the assembler patch the user sees a misleading "CLI - # exited with code 1:" message instead of an actionable auth- - # expired hint. _install_assembler_error_patch() - - # Phase-I ghost-error fix — strip tool_use blocks from the - # ``StreamJsonAccumulator`` final response. See - # ``_install_stream_accumulator_patch`` for the rationale. - _install_stream_accumulator_patch() - - # Phase-I follow-up — surface CLI-handled tool calls (Bash / - # Read / Write / Edit / …) to the active Geny ``SessionLogger`` - # via ``cli_stream_logger_ctx``. The strip patch above silences - # Stage 10, so without this companion patch CLI built-in tool - # work would simply not appear in the Geny session log. _install_stream_observability_patch() @@ -382,8 +175,8 @@ def _install_assembler_error_patch() -> None: *friendly* RuntimeError instead of the runtime's empty-stderr ``CLI '/usr/bin/claude' exited with code 1:`` fallback. - Same three-module pattern as ``claude_code_argv``: the caller - in ``geny_executor.llm_client.claude_code.py`` does + Three-module pattern: the caller in + ``geny_executor.llm_client.claude_code.py`` does ``from … import assemble_response_from_stream_json`` and captures a local binding at module load. We re-bind the attribute in all three modules to the same wrapper instance. @@ -417,41 +210,10 @@ def _install_assembler_error_patch() -> None: async def _wrapped( stream: AsyncIterator[Any], *, model: str, ) -> Any: - """Spy on the stream-json output. Two fixes: - - A. Error envelope detection — if the CLI emitted an - ``is_error`` result envelope, raise a Korean human-readable - error instead of letting the runtime's empty-stderr - fallback kick in. - - B. Strip in-CLI ``tool_use`` blocks from the assembled - response. The Claude Code CLI runs the full agentic - loop internally (LLM → MCP tool → LLM → ...). Every - intermediate assistant turn shows up in the stream-json - output as a separate ``assistant`` envelope, and the - upstream :class:`StreamJsonAccumulator` *concatenates* - their content — so the final ``APIResponse`` returned - to Stage 6 contains all the ``tool_use`` blocks the CLI - *already dispatched* via MCP. Geny's Stage 10 (the - canonical Anthropic-API tool-dispatch stage) then sees - those ``tool_use`` blocks, looks up the - ``mcp__geny__`` ids in Geny's own ToolLoader - (which only knows the bare names — the MCP prefix is - applied by the CLI, not Geny), finds nothing, and - surfaces "Tool X: ERROR (0ms) — No output" for every - call the user can plainly see succeeded (the worker - actually received the message, the worker actually - replied, the bridge log records ``tools/call`` traffic). - - Per the Phase-I design doc — "Stage 10 receives that - assistant message, sees no ``tool_use`` blocks (they - were executed inside the CLI), and naturally no-ops" - — the canonical fix is to strip ``tool_use`` blocks - from the response before they reach Stage 10. Geny's - permission/audit telemetry for those calls flows via - the MCP bridge endpoint (cycle 20260519/Phase-2), - not through Stage 10, so this strip is lossless. - """ + """Spy on the stream-json output. If the CLI emitted an + ``is_error`` result envelope, raise a Korean human-readable + error instead of letting the runtime's empty-stderr + fallback kick in.""" err_holder: List[Dict[str, Any]] = [] async def _spy() -> AsyncIterator[Any]: @@ -482,12 +244,6 @@ async def _spy() -> AsyncIterator[Any]: _friendly_error_message_for_result_envelope(err_holder[-1]) ) - # Note: the equivalent tool_use strip lives in - # ``_install_stream_accumulator_patch`` because the actual - # streaming code path (``ClaudeCodeCLIClient._stream``) - # bypasses this assembler entirely and calls - # ``StreamJsonAccumulator.finalize`` directly. - return response setattr(_wrapped, _ASSEMBLER_PATCH_APPLIED_FLAG, True) @@ -513,126 +269,7 @@ async def _spy() -> AsyncIterator[Any]: ) -# ── StreamJsonAccumulator finalize: strip CLI-handled tool_use ────── - - -_ACCUMULATOR_PATCH_APPLIED_FLAG = "_geny_accumulator_strip_patch_applied" -_cached_accumulator_finalize: Any = None - - -def _install_stream_accumulator_patch() -> None: - """Monkey-patch ``StreamJsonAccumulator.finalize`` so the final - :class:`APIResponse` returned to Stage 6 carries **no** - ``tool_use`` blocks for the Claude Code CLI streaming path. - - Why this lives in Geny rather than the executor - ----------------------------------------------- - Claude Code CLI 2.1.x runs the whole agentic loop *internally* - (LLM → tool → LLM → tool → …). Every intermediate assistant turn - arrives as its own ``{"type":"assistant","message":{...}}`` - envelope in stream-json, and ``StreamJsonAccumulator._feed_message`` - appends every block from every envelope into a shared buffer with - no per-turn reset. ``finalize()`` therefore returns an - ``APIResponse`` whose ``content`` carries every ``tool_use`` block - the CLI *already dispatched* via MCP or its own built-ins. - - Geny's downstream pipeline (Stage 9 parse → Stage 10 dispatch) - treats those ``tool_use`` blocks as *pending* and tries to - redispatch them against Geny's own tool registry — but the CLI - advertises MCP tools with the ``mcp__geny__`` prefix that - Geny's registry doesn't know, so every call instantly fails with - ``ERROR (0ms) — No output``. Meanwhile the actual tool work - already succeeded via the MCP bridge: every Sub-Worker message - was delivered, every memory_write persisted, every browser_* - call dispatched. The user sees both a successful final reply - *and* a session log full of bogus failures — confusing, - operationally noisy, and capable of nudging the LLM itself into - apologising mid-conversation ("messaging tool not connected"). - - Per the Phase-I design doc: - - Stage 10 receives that assistant message, sees no ``tool_use`` - blocks (they were executed inside the CLI), and naturally - no-ops. - - The accumulator's append-everything behaviour violates that - contract. Stripping ``tool_use`` from ``finalize()`` restores it. - Phase-2 audit/telemetry for those calls flows through the MCP - bridge endpoint, so the strip is lossless — Geny's - ``mcp_bridge_controller`` is already where ``tools/call`` events - are recorded. - - A future executor 2.0.6 should encode this directly (per-turn - buffer reset, or skip ``tool_use`` on the claude_code_cli path); - when that lands we drop this patch. - """ - import importlib - - try: - cli_translator = importlib.import_module( - "geny_executor.llm_client.translators._cli" - ) - except Exception: # noqa: BLE001 - return - accum_cls = getattr(cli_translator, "StreamJsonAccumulator", None) - if accum_cls is None: - return - - original = getattr(accum_cls, "finalize", None) - if original is None: - return - - # Unwrap stale wrappers so we don't double-stack across reloads. - while getattr(original, _ACCUMULATOR_PATCH_APPLIED_FLAG, False): - inner = getattr(original, "_original", None) - if inner is None: - break - original = inner - - global _cached_accumulator_finalize - if _cached_accumulator_finalize is None: - def _wrapped(self: Any) -> Any: - response = original(self) - try: - content = getattr(response, "content", None) - if isinstance(content, list): - filtered = [ - b for b in content - if getattr(b, "type", None) != "tool_use" - ] - stripped = len(content) - len(filtered) - if stripped: - response.content = filtered - logger.info( - "[llm_patches] stripped %d CLI-handled " - "tool_use block(s) from claude_code stream " - "response (Stage 10 no-ops)", - stripped, - ) - except Exception: # noqa: BLE001 - logger.debug( - "[llm_patches] accumulator finalize strip skipped", - exc_info=True, - ) - return response - - setattr(_wrapped, _ACCUMULATOR_PATCH_APPLIED_FLAG, True) - setattr(_wrapped, "_original", original) - _cached_accumulator_finalize = _wrapped - - setattr(accum_cls, "finalize", _cached_accumulator_finalize) - logger.info( - "[llm_patches] installed StreamJsonAccumulator.finalize " - "tool_use strip patch (claude_code_cli ghost-error fix)" - ) - - -# ── StreamJsonAccumulator observability: surface CLI built-ins ─────── - - -_OBSERVABILITY_PATCH_APPLIED_FLAG = "_geny_accumulator_observability_patch_applied" -_cached_accumulator_init: Any = None -_cached_accumulator_feed: Any = None +# ── StreamJsonAccumulator observability: surface CLI built-ins ────── def _install_stream_observability_patch() -> None: @@ -643,14 +280,14 @@ def _install_stream_observability_patch() -> None: Why --- - With PR #825's tool_use strip in ``finalize()``, Geny's Stage 10 - naturally no-ops for ``claude_code_cli`` sessions — which means - the in-pipeline ``tool.call_start`` / ``tool.call_complete`` - events never fire and CLI-handled tools (``Bash`` / ``Read`` / - ``Write`` / ``Edit`` / ``WebFetch`` / …) disappear from the - session log. ``mcp_bridge_controller`` already emits log entries - for MCP ``tools/call`` traffic (the bridge sees those directly), - so that surface is covered. + With executor 2.0.6's terminal-response strip of ``tool_use`` + blocks, Geny's Stage 10 naturally no-ops for ``claude_code_cli`` + sessions — which means the in-pipeline ``tool.call_start`` / + ``tool.call_complete`` events never fire and CLI-handled tools + (``Bash`` / ``Read`` / ``Write`` / ``Edit`` / ``WebFetch`` / …) + disappear from the session log. ``mcp_bridge_controller`` already + emits log entries for MCP ``tools/call`` traffic (the bridge sees + those directly), so that surface is covered. The remaining gap is the *CLI built-in* layer: tools the CLI dispatches **internally** without going through our bridge. They @@ -659,7 +296,7 @@ def _install_stream_observability_patch() -> None: Mechanics --------- - - On each ``feed(line)`` call, peek at the line *before* delegating + - On each ``feed(line)`` call, peek at the line *after* delegating to the original feed. - ``"assistant"`` envelopes carry ``tool_use`` blocks → emit ``session_logger.log_tool_use`` for each, and stash @@ -675,10 +312,6 @@ def _install_stream_observability_patch() -> None: double-render in the UI. - All logger calls are try/except'd; observability must never break the underlying tool dispatch. - - No upstream executor changes required. When executor 2.0.6 adds - first-class CLI-tool observability events to the pipeline event - bus, we drop this patch. """ import importlib diff --git a/backend/tests/service/test_llm_patches.py b/backend/tests/service/test_llm_patches.py index 8550db01..ec77e2cb 100644 --- a/backend/tests/service/test_llm_patches.py +++ b/backend/tests/service/test_llm_patches.py @@ -1,520 +1,424 @@ -"""Tests for the Claude Code CLI argv runtime patch. - -Validates the pure ``_patched_argv`` predicate function plus the -idempotent monkey-patch installer. +"""Tests for ``service/llm_patches.py``. + +Post-cycle-20260520 surface is much smaller — the four generic +Claude-Code CLI argv fixes that previously lived here folded into +``geny-executor`` 2.0.6 (``--verbose`` injection, ``--bare`` strip on +OAuth, drop of auto-``--tools ""``, ``StreamJsonAccumulator.finalize`` +tool_use strip). What's left is Geny-specific and covered here: + + 1. Korean friendly-error messages for stream-json ``is_error`` + result envelopes (auth-expired hint + generic API-error + fallback). + 2. CLI-tool observability into Geny's :class:`SessionLogger` + (``log_tool_use`` + ``log_tool_result`` for non-``mcp__*`` + blocks observed in the stream, routed through the + :data:`cli_stream_logger_ctx` ContextVar). """ from __future__ import annotations -import pytest +import json +from typing import Any, AsyncIterator, Dict, List +import pytest -# ── Pure transformation ────────────────────────────────────────────── +# ── _friendly_error_message_for_result_envelope ───────────────── -def test_streaming_argv_gets_verbose_inserted() -> None: - from service.llm_patches import _patched_argv - original = [ - "--print", - "--input-format", "stream-json", - "--output-format", "stream-json", - "--include-partial-messages", - "--model", "claude-opus-4-7", - ] - # API-key path — only the verbose-injection fix is exercised - # here. The bare-strip fix has its own test below. - patched = _patched_argv(original, has_api_key=True) - - assert patched != original, "argv should be modified" - assert "--verbose" in patched - # Order matters for readable logs: ``--verbose`` immediately after - # ``--print``. - print_idx = patched.index("--print") - assert patched[print_idx + 1] == "--verbose" - - -def test_oauth_path_strips_bare_flag() -> None: - """Subscription / OAuth users (no ``ANTHROPIC_API_KEY``) must - NOT pass ``--bare`` — the flag explicitly disables OAuth at - the CLI level. ``geny-executor`` 2.0.1 always emits ``--bare`` - via its ``bare_mode=True`` default, so we strip it here on - every OAuth invocation.""" - from service.llm_patches import _patched_argv - - original = [ - "--print", - "--output-format", "stream-json", - "--bare", - "--model", "claude-opus-4-7", - ] - patched = _patched_argv(original, has_api_key=False) - assert "--bare" not in patched - # ``--verbose`` should still be injected for stream-json output. - assert "--verbose" in patched - - -def test_api_key_path_keeps_bare_flag() -> None: - """When ``ANTHROPIC_API_KEY`` is set, ``--bare`` is the right - flag — keep it untouched.""" - from service.llm_patches import _patched_argv - - original = [ - "--print", - "--output-format", "stream-json", - "--bare", - "--model", "claude-opus-4-7", - ] - patched = _patched_argv(original, has_api_key=True) - assert "--bare" in patched - assert "--verbose" in patched +def test_friendly_error_message_recognises_auth_failure() -> None: + from service.llm_patches import _friendly_error_message_for_result_envelope + msg = _friendly_error_message_for_result_envelope({ + "type": "result", + "is_error": True, + "api_error_status": 401, + "error": "authentication_failed", + "result": "Failed to authenticate. API Error: 401", + }) + # Korean prompt that points the user at the LLM Backends Settings card. + assert "Claude Code 인증이 만료" in msg + assert "LLM 백엔드" in msg + # The raw CLI message is appended as ``(원본: ...)`` for forensics. + assert "Failed to authenticate" in msg -def test_auto_detect_has_api_key_from_env( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """``has_api_key=None`` (default) means "infer from env". With - the env var set, ``--bare`` stays; without, it's stripped.""" - from service.llm_patches import _patched_argv - - original = [ - "--print", - "--output-format", "stream-json", - "--bare", - ] - monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-fake") - keep = _patched_argv(original) - assert "--bare" in keep +def test_friendly_error_message_generic_api_error() -> None: + from service.llm_patches import _friendly_error_message_for_result_envelope - monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - drop = _patched_argv(original) - assert "--bare" not in drop + msg = _friendly_error_message_for_result_envelope({ + "type": "result", + "is_error": True, + "api_error_status": 500, + "error": "internal_server_error", + "result": "Upstream provider returned 500", + }) + assert "Claude Code API 에러" in msg + assert "500" in msg + assert "Upstream provider returned 500" in msg -def test_non_streaming_argv_is_untouched() -> None: - """``--output-format json`` (non-streaming) doesn't need - ``--verbose`` — the CLI accepts it as-is. Patch must NOT add - ``--verbose`` in that case to avoid changing semantics.""" - from service.llm_patches import _patched_argv +def test_friendly_error_message_no_api_status_falls_back_to_cli_error() -> None: + from service.llm_patches import _friendly_error_message_for_result_envelope - original = [ - "--print", - "--output-format", "json", - "--model", "claude-opus-4-7", - ] - patched = _patched_argv(original) - assert "--verbose" not in patched - assert patched == original + msg = _friendly_error_message_for_result_envelope({ + "type": "result", + "is_error": True, + "result": "claude binary segfaulted", + }) + assert "Claude Code CLI 에러" in msg + assert "claude binary segfaulted" in msg -def test_argv_with_existing_verbose_is_untouched() -> None: - """Defensive: if a future ``geny-executor`` ships the fix - upstream, our patch must become a no-op so we don't double-add.""" - from service.llm_patches import _patched_argv +# ── _maybe_extract_error_envelope ─────────────────────────────── - original = [ - "--print", - "--verbose", - "--output-format", "stream-json", - ] - patched = _patched_argv(original) - assert patched == original - assert patched.count("--verbose") == 1 +def test_maybe_extract_error_envelope_accepts_bytes_and_str() -> None: + from service.llm_patches import _maybe_extract_error_envelope -def test_argv_without_output_format_is_untouched() -> None: - from service.llm_patches import _patched_argv + payload = json.dumps({ + "type": "result", "is_error": True, "api_error_status": 401, + }) + assert _maybe_extract_error_envelope(payload.encode("utf-8")) + assert _maybe_extract_error_envelope(payload) + assert _maybe_extract_error_envelope(bytearray(payload, "utf-8")) - original = ["--print", "--model", "x"] - assert _patched_argv(original) == original +@pytest.mark.parametrize("payload", [ + b"", + b" ", + b"not json", + json.dumps({"type": "assistant"}).encode("utf-8"), + json.dumps({"type": "result"}).encode("utf-8"), # no is_error + json.dumps({"type": "result", "is_error": False}).encode("utf-8"), + json.dumps([1, 2, 3]).encode("utf-8"), # not a dict +]) +def test_maybe_extract_error_envelope_returns_none_for_non_error_lines( + payload: bytes, +) -> None: + from service.llm_patches import _maybe_extract_error_envelope -def test_argv_with_stream_json_input_only_is_untouched() -> None: - """``--input-format stream-json`` doesn't trigger the - ``--print --output-format`` requirement; only the *output* format - does. Patch must only react to the output-format pair.""" - from service.llm_patches import _patched_argv + assert _maybe_extract_error_envelope(payload) is None - original = [ - "--print", - "--input-format", "stream-json", - "--output-format", "json", - ] - patched = _patched_argv(original) - assert "--verbose" not in patched - assert patched == original +# ── install_llm_patches: idempotency + re-export coverage ── -def test_argv_with_dangling_output_format_at_end() -> None: - """Defensive: ``--output-format`` is the last token with no value. - Don't crash — just leave it alone.""" - from service.llm_patches import _patched_argv - original = ["--print", "--output-format"] - patched = _patched_argv(original) - assert patched == original # No crash, no insertion. +def test_install_is_idempotent() -> None: + """Calling ``install_llm_patches()`` multiple times must be a + no-op after the first install — the assembler wrapper and the + accumulator observability patch both use cached state, so a + second call leaves the patched attributes pointing at the + *same* wrapper object.""" + from service.llm_patches import install_llm_patches + # First install (might happen at app boot before this test). + install_llm_patches() -# ── Installer ──────────────────────────────────────────────────────── + import importlib + cli_translator = importlib.import_module( + "geny_executor.llm_client.translators._cli" + ) + first_assembler = cli_translator.assemble_response_from_stream_json + first_accum_feed = cli_translator.StreamJsonAccumulator.feed -def test_install_is_idempotent(monkeypatch: pytest.MonkeyPatch) -> None: - """Repeated installs must not double-wrap the function. The - wrapper is flagged with ``_geny_verbose_patch_applied`` so the - second call sees the flag and returns early.""" - pytest.importorskip("geny_executor.llm_client.translators._cli") - from service.llm_patches import install_llm_patches - from geny_executor.llm_client.translators import _cli + # Second install — should hit the cached-wrapper path. + install_llm_patches() - pristine = _cli.claude_code_argv - try: - install_llm_patches() - once = _cli.claude_code_argv - install_llm_patches() - twice = _cli.claude_code_argv - assert once is twice, "second install must be a no-op" - # Confirm the wrapper exposes the original so future re-stacks - # can chain off it. - assert hasattr(once, "_original") - finally: - _cli.claude_code_argv = pristine + second_assembler = cli_translator.assemble_response_from_stream_json + second_accum_feed = cli_translator.StreamJsonAccumulator.feed + # Same instance, no double-stacking. + assert first_assembler is second_assembler + assert first_accum_feed is second_accum_feed -def test_install_patches_every_re_export_namespace( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """The function is re-exported across THREE modules; patching - only the source isn't enough because the caller did - ``from … import claude_code_argv`` and captured a local binding. - - Without patching all three, the live CLI invocation in - ``claude_code.py`` (line 178) still calls the pristine function - and the user sees ``--verbose`` error in prod. This test pins - the bug we hit on 2026-05-18. - """ - pytest.importorskip("geny_executor.llm_client.translators._cli") - import importlib +def test_install_patches_assembler_across_re_exports() -> None: + """The assembler is re-exported from three modules; all the + attributes must point at the wrapper after install so any caller + that captured a local binding hits it.""" from service.llm_patches import install_llm_patches + install_llm_patches() + + import importlib modules = [ - importlib.import_module("geny_executor.llm_client.translators._cli"), - importlib.import_module("geny_executor.llm_client.translators"), - importlib.import_module("geny_executor.llm_client.claude_code"), + "geny_executor.llm_client.translators._cli", + "geny_executor.llm_client.translators", + "geny_executor.llm_client.claude_code", ] - pristines = [getattr(m, "claude_code_argv") for m in modules] + fns = [] + for name in modules: + mod = importlib.import_module(name) + fn = getattr(mod, "assemble_response_from_stream_json", None) + if fn is not None: + fns.append(fn) + assert len(fns) >= 2 # at least source + one re-export + assert all(fn is fns[0] for fn in fns) - try: - install_llm_patches() - wrapped = [getattr(m, "claude_code_argv") for m in modules] - # Every module must point at a wrapper, AND they must all be - # the same wrapper instance so a stale reference can't slip - # through. - for fn in wrapped: - assert hasattr(fn, "_geny_verbose_patch_applied") - assert wrapped[0] is wrapped[1] is wrapped[2], ( - "all three modules must share the same wrapper — otherwise " - "claude_code.py keeps calling the pristine function" - ) - finally: - for m, original in zip(modules, pristines): - setattr(m, "claude_code_argv", original) +# ── assembler-side: stream-json error envelope → friendly raise ── -def test_install_patches_claude_code_caller_path( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """End-to-end pin for the user's reported regression: invoke the - argv builder through the same import path ``claude_code.py`` - uses (``claude_code.claude_code_argv``) and verify ``--verbose`` - lands in the result. This is what the live CLI invocation does - at runtime.""" - pytest.importorskip("geny_executor.llm_client.claude_code") - import importlib +@pytest.mark.asyncio +async def test_assembler_patch_raises_friendly_error_on_auth_failure() -> None: + """When the stream ends with an ``is_error: true`` result + envelope, the patched assembler must raise a Korean human-readable + error instead of silently returning an empty response.""" from service.llm_patches import install_llm_patches - cc_module = importlib.import_module( - "geny_executor.llm_client.claude_code" - ) - cli_module = importlib.import_module( + install_llm_patches() + + import importlib + cli_translator = importlib.import_module( "geny_executor.llm_client.translators._cli" ) - translators_pkg = importlib.import_module( - "geny_executor.llm_client.translators" - ) - - pristine_cc = cc_module.claude_code_argv - pristine_cli = cli_module.claude_code_argv - pristine_pkg = translators_pkg.claude_code_argv - - try: - install_llm_patches() - - class _Req: - stream = True - model = "claude-opus-4-7" - system = "" - thinking = None - response_format = None - session_hint = None - - # The crucial assertion: the function call path - # ``claude_code.claude_code_argv(...)`` must return an argv - # that contains ``--verbose``. Before the deep-patch fix this - # returned the pristine result and the prod session crashed. - argv = cc_module.claude_code_argv(_Req()) - assert "--verbose" in argv, ( - "claude_code.py's local binding must point at the patched " - "wrapper, otherwise the live CLI invocation crashes with " - "'When using --print, --output-format=stream-json requires " - "--verbose'" + assemble = cli_translator.assemble_response_from_stream_json + + async def _gen() -> AsyncIterator[bytes]: + yield b'{"type": "system", "session_id": "s1"}\n' + yield ( + b'{"type": "result", "is_error": true, "api_error_status": 401, ' + b'"error": "authentication_failed", ' + b'"result": "Failed to authenticate. API Error: 401"}\n' ) - finally: - cc_module.claude_code_argv = pristine_cc - cli_module.claude_code_argv = pristine_cli - translators_pkg.claude_code_argv = pristine_pkg + with pytest.raises(RuntimeError, match="Claude Code 인증이 만료"): + await assemble(_gen(), model="m") -def test_install_actually_patches_streaming_call( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """End-to-end: install the patch, then invoke ``claude_code_argv`` - the way geny-executor's caller does and assert ``--verbose`` - lands in the argv.""" - pytest.importorskip("geny_executor.llm_client.translators._cli") + +@pytest.mark.asyncio +async def test_assembler_patch_passes_through_clean_stream() -> None: + """Clean streams (no ``is_error`` envelope) must produce a valid + APIResponse with the text content intact.""" from service.llm_patches import install_llm_patches - from geny_executor.llm_client.translators import _cli - pristine = _cli.claude_code_argv - try: - install_llm_patches() - - # Build a minimal ``ChatCompletionRequest``-shaped object the - # function expects. We use a duck-typed stub to avoid pulling - # the full geny_executor request model. - class _Req: - stream = True - model = "claude-opus-4-7" - system = "" - thinking = None - response_format = None - session_hint = None - - argv = _cli.claude_code_argv(_Req()) - assert "--print" in argv - assert "--output-format" in argv - of_idx = argv.index("--output-format") - assert argv[of_idx + 1] == "stream-json" - assert "--verbose" in argv, ( - "patch must inject --verbose for stream-json output" - ) - print_idx = argv.index("--print") - assert argv[print_idx + 1] == "--verbose", ( - "--verbose must sit right after --print for readable logs" + install_llm_patches() + + import importlib + cli_translator = importlib.import_module( + "geny_executor.llm_client.translators._cli" + ) + assemble = cli_translator.assemble_response_from_stream_json + + async def _gen() -> AsyncIterator[bytes]: + yield b'{"type": "system", "session_id": "s1", "model": "sonnet"}\n' + yield b'{"type": "assistant", "delta": {"type": "text_delta", "text": "ok"}}\n' + yield b'{"type": "message_stop"}\n' + yield ( + b'{"type": "result", "stop_reason": "end_turn", ' + b'"usage": {"input_tokens": 1, "output_tokens": 1}}\n' ) - finally: - _cli.claude_code_argv = pristine + resp = await assemble(_gen(), model="default") + assert resp.text == "ok" + assert resp.stop_reason == "end_turn" -# ── Assembler patch: stream-json error → friendly message ──────────── +# ── Stream observability: CLI built-in tool calls → SessionLogger ── -def test_friendly_error_message_recognises_auth_failure() -> None: - from service.llm_patches import _friendly_error_message_for_result_envelope - envelope = { - "type": "result", - "is_error": True, - "api_error_status": 401, - "error": "authentication_failed", - "result": "Failed to authenticate. API Error: 401 Invalid authentication credentials", - } - msg = _friendly_error_message_for_result_envelope(envelope) - assert "Claude Code 인증이 만료" in msg - assert "다시 로그인" in msg or "Sign in" in msg - # And the original error is preserved so an operator inspecting - # logs can see the underlying API message. - assert "401" in msg +class _FakeLogger: + """Minimal stand-in for Geny's ``SessionLogger``. Records every + ``log_tool_use`` / ``log_tool_result`` call so tests can assert on + the observability patch's behavior without booting the real + logger + DB.""" + def __init__(self) -> None: + self.tool_uses: List[Dict[str, Any]] = [] + self.tool_results: List[Dict[str, Any]] = [] -def test_friendly_error_message_for_generic_api_error() -> None: - from service.llm_patches import _friendly_error_message_for_result_envelope + def log_tool_use(self, **kwargs: Any) -> None: + self.tool_uses.append(kwargs) - envelope = { - "type": "result", - "is_error": True, - "api_error_status": 503, - "error": "overloaded", - "result": "Service unavailable, please retry", - } - msg = _friendly_error_message_for_result_envelope(envelope) - assert "Claude Code API 에러" in msg - assert "503" in msg - # Non-401 must NOT use the auth-expired message. - assert "인증이 만료" not in msg + def log_tool_result(self, **kwargs: Any) -> None: + self.tool_results.append(kwargs) -def test_maybe_extract_error_envelope_accepts_bytes_and_str() -> None: - from service.llm_patches import _maybe_extract_error_envelope +def test_observability_emits_tool_use_for_assistant_envelope() -> None: + """``"assistant"`` envelopes carrying a non-``mcp__*`` ``tool_use`` + block trigger a ``log_tool_use`` call on the active session logger.""" + from service.llm_patches import ( + cli_stream_logger_ctx, + install_llm_patches, + ) + install_llm_patches() - line_bytes = b'{"type":"result","is_error":true,"api_error_status":401}' - line_str = '{"type":"result","is_error":true,"api_error_status":401}' - assert _maybe_extract_error_envelope(line_bytes) is not None - assert _maybe_extract_error_envelope(line_str) is not None + import importlib + cli_translator = importlib.import_module( + "geny_executor.llm_client.translators._cli" + ) + accum = cli_translator.StreamJsonAccumulator(model="m") + fake = _FakeLogger() + token = cli_stream_logger_ctx.set(fake) + try: + accum.feed({ + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "id": "tu_1", "name": "Bash", + "input": {"command": "ls"}}, + ], + }, + }) + finally: + cli_stream_logger_ctx.reset(token) + assert len(fake.tool_uses) == 1 + call = fake.tool_uses[0] + assert call["tool_name"] == "Bash" + assert call["tool_id"] == "tu_1" + assert call["tool_input"] == {"command": "ls"} -@pytest.mark.parametrize("payload", [ - b'', - b'\n', - b'not json', - b'{"type":"system"}', - b'{"type":"result","is_error":false}', - b'{"type":"result"}', - b'[]', -]) -def test_maybe_extract_error_envelope_returns_none_for_non_error_lines( - payload: bytes, -) -> None: - from service.llm_patches import _maybe_extract_error_envelope - assert _maybe_extract_error_envelope(payload) is None +def test_observability_skips_mcp_prefixed_tools() -> None: + """MCP tools are already logged from ``mcp_bridge_controller`` — + the observability patch must NOT double-render them.""" + from service.llm_patches import ( + cli_stream_logger_ctx, + install_llm_patches, + ) + install_llm_patches() -def test_assembler_patch_raises_friendly_error_on_auth_failure() -> None: - """End-to-end through ``assemble_response_from_stream_json``: - install the patch, feed a synthetic stream that ends with the - auth-failed envelope, and assert the wrapped function raises - with the Korean re-login message instead of returning silently.""" - pytest.importorskip("geny_executor.llm_client.translators._cli") - import asyncio import importlib - - from service.llm_patches import install_llm_patches - - cli_module = importlib.import_module( + cli_translator = importlib.import_module( "geny_executor.llm_client.translators._cli" ) - pristine = cli_module.assemble_response_from_stream_json + accum = cli_translator.StreamJsonAccumulator(model="m") + fake = _FakeLogger() + token = cli_stream_logger_ctx.set(fake) try: - install_llm_patches() - wrapped = cli_module.assemble_response_from_stream_json - - async def _fake_stream(): - yield ( - b'{"type":"system","subtype":"init","model":"claude-opus-4-7"}' - ) - yield ( - b'{"type":"result","is_error":true,"api_error_status":401,' - b'"error":"authentication_failed",' - b'"result":"Failed to authenticate. API Error: 401 ..."}' - ) - - async def _runner(): - return await wrapped(_fake_stream(), model="claude-opus-4-7") - - with pytest.raises(RuntimeError, match="인증이 만료"): - asyncio.new_event_loop().run_until_complete(_runner()) + accum.feed({ + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "id": "mc_1", + "name": "mcp__geny__send_direct_message_internal", + "input": {"content": "hi"}}, + ], + }, + }) finally: - cli_module.assemble_response_from_stream_json = pristine + cli_stream_logger_ctx.reset(token) + assert fake.tool_uses == [] -def test_assembler_patch_pass_through_on_clean_stream() -> None: - """A successful stream (no ``is_error`` envelope) must return - the original APIResponse unchanged — the patch is non-intrusive - on the happy path.""" - pytest.importorskip("geny_executor.llm_client.translators._cli") - import asyncio - import importlib - from service.llm_patches import install_llm_patches +def test_observability_emits_tool_result_with_duration() -> None: + """A matching ``"user"`` ``tool_result`` envelope triggers a + ``log_tool_result`` call with measured ``duration_ms`` and the + extracted result text.""" + from service.llm_patches import ( + cli_stream_logger_ctx, + install_llm_patches, + ) + install_llm_patches() - cli_module = importlib.import_module( + import importlib + cli_translator = importlib.import_module( "geny_executor.llm_client.translators._cli" ) - pristine = cli_module.assemble_response_from_stream_json + accum = cli_translator.StreamJsonAccumulator(model="m") + fake = _FakeLogger() + token = cli_stream_logger_ctx.set(fake) try: - install_llm_patches() - wrapped = cli_module.assemble_response_from_stream_json - - async def _fake_stream(): - yield ( - b'{"type":"system","subtype":"init","model":"claude-opus-4-7"}' - ) - yield ( - b'{"type":"assistant","delta":{"type":"text_delta",' - b'"text":"hello"}}' - ) - yield ( - b'{"type":"result","stop_reason":"end_turn",' - b'"usage":{"input_tokens":1,"output_tokens":1}}' - ) - - async def _runner(): - return await wrapped(_fake_stream(), model="claude-opus-4-7") - - resp = asyncio.new_event_loop().run_until_complete(_runner()) - # Don't pin the full APIResponse shape — just confirm it - # didn't raise and we got *something* back. - assert resp is not None + accum.feed({ + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "id": "tu_1", "name": "Read", + "input": {"path": "/etc/hostname"}}, + ], + }, + }) + accum.feed({ + "type": "user", + "message": { + "content": [ + {"type": "tool_result", "tool_use_id": "tu_1", + "content": "myhost\n"}, + ], + }, + }) finally: - cli_module.assemble_response_from_stream_json = pristine - + cli_stream_logger_ctx.reset(token) + + assert len(fake.tool_results) == 1 + res = fake.tool_results[0] + assert res["tool_name"] == "Read" + assert res["tool_id"] == "tu_1" + assert res["result"] == "myhost\n" + assert res["is_error"] is False + assert isinstance(res["duration_ms"], int) and res["duration_ms"] >= 0 + + +def test_observability_handles_content_block_list_result_shape() -> None: + """Tool results can arrive as either a plain string OR a list of + ``{"type":"text","text":...}`` content blocks. The patch + flattens the list shape into a newline-joined string.""" + from service.llm_patches import ( + cli_stream_logger_ctx, + install_llm_patches, + ) + install_llm_patches() -def test_assembler_patch_installs_across_all_three_namespaces() -> None: - """Like the argv patch, the assembler function is re-exported in - three places. Patching only the source leaves - ``claude_code.assemble_response_from_stream_json`` (the actual - call site at line 203) pointed at the pristine function.""" - pytest.importorskip("geny_executor.llm_client.claude_code") import importlib - - from service.llm_patches import install_llm_patches - - modules = [ - importlib.import_module("geny_executor.llm_client.translators._cli"), - importlib.import_module("geny_executor.llm_client.translators"), - importlib.import_module("geny_executor.llm_client.claude_code"), - ] - pristines = [getattr(m, "assemble_response_from_stream_json") for m in modules] + cli_translator = importlib.import_module( + "geny_executor.llm_client.translators._cli" + ) + accum = cli_translator.StreamJsonAccumulator(model="m") + fake = _FakeLogger() + token = cli_stream_logger_ctx.set(fake) try: - install_llm_patches() - wrapped = [getattr(m, "assemble_response_from_stream_json") for m in modules] - for fn in wrapped: - assert hasattr(fn, "_geny_assembler_error_patch_applied") - assert wrapped[0] is wrapped[1] is wrapped[2] + accum.feed({ + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "id": "tu_1", "name": "Bash", + "input": {"command": "echo hi"}}, + ], + }, + }) + accum.feed({ + "type": "user", + "message": { + "content": [ + {"type": "tool_result", "tool_use_id": "tu_1", + "content": [ + {"type": "text", "text": "hi"}, + {"type": "text", "text": "(exit 0)"}, + ]}, + ], + }, + }) finally: - for m, original in zip(modules, pristines): - setattr(m, "assemble_response_from_stream_json", original) + cli_stream_logger_ctx.reset(token) + assert fake.tool_results[0]["result"] == "hi\n(exit 0)" -# ── Original argv patch tests ───────────────────────────────────────── - -def test_install_does_not_modify_non_streaming( - monkeypatch: pytest.MonkeyPatch, -) -> None: - pytest.importorskip("geny_executor.llm_client.translators._cli") +def test_observability_inert_without_active_context() -> None: + """If no ``cli_stream_logger_ctx`` is set (e.g. the accumulator is + driven outside a session-bound code path), the patch must + silently no-op rather than crash.""" from service.llm_patches import install_llm_patches - from geny_executor.llm_client.translators import _cli + install_llm_patches() - pristine = _cli.claude_code_argv - try: - install_llm_patches() - - class _Req: - stream = False # non-streaming - model = "claude-opus-4-7" - system = "" - thinking = None - response_format = None - session_hint = None - - argv = _cli.claude_code_argv(_Req()) - # Non-streaming uses ``--output-format json`` — no verbose - # injection. - assert "--verbose" not in argv - of_idx = argv.index("--output-format") - assert argv[of_idx + 1] == "json" - finally: - _cli.claude_code_argv = pristine + import importlib + cli_translator = importlib.import_module( + "geny_executor.llm_client.translators._cli" + ) + accum = cli_translator.StreamJsonAccumulator(model="m") + # No context set → ContextVar is the default ``None``. + accum.feed({ + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "id": "tu_1", "name": "Bash", + "input": {"command": "ls"}}, + ], + }, + }) + # No exception is the assertion; the patch is a no-op when the + # ContextVar isn't set, so there's nothing observable to compare.