From 478de970392fc29df28acb9ec08adf7a286665e8 Mon Sep 17 00:00:00 2001
From: JangHaryeom <101104772+CocoRoF@users.noreply.github.com>
Date: Wed, 20 May 2026 16:49:47 +0900
Subject: [PATCH] chore(deps): bump geny-executor pin to >=2.0.6 + slim down
 llm_patches.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

geny-executor 2.0.6 (released today, see CocoRoF/geny-executor#208)
absorbed four of the five compat patches that previously lived in
``service/llm_patches.py``:

  A. ``--verbose`` auto-injected when ``--print --output-format=
     stream-json`` is used (required by Claude Code CLI ≥ 2.1.x).
  B. ``--bare`` auto-stripped when no ``ANTHROPIC_API_KEY`` is in
     env, so the OAuth subscription path no longer crashes.
  C. Dropped the auto-``--tools ""`` emit on MCP-configured spawns —
     CLI built-ins (``Bash`` / ``Read`` / ``Write`` / ``Edit`` / …)
     now stay available alongside MCP-wrapped host tools, which is
     what every host (notably Geny's Sub-Worker) actually wants.
  D. ``StreamJsonAccumulator.finalize`` (and ``parse_json_output_to_response``)
     now drop the ``tool_use`` blocks the CLI dispatched internally,
     so Stage 10 naturally no-ops instead of ghost-erroring against
     the ``mcp__geny__<name>`` ids it has no registration for.

Plus the executor lost the dead ``copilot_cli`` provider entirely
(Geny had already removed its end of that wiring in #827).

This bumps both pins (``pyproject.toml`` + ``requirements.txt``)
from ``>=2.0.5`` to ``>=2.0.6``.

``llm_patches.py`` shrinks from ~830 lines to ~479 lines. Two
patches remain — both are Geny-specific and won't fold upstream
without the executor gaining hooks Geny is the only consumer of:

  1. Friendly Korean error messages for ``is_error`` stream-json
     result envelopes (auth-expired → Settings card hint, generic
     API errors → structured short message). Will fold upstream once
     the executor gains a proper i18n hook.

  2. CLI-tool observability into Geny's :class:`SessionLogger` via
     the new :data:`cli_stream_logger_ctx` ContextVar — taps
     ``StreamJsonAccumulator.feed`` to surface CLI built-in tool
     calls (Bash / Read / Write / Edit / …) into the session log
     alongside the MCP tool entries that ``mcp_bridge_controller``
     already emits. ``mcp__*`` prefixed names are skipped so the
     bridge-side log isn't double-rendered. Will fold upstream
     once the executor emits first-class CLI-tool events on the
     pipeline event bus.

Tests are rewritten end-to-end:

  - Old: 21 tests for ``_patched_argv`` predicate + idempotent
    installer (the patched-argv function is gone).
  - New: 14 focused tests covering:
    - ``_friendly_error_message_for_result_envelope`` (3 cases:
      auth, generic API error, fallback).
    - ``_maybe_extract_error_envelope`` (bytes/str input + 7
      parametrised non-match cases).
    - ``install_llm_patches`` idempotency + re-export coverage.
    - Assembler patch end-to-end: raises Korean friendly error
      on auth-failure envelope; passes through clean streams.
    - Stream observability: emits ``log_tool_use`` for non-MCP
      tool blocks; skips ``mcp__*`` to avoid bridge double-render;
      emits ``log_tool_result`` with duration; handles both
      string and content-block-list result shapes; inert when
      no ContextVar is set.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/pyproject.toml                    |   2 +-
 backend/requirements.txt                  |   2 +-
 backend/service/llm_patches.py            | 515 +++------------
 backend/tests/service/test_llm_patches.py | 762 ++++++++++------------
 4 files changed, 409 insertions(+), 872 deletions(-)
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index e3727251..ab3cc2e7 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -42,7 +42,7 @@ dependencies = [
     # provider/parallel/max_concurrent for multi-provider sub-agents.
     # Earlier 1.21.0 surfaces (bounded root _index.json, provider-driven
     # Stage 2 / Stage 18, etc.) all retained.
-    "geny-executor>=2.0.5,<3.0.0",
+    "geny-executor>=2.0.6,<3.0.0",
     # MCP (Model Context Protocol)
     "mcp>=1.0.0",
     # Auth
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 2f3d60cf..77bd138f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -26,7 +26,7 @@ playwright>=1.49.0
 # Must match the version pin in backend/pyproject.toml — out-of-sync
 # pins are what caused the prod Docker build to install 1.21.0 and
 # blow up on `from geny_executor import CredentialBundle`.
-geny-executor>=2.0.5,<3.0.0
+geny-executor>=2.0.6,<3.0.0
 # MCP (Model Context Protocol)
 mcp>=1.0.0
 # TTS (Text-to-Speech)
diff --git a/backend/service/llm_patches.py b/backend/service/llm_patches.py
index 16686dd7..039da0e7 100644
--- a/backend/service/llm_patches.py
+++ b/backend/service/llm_patches.py
@@ -1,41 +1,49 @@
 """
 Runtime patches for ``geny-executor``'s LLM client integrations.
 
-Why this exists
----------------
-``geny-executor`` 2.0.1 (current pinned version) builds the Claude
-Code CLI argv via
-``geny_executor.llm_client.translators._cli.claude_code_argv``. For
-streaming requests it emits:
-
-    --print --input-format stream-json --output-format stream-json
-    --include-partial-messages …
-
-The Claude Code CLI's release pipeline tightened the validation
-sometime after ``geny-executor`` 2.0.1 was published: passing
-``--print`` together with ``--output-format=stream-json`` now also
-requires ``--verbose``, otherwise the CLI exits 1 with::
-
-    Error: When using --print, --output-format=stream-json requires --verbose
-
-Symptom on the user side: every Developer-role session with the
-Claude Code backend selected (`Environment 6단계 → API → Claude Code`)
-fails at the first command with the CLI message bubbled up
-verbatim.
-
-The fix
--------
-Monkey-patch ``claude_code_argv`` to insert ``--verbose`` right
-after ``--print`` whenever the produced argv contains
-``--output-format stream-json``. The patch is **idempotent** —
-re-running ``install_llm_patches()`` is a no-op once the wrapper is
-already in place. We also conditionally skip patching if a future
-``geny-executor`` ships the fix upstream (detected by checking
-whether the original argv already contains ``--verbose``).
-
-Tested against the upstream behaviour: the non-streaming branch
-(`--output-format json`) does NOT need ``--verbose`` and we leave
-it alone.
+Why this exists (cycle 20260520, post-2.0.6)
+--------------------------------------------
+``geny-executor`` 2.0.6 absorbed every generic Claude-Code CLI compat
+patch this module used to carry — ``--verbose`` auto-injection for
+stream-json output, ``--bare`` strip on the OAuth path, drop of the
+auto-``--tools ""`` emit (CLI built-ins now stay available alongside
+MCP-wrapped host tools), and the ``StreamJsonAccumulator.finalize``
+``tool_use`` strip that keeps Stage 10 from ghost-erroring against the
+CLI's internal dispatches. None of those need monkey-patching anymore;
+the executor's argv builder + accumulator behave correctly out of the
+box for hosts pinned to ``>=2.0.6``.
+
+Two Geny-specific hooks remain here:
+
+1. **Friendly Korean error message for auth-expired envelopes.** The
+   executor surfaces a generic ``RuntimeError``; we sugar the wire-shape
+   ``{"is_error": true, "api_error_status": 401,
+   "error": "authentication_failed"}`` into a Korean human-readable
+   prompt that points the user at the LLM Backends Settings card. Same
+   patch turns other ``is_error`` result envelopes into a structured
+   message instead of the runtime's empty-stderr fallback.
+
+2. **CLI-handled tool call observability into Geny's SessionLogger.**
+   The executor (correctly, per the Phase-I design contract) drops
+   ``tool_use`` blocks from its assembled ``APIResponse`` so host
+   pipelines don't try to re-dispatch them. That also means Geny's
+   Stage 10 never fires the ``tool.call_start`` / ``tool.call_complete``
+   events the session_logger taps into for *CLI built-in* (Bash, Read,
+   Write, Edit, Glob, Grep, WebFetch, WebSearch, …) tool use — those
+   tools never round-trip through Geny's MCP bridge either, so without
+   an explicit tap they would disappear from the session log entirely.
+   We monkey-patch ``StreamJsonAccumulator.feed`` to peek at every
+   stream-json line, emit ``session_logger.log_tool_use`` /
+   ``log_tool_result`` for non-``mcp__*`` tool blocks (MCP tools are
+   already logged from ``mcp_bridge_controller``; duplicating would
+   render them twice in the UI), and route the logger through a
+   ``ContextVar`` set by ``AgentSession.invoke()`` / ``astream()``
+   for the duration of a turn.
+
+Both patches will fold upstream eventually — error envelope friendly
+messages once the executor gains a proper i18n hook, and the
+observability tap once the executor emits first-class CLI-tool
+events through the pipeline event bus. Until then they live here.
 """
 
 from __future__ import annotations
@@ -49,8 +57,8 @@
 logger = getLogger(__name__)
 
 
-_PATCH_APPLIED_FLAG = "_geny_verbose_patch_applied"
 _ASSEMBLER_PATCH_APPLIED_FLAG = "_geny_assembler_error_patch_applied"
+_OBSERVABILITY_PATCH_APPLIED_FLAG = "_geny_accumulator_observability_patch_applied"
 
 
 # Context variable carrying the active Geny ``SessionLogger`` so the
@@ -68,13 +76,15 @@
     "geny_cli_stream_logger_ctx", default=None,
 )
 
-# Cached wrapper. Set on the first ``install_llm_patches()`` call so
-# subsequent installs re-use the same instance — matters because the
-# wrapper has to be present (with identical identity) on three
-# separate module attributes. A fresh wrapper per install would leave
-# stale references on whichever module didn't get re-patched.
-_cached_wrapper: Any = None
+# Cached wrappers, re-used across repeated ``install_llm_patches()``
+# calls so the wrapper identity stays consistent on every patched
+# module attribute. The Geny app boot can call ``install_llm_patches()``
+# multiple times during config reloads; without cached wrappers we'd
+# either double-stack or leave stale references on whichever module
+# didn't get re-patched.
 _cached_assembler_wrapper: Any = None
+_cached_accumulator_init: Any = None
+_cached_accumulator_feed: Any = None
 
 
 # Human-readable message shown to the end user when the Claude CLI
@@ -143,233 +153,16 @@ def _maybe_extract_error_envelope(raw: Any) -> Optional[Dict[str, Any]]:
     return line
 
 
-def _patched_argv(
-    original_argv: List[str],
-    *,
-    has_api_key: Optional[bool] = None,
-) -> List[str]:
-    """Return a transformed argv applying three independent fixes:
-
-      A. ``--verbose`` injection — required after ``--print
-         --output-format stream-json`` for CLI ≥ 2.1.x. See the
-         module docstring for the history.
-
-      B. ``--bare`` stripping on the OAuth path — the CLI's
-         ``--bare`` flag explicitly disables OAuth ("Anthropic auth
-         is strictly ANTHROPIC_API_KEY or apiKeyHelper via
-         --settings (OAuth and keychain are never read)") but
-         ``geny-executor`` 2.0.1 builds the
-         ``ClaudeCodeCLIClient`` with ``bare_mode=True`` as the
-         default. The combination crashes every subscription /
-         OAuth user with ``"Not logged in · Please run /login"`` or
-         an empty-stderr ``exited with code 1:``.
-
-         When *has_api_key* is False (no ``ANTHROPIC_API_KEY`` in
-         env), remove ``--bare`` from the argv so the CLI is allowed
-         to read the OAuth credentials. When *has_api_key* is True,
-         keep ``--bare`` — that's the auth path it was designed for.
-         When *has_api_key* is None (caller didn't tell us), we
-         detect via ``os.environ`` as a fall-back.
-
-      C. ``--tools ""`` stripping (Phase-I follow-up) — executor 2.0.5
-         auto-emits ``--tools ""`` whenever ``--mcp-config`` is set
-         and no ``allow_tools`` was supplied, which disables the CLI's
-         entire built-in palette (Bash / Read / Write / Edit / Glob /
-         Grep / TodoWrite / WebFetch / WebSearch / etc.). For Geny we
-         want the *opposite*: CLI built-ins available alongside our
-         MCP-wrapped Geny tools, so the Sub-Worker can actually edit
-         files / run shell / browse the web. The CLI's permission
-         system gates each call, and Geny pre-allows the safe set via
-         the synthesised ``settings.json`` (see
-         ``credentials.py:_build_claude_code``). Stripping the
-         ``["--tools", ""]`` pair restores CLI defaults; the
-         ``--strict-mcp-config`` flag remains in argv so the *MCP*
-         surface stays scoped to our session bridge.
-
-    Pure function — handy for tests.
-    """
-    new_argv = list(original_argv)
-
-    # ── Fix B: drop --bare on OAuth path ─────────────────────────────
-    if has_api_key is None:
-        import os
-        has_api_key = bool(os.environ.get("ANTHROPIC_API_KEY", "").strip())
-    if not has_api_key:
-        new_argv = [arg for arg in new_argv if arg != "--bare"]
-
-    # ── Fix C: drop ``--tools ""`` pair so CLI built-ins remain on ──
-    cleaned: List[str] = []
-    skip_next = False
-    for i, arg in enumerate(new_argv):
-        if skip_next:
-            skip_next = False
-            continue
-        if arg == "--tools" and (i + 1) < len(new_argv) and new_argv[i + 1] == "":
-            skip_next = True
-            continue
-        cleaned.append(arg)
-    new_argv = cleaned
-
-    # ── Fix A: inject --verbose for stream-json output ──────────────
-    if "--verbose" in new_argv:
-        return new_argv
-    try:
-        of_idx = new_argv.index("--output-format")
-    except ValueError:
-        return new_argv
-    if of_idx + 1 >= len(new_argv):
-        return new_argv
-    if new_argv[of_idx + 1] != "stream-json":
-        return new_argv
-    try:
-        insert_at = new_argv.index("--print") + 1
-    except ValueError:
-        insert_at = len(new_argv)
-    new_argv.insert(insert_at, "--verbose")
-    return new_argv
-
-
 def install_llm_patches() -> None:
-    """Idempotently install the Claude Code argv ``--verbose`` patch.
-
-    The function is re-exported across three modules in
-    ``geny_executor``:
+    """Idempotently install Geny's two remaining ``geny-executor``
+    monkey-patches: friendly Korean error envelopes + CLI-tool
+    observability into Geny's SessionLogger.
 
-      1. ``geny_executor.llm_client.translators._cli`` (source).
-      2. ``geny_executor.llm_client.translators`` (re-export in the
-         package ``__init__``).
-      3. ``geny_executor.llm_client.claude_code`` (caller did
-         ``from … import claude_code_argv`` — captured a *local
-         binding* to the source function).
-
-    Patching only #1 is insufficient because #3 already holds a
-    direct reference. We have to overwrite the attribute in **every**
-    namespace that re-exports the function so any future
-    ``module.claude_code_argv(...)`` lookup hits the wrapper.
-
-    Safe to call multiple times. Logs once at INFO on the first
-    install, DEBUG on subsequent calls.
+    Safe to call multiple times. Both installers are themselves
+    idempotent (assembler uses a cached wrapper and ``_original``
+    unwrap; accumulator uses a class-level applied flag).
     """
-    import importlib
-
-    candidate_modules = [
-        "geny_executor.llm_client.translators._cli",
-        "geny_executor.llm_client.translators",
-        "geny_executor.llm_client.claude_code",
-    ]
-
-    # Step 1 — find the original implementation. It lives in
-    # translators._cli; every other module just re-binds it.
-    try:
-        cli_translator = importlib.import_module(candidate_modules[0])
-    except Exception:  # noqa: BLE001
-        logger.debug(
-            "geny_executor.llm_client.translators._cli unavailable — "
-            "Claude Code CLI patch skipped (this is fine when the "
-            "Claude Code backend isn't in use)",
-            exc_info=True,
-        )
-        return
-
-    original = getattr(cli_translator, "claude_code_argv", None)
-    if original is None:
-        logger.debug(
-            "claude_code_argv missing from geny_executor — patch skipped"
-        )
-        return
-    # Unwrap any already-installed wrapper so we don't double-stack.
-    while getattr(original, _PATCH_APPLIED_FLAG, False):
-        inner = getattr(original, "_original", None)
-        if inner is None:
-            break
-        original = inner
-
-    global _cached_wrapper
-    if _cached_wrapper is None:
-        def _wrapped(*args: Any, **kwargs: Any) -> List[str]:
-            argv = original(*args, **kwargs)
-            patched = _patched_argv(argv)
-            # Phase-I diagnostic: log the final argv every CLI invocation
-            # so we can verify ``--tools ""`` / ``--strict-mcp-config`` /
-            # ``--mcp-config`` are actually reaching the spawned ``claude``
-            # process. Sensitive payloads — ``--mcp-config <json>`` carries
-            # the per-session bearer token and ``--system-prompt`` carries
-            # the assembled persona — are redacted to their lengths so the
-            # log stays grep-friendly without leaking the actual content.
-            try:
-                redacted: List[str] = []
-                skip_next = False
-                for i, arg in enumerate(patched):
-                    if skip_next:
-                        # Redact the value following a sensitive flag.
-                        redacted.append(f"<redacted len={len(arg)}>")
-                        skip_next = False
-                        continue
-                    redacted.append(arg)
-                    if arg in {"--mcp-config", "--system-prompt", "--append-system-prompt"}:
-                        skip_next = True
-                logger.info("[llm_patches] claude argv (%d args): %s", len(patched), redacted)
-            except Exception:  # noqa: BLE001
-                pass
-            return patched
-
-        setattr(_wrapped, _PATCH_APPLIED_FLAG, True)
-        setattr(_wrapped, "_original", original)
-        _cached_wrapper = _wrapped
-    # else: re-use the cached wrapper. Its closure captures the
-    # *original* function, so even if a stale wrapper was left on
-    # one of the modules between installs, re-binding to the cached
-    # instance restores the canonical chain.
-
-    # Overwrite every known re-export with the SAME wrapper instance
-    # so any caller — including ``claude_code.py`` which captured a
-    # local binding at module load time — hits the wrapper. Doing
-    # this unconditionally (vs per-module idempotency-skip) keeps
-    # the three module attributes in lock-step across repeated
-    # installs; the unwrap-while-loop above strips stale wrappers
-    # off ``original`` so we don't double-stack.
-    patched_modules: List[str] = []
-    for mod_name in candidate_modules:
-        try:
-            mod = importlib.import_module(mod_name)
-        except Exception:  # noqa: BLE001
-            continue
-        if getattr(mod, "claude_code_argv", None) is None:
-            continue
-        setattr(mod, "claude_code_argv", _cached_wrapper)
-        patched_modules.append(mod_name)
-
-    if patched_modules:
-        logger.info(
-            "[llm_patches] installed Claude Code CLI --verbose patch "
-            "across %d modules: %s "
-            "(workaround for geny-executor 2.0.1 + new claude CLI)",
-            len(patched_modules),
-            ", ".join(patched_modules),
-        )
-    else:
-        logger.debug(
-            "[llm_patches] no claude_code_argv attribute found on any "
-            "known module — nothing to patch"
-        )
-
-    # Always (re-)install the assembler-side patch alongside the argv
-    # patch. They're independent fixes but always travel together:
-    # without the assembler patch the user sees a misleading "CLI
-    # exited with code 1:" message instead of an actionable auth-
-    # expired hint.
     _install_assembler_error_patch()
-
-    # Phase-I ghost-error fix — strip tool_use blocks from the
-    # ``StreamJsonAccumulator`` final response. See
-    # ``_install_stream_accumulator_patch`` for the rationale.
-    _install_stream_accumulator_patch()
-
-    # Phase-I follow-up — surface CLI-handled tool calls (Bash /
-    # Read / Write / Edit / …) to the active Geny ``SessionLogger``
-    # via ``cli_stream_logger_ctx``. The strip patch above silences
-    # Stage 10, so without this companion patch CLI built-in tool
-    # work would simply not appear in the Geny session log.
     _install_stream_observability_patch()
 
 
@@ -382,8 +175,8 @@ def _install_assembler_error_patch() -> None:
     *friendly* RuntimeError instead of the runtime's empty-stderr
     ``CLI '/usr/bin/claude' exited with code 1:`` fallback.
 
-    Same three-module pattern as ``claude_code_argv``: the caller
-    in ``geny_executor.llm_client.claude_code.py`` does
+    Three-module pattern: the caller in
+    ``geny_executor.llm_client.claude_code.py`` does
     ``from … import assemble_response_from_stream_json`` and captures
     a local binding at module load. We re-bind the attribute in all
     three modules to the same wrapper instance.
@@ -417,41 +210,10 @@ def _install_assembler_error_patch() -> None:
         async def _wrapped(
             stream: AsyncIterator[Any], *, model: str,
         ) -> Any:
-            """Spy on the stream-json output. Two fixes:
-
-            A. Error envelope detection — if the CLI emitted an
-               ``is_error`` result envelope, raise a Korean human-readable
-               error instead of letting the runtime's empty-stderr
-               fallback kick in.
-
-            B. Strip in-CLI ``tool_use`` blocks from the assembled
-               response. The Claude Code CLI runs the full agentic
-               loop internally (LLM → MCP tool → LLM → ...). Every
-               intermediate assistant turn shows up in the stream-json
-               output as a separate ``assistant`` envelope, and the
-               upstream :class:`StreamJsonAccumulator` *concatenates*
-               their content — so the final ``APIResponse`` returned
-               to Stage 6 contains all the ``tool_use`` blocks the CLI
-               *already dispatched* via MCP. Geny's Stage 10 (the
-               canonical Anthropic-API tool-dispatch stage) then sees
-               those ``tool_use`` blocks, looks up the
-               ``mcp__geny__<name>`` ids in Geny's own ToolLoader
-               (which only knows the bare names — the MCP prefix is
-               applied by the CLI, not Geny), finds nothing, and
-               surfaces "Tool X: ERROR (0ms) — No output" for every
-               call the user can plainly see succeeded (the worker
-               actually received the message, the worker actually
-               replied, the bridge log records ``tools/call`` traffic).
-
-               Per the Phase-I design doc — "Stage 10 receives that
-               assistant message, sees no ``tool_use`` blocks (they
-               were executed inside the CLI), and naturally no-ops"
-               — the canonical fix is to strip ``tool_use`` blocks
-               from the response before they reach Stage 10. Geny's
-               permission/audit telemetry for those calls flows via
-               the MCP bridge endpoint (cycle 20260519/Phase-2),
-               not through Stage 10, so this strip is lossless.
-            """
+            """Spy on the stream-json output. If the CLI emitted an
+            ``is_error`` result envelope, raise a Korean human-readable
+            error instead of letting the runtime's empty-stderr
+            fallback kick in."""
             err_holder: List[Dict[str, Any]] = []
 
             async def _spy() -> AsyncIterator[Any]:
@@ -482,12 +244,6 @@ async def _spy() -> AsyncIterator[Any]:
                     _friendly_error_message_for_result_envelope(err_holder[-1])
                 )
 
-            # Note: the equivalent tool_use strip lives in
-            # ``_install_stream_accumulator_patch`` because the actual
-            # streaming code path (``ClaudeCodeCLIClient._stream``)
-            # bypasses this assembler entirely and calls
-            # ``StreamJsonAccumulator.finalize`` directly.
-
             return response
 
         setattr(_wrapped, _ASSEMBLER_PATCH_APPLIED_FLAG, True)
@@ -513,126 +269,7 @@ async def _spy() -> AsyncIterator[Any]:
         )
 
 
-# ── StreamJsonAccumulator finalize: strip CLI-handled tool_use ──────
-
-
-_ACCUMULATOR_PATCH_APPLIED_FLAG = "_geny_accumulator_strip_patch_applied"
-_cached_accumulator_finalize: Any = None
-
-
-def _install_stream_accumulator_patch() -> None:
-    """Monkey-patch ``StreamJsonAccumulator.finalize`` so the final
-    :class:`APIResponse` returned to Stage 6 carries **no**
-    ``tool_use`` blocks for the Claude Code CLI streaming path.
-
-    Why this lives in Geny rather than the executor
-    -----------------------------------------------
-    Claude Code CLI 2.1.x runs the whole agentic loop *internally*
-    (LLM → tool → LLM → tool → …). Every intermediate assistant turn
-    arrives as its own ``{"type":"assistant","message":{...}}``
-    envelope in stream-json, and ``StreamJsonAccumulator._feed_message``
-    appends every block from every envelope into a shared buffer with
-    no per-turn reset. ``finalize()`` therefore returns an
-    ``APIResponse`` whose ``content`` carries every ``tool_use`` block
-    the CLI *already dispatched* via MCP or its own built-ins.
-
-    Geny's downstream pipeline (Stage 9 parse → Stage 10 dispatch)
-    treats those ``tool_use`` blocks as *pending* and tries to
-    redispatch them against Geny's own tool registry — but the CLI
-    advertises MCP tools with the ``mcp__geny__<name>`` prefix that
-    Geny's registry doesn't know, so every call instantly fails with
-    ``ERROR (0ms) — No output``. Meanwhile the actual tool work
-    already succeeded via the MCP bridge: every Sub-Worker message
-    was delivered, every memory_write persisted, every browser_*
-    call dispatched. The user sees both a successful final reply
-    *and* a session log full of bogus failures — confusing,
-    operationally noisy, and capable of nudging the LLM itself into
-    apologising mid-conversation ("messaging tool not connected").
-
-    Per the Phase-I design doc:
-
-        Stage 10 receives that assistant message, sees no ``tool_use``
-        blocks (they were executed inside the CLI), and naturally
-        no-ops.
-
-    The accumulator's append-everything behaviour violates that
-    contract. Stripping ``tool_use`` from ``finalize()`` restores it.
-    Phase-2 audit/telemetry for those calls flows through the MCP
-    bridge endpoint, so the strip is lossless — Geny's
-    ``mcp_bridge_controller`` is already where ``tools/call`` events
-    are recorded.
-
-    A future executor 2.0.6 should encode this directly (per-turn
-    buffer reset, or skip ``tool_use`` on the claude_code_cli path);
-    when that lands we drop this patch.
-    """
-    import importlib
-
-    try:
-        cli_translator = importlib.import_module(
-            "geny_executor.llm_client.translators._cli"
-        )
-    except Exception:  # noqa: BLE001
-        return
-    accum_cls = getattr(cli_translator, "StreamJsonAccumulator", None)
-    if accum_cls is None:
-        return
-
-    original = getattr(accum_cls, "finalize", None)
-    if original is None:
-        return
-
-    # Unwrap stale wrappers so we don't double-stack across reloads.
-    while getattr(original, _ACCUMULATOR_PATCH_APPLIED_FLAG, False):
-        inner = getattr(original, "_original", None)
-        if inner is None:
-            break
-        original = inner
-
-    global _cached_accumulator_finalize
-    if _cached_accumulator_finalize is None:
-        def _wrapped(self: Any) -> Any:
-            response = original(self)
-            try:
-                content = getattr(response, "content", None)
-                if isinstance(content, list):
-                    filtered = [
-                        b for b in content
-                        if getattr(b, "type", None) != "tool_use"
-                    ]
-                    stripped = len(content) - len(filtered)
-                    if stripped:
-                        response.content = filtered
-                        logger.info(
-                            "[llm_patches] stripped %d CLI-handled "
-                            "tool_use block(s) from claude_code stream "
-                            "response (Stage 10 no-ops)",
-                            stripped,
-                        )
-            except Exception:  # noqa: BLE001
-                logger.debug(
-                    "[llm_patches] accumulator finalize strip skipped",
-                    exc_info=True,
-                )
-            return response
-
-        setattr(_wrapped, _ACCUMULATOR_PATCH_APPLIED_FLAG, True)
-        setattr(_wrapped, "_original", original)
-        _cached_accumulator_finalize = _wrapped
-
-    setattr(accum_cls, "finalize", _cached_accumulator_finalize)
-    logger.info(
-        "[llm_patches] installed StreamJsonAccumulator.finalize "
-        "tool_use strip patch (claude_code_cli ghost-error fix)"
-    )
-
-
-# ── StreamJsonAccumulator observability: surface CLI built-ins ───────
-
-
-_OBSERVABILITY_PATCH_APPLIED_FLAG = "_geny_accumulator_observability_patch_applied"
-_cached_accumulator_init: Any = None
-_cached_accumulator_feed: Any = None
+# ── StreamJsonAccumulator observability: surface CLI built-ins ──────
 
 
 def _install_stream_observability_patch() -> None:
@@ -643,14 +280,14 @@ def _install_stream_observability_patch() -> None:
 
     Why
     ---
-    With PR #825's tool_use strip in ``finalize()``, Geny's Stage 10
-    naturally no-ops for ``claude_code_cli`` sessions — which means
-    the in-pipeline ``tool.call_start`` / ``tool.call_complete``
-    events never fire and CLI-handled tools (``Bash`` / ``Read`` /
-    ``Write`` / ``Edit`` / ``WebFetch`` / …) disappear from the
-    session log. ``mcp_bridge_controller`` already emits log entries
-    for MCP ``tools/call`` traffic (the bridge sees those directly),
-    so that surface is covered.
+    With executor 2.0.6's terminal-response strip of ``tool_use``
+    blocks, Geny's Stage 10 naturally no-ops for ``claude_code_cli``
+    sessions — which means the in-pipeline ``tool.call_start`` /
+    ``tool.call_complete`` events never fire and CLI-handled tools
+    (``Bash`` / ``Read`` / ``Write`` / ``Edit`` / ``WebFetch`` / …)
+    disappear from the session log. ``mcp_bridge_controller`` already
+    emits log entries for MCP ``tools/call`` traffic (the bridge sees
+    those directly), so that surface is covered.
 
     The remaining gap is the *CLI built-in* layer: tools the CLI
     dispatches **internally** without going through our bridge. They
@@ -659,7 +296,7 @@ def _install_stream_observability_patch() -> None:
 
     Mechanics
     ---------
-    - On each ``feed(line)`` call, peek at the line *before* delegating
+    - On each ``feed(line)`` call, peek at the line *after* delegating
       to the original feed.
     - ``"assistant"`` envelopes carry ``tool_use`` blocks → emit
       ``session_logger.log_tool_use`` for each, and stash
@@ -675,10 +312,6 @@ def _install_stream_observability_patch() -> None:
       double-render in the UI.
     - All logger calls are try/except'd; observability must never
       break the underlying tool dispatch.
-
-    No upstream executor changes required. When executor 2.0.6 adds
-    first-class CLI-tool observability events to the pipeline event
-    bus, we drop this patch.
     """
     import importlib
 
diff --git a/backend/tests/service/test_llm_patches.py b/backend/tests/service/test_llm_patches.py
index 8550db01..ec77e2cb 100644
--- a/backend/tests/service/test_llm_patches.py
+++ b/backend/tests/service/test_llm_patches.py
@@ -1,520 +1,424 @@
-"""Tests for the Claude Code CLI argv runtime patch.
-
-Validates the pure ``_patched_argv`` predicate function plus the
-idempotent monkey-patch installer.
+"""Tests for ``service/llm_patches.py``.
+
+Post-cycle-20260520 surface is much smaller — the four generic
+Claude-Code CLI argv fixes that previously lived here folded into
+``geny-executor`` 2.0.6 (``--verbose`` injection, ``--bare`` strip on
+OAuth, drop of auto-``--tools ""``, ``StreamJsonAccumulator.finalize``
+tool_use strip). What's left is Geny-specific and covered here:
+
+  1. Korean friendly-error messages for stream-json ``is_error``
+     result envelopes (auth-expired hint + generic API-error
+     fallback).
+  2. CLI-tool observability into Geny's :class:`SessionLogger`
+     (``log_tool_use`` + ``log_tool_result`` for non-``mcp__*``
+     blocks observed in the stream, routed through the
+     :data:`cli_stream_logger_ctx` ContextVar).
 """
 
 from __future__ import annotations
 
-import pytest
+import json
+from typing import Any, AsyncIterator, Dict, List
 
+import pytest
 
-# ── Pure transformation ──────────────────────────────────────────────
 
+# ── _friendly_error_message_for_result_envelope ─────────────────
 
-def test_streaming_argv_gets_verbose_inserted() -> None:
-    from service.llm_patches import _patched_argv
 
-    original = [
-        "--print",
-        "--input-format", "stream-json",
-        "--output-format", "stream-json",
-        "--include-partial-messages",
-        "--model", "claude-opus-4-7",
-    ]
-    # API-key path — only the verbose-injection fix is exercised
-    # here. The bare-strip fix has its own test below.
-    patched = _patched_argv(original, has_api_key=True)
-
-    assert patched != original, "argv should be modified"
-    assert "--verbose" in patched
-    # Order matters for readable logs: ``--verbose`` immediately after
-    # ``--print``.
-    print_idx = patched.index("--print")
-    assert patched[print_idx + 1] == "--verbose"
-
-
-def test_oauth_path_strips_bare_flag() -> None:
-    """Subscription / OAuth users (no ``ANTHROPIC_API_KEY``) must
-    NOT pass ``--bare`` — the flag explicitly disables OAuth at
-    the CLI level. ``geny-executor`` 2.0.1 always emits ``--bare``
-    via its ``bare_mode=True`` default, so we strip it here on
-    every OAuth invocation."""
-    from service.llm_patches import _patched_argv
-
-    original = [
-        "--print",
-        "--output-format", "stream-json",
-        "--bare",
-        "--model", "claude-opus-4-7",
-    ]
-    patched = _patched_argv(original, has_api_key=False)
-    assert "--bare" not in patched
-    # ``--verbose`` should still be injected for stream-json output.
-    assert "--verbose" in patched
-
-
-def test_api_key_path_keeps_bare_flag() -> None:
-    """When ``ANTHROPIC_API_KEY`` is set, ``--bare`` is the right
-    flag — keep it untouched."""
-    from service.llm_patches import _patched_argv
-
-    original = [
-        "--print",
-        "--output-format", "stream-json",
-        "--bare",
-        "--model", "claude-opus-4-7",
-    ]
-    patched = _patched_argv(original, has_api_key=True)
-    assert "--bare" in patched
-    assert "--verbose" in patched
+def test_friendly_error_message_recognises_auth_failure() -> None:
+    from service.llm_patches import _friendly_error_message_for_result_envelope
 
+    msg = _friendly_error_message_for_result_envelope({
+        "type": "result",
+        "is_error": True,
+        "api_error_status": 401,
+        "error": "authentication_failed",
+        "result": "Failed to authenticate. API Error: 401",
+    })
+    # Korean prompt that points the user at the LLM Backends Settings card.
+    assert "Claude Code 인증이 만료" in msg
+    assert "LLM 백엔드" in msg
+    # The raw CLI message is appended as ``(원본: ...)`` for forensics.
+    assert "Failed to authenticate" in msg
 
-def test_auto_detect_has_api_key_from_env(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """``has_api_key=None`` (default) means "infer from env". With
-    the env var set, ``--bare`` stays; without, it's stripped."""
-    from service.llm_patches import _patched_argv
-
-    original = [
-        "--print",
-        "--output-format", "stream-json",
-        "--bare",
-    ]
 
-    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-fake")
-    keep = _patched_argv(original)
-    assert "--bare" in keep
+def test_friendly_error_message_generic_api_error() -> None:
+    from service.llm_patches import _friendly_error_message_for_result_envelope
 
-    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
-    drop = _patched_argv(original)
-    assert "--bare" not in drop
+    msg = _friendly_error_message_for_result_envelope({
+        "type": "result",
+        "is_error": True,
+        "api_error_status": 500,
+        "error": "internal_server_error",
+        "result": "Upstream provider returned 500",
+    })
+    assert "Claude Code API 에러" in msg
+    assert "500" in msg
+    assert "Upstream provider returned 500" in msg
 
 
-def test_non_streaming_argv_is_untouched() -> None:
-    """``--output-format json`` (non-streaming) doesn't need
-    ``--verbose`` — the CLI accepts it as-is. Patch must NOT add
-    ``--verbose`` in that case to avoid changing semantics."""
-    from service.llm_patches import _patched_argv
+def test_friendly_error_message_no_api_status_falls_back_to_cli_error() -> None:
+    from service.llm_patches import _friendly_error_message_for_result_envelope
 
-    original = [
-        "--print",
-        "--output-format", "json",
-        "--model", "claude-opus-4-7",
-    ]
-    patched = _patched_argv(original)
-    assert "--verbose" not in patched
-    assert patched == original
+    msg = _friendly_error_message_for_result_envelope({
+        "type": "result",
+        "is_error": True,
+        "result": "claude binary segfaulted",
+    })
+    assert "Claude Code CLI 에러" in msg
+    assert "claude binary segfaulted" in msg
 
 
-def test_argv_with_existing_verbose_is_untouched() -> None:
-    """Defensive: if a future ``geny-executor`` ships the fix
-    upstream, our patch must become a no-op so we don't double-add."""
-    from service.llm_patches import _patched_argv
+# ── _maybe_extract_error_envelope ───────────────────────────────
 
-    original = [
-        "--print",
-        "--verbose",
-        "--output-format", "stream-json",
-    ]
-    patched = _patched_argv(original)
-    assert patched == original
-    assert patched.count("--verbose") == 1
 
+def test_maybe_extract_error_envelope_accepts_bytes_and_str() -> None:
+    from service.llm_patches import _maybe_extract_error_envelope
 
-def test_argv_without_output_format_is_untouched() -> None:
-    from service.llm_patches import _patched_argv
+    payload = json.dumps({
+        "type": "result", "is_error": True, "api_error_status": 401,
+    })
+    assert _maybe_extract_error_envelope(payload.encode("utf-8"))
+    assert _maybe_extract_error_envelope(payload)
+    assert _maybe_extract_error_envelope(bytearray(payload, "utf-8"))
 
-    original = ["--print", "--model", "x"]
-    assert _patched_argv(original) == original
 
+@pytest.mark.parametrize("payload", [
+    b"",
+    b"  ",
+    b"not json",
+    json.dumps({"type": "assistant"}).encode("utf-8"),
+    json.dumps({"type": "result"}).encode("utf-8"),  # no is_error
+    json.dumps({"type": "result", "is_error": False}).encode("utf-8"),
+    json.dumps([1, 2, 3]).encode("utf-8"),  # not a dict
+])
+def test_maybe_extract_error_envelope_returns_none_for_non_error_lines(
+    payload: bytes,
+) -> None:
+    from service.llm_patches import _maybe_extract_error_envelope
 
-def test_argv_with_stream_json_input_only_is_untouched() -> None:
-    """``--input-format stream-json`` doesn't trigger the
-    ``--print --output-format`` requirement; only the *output* format
-    does. Patch must only react to the output-format pair."""
-    from service.llm_patches import _patched_argv
+    assert _maybe_extract_error_envelope(payload) is None
 
-    original = [
-        "--print",
-        "--input-format", "stream-json",
-        "--output-format", "json",
-    ]
-    patched = _patched_argv(original)
-    assert "--verbose" not in patched
-    assert patched == original
 
+# ── install_llm_patches: idempotency + re-export coverage ──
 
-def test_argv_with_dangling_output_format_at_end() -> None:
-    """Defensive: ``--output-format`` is the last token with no value.
-    Don't crash — just leave it alone."""
-    from service.llm_patches import _patched_argv
 
-    original = ["--print", "--output-format"]
-    patched = _patched_argv(original)
-    assert patched == original  # No crash, no insertion.
+def test_install_is_idempotent() -> None:
+    """Calling ``install_llm_patches()`` multiple times must be a
+    no-op after the first install — the assembler wrapper and the
+    accumulator observability patch both use cached state, so a
+    second call leaves the patched attributes pointing at the
+    *same* wrapper object."""
+    from service.llm_patches import install_llm_patches
 
+    # First install (might happen at app boot before this test).
+    install_llm_patches()
 
-# ── Installer ────────────────────────────────────────────────────────
+    import importlib
 
+    cli_translator = importlib.import_module(
+        "geny_executor.llm_client.translators._cli"
+    )
+    first_assembler = cli_translator.assemble_response_from_stream_json
+    first_accum_feed = cli_translator.StreamJsonAccumulator.feed
 
-def test_install_is_idempotent(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Repeated installs must not double-wrap the function. The
-    wrapper is flagged with ``_geny_verbose_patch_applied`` so the
-    second call sees the flag and returns early."""
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
-    from service.llm_patches import install_llm_patches
-    from geny_executor.llm_client.translators import _cli
+    # Second install — should hit the cached-wrapper path.
+    install_llm_patches()
 
-    pristine = _cli.claude_code_argv
-    try:
-        install_llm_patches()
-        once = _cli.claude_code_argv
-        install_llm_patches()
-        twice = _cli.claude_code_argv
-        assert once is twice, "second install must be a no-op"
-        # Confirm the wrapper exposes the original so future re-stacks
-        # can chain off it.
-        assert hasattr(once, "_original")
-    finally:
-        _cli.claude_code_argv = pristine
+    second_assembler = cli_translator.assemble_response_from_stream_json
+    second_accum_feed = cli_translator.StreamJsonAccumulator.feed
 
+    # Same instance, no double-stacking.
+    assert first_assembler is second_assembler
+    assert first_accum_feed is second_accum_feed
 
-def test_install_patches_every_re_export_namespace(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """The function is re-exported across THREE modules; patching
-    only the source isn't enough because the caller did
-    ``from … import claude_code_argv`` and captured a local binding.
-
-    Without patching all three, the live CLI invocation in
-    ``claude_code.py`` (line 178) still calls the pristine function
-    and the user sees ``--verbose`` error in prod. This test pins
-    the bug we hit on 2026-05-18.
-    """
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
-    import importlib
 
+def test_install_patches_assembler_across_re_exports() -> None:
+    """The assembler is re-exported from three modules; all the
+    attributes must point at the wrapper after install so any caller
+    that captured a local binding hits it."""
     from service.llm_patches import install_llm_patches
 
+    install_llm_patches()
+
+    import importlib
     modules = [
-        importlib.import_module("geny_executor.llm_client.translators._cli"),
-        importlib.import_module("geny_executor.llm_client.translators"),
-        importlib.import_module("geny_executor.llm_client.claude_code"),
+        "geny_executor.llm_client.translators._cli",
+        "geny_executor.llm_client.translators",
+        "geny_executor.llm_client.claude_code",
     ]
-    pristines = [getattr(m, "claude_code_argv") for m in modules]
+    fns = []
+    for name in modules:
+        mod = importlib.import_module(name)
+        fn = getattr(mod, "assemble_response_from_stream_json", None)
+        if fn is not None:
+            fns.append(fn)
+    assert len(fns) >= 2  # at least source + one re-export
+    assert all(fn is fns[0] for fn in fns)
 
-    try:
-        install_llm_patches()
-        wrapped = [getattr(m, "claude_code_argv") for m in modules]
-        # Every module must point at a wrapper, AND they must all be
-        # the same wrapper instance so a stale reference can't slip
-        # through.
-        for fn in wrapped:
-            assert hasattr(fn, "_geny_verbose_patch_applied")
-        assert wrapped[0] is wrapped[1] is wrapped[2], (
-            "all three modules must share the same wrapper — otherwise "
-            "claude_code.py keeps calling the pristine function"
-        )
-    finally:
-        for m, original in zip(modules, pristines):
-            setattr(m, "claude_code_argv", original)
 
+# ── assembler-side: stream-json error envelope → friendly raise ──
 
-def test_install_patches_claude_code_caller_path(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """End-to-end pin for the user's reported regression: invoke the
-    argv builder through the same import path ``claude_code.py``
-    uses (``claude_code.claude_code_argv``) and verify ``--verbose``
-    lands in the result. This is what the live CLI invocation does
-    at runtime."""
-    pytest.importorskip("geny_executor.llm_client.claude_code")
-    import importlib
 
+@pytest.mark.asyncio
+async def test_assembler_patch_raises_friendly_error_on_auth_failure() -> None:
+    """When the stream ends with an ``is_error: true`` result
+    envelope, the patched assembler must raise a Korean human-readable
+    error instead of silently returning an empty response."""
     from service.llm_patches import install_llm_patches
 
-    cc_module = importlib.import_module(
-        "geny_executor.llm_client.claude_code"
-    )
-    cli_module = importlib.import_module(
+    install_llm_patches()
+
+    import importlib
+    cli_translator = importlib.import_module(
         "geny_executor.llm_client.translators._cli"
     )
-    translators_pkg = importlib.import_module(
-        "geny_executor.llm_client.translators"
-    )
-
-    pristine_cc = cc_module.claude_code_argv
-    pristine_cli = cli_module.claude_code_argv
-    pristine_pkg = translators_pkg.claude_code_argv
-
-    try:
-        install_llm_patches()
-
-        class _Req:
-            stream = True
-            model = "claude-opus-4-7"
-            system = ""
-            thinking = None
-            response_format = None
-            session_hint = None
-
-        # The crucial assertion: the function call path
-        # ``claude_code.claude_code_argv(...)`` must return an argv
-        # that contains ``--verbose``. Before the deep-patch fix this
-        # returned the pristine result and the prod session crashed.
-        argv = cc_module.claude_code_argv(_Req())
-        assert "--verbose" in argv, (
-            "claude_code.py's local binding must point at the patched "
-            "wrapper, otherwise the live CLI invocation crashes with "
-            "'When using --print, --output-format=stream-json requires "
-            "--verbose'"
+    assemble = cli_translator.assemble_response_from_stream_json
+
+    async def _gen() -> AsyncIterator[bytes]:
+        yield b'{"type": "system", "session_id": "s1"}\n'
+        yield (
+            b'{"type": "result", "is_error": true, "api_error_status": 401, '
+            b'"error": "authentication_failed", '
+            b'"result": "Failed to authenticate. API Error: 401"}\n'
         )
-    finally:
-        cc_module.claude_code_argv = pristine_cc
-        cli_module.claude_code_argv = pristine_cli
-        translators_pkg.claude_code_argv = pristine_pkg
 
+    with pytest.raises(RuntimeError, match="Claude Code 인증이 만료"):
+        await assemble(_gen(), model="m")
 
-def test_install_actually_patches_streaming_call(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """End-to-end: install the patch, then invoke ``claude_code_argv``
-    the way geny-executor's caller does and assert ``--verbose``
-    lands in the argv."""
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
+
+@pytest.mark.asyncio
+async def test_assembler_patch_passes_through_clean_stream() -> None:
+    """Clean streams (no ``is_error`` envelope) must produce a valid
+    APIResponse with the text content intact."""
     from service.llm_patches import install_llm_patches
-    from geny_executor.llm_client.translators import _cli
 
-    pristine = _cli.claude_code_argv
-    try:
-        install_llm_patches()
-
-        # Build a minimal ``ChatCompletionRequest``-shaped object the
-        # function expects. We use a duck-typed stub to avoid pulling
-        # the full geny_executor request model.
-        class _Req:
-            stream = True
-            model = "claude-opus-4-7"
-            system = ""
-            thinking = None
-            response_format = None
-            session_hint = None
-
-        argv = _cli.claude_code_argv(_Req())
-        assert "--print" in argv
-        assert "--output-format" in argv
-        of_idx = argv.index("--output-format")
-        assert argv[of_idx + 1] == "stream-json"
-        assert "--verbose" in argv, (
-            "patch must inject --verbose for stream-json output"
-        )
-        print_idx = argv.index("--print")
-        assert argv[print_idx + 1] == "--verbose", (
-            "--verbose must sit right after --print for readable logs"
+    install_llm_patches()
+
+    import importlib
+    cli_translator = importlib.import_module(
+        "geny_executor.llm_client.translators._cli"
+    )
+    assemble = cli_translator.assemble_response_from_stream_json
+
+    async def _gen() -> AsyncIterator[bytes]:
+        yield b'{"type": "system", "session_id": "s1", "model": "sonnet"}\n'
+        yield b'{"type": "assistant", "delta": {"type": "text_delta", "text": "ok"}}\n'
+        yield b'{"type": "message_stop"}\n'
+        yield (
+            b'{"type": "result", "stop_reason": "end_turn", '
+            b'"usage": {"input_tokens": 1, "output_tokens": 1}}\n'
         )
-    finally:
-        _cli.claude_code_argv = pristine
 
+    resp = await assemble(_gen(), model="default")
+    assert resp.text == "ok"
+    assert resp.stop_reason == "end_turn"
 
-# ── Assembler patch: stream-json error → friendly message ────────────
 
+# ── Stream observability: CLI built-in tool calls → SessionLogger ──
 
-def test_friendly_error_message_recognises_auth_failure() -> None:
-    from service.llm_patches import _friendly_error_message_for_result_envelope
 
-    envelope = {
-        "type": "result",
-        "is_error": True,
-        "api_error_status": 401,
-        "error": "authentication_failed",
-        "result": "Failed to authenticate. API Error: 401 Invalid authentication credentials",
-    }
-    msg = _friendly_error_message_for_result_envelope(envelope)
-    assert "Claude Code 인증이 만료" in msg
-    assert "다시 로그인" in msg or "Sign in" in msg
-    # And the original error is preserved so an operator inspecting
-    # logs can see the underlying API message.
-    assert "401" in msg
+class _FakeLogger:
+    """Minimal stand-in for Geny's ``SessionLogger``. Records every
+    ``log_tool_use`` / ``log_tool_result`` call so tests can assert on
+    the observability patch's behavior without booting the real
+    logger + DB."""
 
+    def __init__(self) -> None:
+        self.tool_uses: List[Dict[str, Any]] = []
+        self.tool_results: List[Dict[str, Any]] = []
 
-def test_friendly_error_message_for_generic_api_error() -> None:
-    from service.llm_patches import _friendly_error_message_for_result_envelope
+    def log_tool_use(self, **kwargs: Any) -> None:
+        self.tool_uses.append(kwargs)
 
-    envelope = {
-        "type": "result",
-        "is_error": True,
-        "api_error_status": 503,
-        "error": "overloaded",
-        "result": "Service unavailable, please retry",
-    }
-    msg = _friendly_error_message_for_result_envelope(envelope)
-    assert "Claude Code API 에러" in msg
-    assert "503" in msg
-    # Non-401 must NOT use the auth-expired message.
-    assert "인증이 만료" not in msg
+    def log_tool_result(self, **kwargs: Any) -> None:
+        self.tool_results.append(kwargs)
 
 
-def test_maybe_extract_error_envelope_accepts_bytes_and_str() -> None:
-    from service.llm_patches import _maybe_extract_error_envelope
+def test_observability_emits_tool_use_for_assistant_envelope() -> None:
+    """``"assistant"`` envelopes carrying a non-``mcp__*`` ``tool_use``
+    block trigger a ``log_tool_use`` call on the active session logger."""
+    from service.llm_patches import (
+        cli_stream_logger_ctx,
+        install_llm_patches,
+    )
+    install_llm_patches()
 
-    line_bytes = b'{"type":"result","is_error":true,"api_error_status":401}'
-    line_str = '{"type":"result","is_error":true,"api_error_status":401}'
-    assert _maybe_extract_error_envelope(line_bytes) is not None
-    assert _maybe_extract_error_envelope(line_str) is not None
+    import importlib
+    cli_translator = importlib.import_module(
+        "geny_executor.llm_client.translators._cli"
+    )
+    accum = cli_translator.StreamJsonAccumulator(model="m")
+    fake = _FakeLogger()
+    token = cli_stream_logger_ctx.set(fake)
+    try:
+        accum.feed({
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {"type": "tool_use", "id": "tu_1", "name": "Bash",
+                     "input": {"command": "ls"}},
+                ],
+            },
+        })
+    finally:
+        cli_stream_logger_ctx.reset(token)
 
+    assert len(fake.tool_uses) == 1
+    call = fake.tool_uses[0]
+    assert call["tool_name"] == "Bash"
+    assert call["tool_id"] == "tu_1"
+    assert call["tool_input"] == {"command": "ls"}
 
-@pytest.mark.parametrize("payload", [
-    b'',
-    b'\n',
-    b'not json',
-    b'{"type":"system"}',
-    b'{"type":"result","is_error":false}',
-    b'{"type":"result"}',
-    b'[]',
-])
-def test_maybe_extract_error_envelope_returns_none_for_non_error_lines(
-    payload: bytes,
-) -> None:
-    from service.llm_patches import _maybe_extract_error_envelope
-    assert _maybe_extract_error_envelope(payload) is None
 
+def test_observability_skips_mcp_prefixed_tools() -> None:
+    """MCP tools are already logged from ``mcp_bridge_controller`` —
+    the observability patch must NOT double-render them."""
+    from service.llm_patches import (
+        cli_stream_logger_ctx,
+        install_llm_patches,
+    )
+    install_llm_patches()
 
-def test_assembler_patch_raises_friendly_error_on_auth_failure() -> None:
-    """End-to-end through ``assemble_response_from_stream_json``:
-    install the patch, feed a synthetic stream that ends with the
-    auth-failed envelope, and assert the wrapped function raises
-    with the Korean re-login message instead of returning silently."""
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
-    import asyncio
     import importlib
-
-    from service.llm_patches import install_llm_patches
-
-    cli_module = importlib.import_module(
+    cli_translator = importlib.import_module(
         "geny_executor.llm_client.translators._cli"
     )
-    pristine = cli_module.assemble_response_from_stream_json
+    accum = cli_translator.StreamJsonAccumulator(model="m")
+    fake = _FakeLogger()
+    token = cli_stream_logger_ctx.set(fake)
     try:
-        install_llm_patches()
-        wrapped = cli_module.assemble_response_from_stream_json
-
-        async def _fake_stream():
-            yield (
-                b'{"type":"system","subtype":"init","model":"claude-opus-4-7"}'
-            )
-            yield (
-                b'{"type":"result","is_error":true,"api_error_status":401,'
-                b'"error":"authentication_failed",'
-                b'"result":"Failed to authenticate. API Error: 401 ..."}'
-            )
-
-        async def _runner():
-            return await wrapped(_fake_stream(), model="claude-opus-4-7")
-
-        with pytest.raises(RuntimeError, match="인증이 만료"):
-            asyncio.new_event_loop().run_until_complete(_runner())
+        accum.feed({
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {"type": "tool_use", "id": "mc_1",
+                     "name": "mcp__geny__send_direct_message_internal",
+                     "input": {"content": "hi"}},
+                ],
+            },
+        })
     finally:
-        cli_module.assemble_response_from_stream_json = pristine
+        cli_stream_logger_ctx.reset(token)
 
+    assert fake.tool_uses == []
 
-def test_assembler_patch_pass_through_on_clean_stream() -> None:
-    """A successful stream (no ``is_error`` envelope) must return
-    the original APIResponse unchanged — the patch is non-intrusive
-    on the happy path."""
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
-    import asyncio
-    import importlib
 
-    from service.llm_patches import install_llm_patches
+def test_observability_emits_tool_result_with_duration() -> None:
+    """A matching ``"user"`` ``tool_result`` envelope triggers a
+    ``log_tool_result`` call with measured ``duration_ms`` and the
+    extracted result text."""
+    from service.llm_patches import (
+        cli_stream_logger_ctx,
+        install_llm_patches,
+    )
+    install_llm_patches()
 
-    cli_module = importlib.import_module(
+    import importlib
+    cli_translator = importlib.import_module(
         "geny_executor.llm_client.translators._cli"
     )
-    pristine = cli_module.assemble_response_from_stream_json
+    accum = cli_translator.StreamJsonAccumulator(model="m")
+    fake = _FakeLogger()
+    token = cli_stream_logger_ctx.set(fake)
     try:
-        install_llm_patches()
-        wrapped = cli_module.assemble_response_from_stream_json
-
-        async def _fake_stream():
-            yield (
-                b'{"type":"system","subtype":"init","model":"claude-opus-4-7"}'
-            )
-            yield (
-                b'{"type":"assistant","delta":{"type":"text_delta",'
-                b'"text":"hello"}}'
-            )
-            yield (
-                b'{"type":"result","stop_reason":"end_turn",'
-                b'"usage":{"input_tokens":1,"output_tokens":1}}'
-            )
-
-        async def _runner():
-            return await wrapped(_fake_stream(), model="claude-opus-4-7")
-
-        resp = asyncio.new_event_loop().run_until_complete(_runner())
-        # Don't pin the full APIResponse shape — just confirm it
-        # didn't raise and we got *something* back.
-        assert resp is not None
+        accum.feed({
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {"type": "tool_use", "id": "tu_1", "name": "Read",
+                     "input": {"path": "/etc/hostname"}},
+                ],
+            },
+        })
+        accum.feed({
+            "type": "user",
+            "message": {
+                "content": [
+                    {"type": "tool_result", "tool_use_id": "tu_1",
+                     "content": "myhost\n"},
+                ],
+            },
+        })
     finally:
-        cli_module.assemble_response_from_stream_json = pristine
-
+        cli_stream_logger_ctx.reset(token)
+
+    assert len(fake.tool_results) == 1
+    res = fake.tool_results[0]
+    assert res["tool_name"] == "Read"
+    assert res["tool_id"] == "tu_1"
+    assert res["result"] == "myhost\n"
+    assert res["is_error"] is False
+    assert isinstance(res["duration_ms"], int) and res["duration_ms"] >= 0
+
+
+def test_observability_handles_content_block_list_result_shape() -> None:
+    """Tool results can arrive as either a plain string OR a list of
+    ``{"type":"text","text":...}`` content blocks. The patch
+    flattens the list shape into a newline-joined string."""
+    from service.llm_patches import (
+        cli_stream_logger_ctx,
+        install_llm_patches,
+    )
+    install_llm_patches()
 
-def test_assembler_patch_installs_across_all_three_namespaces() -> None:
-    """Like the argv patch, the assembler function is re-exported in
-    three places. Patching only the source leaves
-    ``claude_code.assemble_response_from_stream_json`` (the actual
-    call site at line 203) pointed at the pristine function."""
-    pytest.importorskip("geny_executor.llm_client.claude_code")
     import importlib
-
-    from service.llm_patches import install_llm_patches
-
-    modules = [
-        importlib.import_module("geny_executor.llm_client.translators._cli"),
-        importlib.import_module("geny_executor.llm_client.translators"),
-        importlib.import_module("geny_executor.llm_client.claude_code"),
-    ]
-    pristines = [getattr(m, "assemble_response_from_stream_json") for m in modules]
+    cli_translator = importlib.import_module(
+        "geny_executor.llm_client.translators._cli"
+    )
+    accum = cli_translator.StreamJsonAccumulator(model="m")
+    fake = _FakeLogger()
+    token = cli_stream_logger_ctx.set(fake)
     try:
-        install_llm_patches()
-        wrapped = [getattr(m, "assemble_response_from_stream_json") for m in modules]
-        for fn in wrapped:
-            assert hasattr(fn, "_geny_assembler_error_patch_applied")
-        assert wrapped[0] is wrapped[1] is wrapped[2]
+        accum.feed({
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {"type": "tool_use", "id": "tu_1", "name": "Bash",
+                     "input": {"command": "echo hi"}},
+                ],
+            },
+        })
+        accum.feed({
+            "type": "user",
+            "message": {
+                "content": [
+                    {"type": "tool_result", "tool_use_id": "tu_1",
+                     "content": [
+                         {"type": "text", "text": "hi"},
+                         {"type": "text", "text": "(exit 0)"},
+                     ]},
+                ],
+            },
+        })
     finally:
-        for m, original in zip(modules, pristines):
-            setattr(m, "assemble_response_from_stream_json", original)
+        cli_stream_logger_ctx.reset(token)
 
+    assert fake.tool_results[0]["result"] == "hi\n(exit 0)"
 
-# ── Original argv patch tests ─────────────────────────────────────────
 
-
-def test_install_does_not_modify_non_streaming(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    pytest.importorskip("geny_executor.llm_client.translators._cli")
+def test_observability_inert_without_active_context() -> None:
+    """If no ``cli_stream_logger_ctx`` is set (e.g. the accumulator is
+    driven outside a session-bound code path), the patch must
+    silently no-op rather than crash."""
     from service.llm_patches import install_llm_patches
-    from geny_executor.llm_client.translators import _cli
+    install_llm_patches()
 
-    pristine = _cli.claude_code_argv
-    try:
-        install_llm_patches()
-
-        class _Req:
-            stream = False  # non-streaming
-            model = "claude-opus-4-7"
-            system = ""
-            thinking = None
-            response_format = None
-            session_hint = None
-
-        argv = _cli.claude_code_argv(_Req())
-        # Non-streaming uses ``--output-format json`` — no verbose
-        # injection.
-        assert "--verbose" not in argv
-        of_idx = argv.index("--output-format")
-        assert argv[of_idx + 1] == "json"
-    finally:
-        _cli.claude_code_argv = pristine
+    import importlib
+    cli_translator = importlib.import_module(
+        "geny_executor.llm_client.translators._cli"
+    )
+    accum = cli_translator.StreamJsonAccumulator(model="m")
+    # No context set → ContextVar is the default ``None``.
+    accum.feed({
+        "type": "assistant",
+        "message": {
+            "content": [
+                {"type": "tool_use", "id": "tu_1", "name": "Bash",
+                 "input": {"command": "ls"}},
+            ],
+        },
+    })
+    # No exception is the assertion; the patch is a no-op when the
+    # ContextVar isn't set, so there's nothing observable to compare.