From 85ce1eb834a4d3679203a27816867c499b656ee6 Mon Sep 17 00:00:00 2001 From: CocoRoF Date: Sun, 17 May 2026 22:18:03 +0900 Subject: [PATCH] test(llm_client): copilot_cli conformance + bump v2.0.0 (CHANGELOG) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase C2 of the LLM backend upgrade — Phase A/B/C wrap-up for the executor side. Plugs CopilotCLIClient into the conformance harness, bumps the version to 2.0.0, and writes the matching CHANGELOG entry. tests/llm_client/conformance/test_copilot_cli.py (NEW) - 8 specialised cases on top of ConformanceTestSuite: capability shape (subprocess, no streaming/tools/structured_output/token_usage), basic_text_completion via fake gh, auth/not_installed error mapping, streaming-fallback (BaseClient default emits one message_complete event), binary_not_found. pyproject.toml / src/geny_executor/__init__.py - version: 1.21.0 → 2.0.0 (semver major: provider-location contract break, ProviderBackedClient removal). CHANGELOG.md - 2.0.0 entry covering Added / Changed / Removed plus migration notes for hosts moving off the legacy ``api_key=`` + ``strategies['provider']`` path. Full suite: 3205 passed, 8 skipped, 0 failed. ClientRegistry.available() now returns 6 providers; Phases A + B + C of the LLM backend upgrade are complete. The next phase (D) introduces multi-provider sub-agents; after that lands, PyPI publication of 2.0.0 unblocks Geny to consume the new contract. Plan reference: docs/llm-backend-upgrade-plan/07_rollout_phases.md (Phase C2). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 117 ++++++++++++++++++ pyproject.toml | 2 +- src/geny_executor/__init__.py | 2 +- .../conformance/test_copilot_cli.py | 111 +++++++++++++++++ 4 files changed, 230 insertions(+), 2 deletions(-) create mode 100644 tests/llm_client/conformance/test_copilot_cli.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cee0c9..aac2ebb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,123 @@ All notable changes to `geny-executor` are recorded here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this project adheres to [Semantic Versioning](https://semver.org/). +## [2.0.0] — 2026-05-17 + +**Major release.** The LLM client layer is generalised to support every +"model-as-runner" backend behind a single capability-negotiating +contract — the four existing vendor APIs (Anthropic / OpenAI / Google / +vLLM) **plus two new CLI backends** (Claude Code, GitHub Copilot). The +silent-divergence provider-location bug is closed; all credential flow is +unified behind a single `CredentialBundle` channel. + +### Added + +- **`ClaudeCodeCLIClient`** (`llm_client.claude_code`) — subprocess-backed + client driving Anthropic's `claude` CLI. Streams via stream-json, drops + the fields the CLI doesn't accept, and propagates token usage / cost. + Capability flags advertise full feature coverage (thinking, tools, + structured_output, session_continuity, MCP passthrough, budget limit). +- **`CopilotCLIClient`** (`llm_client.copilot`) — subprocess-backed client + driving `gh copilot -p`. Plain stdout text only (no streaming, no + tools); honest capability flags reflect that. +- **`CredentialBundle` + `ProviderCredentials`** (`llm_client.credentials`) + — frozen dataclasses that carry per-provider credentials. `__repr__` + redacts api_key. `Pipeline.from_manifest{,_async}` now accepts + `credentials=` directly; `api_key=` remains a test/legacy convenience + that auto-wraps a single Anthropic key. +- **`Pipeline._build_client_for(provider)`** — single-point client + construction that honours `_resolve_llm_client`'s attach > config + resolution order. +- **`Stage.resolve_local_client(state)`** — per-stage `provider_override` + helper. The pipeline-wide client (built from Stage 6) is the default; + stages that set `config["provider_override"]` build their own client + from the same `CredentialBundle`. +- **`PipelineState.credentials`** — frozen bundle reference mirrored from + the pipeline so stages can build local clients. +- **Capability flags** (`ClientCapabilities` — 9 new fields): + `supports_structured_output`, `supports_session_continuity`, + `supports_mcp_passthrough`, `supports_budget_limit`, + `supports_token_usage`, `supports_cost_usage`, `is_subprocess`, + `requires_workspace`, `streaming_granularity`. Plus a `.supports(name)` + string-keyed lookup helper. +- **`APIRequest`** — `response_format` (json_schema/json_object) and + `session_hint` (vendor session id resume). +- **`TokenUsage`** — `cost_usd` and `duration_ms` with None-aware + aggregation in `__add__` / `__iadd__`. +- **`APIResponse.cost_usd`** — proxy property over `usage.cost_usd`. +- **`ErrorCategory`** — 5 new categories: `CLI_NOT_FOUND`, + `CLI_AUTH_FAILED`, `CLI_TIMEOUT`, `CLI_PROTOCOL_ERROR`, + `CLI_PERMISSION_DENIED`. New `is_fatal` property for unretryable + classes. +- **`llm_client._cli_runtime`** — async subprocess primitives shared by + the two CLI clients: `CLIProcessRunner` (shell=False, new session, + timeout + kill-tree), `scrub_env`, `parse_stream_json_line`, + `detect_binary`, `aiter_bytes`. POSIX `start_new_session=True` enables + safe `killpg` on cancellation. +- **`llm_client.translators._cli`** — canonical ↔ CLI helpers: + `claude_code_argv`, `thinking_to_effort`, `build_stream_json_stdin`, + `stream_json_line_to_canonical_event`, `parse_json_output_to_response`, + `assemble_response_from_stream_json`, `compose_copilot_prompt`, + `copilot_argv`, `parse_plain_text_to_response`. +- **`Pipeline._creds_to_client_kwargs(provider, creds)`** — per-provider + constructor-kwarg mapping. Includes `workspace_root → workspace_dir` + remap for Claude Code. +- **Manifest validator** — strict mode rejects `strategies['provider']` + and requires `config['provider']` on active Stage 6. +- **Conformance harness** (`tests/llm_client/conformance/`) — provider- + agnostic contract tests with `@capability` skip decorator. Six provider + modules (anthropic / openai / google / vllm / claude_code_cli / + copilot_cli) plug into the same suite. +- **Fake binaries** (`tests/_fixtures/`) — `fake_echo_cli`, `fake_claude`, + `fake_gh`. Drive scenarios via env vars so tests never touch a real + vendor service. + +### Changed + +- **`ClientRegistry.available()` returns 6 providers** (was 4). +- **`Pipeline.from_manifest{,_async}`** prefers `credentials=CredentialBundle`; + the legacy `api_key=` kwarg is retained but auto-wraps into a bundle. +- **`Pipeline._resolve_llm_client`** is single-source: attached client > + Stage 6 `config["provider"]` + bundle > None. The legacy + `ProviderBackedClient` auto-bridge fallback is gone. +- **`APIStage`** strategy-slot `"provider"` is removed. Only `retry` and + `router` remain. The stage reads its provider via + `config["provider"]`. Constructor still accepts a legacy + `APIProvider` instance for direct-construction test fixtures, wrapped + internally by `_LegacyProviderAdapter`. +- **`BaseClient._build_request`** also drops + emits `stop_sequences` + when the client lacks that capability. +- **`fork`-mode skill default runner** uses `AnthropicClient` directly + (was `ProviderBackedClient`). A subsequent point release rewires this + through `CredentialBundle` for multi-provider fork-mode (Phase D4 of + the LLM backend upgrade plan). +- Existing 4 providers (`anthropic` / `openai` / `google` / `vllm`) + declare all 16 capability flags explicitly with their honest values. + +### Removed + +- **`llm_client.bridge`** module (`ProviderBackedClient`). The inline + `_LegacyProviderAdapter` inside `APIStage` covers the one remaining + caller (test fixtures). +- The implicit `strategies["provider"]` slot on the manifest. Manifests + using the legacy location are rejected at strict load with `ConfigError`. +- The `_API_KEY_REQUIRING` set in `core.pipeline` (Stage 6 no longer + needs an `api_key` kwarg at instantiation time). + +### Migration notes for hosts + +- Replace `Pipeline.from_manifest_async(manifest, api_key=key, ...)` with + `Pipeline.from_manifest_async(manifest, credentials=CredentialBundle( + by_provider={"anthropic": ProviderCredentials(api_key=key), ...} + ), ...)`. The `api_key=` shape still works for one-provider Anthropic + setups but is now a thin convenience over the canonical channel. +- If your manifest writer set `stages[6].strategies["provider"]`, move + the value to `stages[6].config["provider"]`. Strict load will surface + the mistake. +- Don't import `geny_executor.llm_client.bridge.ProviderBackedClient` — + it's gone. The few legitimate consumers (fork-mode skill default + runner) have been switched. + ## [1.18.0] — 2026-05-05 Minor release. New `IndexHandle.list_categories` surface for diff --git a/pyproject.toml b/pyproject.toml index 8d082e0..e2a6ec1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "geny-executor" -version = "1.21.0" +version = "2.0.0" description = "Harness-engineered agent pipeline library with 21-stage dual-abstraction architecture, built on the Anthropic API" readme = "README.md" license = "MIT" diff --git a/src/geny_executor/__init__.py b/src/geny_executor/__init__.py index 32d480a..07a30a1 100644 --- a/src/geny_executor/__init__.py +++ b/src/geny_executor/__init__.py @@ -95,7 +95,7 @@ ProviderDrivenStrategy, ) -__version__ = "1.21.0" +__version__ = "2.0.0" __all__ = [ # Core diff --git a/tests/llm_client/conformance/test_copilot_cli.py b/tests/llm_client/conformance/test_copilot_cli.py new file mode 100644 index 0000000..ce96b4f --- /dev/null +++ b/tests/llm_client/conformance/test_copilot_cli.py @@ -0,0 +1,111 @@ +"""Copilot CLI provider conformance (Phase C2).""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "src")) + +import pytest + +from geny_executor.core.config import ModelConfig +from geny_executor.core.errors import APIError, ErrorCategory +from geny_executor.llm_client.copilot import CopilotCLIClient +from geny_executor.llm_client.base import BaseClient + +from tests.llm_client.conformance.harness import ConformanceTestSuite + + +FAKE_GH = str( + (Path(__file__).resolve().parents[2] / "_fixtures" / "fake_gh.py") +) + + +class TestCopilotCLIConformance(ConformanceTestSuite): + provider_name = "copilot_cli" + + def make_client( + self, + *, + mode="mocked", + scenario: str = "ok", + text: str | None = None, + ) -> BaseClient: + env_extras = {"FAKE_GH_SCENARIO": scenario} + if text is not None: + env_extras["FAKE_GH_TEXT"] = text + return CopilotCLIClient( + gh_binary_path=FAKE_GH, + timeout_s=5.0, + env_extras=env_extras, + ) + + # ---------------------------------------------------------------- shape + def test_is_subprocess(self) -> None: + c = self.make_client() + assert c.capabilities.is_subprocess is True + assert c.capabilities.requires_workspace is False + assert c.capabilities.streaming_granularity == "none" + + def test_lacks_streaming_and_tools(self) -> None: + c = self.make_client() + assert c.supports("streaming") is False + assert c.supports("tools") is False + assert c.supports("structured_output") is False + assert c.supports("token_usage") is False + + # ---------------------------------------------------------------- e2e + @pytest.mark.asyncio + async def test_basic_text_completion(self) -> None: + c = self.make_client(text="Hello!") + resp = await c.create_message( + model_config=ModelConfig(model="default"), + messages=[{"role": "user", "content": "say hi"}], + ) + assert resp.text == "Hello!" + assert resp.stop_reason == "end_turn" + + @pytest.mark.asyncio + async def test_translates_auth_error(self) -> None: + c = self.make_client(scenario="auth_fail") + with pytest.raises(APIError) as ei: + await c.create_message( + model_config=ModelConfig(model="default"), + messages=[{"role": "user", "content": "x"}], + ) + assert ei.value.category is ErrorCategory.CLI_AUTH_FAILED + + @pytest.mark.asyncio + async def test_translates_not_installed(self) -> None: + c = self.make_client(scenario="not_installed") + with pytest.raises(APIError) as ei: + await c.create_message( + model_config=ModelConfig(model="default"), + messages=[{"role": "user", "content": "x"}], + ) + assert ei.value.category is ErrorCategory.CLI_NOT_FOUND + + @pytest.mark.asyncio + async def test_streaming_falls_back(self) -> None: + """copilot_cli's supports_streaming=False → BaseClient default emits + one message_complete event.""" + c = self.make_client(text="streamed") + events = [] + async for evt in c.create_message_stream( + model_config=ModelConfig(model="default"), + messages=[{"role": "user", "content": "go"}], + ): + events.append(evt) + assert any(e.get("type") == "message_complete" for e in events) + + @pytest.mark.asyncio + async def test_binary_not_found_raises_cli_not_found(self) -> None: + c = CopilotCLIClient(gh_binary_path="/totally/missing/gh", timeout_s=2.0) + with pytest.raises(APIError) as ei: + await c.create_message( + model_config=ModelConfig(model="default"), + messages=[{"role": "user", "content": "x"}], + ) + assert ei.value.category is ErrorCategory.CLI_NOT_FOUND