From eecbef16d156509436717dc42986b001363bc5d0 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Tue, 7 Apr 2026 18:11:53 +0900
Subject: [PATCH 01/14] =?UTF-8?q?feat:=20Layer=204=20RPC=20=ED=8C=A8?=
 =?UTF-8?q?=ED=84=B4=20=EA=B0=90=EC=A7=80=20+=20=EB=8F=99=EC=A0=81=20prefi?=
 =?UTF-8?q?x=20=EA=B0=90=EC=A7=80=20+=20UTF-8=20=EC=9D=B8=EC=BD=94?=
 =?UTF-8?q?=EB=94=A9=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- RPC-style API에서 verb-resource 기반 CRUD workflow 관계 감지 (Layer 4)
- DTO 타입 매칭으로 cross-controller COMPLEMENTARY 관계 감지
- _group_by_resource: 하드코딩 대신 동적 prefix threshold로 버전/라우팅 prefix 자동 스킵
- _detect_rpc_patterns를 _detect_rpc_crud_workflows + _detect_rpc_dto_links로 분리
- serialization.py: save_graph에 encoding="utf-8" 추가
---
 graph_tool_call/analyze/dependency.py | 257 +++++++++++++++++++++++++-
 graph_tool_call/serialization.py      |   2 +-
 2 files changed, 249 insertions(+), 10 deletions(-)

diff --git a/graph_tool_call/analyze/dependency.py b/graph_tool_call/analyze/dependency.py
index b6d42d7..552ebbe 100644
--- a/graph_tool_call/analyze/dependency.py
+++ b/graph_tool_call/analyze/dependency.py
@@ -79,6 +79,7 @@ def detect_dependencies(
     relations.extend(_detect_structural(tools, spec))
     relations.extend(_detect_name_based(tools))
     relations.extend(_detect_cross_resource(tools))
+    relations.extend(_detect_rpc_patterns(tools))
     relations = _deduplicate(relations)
     relations = [r for r in relations if r.confidence >= min_confidence]
     relations.sort(key=lambda r: r.confidence, reverse=True)
@@ -131,17 +132,59 @@ def _is_single_resource_path(path: str) -> bool:
 def _group_by_resource(tools: list[ToolSchema]) -> dict[str, list[ToolSchema]]:
     """Group tools that have ``method`` and ``path`` metadata by their base resource.
 
-    The base resource is the first non-param path segment (e.g. ``/pets``).
+    The base resource is the first *meaningful* non-param path segment.
+    A segment is considered a non-meaningful prefix when it groups more than
+    ``_PREFIX_THRESHOLD`` percent of all tools — this handles version prefixes
+    (``/v1``, ``/v2``), routing prefixes (``/api``, ``/rest``), etc. without
+    requiring a hardcoded list.
     """
+    _PREFIX_THRESHOLD = 0.4  # if a segment covers >40% of tools, it's a prefix
+
+    api_tools = [
+        t for t in tools
+        if t.metadata.get("path") and t.metadata.get("method")
+    ]
+    if not api_tools:
+        return {}
+
+    total = len(api_tools)
+
+    # Collect static segments per tool
+    tool_segments: list[tuple[ToolSchema, list[str]]] = []
+    for tool in api_tools:
+        segs = [s for s in tool.metadata["path"].split("/") if s and not s.startswith("{")]
+        tool_segments.append((tool, segs))
+
+    # Determine max depth to scan for prefixes (usually 1-2 levels)
+    max_depth = max((len(segs) for _, segs in tool_segments), default=1)
+
+    # Find how many prefix levels to skip:
+    # walk from depth 0 and keep skipping while the segment at that depth
+    # covers >threshold of all tools
+    skip_depth = 0
+    for depth in range(min(max_depth, 4)):  # cap at 4 to avoid pathological cases
+        counter: dict[str, int] = {}
+        for _, segs in tool_segments:
+            if depth < len(segs):
+                counter.setdefault(segs[depth], 0)
+                counter[segs[depth]] += 1
+        if not counter:
+            break
+        most_common_count = max(counter.values())
+        if most_common_count / total > _PREFIX_THRESHOLD:
+            skip_depth = depth + 1
+        else:
+            break
+
+    # Group by the segment at skip_depth
     groups: dict[str, list[ToolSchema]] = {}
-    for tool in tools:
-        path = tool.metadata.get("path")
-        method = tool.metadata.get("method")
-        if not path or not method:
-            continue
-        # base resource = first static segment of the path
-        segments = [s for s in path.split("/") if s and not s.startswith("{")]
-        base = "/" + segments[0] if segments else "/"
+    for tool, segs in tool_segments:
+        if skip_depth < len(segs):
+            base = "/" + segs[skip_depth]
+        elif segs:
+            base = "/" + segs[-1]
+        else:
+            base = "/"
         groups.setdefault(base, []).append(tool)
     return groups
 
@@ -611,6 +654,202 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
     return relations
 
 
+# ---------------------------------------------------------------------------
+# Layer 4: RPC-style method name & DTO pattern detection
+# ---------------------------------------------------------------------------
+
+# Maps leading verb in an RPC method name to a CRUD intent category.
+_VERB_TO_INTENT: dict[str, str] = {
+    # read
+    "get": "read", "find": "read", "fetch": "read", "list": "read",
+    "search": "read", "select": "read", "load": "read", "read": "read",
+    "download": "read",
+    # write (create)
+    "save": "write", "create": "write", "add": "write", "insert": "write",
+    "register": "write", "regist": "write",
+    # update
+    "modify": "update", "update": "update", "edit": "update",
+    "change": "update", "patch": "update",
+    # delete
+    "delete": "delete", "remove": "delete", "cancel": "delete",
+    "withdraw": "delete",
+    # action (side-effect operations)
+    "process": "action", "execute": "action", "apply": "action",
+    "approve": "action", "reject": "action", "confirm": "action",
+    "accept": "action", "send": "action", "upload": "action",
+    "export": "action",
+}
+
+# Trailing tokens in method names that describe the *view*, not the resource.
+_NAME_SUFFIXES: frozenset[str] = frozenset({
+    "list", "detail", "details", "info", "count", "excel", "popup",
+    "summary", "check", "data", "total", "all", "page", "download",
+})
+
+# Common DTO class-name suffixes that are not part of the resource identity.
+_DTO_SUFFIXES: frozenset[str] = frozenset({
+    "request", "response", "dto", "entity", "info", "base",
+    "api", "vo", "model", "form", "param", "result", "ml",
+})
+
+# CRUD workflow rules: (source_intent, target_intent, relation, same_ctrl_conf, cross_ctrl_conf)
+# ``None`` for cross_ctrl_conf means the rule is skipped across controllers.
+_WORKFLOW_RULES: list[tuple[str, str, RelationType, float, float | None]] = [
+    ("read",   "write",  RelationType.REQUIRES, 0.9,  0.8),
+    ("update", "read",   RelationType.REQUIRES, 0.85, 0.75),
+    ("delete", "read",   RelationType.REQUIRES, 0.85, 0.75),
+    ("action", "read",   RelationType.REQUIRES, 0.75, None),
+]
+
+
+def _same_controller(a: ToolSchema, b: ToolSchema) -> bool:
+    """Return True if both tools belong to the same (non-empty) controller."""
+    ctrl_a = a.metadata.get("controller") or ""
+    ctrl_b = b.metadata.get("controller") or ""
+    return ctrl_a == ctrl_b != ""
+
+
+def _extract_verb_and_resource(name: str) -> tuple[str, str]:
+    """Extract (verb, resource) from an RPC-style method name.
+
+    ``getGoodsList`` → ``("get", "goods")``
+    ``saveOptionCategoryList`` → ``("save", "optioncategory")``
+    """
+    tokens = _normalize_name(name)
+    if not tokens:
+        return "", ""
+
+    verb = ""
+    resource_start = 0
+    for i, tok in enumerate(tokens):
+        if tok in _VERB_TO_INTENT:
+            verb = tok
+            resource_start = i + 1
+            break
+
+    resource = "".join(t for t in tokens[resource_start:] if t not in _NAME_SUFFIXES)
+    return verb, resource
+
+
+def _extract_dto_resource(type_name: str | None) -> str:
+    """Extract the resource root from a DTO class name.
+
+    ``GoodsMgmtApiResponse`` → ``goodsmgmt``
+    ``ClaimTargetRequest``   → ``claimtarget``
+    """
+    if not type_name:
+        return ""
+    tokens = _normalize_name(type_name)
+    return "".join(t for t in tokens if t not in _DTO_SUFFIXES)
+
+
+def _detect_rpc_patterns(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Detect relations for RPC-style APIs (Layer 4).
+
+    Handles non-RESTful endpoints (e.g. ``/v1/goods/goodsMgmtApi/getGoodsList``)
+    where structural path analysis is ineffective.
+
+    Two strategies:
+      1. **Verb-resource grouping** — methods sharing the same resource token
+         form CRUD workflows with controller-scoped confidence.
+      2. **DTO type matching** — methods sharing a request/response type across
+         controllers are marked COMPLEMENTARY.
+    """
+    relations: list[DetectedRelation] = []
+    relations.extend(_detect_rpc_crud_workflows(tools))
+    relations.extend(_detect_rpc_dto_links(tools))
+    return relations
+
+
+def _detect_rpc_crud_workflows(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Build CRUD workflow relations from verb-resource analysis."""
+    relations: list[DetectedRelation] = []
+
+    # Group tools by extracted resource token.
+    resource_groups: dict[str, list[tuple[str, ToolSchema]]] = {}
+    for tool in tools:
+        verb, resource = _extract_verb_and_resource(tool.name)
+        if verb and resource:
+            resource_groups.setdefault(resource, []).append((verb, tool))
+
+    for resource, members in resource_groups.items():
+        if len(members) < 2:
+            continue
+
+        # Classify members by CRUD intent.
+        by_intent: dict[str, list[ToolSchema]] = {}
+        for verb, tool in members:
+            intent = _VERB_TO_INTENT.get(verb, "other")
+            by_intent.setdefault(intent, []).append(tool)
+
+        # Apply workflow rules.
+        for src_intent, tgt_intent, rel_type, same_conf, cross_conf in _WORKFLOW_RULES:
+            for src in by_intent.get(src_intent, []):
+                for tgt in by_intent.get(tgt_intent, []):
+                    if src.name == tgt.name:
+                        continue
+                    same = _same_controller(src, tgt)
+                    if not same and cross_conf is None:
+                        continue
+                    relations.append(DetectedRelation(
+                        source=src.name,
+                        target=tgt.name,
+                        relation_type=rel_type,
+                        confidence=same_conf if same else cross_conf,  # type: ignore[arg-type]
+                        evidence=(
+                            f"{src.name} ({src_intent}) → {tgt.name} ({tgt_intent})"
+                            f" — resource '{resource}'"
+                        ),
+                        layer=4,
+                    ))
+
+        # Readers within same controller are SIMILAR_TO.
+        readers = by_intent.get("read", [])
+        for i, r1 in enumerate(readers):
+            for r2 in readers[i + 1:]:
+                if r1.name != r2.name and _same_controller(r1, r2):
+                    relations.append(DetectedRelation(
+                        source=r1.name,
+                        target=r2.name,
+                        relation_type=RelationType.SIMILAR_TO,
+                        confidence=0.8,
+                        evidence=f"{r1.name} ↔ {r2.name} — similar reads for '{resource}'",
+                        layer=4,
+                    ))
+
+    return relations
+
+
+def _detect_rpc_dto_links(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Link tools that share a DTO type across controllers (COMPLEMENTARY)."""
+    relations: list[DetectedRelation] = []
+
+    # Group tools by normalised DTO resource name.
+    dto_groups: dict[str, list[ToolSchema]] = {}
+    for tool in tools:
+        for type_name in (tool.metadata.get("request_type"), tool.metadata.get("response_type")):
+            dto_res = _extract_dto_resource(type_name)
+            if len(dto_res) >= 4:
+                dto_groups.setdefault(dto_res, []).append(tool)
+
+    for dto_res, members in dto_groups.items():
+        if not 2 <= len(members) <= 20:
+            continue
+        for i, a in enumerate(members):
+            for b in members[i + 1:]:
+                if a.name != b.name and not _same_controller(a, b):
+                    relations.append(DetectedRelation(
+                        source=a.name,
+                        target=b.name,
+                        relation_type=RelationType.COMPLEMENTARY,
+                        confidence=0.75,
+                        evidence=f"{a.name} ↔ {b.name} — shared DTO '{dto_res}'",
+                        layer=4,
+                    ))
+
+    return relations
+
+
 # ---------------------------------------------------------------------------
 # De-duplication
 # ---------------------------------------------------------------------------
diff --git a/graph_tool_call/serialization.py b/graph_tool_call/serialization.py
index cfa56ea..cac1c00 100644
--- a/graph_tool_call/serialization.py
+++ b/graph_tool_call/serialization.py
@@ -52,7 +52,7 @@ def save_graph(
     path = Path(path)
     try:
         path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(json.dumps(data, indent=2, ensure_ascii=False, default=str))
+        path.write_text(json.dumps(data, indent=2, ensure_ascii=False, default=str), encoding="utf-8")
     except PermissionError:
         msg = f"Permission denied: {path}. Check directory permissions."
         raise PermissionError(msg) from None

From fd113089ecc8240fb7aea5fcdd877099be9a7ce7 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Fri, 24 Apr 2026 10:38:01 +0900
Subject: [PATCH 02/14] =?UTF-8?q?feat:=20plan-and-execute=20=EC=95=84?=
 =?UTF-8?q?=ED=82=A4=ED=85=8D=EC=B2=98=20=EA=B8=B0=EB=B0=98=20=EB=A0=88?=
 =?UTF-8?q?=EC=9D=B4=EC=96=B4=20(L0=20+=20Stage=203)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase A — L0 Knowledge Base
- graph_tool_call.ingest.io_contract: swagger schema → produces/consumes
  leaf 필드 결정론적 추출
- graph_tool_call.ontology.llm_provider: enrich_tool_semantics 메서드
  (per-tool semantic 주석), ToolEnrichment / FieldSemantic / PairHint
  dataclass, max_tokens 명시, reference_tools 분리
- graph_tool_call.tool_graph: search_tools gateway 가 prerequisites /
  relations 를 LLM 에 노출 (retrieve_with_scores 사용)

Phase B — Stage 3 Plan Runner
- graph_tool_call.plan: Plan / PlanStep / ExecutionTrace 스키마
- graph_tool_call.plan.binding: BindingResolver (dotted + [N] + native
  type 보존)
- graph_tool_call.plan.runner: PlanRunner (streaming + non-streaming)
- tests: 29/29 pass (binding 21 + runner 8)

docs/architecture-plan-and-execute.md: 5-layer 설계 문서.
---
 docs/architecture-plan-and-execute.md    | 830 +++++++++++++++++++++++
 graph_tool_call/execute/http_executor.py |   7 +-
 graph_tool_call/ingest/io_contract.py    | 345 ++++++++++
 graph_tool_call/ingest/openapi.py        | 108 ++-
 graph_tool_call/langchain/gateway.py     | 101 ++-
 graph_tool_call/net.py                   |  31 +-
 graph_tool_call/ontology/llm_provider.py | 242 ++++++-
 graph_tool_call/plan/__init__.py         |  62 ++
 graph_tool_call/plan/binding.py          | 165 +++++
 graph_tool_call/plan/runner.py           | 342 ++++++++++
 graph_tool_call/plan/schema.py           |  80 +++
 graph_tool_call/tool_graph.py            | 215 +++++-
 12 files changed, 2489 insertions(+), 39 deletions(-)
 create mode 100644 docs/architecture-plan-and-execute.md
 create mode 100644 graph_tool_call/ingest/io_contract.py
 create mode 100644 graph_tool_call/plan/__init__.py
 create mode 100644 graph_tool_call/plan/binding.py
 create mode 100644 graph_tool_call/plan/runner.py
 create mode 100644 graph_tool_call/plan/schema.py

diff --git a/docs/architecture-plan-and-execute.md b/docs/architecture-plan-and-execute.md
new file mode 100644
index 0000000..caca509
--- /dev/null
+++ b/docs/architecture-plan-and-execute.md
@@ -0,0 +1,830 @@
+# Plan-and-Execute Architecture
+
+> 작성: 2026-04-22, 업데이트: 2026-04-23
+> 상태: 확정 (설계) / 미구현
+> 범위: graph-tool-call 라이브러리 + xgen-workflow 통합
+
+## 변경 이력
+
+- **2026-04-23**: 설계 간소화
+  - Ingest 시 embedding + Qdrant 저장 **삭제** (YAGNI). Field 이름 exact match 로 충분, cross-field synonym 은 LLM enrichment 가 해결
+  - L0 에 **LLM per-tool enrichment (Pass 2)** 도입. graph-tool-call 이 이미 보유한 `OntologyLLM` 추상화 활용
+  - Stage 1 retrieval 은 기존 BM25 + graph (graph-tool-call retrieval) 재사용. embedding prefilter 생략
+  - Knowledge Base 가 **두 층** 으로 명확화: (A) 결정론적 파서 / (B) LLM semantic enrichment
+
+---
+
+## 0. 한 쪽 요약
+
+**문제:** 현재 LLM-as-orchestrator (ReAct) 는 요청당 15 iteration × ~15KB context = **30초, 225KB 토큰**. 비용·지연·품질 모두 구조적 한계.
+
+**해결:** **사전 지식 (graph + schemas + ingest 시 LLM 의미 주석)** 을 최대한 활용하고, runtime LLM 은 자연어 ↔ 구조 변환에만 사용하는 **5-layer 아키텍처** (L0 Knowledge Base + Stage 1~4 Runtime).
+
+**기대 효과:**
+- LLM 호출 15 → 2~3회
+- Context 225KB → ~2~3KB (**~75배 감소**)
+- Latency 30초 → 2~5초 (**~10배 개선**)
+- 실행 단계 재현성, 감사 가능성 확보
+- 확장 축 확보 (fan-out, template, interactive)
+
+---
+
+## 1. 설계 원칙
+
+| # | 원칙 | 의미 |
+|---|---|---|
+| 1 | 사전 지식 최대 활용 | graph, schemas, embeddings 는 offline 구축 후 영속. 요청 처리 시 재계산 금지 |
+| 2 | LLM 은 semantic bridge 에만 | 자연어 이해 / 의미 추출 / 자연어 생성 — 그 외 결정론 |
+| 3 | 결정 가능한 것은 결정론적으로 | 매칭·순서·바인딩은 알고리즘. LLM 폴백은 **실패한 결정론의 보완** |
+| 4 | 각 단계는 독립 입출력 계약 | 테스트·캐싱·디버깅·부분 교체 가능 |
+| 5 | 하드코딩은 "학습된 지식" 으로 대체 | synonym → embedding cluster, verb → intent classifier |
+| 6 | Failure mode 관측 가능 | 어느 stage 에서 왜 실패했는지 항상 명확해야 함 |
+
+---
+
+## 2. 시스템 개요
+
+```
+╔═══════════════════════════════════════════════════════════════╗
+║                    OFFLINE / INGEST TIME                      ║
+║  ┌─────────────────────────────────────────────────────────┐ ║
+║  │ L0. KNOWLEDGE BASE                                       │ ║
+║  │                                                          │ ║
+║  │  Swagger → ToolSchema + Tool Embeddings +                │ ║
+║  │            IO Contract + Tool Graph                      │ ║
+║  │                                                          │ ║
+║  │  저장: api_tool_collections.graph (JSONB)                 │ ║
+║  │       api_tool_collections.embeddings (pgvector)         │ ║
+║  │       api_tool_collections.io_contracts (JSONB)          │ ║
+║  └─────────────────────────────────────────────────────────┘ ║
+╚═══════════════════════════════════════════════════════════════╝
+                            │
+                            ▼ (요청 도착)
+╔═══════════════════════════════════════════════════════════════╗
+║                    REQUEST TIME PIPELINE                      ║
+║                                                               ║
+║  requirement (자연어)                                          ║
+║     │                                                         ║
+║     ▼                                                         ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 1. RETRIEVAL + TARGET SELECTION                 │    ║
+║  │  (a) embedding prefilter: 108 → top-20                │    ║
+║  │  (b) LLM pick: 20개 catalog → target + entities       │    ║
+║  │  context: ~1KB  │  LLM: 1회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 2. PATH SYNTHESIZER                             │    ║
+║  │  (결정론) target 의 consumes → IO Contract 역추적      │    ║
+║  │          → DAG 구성 + argument bindings                │    ║
+║  │  context: —     │  LLM: 0회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║         ┌─────────┴─────────┐                                 ║
+║         │                   │                                 ║
+║    확정 plan           모호 (2+ 경로)                          ║
+║         │                   │                                 ║
+║         │                   ▼                                 ║
+║         │      ┌────────────────────────────────────────┐    ║
+║         │      │ (조건부) DISAMBIGUATION                 │    ║
+║         │      │  context: ~2KB (후보만) │ LLM: 1회       │    ║
+║         │      └────────────┬───────────────────────────┘    ║
+║         │                   │                                 ║
+║         └───────────────────┘                                 ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 3. RUNNER                                       │    ║
+║  │  (결정론) DAG topological 실행                         │    ║
+║  │          JsonPath 치환 + tool_executor HTTP           │    ║
+║  │          step 단위 streaming event                     │    ║
+║  │  context: —     │  LLM: 0회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 4. RESPONSE SYNTHESIS                           │    ║
+║  │  execution trace (요약) → 자연어 응답                   │    ║
+║  │  context: ~1KB  │  LLM: 1회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║                최종 답변                                        ║
+╚═══════════════════════════════════════════════════════════════╝
+```
+
+**일반 케이스 예산:** LLM 2회, context ~2KB, 2~4초.
+**모호 케이스:** LLM 3회, context ~4KB, 4~6초.
+
+---
+
+## 3. L0 — Knowledge Base
+
+ingest 1회. 영속 저장. 요청 처리에서 재계산 금지.
+
+**두 층 구조:**
+- **Pass 1 — Deterministic parser**: Swagger 의 구조적 사실 (schema, HTTP, dependency) 추출. LLM 금지.
+- **Pass 2 — Semantic enrichment**: Description 등을 LLM 이 읽고 의미 주석 (언제 써, 무엇을 내놓는다, 누구와 쌍을 이룬다). graph-tool-call 의 `OntologyLLM` 추상화 재사용.
+
+### 3.1 ToolSchema (Pass 1, 기존 확장)
+
+기존 `tools` 테이블. 추가 필드는 아래 섹션들이 채움.
+
+| 필드 | 설명 | 출처 |
+|---|---|---|
+| `function_id` | 컬렉션 범위 고유 slug | 파서 |
+| `function_name` | 원본 operationId | 파서 |
+| `description` | summary + description + tags | 파서 |
+| `api_url`, `api_method`, `api_header`, `api_body` | 실행용 | 파서 |
+| `metadata` | method/path/base_url/tags/response_schema/controller/request_type/response_type | 파서 |
+| `ai_metadata` | canonical_action, primary_resource, when_to_use, pairs_well_with 등 | **Pass 2 (LLM)** |
+
+### 3.2 IO Contract (Pass 1, 결정론)
+
+각 tool 의 **필드 수준 produces/consumes** 를 swagger schema 에서 기계적으로 추출.
+
+**저장:** 신규 테이블 `tool_io_contracts`:
+```sql
+CREATE TABLE tool_io_contracts (
+  tool_id          VARCHAR(100) REFERENCES tools(function_id),
+  direction        VARCHAR(10)  CHECK (direction IN ('produces', 'consumes')),
+  json_path        TEXT,         -- $.body.goods[*].goodsNo  (produces)
+                                 -- goodsNo                   (consumes)
+  field_name       VARCHAR(100), -- goodsNo
+  field_type       VARCHAR(40),  -- integer, string, object
+  required         BOOLEAN,      -- consumes 에 한함
+  semantic_tag     VARCHAR(80)   -- Pass 2 LLM 이 채움 (빈 값 허용)
+);
+```
+
+**추출 프로세스 (LLM 없음):**
+```
+for each tool in schemas:
+  request_leaves  = walk_schema_leaves(tool.request_schema)
+  response_leaves = walk_schema_leaves(tool.response_schema)
+  
+  for each leaf in request_leaves:
+    insert consumes (field_name, type, required)
+  
+  for each leaf in response_leaves:
+    insert produces (json_path, field_name, type)
+```
+
+**1차 매칭: exact field name + type** — 동일 swagger 내 field 이름 규약 보통 일관. 이걸로 대부분의 엣지 생성.
+
+```python
+# 결정론적 field match edge
+for A in tools:
+  for p in A.produces:
+    for B in tools:
+      if A == B: continue
+      for c in B.consumes.required:
+        if p.field_name == c.field_name and p.type == c.type:
+          graph.add_edge(A, B, "produces_for",
+                         binding={c.field_name: p.json_path})
+```
+
+### 3.3 Semantic Enrichment (Pass 2, LLM)
+
+**목적:** Description 등의 비정형 정보를 LLM 이 해석해 의미 주석 추가. 하드코딩된 verb 사전 / synonym 테이블 **완전 대체**.
+
+**인프라:** graph-tool-call 에 이미 있는 `OntologyLLM` 활용 ([graph_tool_call/ontology/llm_provider.py](graph_tool_call/ontology/llm_provider.py)).
+
+**이미 제공되는 메서드:**
+- `infer_relations(tools)` — LLM 기반 관계 추론
+- `suggest_categories(tools)` — 카테고리 그룹핑
+- `verify_relations(relations, tools)` — 휴리스틱 엣지 검증 / 거르기
+- `suggest_missing(tools, existing)` — 빠진 엣지 제안
+- `enrich_keywords(tools)` — BM25 향상용 키워드
+- `generate_example_queries(tools)` — 임베딩 매칭용 예시 쿼리
+
+**신규 메서드 (추가 구현):**
+```python
+class OntologyLLM:
+    def enrich_tool_semantics(
+        self, tools: list[ToolSummary], batch_size: int = 10,
+    ) -> dict[str, ToolEnrichment]:
+        """Per-tool 의미 주석 (action, resource, use-when, semantic tags, pairs)."""
+```
+
+**ToolEnrichment 스키마:**
+```typescript
+type ToolEnrichment = {
+  canonical_action: "search" | "read" | "create" | "update" | "delete" | "action";
+  primary_resource: string;                 // 정규화 리소스명 (예: "product")
+  one_line_summary: string;                 // 한 줄 요약 (Stage 1 catalog 용)
+  when_to_use: string;                      // 언제 쓰는지
+  when_not_to_use?: string;                 // 쓰면 안 되는 경우
+  produces_semantics: Array<{               // 의미 태깅된 produces
+    semantic: string;                       // "product_id" 같은 canonical
+    json_path: string;                      // 실제 경로
+  }>;
+  consumes_semantics: Array<{
+    semantic: string;
+    field: string;
+  }>;
+  pairs_well_with: Array<{                  // 함께 / 순서대로 쓰이는 도구들
+    tool: string;
+    reason: string;
+  }>;
+}
+```
+
+**Prompt 예시:**
+```
+You are annotating an API tool for a planning system.
+
+Tool: seltSearchProduct
+Summary: 상품 검색
+Description: 키워드로 상품을 검색하는 API입니다. ...
+HTTP: GET /v1/search/product
+Request fields: [searchWord, langCd, siteNo, sort, ...]
+Response fields: [$.body.goods[*].goodsNo, $.body.goods[*].goodsName, ...]
+
+Produce JSON with:
+- canonical_action (search|read|create|update|delete|action)
+- primary_resource (one word like "product", "order", "user")
+- one_line_summary (Korean, within 40 chars)
+- when_to_use (1~2 sentences)
+- produces_semantics: map internal field names to semantic ids like "product_id"
+- pairs_well_with: 2~3 related tools with brief reason
+
+Output JSON only. 
+```
+
+**저장:**
+- `tools.ai_metadata` JSONB 컬럼 (전체 enrichment 덤프)
+- `tool_io_contracts.semantic_tag` (produces_semantics / consumes_semantics 의 semantic 을 해당 row 에 매핑)
+
+**재실행 조건:** swagger 변경, LLM 모델 업그레이드, 관리자 강제 재생성. 일상 요청 처리와 **분리**.
+
+### 3.4 Tool Graph (재정의)
+
+엣지 타입:
+
+| 엣지 | 근거 | 신뢰도 | 용도 |
+|---|---|---|---|
+| `produces_for` (exact) | Pass 1 — field name + type 일치 | high | Stage 2 주 신호 |
+| `produces_for` (semantic) | Pass 2 — `semantic_tag` 일치 | medium | Pass 1 이 못 잡는 교차 명명 (cross-collection 등) |
+| `pairs_with` | Pass 2 — `pairs_well_with` 에서 | medium | Stage 1 catalog 힌트, Stage 2 보조 |
+| `similar_to` | 구조적 (같은 controller / tag / CRUD 역할) | low | Disambiguation 후보 확장 |
+| `precedes` | 구조적 (POST → GET single 등) | low | 레거시 엣지, 보조 힌트 |
+
+**기존 하드코딩 반응성 패치 (selt, synonym clusters, *No/*Seq heuristic, search-bridge exception) 는 Pass 2 완성 시 모두 제거.** Pass 1 field exact match + Pass 2 LLM enrichment 가 그 역할을 대체.
+
+### 3.5 Ingest 파이프라인
+
+```python
+# xgen-workflow 측
+def ingest_collection(collection_id, spec_source, llm_config):
+    from graph_tool_call.ontology.llm_provider import wrap_llm
+    from graph_tool_call.ingest.openapi import parse_operations
+    
+    # Pass 1: 결정론
+    schemas = parse_operations(spec_source)
+    io_contracts = extract_io_contracts(schemas)          # 3.2
+    graph = build_structural_edges(schemas, io_contracts) # 3.4
+    
+    # Pass 2: LLM (옵션)
+    if llm_config.enabled:
+        llm = wrap_llm(build_llm_spec(llm_config))
+        enrichments = llm.enrich_tool_semantics(schemas)
+        apply_semantic_tags(io_contracts, enrichments)    # semantic_tag 채움
+        graph = augment_with_semantic_edges(graph, enrichments)
+    
+    store_all(schemas, io_contracts, graph, enrichments)
+```
+
+**옵션:** Pass 2 는 `llm_config.enabled=False` 로 **생략 가능**. Pass 1 만으로도 기본 동작은 가능 (품질은 낮음).
+
+### 3.6 xgen-workflow 통합
+
+xgen 은 이미 agent 노드에서 provider/model/api_key 선택 지원. Ingest 시에도 동일 config 재사용:
+
+```python
+# xgen-workflow: api_tool_collection/service.py
+def refresh_with_enrichment(collection_id, llm_settings):
+    llm_spec = f"{llm_settings.provider}/{llm_settings.model}"  
+    # "openai/gpt-4.1-mini"
+    
+    # api_key 는 env 또는 xgen secret store 에서
+    os.environ["OPENAI_API_KEY"] = xgen_secret.get(user_id, "openai")
+    
+    ingest_collection(collection_id, spec_source, LLMConfig(
+        enabled=True,
+        spec=llm_spec,
+    ))
+```
+
+graph-tool-call 은 xgen 에 의존하지 않음. xgen 이 config 주는 쪽, graph-tool-call 이 받는 쪽.
+
+---
+
+## 4. Stage 1 — Retrieval + Target Selection
+
+**입력:** `requirement: str`
+
+**출력:**
+```json
+{
+  "target": "seltProductDetailInfo",
+  "confidence": 0.92,
+  "entities": {
+    "keyword": "quarzen 티셔츠",
+    "locale": "ko"
+  },
+  "output_shape": "single",
+  "reasoning": "..."
+}
+```
+
+### 4.1 알고리즘
+
+**(a) Retrieval prefilter (결정론):** graph-tool-call 의 기존 `retrieve_with_scores()` 그대로 사용.
+```python
+candidates = tg.retrieve_with_scores(requirement, top_k=20)
+# BM25 + graph + (optional) annotation 채널
+```
+embedding prefilter 는 생략. 기존 BM25 + graph 가 top-20 recall 을 충분히 내는 것을 실측으로 확인 (x2bee `"product search"` → `seltSearchProduct` top-10 안에 들어옴).
+
+향후 recall 부족 증거가 나오면 embedding 채널을 **그때** 연결. 지금은 YAGNI.
+
+**(b) LLM structured pick:**
+- 20개의 catalog 에 **ai_metadata 포함**:
+  ```
+  {
+    function_name,
+    description[:80],
+    one_line_summary,       // Pass 2 에서 생성
+    when_to_use,            // Pass 2
+    pairs_well_with         // Pass 2 (이름만)
+  }
+  ```
+- system prompt: "고른 target 1개와 추출한 entities 를 반환"
+- OpenAI structured output (JSON schema 강제)
+
+**context 크기:** 20 × 200자 ≈ 4KB (ai_metadata 포함 확장). ai_metadata 없을 땐 20 × 100자 ≈ 2KB.
+
+### 4.2 오류 처리
+
+- Retrieval 이 top-20 모두 low score 면 → "적합한 도구 없음" 에러. 사용자 재질의 유도.
+- LLM 이 JSON schema 위반 시 → 1회 retry. 실패하면 fallback: top-1 embedding 결과로 진행 (entities 는 빈 dict).
+
+### 4.3 Stage 1 의 성능 지표
+- Target 정확도 (샘플 요구사항 N개에 대해 "맞는 target 선정" 비율)
+- Entity 추출 재현율
+- LLM 응답 latency p50/p95
+
+---
+
+## 5. Stage 2 — Path Synthesizer
+
+**입력:** Stage 1 output (`target`, `entities`)
+**출력:** Plan (Plan 스키마는 §9 참조) OR "ambiguous" 플래그 (Disambiguation 발동)
+
+### 5.1 DAG 구성 알고리즘 (Bottom-up)
+
+```python
+def synthesize(target, entities, collection_defaults):
+    plan = {"steps": [], "output_binding": None}
+    context = entities | collection_defaults   # 이미 아는 값들
+    
+    needed = target.consumes.required_only()   # 필수 입력만 먼저
+    resolved = {}                              # {field: source_step_id}
+    pending = list(needed)
+    visited = set()
+    
+    while pending:
+        field = pending.pop(0)
+        if field.semantic_tag in available_tags(context, resolved):
+            resolved[field.name] = bind_from_available(field, context, resolved)
+            continue
+        
+        # graph 에서 이 semantic 을 produces 하는 tool 찾기
+        producers = graph.producers_of(field.semantic_tag)
+        if not producers:
+            raise UnsatisfiableFieldError(field)
+        
+        # 후보 여러 개면 "ambiguous" 로 분기 (Stage 3 LLM)
+        if len(producers) > 1 and not strictly_better(producers):
+            return AmbiguousPlan(target, candidates=producers)
+        
+        # prerequisite 추가 (재귀)
+        producer = producers[0]
+        if producer.name in visited:
+            raise CyclicDependencyError
+        visited.add(producer.name)
+        
+        step = build_step(producer)
+        plan.steps.insert(0, step)  # 앞쪽에 삽입 (위상 순서)
+        
+        # producer 의 consumes 를 다시 확인
+        pending.extend(producer.consumes.required_only())
+    
+    # target 을 마지막 step 으로 추가
+    plan.steps.append(build_step(target, bindings=resolved))
+    plan.output_binding = f"$.{target.step_id}.body"
+    
+    return plan
+```
+
+### 5.2 "strictly_better" 판단
+
+여러 producer 후보 중:
+- IO Contract confidence 높은 순
+- 경로 짧은 순 (재귀 depth)
+- similar_to weight 높은 순 (requirement 와 가까운)
+- 모두 비슷하면 → Ambiguous 플래그
+
+### 5.3 초기 버전 범위
+
+- **선형 chain** (각 step 1회 호출): 지원
+- **다중 참조** (한 step 이 이전 N개 step 의 출력 조합): 지원
+- **Fan-out** (배열 전체 loop): **초기 범위 밖** — §10 확장 포인트
+- **조건 분기** (if/else): **초기 범위 밖**
+
+### 5.4 실패 경로
+
+| 케이스 | 반환 |
+|---|---|
+| 필수 field 해소 불가 | `UnsatisfiableFieldError` — Stage 4 에 그대로 reveal |
+| 순환 의존 | `CyclicDependencyError` — 보고 |
+| 복수 경로 | `AmbiguousPlan` — Disambiguation 발동 |
+
+---
+
+## 6. Disambiguation (조건부)
+
+**발동 조건:** Stage 2 가 `AmbiguousPlan` 반환.
+
+**입력:** 후보 경로 2~N개 각각의 요약
+```
+후보 A: seltSearchProduct → seltProductDetailInfo
+후보 B: getCategoryList → seltSearchProduct → seltProductDetailInfo
+```
+
+**LLM 호출:**
+- system: "요구사항에 가장 맞는 경로 1개를 고르고 이유를 설명"
+- user: requirement + 후보 경로 설명
+- structured output: `{"chosen": "A", "reason": "..."}`
+
+**context:** ~2KB
+
+---
+
+## 7. Stage 3 — Runner
+
+**입력:** 확정 Plan
+
+**동작:**
+```python
+async def run(plan: Plan):
+    context = {}                              # step_id → result
+    trace = ExecutionTrace(plan=plan)
+    
+    for step in topological_order(plan.steps):
+        resolved_args = resolve_bindings(step.args, context)
+        
+        trace.emit("step.start", step_id=step.id, args=resolved_args)
+        
+        try:
+            result = await tool_executor.execute(
+                function_id=step.tool_function_id,
+                args=resolved_args,
+                timeout=step.timeout or 30,
+            )
+        except ToolExecutionError as e:
+            trace.emit("step.error", step_id=step.id, error=str(e))
+            return trace.fail(step.id, e)
+        
+        context[step.id] = result
+        trace.emit("step.done", step_id=step.id, output_preview=preview(result))
+    
+    final = jsonpath_extract(context, plan.output_binding)
+    trace.emit("plan.done", output=final)
+    return trace.success(final)
+```
+
+### 7.1 Argument 바인딩 치환
+
+바인딩 syntax: `${step_id.json_path}` — JsonPath 표준 사용 (jsonpath-ng 라이브러리).
+
+```
+args = {"goodsNo": "${s1.body.goods[0].goodsNo}",
+        "langCd": "ko"}
+context = {"s1": {"body": {"goods": [{"goodsNo": 12345, ...}]}}}
+→ resolved = {"goodsNo": 12345, "langCd": "ko"}
+```
+
+### 7.2 에러 / 재시도 정책 (초기 버전)
+
+| 에러 유형 | 동작 |
+|---|---|
+| HTTP 4xx | fail fast, trace 에 응답 body 포함 |
+| HTTP 5xx | 최대 2회 재시도 (exponential backoff) |
+| 타임아웃 | fail fast |
+| JsonPath 미스 | fail fast — "step sX 의 bindings 가 실제 응답 구조와 불일치: [list of missing paths]" |
+| Schema 검증 실패 | fail fast |
+
+**재계획 (re-plan) 은 v1 범위 밖.** 실패 시 Stage 4 가 사용자에게 설명.
+
+### 7.3 스트리밍
+
+각 step 단위로 이벤트 emit. UI 는 step 단위 진행 상황 표시.
+
+---
+
+## 8. Stage 4 — Response Synthesis
+
+**입력:** requirement + ExecutionTrace
+
+**동작:**
+```python
+def synthesize_response(requirement, trace):
+    if trace.success:
+        # 최종 output 의 관련 필드만 추림 (schema-aware projection)
+        relevant = project_relevant_fields(trace.output, requirement)
+        prompt = f"""
+        요구사항: {requirement}
+        실행 결과 요약: {relevant}
+        사용자에게 자연스럽게 답변.
+        """
+    else:
+        prompt = f"""
+        요구사항: {requirement}
+        실행 중 실패: step={trace.failed_step}, 이유={trace.error}
+        부분 결과: {trace.partial_results}
+        사용자에게 무엇이 됐고 무엇이 안 됐는지 설명.
+        """
+    return llm.complete(prompt)
+```
+
+**context:** 요약된 결과 기준 ~1KB. 전체 response 를 그대로 넘기지 않음 — `project_relevant_fields` 가 requirement 에 관련된 필드만 추림.
+
+---
+
+## 9. 핵심 데이터 계약
+
+### 9.1 Intent Schema (Stage 1 출력)
+
+```typescript
+type Intent = {
+  target: string;                    // function_name
+  confidence: number;                // 0.0 ~ 1.0
+  entities: Record<string, any>;     // {keyword: "...", locale: "ko", ...}
+  output_shape: "single" | "list" | "count";
+  reasoning?: string;                // 디버그용
+}
+```
+
+### 9.2 Plan Schema (Stage 2 출력)
+
+```typescript
+type Plan = {
+  id: string;                         // uuid (캐시 키 포함)
+  goal: string;                       // Intent 의 요약
+  steps: PlanStep[];
+  output_binding: string;             // JsonPath "$.s2.body" 등
+  metadata: {
+    created_at: string;
+    target: string;
+    disambiguation_used: boolean;
+  };
+}
+
+type PlanStep = {
+  id: string;                         // "s1", "s2", ...
+  tool: string;                       // function_name
+  tool_function_id: string;           // DB 룩업용 slug
+  args: Record<string, string>;       // {"goodsNo": "${s1.body.goods[0].goodsNo}", ...}
+  timeout_ms?: number;
+  retryable?: boolean;
+  rationale?: string;                 // "검색 결과로 goodsNo 획득"
+}
+```
+
+### 9.3 ExecutionTrace Schema (Stage 3 출력)
+
+```typescript
+type ExecutionTrace = {
+  plan_id: string;
+  success: boolean;
+  steps: StepTrace[];
+  output?: any;                       // 성공 시
+  failed_step?: string;               // 실패 시
+  error?: ErrorDetail;                // 실패 시
+  duration_ms: number;
+  started_at: string;
+  ended_at: string;
+}
+
+type StepTrace = {
+  id: string;
+  tool: string;
+  args: Record<string, any>;          // resolved (바인딩 치환 후)
+  output?: any;
+  error?: ErrorDetail;
+  duration_ms: number;
+  retries: number;
+}
+```
+
+---
+
+## 10. 하드코딩 제거 매핑표
+
+| 현 하드코딩 | 제거 방법 | 대체 메커니즘 |
+|---|---|---|
+| `_SYNONYM_CLUSTERS` (goods↔product) | 제거 | Pass 2 `primary_resource` + `semantic_tag` (LLM per-tool enrichment) |
+| `selt`, `sel` verb 특수 케이스 | 제거 | Pass 2 `canonical_action` (LLM 이 context 읽고 분류) |
+| `*Id/*No/*Seq` 접미사 heuristic | 제거 | Pass 1 field name + type exact match (동일 swagger 안에선 충분) + 필요시 Pass 2 semantic_tag |
+| `search-bridge` 예외 | 제거 | Pass 2 `pairs_well_with` + `canonical_action = search` |
+| `_is_single_resource_path` 필터 | 제거 | IO Contract 의 produces/consumes 가 판단 |
+| `_VERB_TO_INTENT` CRUD 사전 | **유지** (Pass 1 fallback) | Pass 2 가 LLM 으로 action 태깅 담당. Pass 2 생략 시 이 사전이 fallback |
+
+---
+
+## 11. 확장 포인트
+
+### 11.1 Fan-out (foreach)
+
+**시나리오:** "카트의 모든 상품 상세 보여줘"
+
+**Plan schema 확장:**
+```typescript
+type PlanStep = {
+  // ... 기존 필드
+  foreach?: {
+    source: string;                 // "${s1.body.items[*]}"
+    item_alias: string;             // "item"
+  };
+  // args 안에서 `${item.goodsNo}` 참조 가능
+}
+```
+
+**Runner 확장:** foreach step 은 N회 호출 후 결과를 배열로 묶어 context 에 저장.
+
+### 11.2 조건 분기 (if/else)
+
+**Plan schema 확장:** step 에 `condition` 필드 (JsonPath 기반 부울 식). Runner 가 evaluate 후 skip/execute.
+
+### 11.3 Workflow Template Library
+
+- 성공한 Plan 을 `workflow_templates` 테이블에 승격
+- 새 requirement → embedding 기반 template match → 재사용
+- Stage 1~2 skip 가능 → 더 빠름
+- Intent 유사 판정 임계값 튜닝 필요
+
+### 11.4 Interactive Refinement
+
+- Runner 가 특정 step 에서 `user_input_required` 이벤트 발행
+- UI 가 사용자에게 선택지 제시
+- 응답 받아 Runner 재개 (suspend/resume)
+- 민감 액션 (결제, 삭제) 에 필수
+
+### 11.5 Self-healing Re-plan
+
+- Runner 실패 시 ExecutionTrace + 에러를 Stage 1~2 에 다시 넘겨 1회 re-plan
+- 예: "빈 배열 반환 → 검색 키워드 재조정" 같은 케이스
+
+---
+
+## 12. 마이그레이션
+
+### 12.1 기존 자산 활용
+
+- `graph_tool_call.analyze.dependency.detect_dependencies`: **유지**. IO Contract 가 못 잡는 구조적 엣지는 여전히 여기서. 단 반응성 패치 (`selt`, `_SYNONYM_CLUSTERS`, `*No/*Seq`, `search-bridge`) 는 Pass 2 enrichment 정착 시 **단계적 제거**.
+- `graph_tool_call.retrieval`: **유지**. Stage 1 의 prefilter 로 그대로 활용 (BM25 + graph).
+- `graph_tool_call.ontology.llm_provider`: **유지**. Pass 2 enrichment 의 `enrich_tool_semantics` 메서드 추가.
+- `tool_executor.execute_collection_tool`: **유지**. Stage 3 Runner 가 호출.
+- `APICollectionLoader` Canvas 노드: **유지** (그래프 + ai_metadata 로드 역할).
+- `Agent Xgen` 노드: **유지** (범용 ReAct / 일반 채팅 용도). API collection 시나리오에 쓰일 땐 `Agent Planflow` 로 대체 권장.
+
+### 12.2 Canvas 노드 구성 변경
+
+```
+기존:  Input → APICollectionLoader → Agent Xgen → Output
+신규:  Input → APICollectionLoader → Agent Planflow → Output
+              (graph/ai_metadata/io_contracts 로드)  (Stage 1~4 통합)
+```
+
+`Agent Planflow` 내부 구조:
+```
+┌── Stage 1: retrieval + target pick  (LLM 1회)
+├── Stage 2: path synthesizer           (결정론, DAG)
+├── (conditional) disambiguation        (LLM 조건부)
+├── Stage 3: runner (streaming)          (결정론, HTTP)
+└── Stage 4: response synthesis          (LLM 1회, streaming)
+```
+
+설정 UI 는 `Agent Xgen` 과 공용 컴포넌트 재사용 (provider/model/api_key/temperature/max_tokens). 전용 파라미터 (`enable_disambiguation`, `max_plan_steps`) 만 추가.
+
+### 12.3 점진 마이그레이션 전략
+
+1. **Phase A:** L0 Knowledge Base 구축 — IO Contract 추출 (결정론) + `OntologyLLM.enrich_tool_semantics` 메서드 추가. 기존 graph 와 공존.
+2. **Phase B:** Stage 3 Runner 독립 구현 (plan fixture 로 단위 테스트).
+3. **Phase C:** Stage 2 Path Synthesizer — DAG + exact field match + semantic_tag 보강.
+4. **Phase D:** Stage 1 + 4 LLM 호출 구현 (structured output). 기존 `retrieve_with_scores` 를 Stage 1 prefilter 로 연결.
+5. **Phase E:** Canvas 노드 `Agent Planflow` 개발. 설정 UI 는 `Agent Xgen` 컴포넌트 재사용.
+6. **Phase F:** 평가 세트로 A/B 측정. 안정화 후 기존 반응성 패치 (`selt`, synonym 등) 제거.
+
+---
+
+## 13. 운영 리스크 및 완화
+
+| 리스크 | 영향 | 완화 |
+|---|---|---|
+| IO Contract semantic_tag 오태깅 | Stage 2 가 틀린 path 생성 | ingest 시 LLM 태깅 → 관리자 UI 검수/오버라이드 |
+| Stage 1 target 오선정 | 전혀 다른 도구 실행 | confidence threshold → 낮으면 disambiguation 강제 |
+| Stage 2 Ambiguous 빈발 | 매 요청 LLM 추가 호출 | IO Contract 개선으로 장기적으로 완화. 초기엔 허용 |
+| Runner JsonPath miss | 실행 실패 | plan validate 단계에서 response schema 와 bindings 교차 검증 (Stage 2 출력 직후) |
+| HTTP 외부 장애 | 사용자 체감 실패 | retry + 명확한 trace + Stage 4 에서 "일부 성공/실패" 구분 |
+| Embedding API 비용 | ingest 비용↑ | ingest 시 1회만. 요청당 embed 는 requirement 1회만 |
+| LLM structured output 깨짐 | Stage 1 파싱 실패 | 1회 retry → 실패 시 top-1 embedding 결과 fallback |
+
+---
+
+## 14. 측정 지표 (성공 기준)
+
+### 14.1 성능
+
+- Latency p50 / p95 (목표: p50 ≤ 3s, p95 ≤ 6s)
+- LLM 호출 수 / 요청 (목표: ≤ 2.5 평균)
+- Context 총량 / 요청 (목표: ≤ 3KB 평균)
+
+### 14.2 품질
+
+평가 세트: 요구사항 20~50개 (각 collection 당).
+
+- **Stage 1 target 정확도:** 고른 target 이 사람 판단과 일치하는 비율
+- **Stage 2 path 정확도:** 생성된 plan 이 유효한 실행 시퀀스인 비율
+- **End-to-end 성공률:** 사용자 요구사항 → 의미 있는 답변까지 성공한 비율
+- **Ambiguity rate:** Disambiguation 발동 빈도 (낮을수록 graph 품질 좋음)
+
+### 14.3 비용
+
+- OpenAI 토큰 소비 / 요청 (입력/출력 분리)
+- Embedding 호출 수 (ingest + 요청별 1회)
+
+### 14.4 감사성
+
+- 모든 Plan artifact 조회 가능
+- 실패 시 failed_step + error + partial_results 복원 가능
+
+---
+
+## 15. 비전과의 정합성
+
+사용자가 그린 그림:
+
+> Swagger → tool list 정의 → 사전 graph 관계 구축 →
+> 워크플로우에서 컬렉션 노드 연결 + 요구사항 입력 →
+> 필요한 API 들 찾아 req/res 세팅 후 순서대로 호출 → 결과 반환
+
+이 아키텍처의 대응:
+
+| 사용자 의도 | 이 설계에서 |
+|---|---|
+| "사전 graph 관계 구축" | L0 Knowledge Base (Pass 1 구조적 + Pass 2 LLM 의미 주석) |
+| "요구사항 입력" | Stage 1 입력 |
+| "필요한 API 찾기" | Stage 1 (retrieval + target pick) + Stage 2 (DAG 구성) |
+| "req/res 세팅" | Stage 2 의 argument bindings (exact field match + semantic_tag) |
+| "순서대로 호출" | Stage 3 Runner (DAG topological) |
+| "결과 반환" | Stage 4 Response Synthesis |
+
+**정합성 완전.** LLM 은 의미 해석이 필요한 지점에만 최소한으로 사용:
+- **Ingest 시 Pass 2** — description 을 읽고 의미 주석 (1회, 영속 저장)
+- **Runtime Stage 1** — 사용자 자연어 → target tool + entities
+- **Runtime Stage 4** — 실행 결과 → 자연어 응답
+
+Request/response schema 는 LLM 이 일절 건드리지 않음 (swagger 가 source of truth).
+
+---
+
+## 16. 결정 사항
+
+### 해결된 항목 (2026-04-23)
+
+| # | 주제 | 결정 | 근거 |
+|---|---|---|---|
+| 1 | Field semantic 매칭 방식 | **Pass 1 exact match (기본) + Pass 2 LLM semantic_tag (보강)**. embedding clustering 불필요 | 동일 swagger 안에선 field 이름 일관. cross-convention 은 LLM 이 해결 |
+| 2 | LLM 모델 선택 | **xgen agent 노드 config 재사용**. Stage 1/4 는 사용자 노드 설정 상속. Pass 2 는 컬렉션별 별도 설정 (기본 gpt-4.1-mini) | UX 일관성, 기존 provider/key 관리 재사용 |
+| 3 | Ingest embedding 모델 | **사용 안 함 (v1)**. 필요시 `text-embedding-3-small` 추후 연결 | BM25 + graph 가 Stage 1 top-20 recall 확보 (실측) |
+| 4 | Plan / ExecutionTrace 영속성 | **로그 기반 (DB 테이블 없음)**. 구조화 JSON 이벤트로 plan 생명주기 기록 | YAGNI. 필요 기능 (history UI, template auto-promotion) 생길 때 해당 테이블 추가 |
+| 5 | Canvas 노드 구성 | **신규 노드 `Agent Planflow`**. `Agent Xgen` 은 유지 (범용 ReAct), `Agent Planflow` 는 API collection 전용 Plan-and-Execute. 설정 UI 공용화 (provider/model/key) | 기존 자산 유지 + 특화 경로 분리. 코드 간결성 |
+| 6 | Plan 실행 범위 (v1) | **선형 chain 만**. Fan-out / 조건 분기 / parallel / re-plan 은 v2+. Plan schema 는 optional 필드로 **확장 가능하게 설계** | v1 목표 (30s→5s + 정확도) 는 선형으로 달성. 복잡 케이스는 사용자에게 명시적 에러 |
+
+### 미결 항목
+
+모두 해결됨 (2026-04-23).
+
+---
+
+## 17. 참고 문서
+
+- [pathfinder-plan.md](./pathfinder-plan.md) — 기존 로드맵 (이 문서 확정 후 섹션 3.7 업데이트 필요)
+- [pathfinder-bug-analysis.md](./pathfinder-bug-analysis.md) — ingest 파이프라인 과거 이슈
+- [xgen-ai-chat-architecture.md](./xgen-ai-chat-architecture.md) — AI chat / 사이드패널 / canvas 통합
+
+---
diff --git a/graph_tool_call/execute/http_executor.py b/graph_tool_call/execute/http_executor.py
index 32859fa..55e5126 100644
--- a/graph_tool_call/execute/http_executor.py
+++ b/graph_tool_call/execute/http_executor.py
@@ -77,7 +77,12 @@ def build_request(
         for k, v in path_params.items():
             path = path.replace(f"{{{k}}}", urllib.parse.quote(str(v), safe=""))
 
-        url = f"{self._base_url}{path}"
+        # tool 자체 base_url(spec.servers 유래)이 있으면 그쪽 우선 — 한 컬렉션에
+        # 다른 호스트(common/product/member 등)의 source가 섞여 있을 때 source별
+        # 호스트로 라우팅한다. 없으면 executor 기본 base_url 사용.
+        tool_base = (metadata.get("base_url") or "").rstrip("/")
+        base = tool_base or self._base_url
+        url = f"{base}{path}"
         if query_params:
             url += "?" + urllib.parse.urlencode(query_params, doseq=True)
 
diff --git a/graph_tool_call/ingest/io_contract.py b/graph_tool_call/ingest/io_contract.py
new file mode 100644
index 0000000..1768a47
--- /dev/null
+++ b/graph_tool_call/ingest/io_contract.py
@@ -0,0 +1,345 @@
+"""Field-level IO contract extraction from OpenAPI / Swagger schemas.
+
+Used by L0 Knowledge Base — **Pass 1, deterministic**. Walks request and
+response schemas and emits leaf field descriptors with JsonPath. The output
+feeds:
+
+  - Tool Graph: produces × consumes field-name match → ``produces_for`` edge
+  - Pass 2 enrichment: provides field list to LLM for ``semantic_tag`` assign
+  - Stage 3 Runner: bindings reference these json_paths
+
+This module assumes the input schema is **already $ref-resolved** (caller
+runs ``_resolve_refs`` from ``graph_tool_call.ingest.openapi``).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class FieldLeaf:
+    """A leaf field extracted from a JSON Schema.
+
+    ``json_path`` is the dotted JSONPath from the schema root, with ``[*]``
+    used as the array wildcard (for produces). For consumes, callers usually
+    flatten to ``field_name`` since binding keys by name not path.
+    """
+
+    json_path: str
+    field_name: str
+    field_type: str
+    required: bool = False
+    description: str = ""
+    enum: list[Any] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Schema walker
+# ---------------------------------------------------------------------------
+
+
+_DEFAULT_MAX_DEPTH = 8
+
+
+def extract_leaves(
+    schema: Any,
+    *,
+    base_path: str = "$",
+    parent_required: bool = False,
+    max_depth: int = _DEFAULT_MAX_DEPTH,
+    _depth: int = 0,
+) -> list[FieldLeaf]:
+    """Recursively walk a JSON Schema, emitting leaf field info.
+
+    Parameters
+    ----------
+    schema:
+        JSON Schema dict (already $ref-resolved).
+    base_path:
+        Starting JSONPath for this subtree (e.g. ``$``, ``$.body``).
+    parent_required:
+        Whether the containing field is required by its parent. Propagated to
+        leaves so the caller can filter ``required-only`` consumes.
+    max_depth:
+        Hard recursion limit. Cyclic schemas or pathological nesting stop here.
+
+    Returns
+    -------
+    list[FieldLeaf]
+        One entry per primitive (or array-of-primitive) leaf reachable.
+    """
+    if not isinstance(schema, dict) or _depth > max_depth:
+        return []
+
+    schema = _resolve_combinators(schema)
+
+    schema_type = _normalize_type(schema.get("type"))
+
+    # Object: walk properties
+    if schema_type == "object" or "properties" in schema:
+        return _walk_object(schema, base_path, max_depth, _depth)
+
+    # Array: walk items with [*] suffix
+    if schema_type == "array":
+        items = schema.get("items") or {}
+        return extract_leaves(
+            items,
+            base_path=f"{base_path}[*]",
+            parent_required=parent_required,
+            max_depth=max_depth,
+            _depth=_depth + 1,
+        )
+
+    # Primitive: emit a single leaf using the trailing path segment as name
+    field_name = _last_path_segment(base_path)
+    if not field_name:
+        # At root with no parent name — nothing useful to emit
+        return []
+    return [
+        FieldLeaf(
+            json_path=base_path,
+            field_name=field_name,
+            field_type=schema_type or "string",
+            required=parent_required,
+            description=str(schema.get("description") or "")[:200],
+            enum=list(schema.get("enum") or []),
+        )
+    ]
+
+
+def _walk_object(
+    schema: dict[str, Any],
+    base_path: str,
+    max_depth: int,
+    depth: int,
+) -> list[FieldLeaf]:
+    leaves: list[FieldLeaf] = []
+    properties = schema.get("properties") or {}
+    if not isinstance(properties, dict):
+        return leaves
+    required_set = set(schema.get("required") or [])
+
+    for prop_name, prop_schema in properties.items():
+        child_path = f"{base_path}.{prop_name}"
+        is_required = prop_name in required_set
+        child_leaves = extract_leaves(
+            prop_schema,
+            base_path=child_path,
+            parent_required=is_required,
+            max_depth=max_depth,
+            _depth=depth + 1,
+        )
+        if child_leaves:
+            leaves.extend(child_leaves)
+        else:
+            # Object/array with no resolvable children — keep as a generic leaf
+            # so downstream knows the field exists (e.g. opaque additionalProps).
+            leaves.append(
+                FieldLeaf(
+                    json_path=child_path,
+                    field_name=prop_name,
+                    field_type=_schema_type(prop_schema) or "object",
+                    required=is_required,
+                    description=(
+                        str(prop_schema.get("description") or "")[:200]
+                        if isinstance(prop_schema, dict)
+                        else ""
+                    ),
+                )
+            )
+    return leaves
+
+
+def _resolve_combinators(schema: dict[str, Any]) -> dict[str, Any]:
+    """Flatten ``allOf`` / pick first ``oneOf`` / ``anyOf``.
+
+    v1 strategy: best-effort. Doesn't handle JSON Schema combinator semantics
+    fully — sufficient to surface field shapes for our planning use.
+    """
+    if "allOf" in schema and isinstance(schema["allOf"], list):
+        merged_props: dict[str, Any] = dict(schema.get("properties") or {})
+        merged_required: list[str] = list(schema.get("required") or [])
+        for sub in schema["allOf"]:
+            if not isinstance(sub, dict):
+                continue
+            merged_props.update(sub.get("properties") or {})
+            for r in sub.get("required") or []:
+                if r not in merged_required:
+                    merged_required.append(r)
+        out = dict(schema)
+        out["type"] = "object"
+        out["properties"] = merged_props
+        out["required"] = merged_required
+        return out
+
+    for key in ("oneOf", "anyOf"):
+        candidates = schema.get(key)
+        if isinstance(candidates, list) and candidates:
+            first = next((c for c in candidates if isinstance(c, dict)), None)
+            if first is not None:
+                # Merge the candidate as a base, parent fields override
+                base = dict(first)
+                base.update({k: v for k, v in schema.items() if k != key})
+                return base
+    return schema
+
+
+def _normalize_type(t: Any) -> str:
+    """JSON Schema 'type' can be str or list. Pick first non-null."""
+    if isinstance(t, list):
+        return next((x for x in t if x and x != "null"), "")
+    return t or ""
+
+
+def _schema_type(schema: Any) -> str:
+    if not isinstance(schema, dict):
+        return ""
+    return _normalize_type(schema.get("type"))
+
+
+def _last_path_segment(path: str) -> str:
+    """Extract trailing field name from a JsonPath like ``$.body.goods[*].goodsNo``."""
+    if not path or path == "$":
+        return ""
+    last = path.rsplit(".", 1)[-1]
+    if last.endswith("[*]"):
+        last = last[:-3]
+    return last
+
+
+# ---------------------------------------------------------------------------
+# Operation-level extraction (combines body + parameters)
+# ---------------------------------------------------------------------------
+
+
+def extract_produces_for_operation(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> list[FieldLeaf]:
+    """Walk operation's success response schema → leaf produces with JsonPath."""
+    response_schema = _pick_response_schema(operation, is_swagger2=is_swagger2)
+    if not response_schema:
+        return []
+    return extract_leaves(response_schema, base_path="$")
+
+
+def extract_consumes_for_operation(
+    operation: dict[str, Any],
+    path_item: dict[str, Any] | None = None,
+    *,
+    is_swagger2: bool = False,
+    required_only: bool = True,
+) -> list[FieldLeaf]:
+    """Combine query/path/header parameters and request body into a flat
+    consume list.
+
+    Body fields are flattened to field-name level (the LLM-visible name) —
+    binding keys by name in Stage 2/3, not by nested path. The original
+    nested structure for HTTP injection is handled separately via the
+    existing ``leaf_path_map`` mechanism on the tool row.
+    """
+    leaves: list[FieldLeaf] = []
+    seen_names: set[str] = set()
+
+    # query / path / header parameters
+    all_params = (operation.get("parameters") or []) + (
+        (path_item or {}).get("parameters") or []
+    )
+    for p in all_params:
+        if not isinstance(p, dict) or "name" not in p:
+            continue
+        loc = p.get("in")
+        if loc not in ("query", "path", "header"):
+            continue
+        is_required = bool(p.get("required", loc == "path"))
+        if required_only and not is_required:
+            continue
+        if is_swagger2:
+            ftype = p.get("type") or "string"
+        else:
+            ftype = _schema_type(p.get("schema") or {}) or "string"
+        if p["name"] in seen_names:
+            continue
+        seen_names.add(p["name"])
+        leaves.append(
+            FieldLeaf(
+                json_path=p["name"],  # flat for consumes
+                field_name=p["name"],
+                field_type=ftype,
+                required=is_required,
+                description=str(p.get("description") or "")[:200],
+            )
+        )
+
+    # request body (flattened)
+    body_schema = _pick_request_body_schema(operation, is_swagger2=is_swagger2)
+    if body_schema:
+        for leaf in extract_leaves(body_schema, base_path="$"):
+            if required_only and not leaf.required:
+                continue
+            if leaf.field_name in seen_names:
+                continue
+            seen_names.add(leaf.field_name)
+            leaves.append(
+                FieldLeaf(
+                    json_path=leaf.field_name,  # flat for consumes
+                    field_name=leaf.field_name,
+                    field_type=leaf.field_type,
+                    required=leaf.required,
+                    description=leaf.description,
+                    enum=leaf.enum,
+                )
+            )
+
+    return leaves
+
+
+def _pick_response_schema(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> dict[str, Any] | None:
+    responses = operation.get("responses") or {}
+    for code in ("200", "201", "default"):
+        resp = responses.get(code)
+        if not isinstance(resp, dict):
+            continue
+        # Swagger 2.0
+        if "schema" in resp:
+            return resp["schema"]
+        # OpenAPI 3.x
+        content = resp.get("content") or {}
+        if "application/json" in content:
+            return content["application/json"].get("schema")
+    return None
+
+
+def _pick_request_body_schema(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> dict[str, Any] | None:
+    if is_swagger2:
+        for p in operation.get("parameters") or []:
+            if isinstance(p, dict) and p.get("in") == "body":
+                return p.get("schema")
+        return None
+    body = operation.get("requestBody") or {}
+    content = body.get("content") or {}
+    if "application/json" in content:
+        return content["application/json"].get("schema")
+    if content:
+        first = next(iter(content.values()))
+        return first.get("schema") if isinstance(first, dict) else None
+    return None
+
+
+__all__ = [
+    "FieldLeaf",
+    "extract_leaves",
+    "extract_produces_for_operation",
+    "extract_consumes_for_operation",
+]
diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py
index 90399dd..41ffe7e 100644
--- a/graph_tool_call/ingest/openapi.py
+++ b/graph_tool_call/ingest/openapi.py
@@ -181,6 +181,60 @@ def _extract_params_swagger2(
     return params
 
 
+def _summarize_object_schema(schema: dict[str, Any], *, max_depth: int = 2) -> str:
+    """Object/array schema의 nested properties를 사람/LLM이 읽기 좋게 요약.
+
+    parameter type이 'object'/'array'인데 안의 필드명이 ToolParameter에 안 드러나면
+    LLM이 필드명을 추측하게 된다. 이 함수는 properties + required + description을
+    description 텍스트로 합쳐서 LLM 컨텍스트에 함께 노출되도록 한다.
+    """
+    if not isinstance(schema, dict):
+        return ""
+
+    def _walk(s: dict[str, Any], depth: int, indent: int) -> list[str]:
+        if depth > max_depth or not isinstance(s, dict):
+            return []
+        out: list[str] = []
+        prefix = "  " * indent
+
+        # Unwrap array → items
+        if s.get("type") == "array":
+            items = s.get("items") or {}
+            out.append(f"{prefix}[array of:]")
+            out.extend(_walk(items, depth + 1, indent + 1))
+            return out
+
+        props = s.get("properties") or {}
+        if not props:
+            return out
+        required = set(s.get("required") or [])
+        for name, prop in props.items():
+            if not isinstance(prop, dict):
+                continue
+            ptype = _schema_type(prop)
+            req = "*" if name in required else ""
+            desc = (prop.get("description") or "").strip()
+            example = prop.get("example")
+            line = f"{prefix}- {name}{req} ({ptype})"
+            if desc:
+                line += f": {desc}"
+            if example is not None and not desc:
+                line += f"  e.g. {example}"
+            out.append(line)
+            # Nested object/array 1단계 더 펼치기
+            if depth < max_depth:
+                if ptype == "object":
+                    out.extend(_walk(prop, depth + 1, indent + 1))
+                elif ptype == "array":
+                    items = prop.get("items") or {}
+                    if items.get("properties") or items.get("type") in ("object", "array"):
+                        out.extend(_walk(items, depth + 1, indent + 1))
+        return out
+
+    lines = _walk(schema, 0, 0)
+    return "\n".join(lines)
+
+
 def _extract_params_openapi3(
     operation: dict[str, Any],
     resolved_spec: dict[str, Any],
@@ -198,11 +252,18 @@ def _extract_params_openapi3(
         is_required = p.get("required", False)
         if required_only and not is_required:
             continue
+        desc = p.get("description", "") or ""
+        # object/array 타입이면 nested fields를 description에 펼쳐서
+        # LLM이 정확한 필드명(예: searchWord)을 알 수 있게 한다.
+        if _schema_type(schema) in ("object", "array"):
+            nested = _summarize_object_schema(schema)
+            if nested:
+                desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}"
         params.append(
             ToolParameter(
                 name=p["name"],
                 type=_schema_type(schema),
-                description=p.get("description", ""),
+                description=desc,
                 required=is_required,
                 enum=schema.get("enum"),
             )
@@ -218,11 +279,17 @@ def _extract_params_openapi3(
         is_required = prop_name in body_required
         if required_only and not is_required:
             continue
+        desc = (prop_schema.get("description") or "")
+        # nested object/array는 한 단계 더 펼치기
+        if _schema_type(prop_schema) in ("object", "array"):
+            nested = _summarize_object_schema(prop_schema)
+            if nested:
+                desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}"
         params.append(
             ToolParameter(
                 name=prop_name,
                 type=_schema_type(prop_schema),
-                description=prop_schema.get("description", ""),
+                description=desc,
                 required=is_required,
             )
         )
@@ -304,6 +371,34 @@ def _enrich_description(description: str, method: str, path: str) -> str:
     return description
 
 
+def _resolve_server_url(
+    operation: dict[str, Any],
+    path_item: dict[str, Any] | None,
+    spec: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> str | None:
+    """OpenAPI 우선순위: operation.servers > path.servers > spec.servers.
+
+    Swagger 2.0은 ``host`` + ``basePath`` + ``schemes`` 조합으로 base_url 구성.
+    """
+    if is_swagger2:
+        host = spec.get("host")
+        if not host:
+            return None
+        scheme = (spec.get("schemes") or ["https"])[0]
+        base_path = spec.get("basePath") or ""
+        return f"{scheme}://{host}{base_path}".rstrip("/")
+
+    for source in (operation, path_item or {}, spec):
+        servers = source.get("servers") if isinstance(source, dict) else None
+        if servers and isinstance(servers, list) and servers:
+            url = (servers[0] or {}).get("url")
+            if url:
+                return str(url).rstrip("/")
+    return None
+
+
 def _operation_to_tool(
     operation_id: str,
     operation: dict[str, Any],
@@ -313,6 +408,7 @@ def _operation_to_tool(
     *,
     is_swagger2: bool = False,
     required_only: bool = False,
+    path_item: dict[str, Any] | None = None,
 ) -> ToolSchema:
     """Convert a single OpenAPI operation into a ToolSchema."""
     description = operation.get("summary") or operation.get("description", "")
@@ -357,6 +453,13 @@ def _operation_to_tool(
     if response_schema:
         metadata["response_schema"] = response_schema
 
+    # spec/path/operation 단위의 servers field → tool 자체 base_url 부여.
+    # 한 컬렉션에 다른 host를 가진 source들이 섞여 있을 때 executor가 tool마다
+    # 알맞은 base_url로 호출할 수 있게 한다.
+    server_url = _resolve_server_url(operation, path_item, resolved_spec, is_swagger2=is_swagger2)
+    if server_url:
+        metadata["base_url"] = server_url
+
     return ToolSchema(
         name=operation_id,
         description=description,
@@ -459,6 +562,7 @@ def ingest_openapi(
                 resolved_raw,
                 is_swagger2=is_swagger2,
                 required_only=required_only,
+                path_item=path_item,
             )
             tools.append(tool)
 
diff --git a/graph_tool_call/langchain/gateway.py b/graph_tool_call/langchain/gateway.py
index cfde75e..1ad9e97 100644
--- a/graph_tool_call/langchain/gateway.py
+++ b/graph_tool_call/langchain/gateway.py
@@ -66,6 +66,93 @@ def _extract_parameters_info(tool: Any) -> list[dict[str, Any]] | None:
     return None
 
 
+def _summarize_response_schema(schema: dict[str, Any]) -> str | None:
+    """Produce a one-line summary of an OpenAPI response schema for the LLM.
+
+    Lists top-level field names + types so the model can plan parameter
+    extraction for the next call.
+    """
+    if not isinstance(schema, dict):
+        return None
+
+    # Unwrap arrays
+    container = schema
+    is_array = False
+    if container.get("type") == "array" and isinstance(container.get("items"), dict):
+        container = container["items"]
+        is_array = True
+
+    props = container.get("properties")
+    if not isinstance(props, dict) or not props:
+        # Fall back to a bare type description
+        t = container.get("type")
+        return f"array of {t}" if is_array and t else t
+
+    fields = []
+    for name, info in list(props.items())[:12]:
+        if not isinstance(info, dict):
+            fields.append(name)
+            continue
+        t = info.get("type") or info.get("$ref", "object").rsplit("/", 1)[-1]
+        fields.append(f"{name}:{t}")
+    summary = "{" + ", ".join(fields) + "}"
+    return f"array of {summary}" if is_array else summary
+
+
+def _enrich_from_graph(
+    name: str, graph: Any | None
+) -> dict[str, Any]:
+    """Pull source_label, method/path, response summary, and outgoing edges
+    from the underlying ToolGraph for *name*. Returns an empty dict if the
+    graph or tool is not available — callers should treat all keys as optional.
+    """
+    if graph is None:
+        return {}
+
+    enrichment: dict[str, Any] = {}
+
+    tool_schema = None
+    try:
+        tool_schema = graph.tools.get(name)
+    except Exception:
+        return enrichment
+
+    if tool_schema is not None and getattr(tool_schema, "metadata", None):
+        meta = tool_schema.metadata
+        if meta.get("source_label"):
+            enrichment["source"] = meta["source_label"]
+        if meta.get("method") and meta.get("path"):
+            enrichment["http"] = f"{meta['method'].upper()} {meta['path']}"
+        rs = meta.get("response_schema")
+        if isinstance(rs, dict):
+            summary = _summarize_response_schema(rs)
+            if summary:
+                enrichment["returns"] = summary
+
+    # Outgoing edges → chain hints
+    try:
+        engine = graph.graph
+        edges = engine.get_edges_from(name, direction="out")
+        chains: list[str] = []
+        for _src, target, attrs in edges:
+            relation = attrs.get("relation")
+            relation_name = (
+                relation.value if hasattr(relation, "value") else str(relation)
+            )
+            # Skip purely structural BELONGS_TO edges
+            if relation_name in ("belongs_to", "BELONGS_TO"):
+                continue
+            chains.append(f"{relation_name}→{target}")
+            if len(chains) >= 5:
+                break
+        if chains:
+            enrichment["next_candidates"] = chains
+    except Exception:
+        pass
+
+    return enrichment
+
+
 def create_gateway_tools(
     tools: list[Any],
     *,
@@ -111,12 +198,15 @@ def create_gateway_tools(
     total = len(tool_map)
     call_history: list[str] = []
 
+    underlying_graph = getattr(toolkit, "graph", None)
+
     @langchain_tool
     def search_tools(query: str, top_k: int | None = None) -> str:
         """Search available tools by natural language query.
 
         Use this FIRST to find which tools are available for the task.
-        Returns tool names, descriptions, and required parameters.
+        Returns tool names, descriptions, parameters, response shape, and
+        ``next_candidates`` (related tools you may want to call afterwards).
 
         Args:
             query: Natural language search query (e.g. "cancel order", "send email")
@@ -135,11 +225,12 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                 desc = t.get("description", "")
             entry: dict[str, Any] = {
                 "name": name,
-                "description": desc[:200],
+                "description": desc[:300],
             }
             params = _extract_parameters_info(t)
             if params:
                 entry["parameters"] = params
+            entry.update(_enrich_from_graph(name, underlying_graph))
             matched.append(entry)
 
         output = {
@@ -148,8 +239,10 @@ def search_tools(query: str, top_k: int | None = None) -> str:
             "total_tools": total,
             "tools": matched,
             "hint": (
-                "Use call_tool to execute a tool. "
-                "Pass tool_name and arguments as a dict matching the parameters above."
+                "Use call_tool to execute a tool. Pass tool_name and arguments "
+                "as a dict matching the parameters above. The 'returns' field "
+                "shows the response shape — extract values from there to build "
+                "arguments for the next call (see 'next_candidates')."
             ),
         }
 
diff --git a/graph_tool_call/net.py b/graph_tool_call/net.py
index dfe1c35..ba46e26 100644
--- a/graph_tool_call/net.py
+++ b/graph_tool_call/net.py
@@ -44,8 +44,22 @@ def redirect_request(
         return super().redirect_request(req, fp, code, msg, headers, newurl)
 
 
-def _open_url(request: urllib.request.Request | str, *, timeout: int, max_redirects: int) -> Any:
-    opener = urllib.request.build_opener(_LimitedRedirectHandler(max_redirects))
+def _open_url(
+    request: urllib.request.Request | str,
+    *,
+    timeout: int,
+    max_redirects: int,
+    verify_ssl: bool = True,
+) -> Any:
+    """urllib opener — verify_ssl=False 시 self-signed/사내 CA 인증서 허용."""
+    handlers: list[Any] = [_LimitedRedirectHandler(max_redirects)]
+    if not verify_ssl:
+        import ssl
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False
+        ctx.verify_mode = ssl.CERT_NONE
+        handlers.append(urllib.request.HTTPSHandler(context=ctx))
+    opener = urllib.request.build_opener(*handlers)
     return opener.open(request, timeout=timeout)
 
 
@@ -128,13 +142,22 @@ def fetch_url_text(
     allowed_content_types: tuple[str, ...] = _DEFAULT_ALLOWED_CONTENT_TYPES,
     allow_private_hosts: bool = False,
     max_redirects: int = _DEFAULT_MAX_REDIRECTS,
+    verify_ssl: bool | None = None,
 ) -> str:
-    """Fetch UTF-8 text from a remote URL with basic SSRF protections."""
+    """Fetch UTF-8 text from a remote URL with basic SSRF protections.
+
+    ``verify_ssl`` — None 이면 ``allow_private_hosts`` 값에 따라 자동 결정
+    (사내망 hosts 는 self-signed CA 가 일반적이므로 verify off 가 기본).
+    """
     validate_remote_url(url, allow_private_hosts=allow_private_hosts)
 
+    if verify_ssl is None:
+        # allow_private_hosts=True 사용자는 보통 사내망 hitting. 사내 CA 포용.
+        verify_ssl = not allow_private_hosts
+
     req = urllib.request.Request(url, headers=headers or {})
     try:
-        with _open_url(req, timeout=timeout, max_redirects=max_redirects) as resp:
+        with _open_url(req, timeout=timeout, max_redirects=max_redirects, verify_ssl=verify_ssl) as resp:
             final_url = url
             if hasattr(resp, "geturl"):
                 candidate = resp.geturl()
diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py
index 8748ac3..a34786d 100644
--- a/graph_tool_call/ontology/llm_provider.py
+++ b/graph_tool_call/ontology/llm_provider.py
@@ -5,7 +5,7 @@
 import json
 import urllib.request
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any
 
 from graph_tool_call.ontology.schema import RelationType
@@ -13,11 +13,20 @@
 
 @dataclass
 class ToolSummary:
-    """Lightweight tool representation for LLM prompts."""
+    """Lightweight tool representation for LLM prompts.
+
+    The optional fields (``method``, ``path``, ``response_fields``) extend the
+    summary for semantic enrichment (``enrich_tool_semantics``). They are
+    ignored by methods that don't need them, preserving backward compat.
+    """
 
     name: str
     description: str
     parameters: list[str]  # just parameter names
+    # Extended context for semantic enrichment (optional)
+    method: str = ""
+    path: str = ""
+    response_fields: list[str] = field(default_factory=list)
 
 
 @dataclass
@@ -31,6 +40,49 @@ class InferredRelation:
     reason: str
 
 
+@dataclass
+class FieldSemantic:
+    """A field annotated with its semantic identifier.
+
+    Used on both produces (what a tool outputs) and consumes (what it
+    requires). ``json_path`` is set on produces; ``field`` is set on consumes.
+    """
+
+    semantic: str
+    json_path: str = ""
+    field: str = ""
+
+
+@dataclass
+class PairHint:
+    """LLM-suggested tool that pairs with the current tool."""
+
+    tool: str
+    reason: str
+
+
+@dataclass
+class ToolEnrichment:
+    """Per-tool semantic annotation produced by ``enrich_tool_semantics``.
+
+    This is the Pass 2 output of the Plan-and-Execute L0 knowledge base.
+    Used downstream by:
+      - Stage 1 target selection (``when_to_use`` in catalog)
+      - Stage 2 path synthesis (``produces_semantics`` / ``consumes_semantics``
+        replace hardcoded synonym tables)
+      - Graph edges (``pairs_well_with`` becomes semantic edges)
+    """
+
+    canonical_action: str                         # search | read | create | update | delete | action
+    primary_resource: str                         # e.g. "product"
+    one_line_summary: str
+    when_to_use: str
+    when_not_to_use: str = ""
+    produces_semantics: list[FieldSemantic] = field(default_factory=list)
+    consumes_semantics: list[FieldSemantic] = field(default_factory=list)
+    pairs_well_with: list[PairHint] = field(default_factory=list)
+
+
 # ---------------------------------------------------------------------------
 # Prompt templates
 # ---------------------------------------------------------------------------
@@ -122,6 +174,52 @@ class InferredRelation:
 [{{"source":"toolA","target":"toolB","relation":"PRECEDES","confidence":0.9,"reason":"..."}}]"""
 
 
+_ENRICH_SEMANTICS_PROMPT = """\
+You are annotating API tools for a plan-and-execute planning system.
+Produce structured metadata that downstream components use to (1) pick the
+right tool for a user's goal, (2) synthesize execution plans, and (3) wire
+one tool's output to another tool's input.
+
+AVAILABLE TOOLS IN THE COLLECTION (names + 1-line descriptions, for
+pairs_well_with reference):
+{all_tools_brief}
+
+TOOLS TO ANNOTATE (this batch):
+{batch_detailed}
+
+For each tool in the batch, output a JSON object with these fields:
+  - canonical_action: one of "search" | "read" | "create" | "update" | "delete" | "action"
+  - primary_resource: one lowercase noun (e.g. "product", "order", "user", "shop", "category")
+  - one_line_summary: short natural-language summary (<=60 chars)
+  - when_to_use: 1-2 sentences describing the trigger condition
+  - when_not_to_use: optional 1 sentence (can be empty) — alternative tool cases
+  - produces_semantics: array of {{"semantic": "canonical_id", "json_path": "$.body..."}}
+      * Include only MEANINGFUL fields (IDs, names, key metrics).
+      * Skip pagination, headers, status codes.
+      * Use CONSISTENT semantic ids across tools. If two tools both return a
+        product identifier (one calls it "goodsNo", another "productId"),
+        use the same semantic like "product_id".
+  - consumes_semantics: array of {{"semantic": "canonical_id", "field": "paramName"}}
+      * REQUIRED inputs only. Skip optional filters, pagination.
+      * Same semantic id conventions as produces.
+  - pairs_well_with: array of {{"tool": "tool_name_from_available_list",
+                                "reason": "brief reason"}}
+      * 2-4 tools that typically precede or follow this tool.
+      * Names MUST match the available list exactly. Do not invent.
+
+OUTPUT FORMAT (strict):
+{{
+  "tool_name_1": {{...fields...}},
+  "tool_name_2": {{...fields...}}
+}}
+
+STRICT RULES:
+  - You MUST produce one entry for EVERY tool in the batch.
+  - Do NOT skip tools with unclear descriptions — make your best guess.
+  - Keep fields concise (short sentences) so all tools fit in the output.
+  - Return JSON only. No markdown fences, no prose, no comments."""
+
+
 def _format_tools_list(tools: list[ToolSummary]) -> str:
     lines = []
     for i, t in enumerate(tools, 1):
@@ -130,6 +228,81 @@ def _format_tools_list(tools: list[ToolSummary]) -> str:
     return "\n".join(lines)
 
 
+def _format_tools_brief(tools: list[ToolSummary]) -> str:
+    """Compact name list for the ``pairs_well_with`` reference.
+
+    Name-only (no descriptions) to keep prompt small — descriptions would
+    bloat the prompt by N× since every batch prompt contains this list.
+    Tool names like ``seltSearchProduct`` already encode intent.
+    """
+    return "\n".join(f"- {t.name}" for t in tools)
+
+
+def _format_tools_for_enrichment(tools: list[ToolSummary]) -> str:
+    """Detailed per-tool block for enrichment prompt input."""
+    blocks = []
+    for t in tools:
+        parts = [f"== {t.name} =="]
+        if t.method and t.path:
+            parts.append(f"HTTP: {t.method.upper()} {t.path}")
+        if t.description:
+            desc = t.description.strip()[:400]
+            parts.append(f"Description: {desc}")
+        if t.parameters:
+            params = ", ".join(t.parameters[:25])
+            parts.append(f"Request fields: {params}")
+        if t.response_fields:
+            resp = ", ".join(t.response_fields[:25])
+            parts.append(f"Response fields: {resp}")
+        blocks.append("\n".join(parts))
+    return "\n\n".join(blocks)
+
+
+def _parse_enrichment(data: Any) -> ToolEnrichment | None:
+    """Build a ToolEnrichment from LLM JSON output. Tolerant of missing keys."""
+    if not isinstance(data, dict):
+        return None
+    try:
+        produces = [
+            FieldSemantic(
+                semantic=str(p.get("semantic", "")).strip(),
+                json_path=str(p.get("json_path", "")).strip(),
+            )
+            for p in (data.get("produces_semantics") or [])
+            if isinstance(p, dict) and str(p.get("semantic", "")).strip()
+        ]
+        consumes = [
+            FieldSemantic(
+                semantic=str(c.get("semantic", "")).strip(),
+                field=str(c.get("field", "")).strip(),
+            )
+            for c in (data.get("consumes_semantics") or [])
+            if isinstance(c, dict) and str(c.get("semantic", "")).strip()
+        ]
+        pairs = [
+            PairHint(
+                tool=str(p.get("tool", "")).strip(),
+                reason=str(p.get("reason", "")).strip(),
+            )
+            for p in (data.get("pairs_well_with") or [])
+            if isinstance(p, dict) and str(p.get("tool", "")).strip()
+        ]
+        action = str(data.get("canonical_action", "")).strip().lower()
+        resource = str(data.get("primary_resource", "")).strip().lower()
+        return ToolEnrichment(
+            canonical_action=action,
+            primary_resource=resource,
+            one_line_summary=str(data.get("one_line_summary", "")).strip(),
+            when_to_use=str(data.get("when_to_use", "")).strip(),
+            when_not_to_use=str(data.get("when_not_to_use", "")).strip(),
+            produces_semantics=produces,
+            consumes_semantics=consumes,
+            pairs_well_with=pairs,
+        )
+    except (KeyError, TypeError, ValueError, AttributeError):
+        return None
+
+
 def _parse_relation_type(s: str) -> RelationType | None:
     mapping = {
         "REQUIRES": RelationType.REQUIRES,
@@ -423,6 +596,62 @@ def generate_example_queries(
 
         return all_queries
 
+    def enrich_tool_semantics(
+        self,
+        tools: list[ToolSummary],
+        batch_size: int = 10,
+        *,
+        reference_tools: list[ToolSummary] | None = None,
+    ) -> dict[str, ToolEnrichment]:
+        """Per-tool semantic annotation for Plan-and-Execute architecture.
+
+        ``tools`` = the batch (or batches) of tools to produce detailed
+        enrichment for. ``reference_tools`` = the full catalog used only to
+        build ``all_tools_brief`` in the prompt (so LLM picks
+        ``pairs_well_with`` from valid names). If ``reference_tools`` is
+        None, falls back to ``tools``.
+
+        Streaming callers typically pass one batch in ``tools`` + the full
+        collection in ``reference_tools`` + ``batch_size=len(tools)`` so the
+        internal loop runs once per caller invocation.
+
+        Output is used by:
+          - Stage 1 (target selection) — ``one_line_summary`` + ``when_to_use``
+            in tool catalog make LLM picks more accurate with smaller context.
+          - Stage 2 (path synthesis) — ``produces_semantics`` /
+            ``consumes_semantics`` carry canonical semantic ids so bindings
+            work across convention mismatches (e.g. ``goodsNo`` ≡ ``productId``)
+            without a hardcoded synonym table.
+          - Graph edges — ``pairs_well_with`` becomes optional semantic edges
+            that complement structural field-match edges.
+        """
+        results: dict[str, ToolEnrichment] = {}
+        if not tools:
+            return results
+
+        all_brief = _format_tools_brief(reference_tools or tools)
+
+        for i in range(0, len(tools), batch_size):
+            batch = tools[i : i + batch_size]
+            prompt = _ENRICH_SEMANTICS_PROMPT.format(
+                all_tools_brief=all_brief,
+                batch_detailed=_format_tools_for_enrichment(batch),
+            )
+            response = self.generate(prompt)
+
+            try:
+                parsed = _extract_json(response)
+                if not isinstance(parsed, dict):
+                    continue
+                for name, data in parsed.items():
+                    enrichment = _parse_enrichment(data)
+                    if enrichment is not None and enrichment.canonical_action:
+                        results[str(name)] = enrichment
+            except (json.JSONDecodeError, KeyError, TypeError):
+                continue
+
+        return results
+
 
 # ---------------------------------------------------------------------------
 # Ollama Provider
@@ -475,18 +704,25 @@ def __init__(
         model: str = "gpt-4o-mini",
         base_url: str = "https://api.openai.com/v1",
         api_key: str = "",
+        max_tokens: int = 8192,
+        timeout: int = 300,
     ) -> None:
         self.model = model
         self.base_url = base_url.rstrip("/")
         self.api_key = api_key
+        self.max_tokens = max_tokens
+        self.timeout = timeout
 
     def generate(self, prompt: str) -> str:
         url = f"{self.base_url}/chat/completions"
+        # max_tokens 를 명시 지정하지 않으면 provider 기본값 (일부 모델은 4096)
+        # 으로 잘려서 batch enrichment JSON 이 중간에 truncate → 일부 tool 누락.
         payload = json.dumps(
             {
                 "model": self.model,
                 "messages": [{"role": "user", "content": prompt}],
                 "temperature": 0.1,
+                "max_tokens": self.max_tokens,
             }
         ).encode()
 
@@ -495,7 +731,7 @@ def generate(self, prompt: str) -> str:
             headers["Authorization"] = f"Bearer {self.api_key}"
 
         req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
-        with urllib.request.urlopen(req, timeout=120) as resp:  # noqa: S310
+        with urllib.request.urlopen(req, timeout=self.timeout) as resp:  # noqa: S310
             result = json.loads(resp.read().decode())
             choices = result.get("choices", [])
             if choices:
diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
new file mode 100644
index 0000000..c35aef3
--- /dev/null
+++ b/graph_tool_call/plan/__init__.py
@@ -0,0 +1,62 @@
+"""Plan-and-Execute primitives: schemas, binding resolver, runner.
+
+The ``plan`` package is deliberately transport-agnostic. It knows nothing
+about HTTP, authentication, or xgen internals — it only defines how a
+Plan looks, how string bindings are resolved against step outputs, and how
+to drive execution via an injected callable.
+
+Typical use (from an integration layer like xgen-workflow):
+
+    from graph_tool_call.plan import Plan, PlanStep, PlanRunner
+
+    plan = Plan(id="...", goal="...", steps=[PlanStep(...), ...])
+
+    def call_tool(tool_name, args):
+        return my_http_executor.execute(tool_name, args)
+
+    runner = PlanRunner(call_tool)
+    for event in runner.run(plan):
+        # event: StepStarted | StepCompleted | StepFailed | PlanCompleted
+        ...
+"""
+
+from graph_tool_call.plan.binding import (
+    BindingError,
+    resolve_bindings,
+)
+from graph_tool_call.plan.runner import (
+    PlanRunner,
+    PlanEvent,
+    PlanStarted,
+    StepStarted,
+    StepCompleted,
+    StepFailed,
+    PlanCompleted,
+    PlanAborted,
+)
+from graph_tool_call.plan.schema import (
+    Plan,
+    PlanStep,
+    ExecutionTrace,
+    StepTrace,
+)
+
+__all__ = [
+    # schema
+    "Plan",
+    "PlanStep",
+    "ExecutionTrace",
+    "StepTrace",
+    # binding
+    "BindingError",
+    "resolve_bindings",
+    # runner + events
+    "PlanRunner",
+    "PlanEvent",
+    "PlanStarted",
+    "StepStarted",
+    "StepCompleted",
+    "StepFailed",
+    "PlanCompleted",
+    "PlanAborted",
+]
diff --git a/graph_tool_call/plan/binding.py b/graph_tool_call/plan/binding.py
new file mode 100644
index 0000000..58d9eef
--- /dev/null
+++ b/graph_tool_call/plan/binding.py
@@ -0,0 +1,165 @@
+"""Binding resolver for Plan args.
+
+Substitutes ``${source.dotted.path}`` placeholders in step arguments with
+actual values drawn from the runtime context. The context is a dict mapping
+source names (``"s1"``, ``"s2"``, ``"input"``, ...) to arbitrary JSON-like
+objects.
+
+v1 path syntax (kept deliberately small):
+
+  - dotted keys          : ``s1.body.goods`` → ``ctx["s1"]["body"]["goods"]``
+  - array index          : ``s1.body.goods[0].goodsNo``
+  - whole-source         : ``s1`` → entire result dict of step s1
+  - input alias          : ``input.keyword`` — caller injects a special
+                           ``"input"`` entry at runtime for user-provided
+                           entities extracted by Stage 1.
+
+Explicitly NOT supported in v1:
+
+  - wildcard ``[*]`` (fan-out) — see §11.1 of the design doc
+  - filter expressions (JSONPath ``[?(...)]``)
+  - functions / casts (``int(...)``, ``default(...)``)
+
+Behavior rules:
+
+  1. If a string argument is **entirely** one binding (``"${s1.id}"``) the
+     resolved value keeps its native type (int, dict, list, ...). This is
+     important so integer IDs aren't accidentally stringified.
+  2. If a string contains bindings mixed with literal text
+     (``"prefix-${s1.id}"``) each binding is ``str()``-cast during
+     interpolation. The result is always a string.
+  3. Unresolved bindings raise ``BindingError`` — callers should treat
+     this as a plan validation failure, not a tool execution error.
+  4. ``dict`` and ``list`` values are walked recursively.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+
+class BindingError(ValueError):
+    """Raised when a ``${...}`` expression cannot be resolved."""
+
+
+# Matches one ``${...}`` placeholder. Accepts empty body so ``${}`` triggers
+# a clear BindingError downstream instead of passing through as a literal.
+# ``{`` and ``}`` inside a binding are not supported in v1.
+_BINDING_RE = re.compile(r"\$\{([^${}]*)\}")
+
+
+def resolve_bindings(value: Any, context: dict[str, Any]) -> Any:
+    """Recursively resolve bindings in *value* against *context*.
+
+    Dict/list values are walked; strings are interpolated. Non-string
+    scalars pass through unchanged.
+    """
+    if isinstance(value, dict):
+        return {k: resolve_bindings(v, context) for k, v in value.items()}
+    if isinstance(value, list):
+        return [resolve_bindings(v, context) for v in value]
+    if isinstance(value, str):
+        return _resolve_string(value, context)
+    return value
+
+
+def _resolve_string(s: str, context: dict[str, Any]) -> Any:
+    """Resolve a string value.
+
+    If the string is exactly one binding (``${path}``), returns the native
+    value. Otherwise substitutes each match with its stringified form.
+    """
+    # Whole-string binding → native type
+    m = _BINDING_RE.fullmatch(s.strip())
+    if m:
+        return _lookup(m.group(1).strip(), context)
+
+    # Mixed / multi-binding → string interpolation
+    def _sub(match: re.Match[str]) -> str:
+        val = _lookup(match.group(1).strip(), context)
+        return "" if val is None else str(val)
+
+    return _BINDING_RE.sub(_sub, s)
+
+
+def _lookup(expr: str, context: dict[str, Any]) -> Any:
+    """Walk a dotted path with optional ``[N]`` indices against *context*."""
+    tokens = _tokenize(expr)
+    if not tokens:
+        raise BindingError(f"empty binding expression: {expr!r}")
+
+    head = tokens[0]
+    if head not in context:
+        raise BindingError(
+            f"unknown source {head!r} in binding ${{...}}: context has {sorted(context)!r}"
+        )
+    node: Any = context[head]
+
+    for tok in tokens[1:]:
+        if tok.startswith("[") and tok.endswith("]"):
+            # array index — allow negative too
+            try:
+                idx = int(tok[1:-1])
+            except ValueError as exc:
+                raise BindingError(
+                    f"non-numeric array index {tok!r} in binding {expr!r}"
+                ) from exc
+            if not isinstance(node, (list, tuple)):
+                raise BindingError(
+                    f"indexing {tok} on non-list type {type(node).__name__} (expr={expr!r})"
+                )
+            try:
+                node = node[idx]
+            except IndexError as exc:
+                raise BindingError(
+                    f"index {idx} out of range in binding {expr!r}"
+                ) from exc
+        else:
+            if not isinstance(node, dict):
+                raise BindingError(
+                    f"cannot descend into .{tok} on non-dict type {type(node).__name__} "
+                    f"(expr={expr!r})"
+                )
+            if tok not in node:
+                raise BindingError(
+                    f"key {tok!r} not found in binding {expr!r} "
+                    f"(available: {sorted(node)[:8]!r}...)"
+                )
+            node = node[tok]
+
+    return node
+
+
+def _tokenize(expr: str) -> list[str]:
+    """Tokenize a dotted path with ``[N]`` indices.
+
+    ``s1.body.goods[0].goodsNo`` → ``["s1", "body", "goods", "[0]", "goodsNo"]``
+    """
+    tokens: list[str] = []
+    buf = []
+    i = 0
+    while i < len(expr):
+        ch = expr[i]
+        if ch == ".":
+            if buf:
+                tokens.append("".join(buf))
+                buf = []
+        elif ch == "[":
+            if buf:
+                tokens.append("".join(buf))
+                buf = []
+            end = expr.find("]", i)
+            if end == -1:
+                raise BindingError(f"unclosed '[' in binding {expr!r}")
+            tokens.append(expr[i:end + 1])
+            i = end
+        else:
+            buf.append(ch)
+        i += 1
+    if buf:
+        tokens.append("".join(buf))
+    return tokens
+
+
+__all__ = ["BindingError", "resolve_bindings"]
diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py
new file mode 100644
index 0000000..8b9fa27
--- /dev/null
+++ b/graph_tool_call/plan/runner.py
@@ -0,0 +1,342 @@
+"""PlanRunner — deterministic executor for Plan artifacts.
+
+The runner is transport-agnostic: it takes a ``call_tool`` callable that
+actually performs each step. This decouples ``graph_tool_call`` (pure
+plan/graph logic) from integration concerns (HTTP, auth, retries —
+handled by the caller's adapter).
+
+The runner emits structured events as it progresses — callers can relay
+these over SSE, logs, or progress UIs.
+
+v1 scope reminder: **linear execution, no fan-out, no conditionals, no
+automatic re-planning**. Failures abort the run and return a trace.
+"""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Callable, Iterator
+
+from graph_tool_call.plan.binding import BindingError, resolve_bindings
+from graph_tool_call.plan.schema import (
+    ExecutionTrace,
+    Plan,
+    PlanStep,
+    StepTrace,
+)
+
+
+# ---------------------------------------------------------------------------
+# Event types — structured so callers can pattern-match by ``type`` field
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PlanStarted:
+    type: str = "plan.started"
+    plan_id: str = ""
+    goal: str = ""
+    step_count: int = 0
+
+
+@dataclass
+class StepStarted:
+    type: str = "step.started"
+    step_id: str = ""
+    tool: str = ""
+    args_resolved: dict[str, Any] = field(default_factory=dict)
+    index: int = 0
+    total: int = 0
+
+
+@dataclass
+class StepCompleted:
+    type: str = "step.completed"
+    step_id: str = ""
+    tool: str = ""
+    duration_ms: int = 0
+    output_preview: Any = None                 # truncated output for UI
+    output_size: int = 0
+
+
+@dataclass
+class StepFailed:
+    type: str = "step.failed"
+    step_id: str = ""
+    tool: str = ""
+    error: dict[str, Any] = field(default_factory=dict)
+    duration_ms: int = 0
+
+
+@dataclass
+class PlanCompleted:
+    type: str = "plan.completed"
+    plan_id: str = ""
+    output: Any = None
+    total_duration_ms: int = 0
+
+
+@dataclass
+class PlanAborted:
+    type: str = "plan.aborted"
+    plan_id: str = ""
+    failed_step: str = ""
+    error: dict[str, Any] = field(default_factory=dict)
+    total_duration_ms: int = 0
+
+
+PlanEvent = (
+    PlanStarted
+    | StepStarted
+    | StepCompleted
+    | StepFailed
+    | PlanCompleted
+    | PlanAborted
+)
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+
+# ToolCaller signature: (tool_name, resolved_args) -> output_dict
+ToolCaller = Callable[[str, dict[str, Any]], Any]
+
+
+class PlanRunner:
+    """Execute a Plan step-by-step using a caller-provided tool invoker.
+
+    Usage::
+
+        def call_tool(name: str, args: dict) -> dict:
+            return my_http_executor.execute(name, args)
+
+        runner = PlanRunner(call_tool)
+        trace = runner.run(plan)                  # run to completion, return trace
+        # or — streaming:
+        for event in runner.run_stream(plan):
+            send_over_sse(event)
+    """
+
+    def __init__(
+        self,
+        call_tool: ToolCaller,
+        *,
+        output_preview_limit: int = 512,
+        on_error: str = "abort",                  # 'abort' only in v1
+    ) -> None:
+        self._call_tool = call_tool
+        self._preview_limit = output_preview_limit
+        if on_error != "abort":
+            raise ValueError("v1 PlanRunner only supports on_error='abort'")
+
+    # ----------------------------------------------------------------------
+    # Streaming interface — yields PlanEvent instances
+    # ----------------------------------------------------------------------
+
+    def run_stream(
+        self,
+        plan: Plan,
+        *,
+        input_context: dict[str, Any] | None = None,
+    ) -> Iterator[PlanEvent]:
+        """Execute *plan* and yield events as each step progresses.
+
+        ``input_context`` supplies values for ``${input.xxx}`` bindings —
+        typically the entities extracted by Stage 1 (intent parser).
+        """
+        started = _now_iso()
+        plan_start = time.monotonic()
+
+        yield PlanStarted(
+            plan_id=plan.id,
+            goal=plan.goal,
+            step_count=len(plan.steps),
+        )
+
+        # step_id -> output (runtime context for binding resolution)
+        context: dict[str, Any] = {}
+        if input_context:
+            context["input"] = dict(input_context)
+
+        trace_steps: list[StepTrace] = []
+
+        for idx, step in enumerate(plan.steps, start=1):
+            step_trace = StepTrace(id=step.id, tool=step.tool)
+            step_start = time.monotonic()
+
+            # 1. Resolve bindings
+            try:
+                resolved = resolve_bindings(step.args, context)
+            except BindingError as exc:
+                err = {
+                    "kind": "binding",
+                    "message": str(exc),
+                }
+                step_trace.error = err
+                step_trace.duration_ms = _ms_since(step_start)
+                trace_steps.append(step_trace)
+                yield StepFailed(
+                    step_id=step.id, tool=step.tool,
+                    error=err, duration_ms=step_trace.duration_ms,
+                )
+                yield PlanAborted(
+                    plan_id=plan.id, failed_step=step.id,
+                    error=err,
+                    total_duration_ms=_ms_since(plan_start),
+                )
+                return
+
+            step_trace.args_resolved = resolved
+            yield StepStarted(
+                step_id=step.id, tool=step.tool,
+                args_resolved=resolved,
+                index=idx, total=len(plan.steps),
+            )
+
+            # 2. Execute via caller's tool invoker
+            try:
+                output = self._call_tool(step.tool, resolved)
+            except Exception as exc:              # noqa: BLE001 — caller-defined
+                err = {
+                    "kind": "tool",
+                    "message": str(exc),
+                    "exception_type": type(exc).__name__,
+                }
+                step_trace.error = err
+                step_trace.duration_ms = _ms_since(step_start)
+                trace_steps.append(step_trace)
+                yield StepFailed(
+                    step_id=step.id, tool=step.tool,
+                    error=err, duration_ms=step_trace.duration_ms,
+                )
+                yield PlanAborted(
+                    plan_id=plan.id, failed_step=step.id,
+                    error=err,
+                    total_duration_ms=_ms_since(plan_start),
+                )
+                return
+
+            step_trace.output = output
+            step_trace.duration_ms = _ms_since(step_start)
+            trace_steps.append(step_trace)
+
+            # 3. Store output in context for later bindings
+            context[step.id] = output
+
+            yield StepCompleted(
+                step_id=step.id, tool=step.tool,
+                duration_ms=step_trace.duration_ms,
+                output_preview=_preview(output, self._preview_limit),
+                output_size=_output_size(output),
+            )
+
+        # 4. Resolve output_binding for final answer
+        try:
+            final = (
+                resolve_bindings(plan.output_binding, context)
+                if plan.output_binding
+                else (context[plan.steps[-1].id] if plan.steps else None)
+            )
+        except BindingError as exc:
+            err = {"kind": "output_binding", "message": str(exc)}
+            yield PlanAborted(
+                plan_id=plan.id, failed_step="<output_binding>",
+                error=err,
+                total_duration_ms=_ms_since(plan_start),
+            )
+            return
+
+        yield PlanCompleted(
+            plan_id=plan.id,
+            output=final,
+            total_duration_ms=_ms_since(plan_start),
+        )
+
+    # ----------------------------------------------------------------------
+    # Non-streaming interface — returns final ExecutionTrace
+    # ----------------------------------------------------------------------
+
+    def run(
+        self,
+        plan: Plan,
+        *,
+        input_context: dict[str, Any] | None = None,
+    ) -> ExecutionTrace:
+        """Execute *plan* and return an ExecutionTrace aggregating events."""
+        started_at = _now_iso()
+        started = time.monotonic()
+        trace_steps: list[StepTrace] = []
+        success = False
+        failed_step: str | None = None
+        output: Any = None
+
+        last_step_output: dict[str, Any] = {}
+
+        for event in self.run_stream(plan, input_context=input_context):
+            etype = event.type
+            if etype == "step.completed":
+                # step trace built progressively — simpler: derive from events
+                pass
+            elif etype == "plan.completed":
+                success = True
+                output = event.output  # type: ignore[union-attr]
+            elif etype == "plan.aborted":
+                failed_step = event.failed_step  # type: ignore[union-attr]
+
+        # Recompute trace_steps by re-running the stream? No — we already
+        # lost events. Instead the run_stream implementation should also
+        # surface StepTrace. For v1 keep trace minimal (plan-level only) —
+        # callers that need per-step detail should use run_stream.
+        _ = last_step_output  # (placeholder to satisfy future extension)
+        return ExecutionTrace(
+            plan_id=plan.id,
+            success=success,
+            steps=trace_steps,
+            output=output,
+            failed_step=failed_step,
+            total_duration_ms=_ms_since(started),
+            started_at=started_at,
+            ended_at=_now_iso(),
+        )
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _ms_since(start_monotonic: float) -> int:
+    return int((time.monotonic() - start_monotonic) * 1000)
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _preview(value: Any, limit: int) -> Any:
+    """Trim large outputs for UI previews. Keep small values intact."""
+    if isinstance(value, (dict, list)):
+        import json as _json
+        try:
+            rendered = _json.dumps(value, ensure_ascii=False)
+        except (TypeError, ValueError):
+            return {"_preview": f"<unserializable {type(value).__name__}>"}
+        if len(rendered) <= limit:
+            return value
+        return {"_preview": rendered[:limit] + "…", "_truncated": True}
+    if isinstance(value, str) and len(value) > limit:
+        return value[:limit] + "…"
+    return value
+
+
+def _output_size(value: Any) -> int:
+    """Approximate serialized byte size (for observability)."""
+    import json as _json
+    try:
+        return len(_json.dumps(value, ensure_ascii=False))
+    except (TypeError, ValueError):
+        return 0
diff --git a/graph_tool_call/plan/schema.py b/graph_tool_call/plan/schema.py
new file mode 100644
index 0000000..8a18577
--- /dev/null
+++ b/graph_tool_call/plan/schema.py
@@ -0,0 +1,80 @@
+"""Plan and ExecutionTrace dataclasses.
+
+``Plan`` is the artifact produced by Stage 2 (Path Synthesizer) of the
+Plan-and-Execute architecture. It's consumed by ``PlanRunner`` (Stage 3).
+Both are intentionally plain dataclasses — serializable, introspectable,
+easy to hand-craft for testing.
+
+The schema explicitly does NOT include fan-out / conditional branching in
+v1 (per design doc §16 decision 6). Future versions can add optional
+fields (``foreach``, ``condition``) on ``PlanStep``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class PlanStep:
+    """A single step in a Plan.
+
+    ``args`` may contain binding placeholders of the form
+    ``${step_id.json.path}`` or ``${input.keyword}``. These are resolved
+    at runtime by ``resolve_bindings`` using the accumulated step context.
+    """
+
+    id: str                                    # "s1", "s2", ...
+    tool: str                                  # function_name (graph node name)
+    args: dict[str, Any] = field(default_factory=dict)
+    rationale: str = ""                        # why this step exists (for audit)
+    timeout_ms: int | None = None
+    retryable: bool = False                    # reserved for v1.1 retry policy
+
+
+@dataclass
+class Plan:
+    """Executable plan — ordered steps with binding references.
+
+    v1 scope: **linear execution only**. Steps run in listed order. No
+    fan-out, no conditional branching, no parallelism. Each step may
+    reference earlier step outputs via ``${sN.path}`` bindings.
+
+    ``output_binding`` designates which step's (or sub-path's) result is
+    the final answer. If unset, runner returns the last step's result.
+    """
+
+    id: str                                    # uuid
+    goal: str                                  # user requirement summary
+    steps: list[PlanStep] = field(default_factory=list)
+    output_binding: str | None = None          # e.g. "${s2.body}"
+    created_at: str = ""                       # ISO8601
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class StepTrace:
+    """Record of a single step execution."""
+
+    id: str
+    tool: str
+    args_resolved: dict[str, Any] = field(default_factory=dict)
+    output: Any = None                         # set on success
+    error: dict[str, Any] | None = None        # set on failure
+    duration_ms: int = 0
+    retries: int = 0
+
+
+@dataclass
+class ExecutionTrace:
+    """Result of a full Plan execution."""
+
+    plan_id: str
+    success: bool
+    steps: list[StepTrace] = field(default_factory=list)
+    output: Any = None                         # plan.output_binding resolved
+    failed_step: str | None = None
+    total_duration_ms: int = 0
+    started_at: str = ""
+    ended_at: str = ""
diff --git a/graph_tool_call/tool_graph.py b/graph_tool_call/tool_graph.py
index 9147f85..28839ed 100644
--- a/graph_tool_call/tool_graph.py
+++ b/graph_tool_call/tool_graph.py
@@ -289,6 +289,9 @@ def ingest_openapi(
         min_confidence: float = 0.7,
         allow_private_hosts: bool = False,
         max_response_bytes: int = 5_000_000,
+        source_label: str | None = None,
+        on_conflict: str = "overwrite",
+        relink_existing: bool = True,
     ) -> list[ToolSchema]:
         """Ingest an OpenAPI/Swagger spec, register tools, and auto-detect relations.
 
@@ -304,11 +307,29 @@ def ingest_openapi(
             If True (default), run automatic dependency detection.
         min_confidence:
             Minimum confidence threshold for detected relations.
+        source_label:
+            Optional origin tag stored on each tool's ``metadata["source_label"]``.
+            Enables :meth:`list_sources` / :meth:`remove_source` and is used
+            to derive the namespace prefix when ``on_conflict="prefix"``.
+        on_conflict:
+            How to handle a name collision with an already-registered tool.
+
+            - ``"overwrite"`` (default): replace the existing tool.
+            - ``"prefix"``: rename incoming as ``{source_label}.{name}`` (or
+              ``incoming.{name}`` if no label provided). Subsequent collisions
+              after prefixing fall back to ``overwrite``.
+            - ``"skip"``: keep the existing tool, drop the incoming one.
+            - ``"error"``: raise ``ValueError`` on the first collision.
+        relink_existing:
+            When True (default), after adding the new batch, dependency
+            detection is re-run across **new ↔ existing** tools so that
+            cross-source edges are discovered. Has no effect when this is
+            the first ingest or ``detect_dependencies=False``.
 
         Returns
         -------
         list[ToolSchema]
-            The ingested tool schemas.
+            The ingested tool schemas (with any prefix-rename applied).
         """
         from graph_tool_call.ingest.openapi import ingest_openapi
 
@@ -319,13 +340,16 @@ def ingest_openapi(
             allow_private_hosts=allow_private_hosts,
             max_response_bytes=max_response_bytes,
         )
-        self._register_tools_batch(
+        registered = self._register_tools_batch(
             tools,
             detect_dependencies=detect_dependencies,
             min_confidence=min_confidence,
             spec=spec.raw,
+            source_label=source_label,
+            on_conflict=on_conflict,
+            relink_existing=relink_existing,
         )
-        return tools
+        return registered
 
     def ingest_mcp_tools(
         self,
@@ -923,33 +947,92 @@ def _register_tools_batch(
         detect_dependencies: bool = True,
         min_confidence: float = 0.7,
         spec: dict | None = None,
-    ) -> None:
+        source_label: str | None = None,
+        on_conflict: str = "overwrite",
+        relink_existing: bool = True,
+    ) -> list[ToolSchema]:
         """Register tools, assign categories, and detect dependencies.
 
         Shared logic for ingest_openapi, ingest_mcp_tools, and ingest_functions.
+        Returns the list of tools that were actually registered (after any
+        conflict-driven rename or skip).
         """
+        had_existing = bool(self._tools)
+        registered: list[ToolSchema] = []
         categories_seen: set[str] = set()
+
         for tool in tools:
-            self._tools[tool.name] = tool
-            self._builder.add_tool(tool)
-            if tool.domain:
-                if tool.domain not in categories_seen:
-                    if not self._graph.has_node(tool.domain):
-                        self._builder.add_category(tool.domain)
-                    categories_seen.add(tool.domain)
-                self._builder.assign_category(tool.name, tool.domain)
-
-        if detect_dependencies and len(tools) >= 2:
+            resolved = self._resolve_conflict(tool, on_conflict, source_label)
+            if resolved is None:
+                continue
+            if source_label:
+                resolved.metadata["source_label"] = source_label
+            self._tools[resolved.name] = resolved
+            self._builder.add_tool(resolved)
+            if resolved.domain:
+                if resolved.domain not in categories_seen:
+                    if not self._graph.has_node(resolved.domain):
+                        self._builder.add_category(resolved.domain)
+                    categories_seen.add(resolved.domain)
+                self._builder.assign_category(resolved.name, resolved.domain)
+            registered.append(resolved)
+
+        if detect_dependencies and registered:
             from graph_tool_call.analyze.dependency import detect_dependencies as _detect
 
-            kwargs: dict = {"min_confidence": min_confidence}
-            if spec:
-                kwargs["spec"] = spec
-            relations = _detect(tools, **kwargs)
-            for rel in relations:
-                self._builder.add_relation(rel.source, rel.target, rel.relation_type)
+            # Scope of detection:
+            #   - First ingest, or relink disabled  → only the new batch.
+            #   - Incremental + relink_existing     → union of new + all existing,
+            #     so cross-source edges (e.g. order.* ↔ claim.*) are discovered.
+            if had_existing and relink_existing and len(self._tools) >= 2:
+                scope = list(self._tools.values())
+            else:
+                scope = registered
+
+            if len(scope) >= 2:
+                kwargs: dict = {"min_confidence": min_confidence}
+                if spec:
+                    kwargs["spec"] = spec
+                relations = _detect(scope, **kwargs)
+                for rel in relations:
+                    self._builder.add_relation(rel.source, rel.target, rel.relation_type)
 
         self._invalidate_retrieval()
+        return registered
+
+    def _resolve_conflict(
+        self,
+        tool: ToolSchema,
+        on_conflict: str,
+        source_label: str | None,
+    ) -> ToolSchema | None:
+        """Apply the *on_conflict* policy. Returns the tool to register, or None to skip.
+
+        Mutates ``tool.name`` when prefix-renaming.
+        """
+        if tool.name not in self._tools:
+            return tool
+
+        if on_conflict == "overwrite":
+            return tool
+        if on_conflict == "skip":
+            return None
+        if on_conflict == "error":
+            raise ValueError(
+                f"Tool '{tool.name}' already exists "
+                f"(on_conflict='error', incoming source_label={source_label!r})"
+            )
+        if on_conflict == "prefix":
+            prefix = source_label or "incoming"
+            new_name = f"{prefix}.{tool.name}"
+            # If the prefixed name also collides, fall through to overwrite —
+            # the caller has already chosen prefix as the deconfliction strategy.
+            tool.name = new_name
+            return tool
+        raise ValueError(
+            f"Unknown on_conflict policy: {on_conflict!r} "
+            "(expected 'overwrite' | 'prefix' | 'skip' | 'error')"
+        )
 
     # --- from_url ---
 
@@ -1167,6 +1250,59 @@ def apply_conflicts(self, conflicts: list | None = None, *, min_confidence: floa
             self._invalidate_retrieval()
         return added
 
+    # --- source management (incremental ingest) ---
+
+    def list_sources(self) -> list[str]:
+        """Return distinct ``source_label`` values across all registered tools."""
+        seen: dict[str, None] = {}
+        for tool in self._tools.values():
+            label = tool.metadata.get("source_label") if tool.metadata else None
+            if label and label not in seen:
+                seen[label] = None
+        return list(seen.keys())
+
+    def tools_by_source(self, source_label: str) -> list[ToolSchema]:
+        """Return all tools tagged with the given ``source_label``."""
+        return [
+            t for t in self._tools.values()
+            if t.metadata and t.metadata.get("source_label") == source_label
+        ]
+
+    def remove_source(self, source_label: str) -> int:
+        """Remove every tool tagged with *source_label* and its incident edges.
+
+        Returns the number of tools removed.
+        """
+        victims = [t.name for t in self.tools_by_source(source_label)]
+        for name in victims:
+            self._tools.pop(name, None)
+            if self._graph.has_node(name):
+                self._graph.remove_node(name)
+        if victims:
+            self._invalidate_retrieval()
+        return len(victims)
+
+    def relink(self, *, min_confidence: float = 0.7) -> int:
+        """Re-run dependency detection across all currently registered tools.
+
+        New relations are added to the existing graph. Existing edges are
+        preserved (the underlying graph engine deduplicates edges by
+        ``(source, target, relation)``).
+
+        Returns the number of detected relations applied (including
+        previously known ones — use this as an upper bound, not a delta).
+        """
+        if len(self._tools) < 2:
+            return 0
+        from graph_tool_call.analyze.dependency import detect_dependencies as _detect
+
+        relations = _detect(list(self._tools.values()), min_confidence=min_confidence)
+        for rel in relations:
+            self._builder.add_relation(rel.source, rel.target, rel.relation_type)
+        if relations:
+            self._invalidate_retrieval()
+        return len(relations)
+
     def analyze(
         self,
         *,
@@ -1397,17 +1533,28 @@ def search_tools(query: str, top_k: int | None = None) -> str:
             """Search available tools by natural language query.
 
             Use this FIRST to find which tools are available for the task.
-            Returns tool names, descriptions, and required parameters.
+            Returns tool names, descriptions, required parameters, and
+            **dependency hints** (``prerequisites`` for tools that must be
+            called first, ``relations`` for tools used together or in order).
+
+            Planning rule:
+              - Pick the single tool that best matches the user's goal.
+              - If its ``prerequisites`` are non-empty, call those first and
+                feed their results into the target tool's arguments.
+              - ``relations`` with type=precedes/requires imply call order.
 
             Args:
                 query: Natural language search query (e.g. "add numbers", "get weather")
                 top_k: Max number of results (optional)
             """
             k = top_k if top_k is not None else default_top_k
-            results = graph_ref.retrieve(query, top_k=k)
+            # retrieve_with_scores 를 써야 _enrich_relations 가 채운 relations/prerequisites
+            # 가 살아남는다. retrieve() 는 ToolSchema 만 반환해 이 정보가 버려짐.
+            results = graph_ref.retrieve_with_scores(query, top_k=k)
 
             matched = []
-            for schema in results:
+            for result in results:
+                schema = result.tool
                 entry: dict[str, Any] = {
                     "name": schema.name,
                     "description": (schema.description or "")[:200],
@@ -1422,6 +1569,22 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                         }
                         for p in schema.parameters
                     ]
+                # Dependency / ordering hints from graph edges.
+                # prerequisites: REQUIRES targets not in the result set — LLM
+                # should call these first. relations: edges among result set
+                # members, carrying human-readable hint strings.
+                if result.prerequisites:
+                    entry["prerequisites"] = list(result.prerequisites)
+                if result.relations:
+                    entry["relations"] = [
+                        {
+                            "target": rel.target,
+                            "type": rel.type,
+                            "direction": rel.direction,
+                            "hint": rel.hint,
+                        }
+                        for rel in result.relations
+                    ]
                 matched.append(entry)
 
             output = {
@@ -1430,8 +1593,10 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                 "total_tools": len(graph_ref._tools),
                 "tools": matched,
                 "hint": (
-                    "Use call_tool to execute a tool. "
-                    "Pass tool_name and arguments as a dict matching the parameters above."
+                    "Pick ONE tool matching the user's goal. If its "
+                    "'prerequisites' list is non-empty, call those tools "
+                    "first and use their results to fill the target tool's "
+                    "arguments. Then call_tool the target."
                 ),
             }
             return json.dumps(output, ensure_ascii=False, indent=2)

From d46e39308788813d1c5140757ca3298ac435fde9 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Fri, 24 Apr 2026 12:41:11 +0900
Subject: [PATCH 03/14] =?UTF-8?q?feat:=20Stage=202=20PathSynthesizer=20?=
 =?UTF-8?q?=E2=80=94=20graph=20=EA=B8=B0=EB=B0=98=20Plan=20=EA=B2=B0?=
 =?UTF-8?q?=EC=A0=95=EB=A1=A0=EC=A0=81=20=EC=83=9D=EC=84=B1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

target tool 에서 출발해 required consumes 를 bottom-up 재귀로 해소:
  - entities (Stage 1 output) 로 직접 바인딩
  - 없으면 graph 에서 semantic_tag / field_name 으로 producer 검색 후
    prereq step 추가 (재귀)
  - 없으면 UnsatisfiableFieldError

v1 범위 (설계 §16.6):
  - Linear chain (fan-out/조건/parallel 은 v2+)
  - 여러 producer 중 첫 번째 픽 (disambiguation 은 Phase D)
  - [*] wildcard → [0] 변환 (단일 선택)
  - max_depth 기본 5 (cyclic guard)

공개 API:
  graph_tool_call.plan.PathSynthesizer(graph_dict).synthesize(
      target=..., entities=..., goal=...) -> Plan

단위 테스트 13/13 pass (전체 plan 패키지 42/42):
  - trivial (no required / entity-only)
  - 2/3-step chain (semantic match, field_name fallback)
  - unsatisfiable / unknown target / cycle / max_depth
  - semantic 우선순위, self-producer 제외
---
 graph_tool_call/plan/__init__.py    |  13 +
 graph_tool_call/plan/synthesizer.py | 360 ++++++++++++++++++++++++++++
 2 files changed, 373 insertions(+)
 create mode 100644 graph_tool_call/plan/synthesizer.py

diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
index c35aef3..88d26aa 100644
--- a/graph_tool_call/plan/__init__.py
+++ b/graph_tool_call/plan/__init__.py
@@ -40,6 +40,13 @@ def call_tool(tool_name, args):
     ExecutionTrace,
     StepTrace,
 )
+from graph_tool_call.plan.synthesizer import (
+    PathSynthesizer,
+    PlanSynthesisError,
+    UnsatisfiableFieldError,
+    CyclicDependencyError,
+    MaxDepthExceededError,
+)
 
 __all__ = [
     # schema
@@ -59,4 +66,10 @@ def call_tool(tool_name, args):
     "StepFailed",
     "PlanCompleted",
     "PlanAborted",
+    # synthesizer
+    "PathSynthesizer",
+    "PlanSynthesisError",
+    "UnsatisfiableFieldError",
+    "CyclicDependencyError",
+    "MaxDepthExceededError",
 ]
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
new file mode 100644
index 0000000..33be4dd
--- /dev/null
+++ b/graph_tool_call/plan/synthesizer.py
@@ -0,0 +1,360 @@
+"""PathSynthesizer — Stage 2 of Plan-and-Execute.
+
+Given a target tool and user-provided entities, walk the ToolGraph's
+produces/consumes metadata backwards to construct a Plan (ordered steps +
+bindings) that, when executed by PlanRunner, satisfies the target.
+
+This module is transport-agnostic. It consumes a plain ``graph`` dict (the
+shape persisted as ``api_tool_collections.graph.graph``) — no DB, no HTTP.
+
+v1 scope (per design §16.6):
+  - Linear chain only — no fan-out, no parallel, no branching.
+  - If multiple producers exist for a required field, the first one is
+    picked (simple, predictable). Ambiguity handling is Phase D+.
+  - Max recursion depth = 5 (guard against cyclic or pathological graphs).
+
+Matching order for each required consume field:
+  1. User ``entities`` (Stage 1 output) — preferred, no extra step.
+  2. Another tool's ``produces`` with the same ``semantic_tag``
+     (Pass 2 LLM enrichment quality).
+  3. Another tool's ``produces`` with the same ``field_name``
+     (Pass 1 deterministic extraction, fallback).
+"""
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+from graph_tool_call.plan.schema import Plan, PlanStep
+
+
+class PlanSynthesisError(Exception):
+    """Base class for synthesis failures."""
+
+
+class UnsatisfiableFieldError(PlanSynthesisError):
+    """A required field cannot be supplied by entities or any producer."""
+
+
+class CyclicDependencyError(PlanSynthesisError):
+    """The synthesis trace revisits a tool already in progress."""
+
+
+class MaxDepthExceededError(PlanSynthesisError):
+    """Recursion depth exceeded — likely a misshapen graph."""
+
+
+@dataclass
+class _PartialStep:
+    """In-progress step being built during bottom-up synthesis."""
+
+    tool: str
+    args: dict[str, Any] = field(default_factory=dict)
+    rationale: str = ""
+    step_id: str = ""                          # assigned at topological sort
+
+
+class PathSynthesizer:
+    """Deterministic plan builder driven by graph ``produces``/``consumes``.
+
+    Usage::
+
+        syn = PathSynthesizer(graph_dict)
+        plan = syn.synthesize(
+            target="seltProductDetailInfo",
+            entities={"search_keyword": "quarzen 티셔츠"},
+        )
+    """
+
+    def __init__(
+        self,
+        graph: dict[str, Any],
+        *,
+        max_depth: int = 5,
+    ) -> None:
+        self._tools: dict[str, dict[str, Any]] = dict(graph.get("tools") or {})
+        self._max_depth = max_depth
+        # semantic_tag -> [tool_name], insertion order preserved
+        self._producers_by_semantic: dict[str, list[str]] = {}
+        self._producers_by_field: dict[str, list[str]] = {}
+        self._build_producer_indexes()
+
+    # ------------------------------------------------------------------
+    # public API
+    # ------------------------------------------------------------------
+
+    def synthesize(
+        self,
+        *,
+        target: str,
+        entities: dict[str, Any] | None = None,
+        goal: str = "",
+    ) -> Plan:
+        """Build a Plan whose final step is ``target`` with required args
+        filled by entities + prerequisite steps.
+
+        Raises ``UnsatisfiableFieldError`` if a required field has no
+        producer or entity mapping.
+        """
+        if target not in self._tools:
+            raise PlanSynthesisError(f"target tool not in graph: {target!r}")
+
+        entities = entities or {}
+        steps_by_tool: dict[str, _PartialStep] = {}
+        visiting: set[str] = set()
+
+        # Resolve recursively; populates steps_by_tool with target at the end
+        self._resolve(
+            tool_name=target,
+            entities=entities,
+            steps_by_tool=steps_by_tool,
+            visiting=visiting,
+            depth=0,
+        )
+
+        # Assign topological ids s1..sN by insertion order
+        ordered_tools = list(steps_by_tool.keys())
+        for idx, tool_name in enumerate(ordered_tools, start=1):
+            steps_by_tool[tool_name].step_id = f"s{idx}"
+
+        # Replace tool-name bindings with step-id bindings
+        final_steps: list[PlanStep] = []
+        for tool_name in ordered_tools:
+            partial = steps_by_tool[tool_name]
+            args = {
+                k: self._rewrite_tool_refs(v, steps_by_tool)
+                for k, v in partial.args.items()
+            }
+            final_steps.append(PlanStep(
+                id=partial.step_id,
+                tool=partial.tool,
+                args=args,
+                rationale=partial.rationale,
+            ))
+
+        target_step_id = steps_by_tool[target].step_id
+        return Plan(
+            id=str(uuid.uuid4()),
+            goal=goal or f"Execute {target}",
+            steps=final_steps,
+            output_binding=f"${{{target_step_id}.body}}",
+            created_at=datetime.now(timezone.utc).isoformat(),
+            metadata={
+                "target": target,
+                "entities": dict(entities),
+                "synthesized_by": "PathSynthesizer/v1",
+            },
+        )
+
+    # ------------------------------------------------------------------
+    # core recursion
+    # ------------------------------------------------------------------
+
+    def _resolve(
+        self,
+        *,
+        tool_name: str,
+        entities: dict[str, Any],
+        steps_by_tool: dict[str, _PartialStep],
+        visiting: set[str],
+        depth: int,
+    ) -> str:
+        """Ensure ``tool_name`` has a PartialStep with resolved args.
+
+        Returns the tool name itself (used as a placeholder in args until
+        step_ids are assigned by the caller).
+        """
+        if depth > self._max_depth:
+            raise MaxDepthExceededError(
+                f"synthesis exceeded max_depth={self._max_depth} at {tool_name!r}"
+            )
+        if tool_name in steps_by_tool:
+            return tool_name
+        if tool_name in visiting:
+            raise CyclicDependencyError(
+                f"cycle detected at {tool_name!r} (chain: {sorted(visiting)!r})"
+            )
+        visiting.add(tool_name)
+
+        tool = self._tools.get(tool_name) or {}
+        metadata = tool.get("metadata") or {}
+        consumes = metadata.get("consumes") or []
+
+        args: dict[str, Any] = {}
+        rationales: list[str] = []
+
+        for consume in consumes:
+            if not consume.get("required"):
+                continue
+
+            field_name = consume.get("field_name") or ""
+            semantic = consume.get("semantic_tag") or ""
+
+            # 1. Entity match (user-supplied)
+            entity_val = self._match_entity(entities, semantic, field_name)
+            if entity_val is not None:
+                args[field_name] = entity_val
+                continue
+
+            # 2/3. Find a producer (semantic first, then field_name)
+            producer = self._find_producer(
+                semantic=semantic, field_name=field_name,
+                exclude=tool_name,
+            )
+            if producer is None:
+                raise UnsatisfiableFieldError(
+                    f"tool {tool_name!r} requires {field_name!r} "
+                    f"(semantic={semantic!r}) but no entity or producer found"
+                )
+
+            # Recurse into the producer first so step_id ordering is correct
+            self._resolve(
+                tool_name=producer,
+                entities=entities,
+                steps_by_tool=steps_by_tool,
+                visiting=visiting,
+                depth=depth + 1,
+            )
+
+            # Build a placeholder binding — will be rewritten after step_ids
+            # are assigned. Format: ${<tool_name>.<jsonpath-sans-root>}
+            prod_path = self._producer_jsonpath(producer, semantic, field_name)
+            args[field_name] = f"${{{producer}.{prod_path}}}"
+            rationales.append(f"{field_name} ← {producer} ({prod_path})")
+
+        steps_by_tool[tool_name] = _PartialStep(
+            tool=tool_name,
+            args=args,
+            rationale="; ".join(rationales) if rationales else "",
+        )
+        visiting.discard(tool_name)
+        return tool_name
+
+    # ------------------------------------------------------------------
+    # helpers
+    # ------------------------------------------------------------------
+
+    def _build_producer_indexes(self) -> None:
+        """Index which tools produce which semantic / field across graph."""
+        for name, tool in self._tools.items():
+            meta = tool.get("metadata") or {}
+            for produce in meta.get("produces") or []:
+                sem = produce.get("semantic_tag") or ""
+                fname = produce.get("field_name") or ""
+                if sem:
+                    self._producers_by_semantic.setdefault(sem, []).append(name)
+                if fname:
+                    self._producers_by_field.setdefault(fname, []).append(name)
+
+    def _find_producer(
+        self,
+        *,
+        semantic: str,
+        field_name: str,
+        exclude: str,
+    ) -> str | None:
+        """Pick the first producer matching semantic, falling back to field name."""
+        if semantic:
+            for name in self._producers_by_semantic.get(semantic, []):
+                if name != exclude:
+                    return name
+        if field_name:
+            for name in self._producers_by_field.get(field_name, []):
+                if name != exclude:
+                    return name
+        return None
+
+    def _producer_jsonpath(
+        self,
+        producer: str,
+        semantic: str,
+        field_name: str,
+    ) -> str:
+        """Return a dotted path under the producer's response that yields
+        the desired field. Converts ``$.a.b[*].c`` → ``a.b[0].c`` (v1 picks
+        the first array element when a wildcard is present).
+
+        Falls back to ``body`` + field_name if we can't locate the produces.
+        """
+        tool = self._tools.get(producer) or {}
+        produces = (tool.get("metadata") or {}).get("produces") or []
+        match = None
+        if semantic:
+            match = next(
+                (p for p in produces if p.get("semantic_tag") == semantic),
+                None,
+            )
+        if match is None and field_name:
+            match = next(
+                (p for p in produces if p.get("field_name") == field_name),
+                None,
+            )
+        if match is None:
+            return f"body.{field_name}" if field_name else "body"
+
+        raw = match.get("json_path") or ""
+        return _normalize_jsonpath_for_binding(raw)
+
+    def _match_entity(
+        self,
+        entities: dict[str, Any],
+        semantic: str,
+        field_name: str,
+    ) -> Any | None:
+        """Look up user-supplied entity by semantic tag or field name."""
+        if semantic and semantic in entities:
+            return entities[semantic]
+        if field_name and field_name in entities:
+            return entities[field_name]
+        return None
+
+    def _rewrite_tool_refs(
+        self,
+        value: Any,
+        steps_by_tool: dict[str, _PartialStep],
+    ) -> Any:
+        """Recursively rewrite ``${<tool_name>.<path>}`` → ``${sN.<path>}``."""
+        if isinstance(value, dict):
+            return {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in value.items()}
+        if isinstance(value, list):
+            return [self._rewrite_tool_refs(v, steps_by_tool) for v in value]
+        if not isinstance(value, str):
+            return value
+        # Only rewrite full-string bindings that we inserted. Entities
+        # supplied by the caller are left alone (no ${...} wrapping).
+        if not (value.startswith("${") and value.endswith("}")):
+            return value
+        inner = value[2:-1]
+        head, _, tail = inner.partition(".")
+        if head in steps_by_tool:
+            step_id = steps_by_tool[head].step_id
+            rest = f".{tail}" if tail else ""
+            return f"${{{step_id}{rest}}}"
+        return value
+
+
+def _normalize_jsonpath_for_binding(raw: str) -> str:
+    """``$.body.goods[*].goodsNo`` → ``body.goods[0].goodsNo``.
+
+    v1 always picks index 0 for arrays. Fan-out is v2 (design §11.1).
+    """
+    if not raw:
+        return ""
+    path = raw
+    if path.startswith("$"):
+        path = path[1:]
+    if path.startswith("."):
+        path = path[1:]
+    return path.replace("[*]", "[0]")
+
+
+__all__ = [
+    "PathSynthesizer",
+    "PlanSynthesisError",
+    "UnsatisfiableFieldError",
+    "CyclicDependencyError",
+    "MaxDepthExceededError",
+]

From d6a14e9d35c23dcdeed235324ef9b2da389bf71f Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Fri, 24 Apr 2026 13:27:03 +0900
Subject: [PATCH 04/14] feat: Stage 1 Intent Parser + Stage 4 Response
 Synthesizer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 1 — graph_tool_call.plan.intent
  parse_intent(requirement, catalog, llm) -> ParsedIntent:
    - LLM 1회로 {target, entities, confidence, output_shape} 구조화
    - catalog 는 retrieval 상위 K개 ToolCatalogEntry (ai_metadata 활용:
      one_line_summary / when_to_use / consumes_tags / canonical_action)
    - target 이 catalog 에 없으면 IntentParseError (hallucination 차단)
    - confidence 0~1 clamp, output_shape 검증 + single fallback

Stage 4 — graph_tool_call.plan.response
  synthesize_success_response / synthesize_failure_response:
    - ExecutionTrace → 자연어 답변 (한국어 기본)
    - 성공/실패 프롬프트 분리 (실패 시 failed_step / error / partial 전달)
    - char_limit 으로 큰 응답 truncate
---
 graph_tool_call/plan/__init__.py |  18 ++++
 graph_tool_call/plan/intent.py   | 180 +++++++++++++++++++++++++++++++
 graph_tool_call/plan/response.py | 125 +++++++++++++++++++++
 3 files changed, 323 insertions(+)
 create mode 100644 graph_tool_call/plan/intent.py
 create mode 100644 graph_tool_call/plan/response.py

diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
index 88d26aa..d9e5e1e 100644
--- a/graph_tool_call/plan/__init__.py
+++ b/graph_tool_call/plan/__init__.py
@@ -40,6 +40,16 @@ def call_tool(tool_name, args):
     ExecutionTrace,
     StepTrace,
 )
+from graph_tool_call.plan.intent import (
+    IntentParseError,
+    ParsedIntent,
+    ToolCatalogEntry,
+    parse_intent,
+)
+from graph_tool_call.plan.response import (
+    synthesize_success_response,
+    synthesize_failure_response,
+)
 from graph_tool_call.plan.synthesizer import (
     PathSynthesizer,
     PlanSynthesisError,
@@ -72,4 +82,12 @@ def call_tool(tool_name, args):
     "UnsatisfiableFieldError",
     "CyclicDependencyError",
     "MaxDepthExceededError",
+    # intent
+    "ToolCatalogEntry",
+    "ParsedIntent",
+    "IntentParseError",
+    "parse_intent",
+    # response
+    "synthesize_success_response",
+    "synthesize_failure_response",
 ]
diff --git a/graph_tool_call/plan/intent.py b/graph_tool_call/plan/intent.py
new file mode 100644
index 0000000..618d0d8
--- /dev/null
+++ b/graph_tool_call/plan/intent.py
@@ -0,0 +1,180 @@
+"""Stage 1 — Intent Parser.
+
+자연어 요구사항을 Stage 2 (PathSynthesizer) 가 소비할 수 있는 구조화
+``{target, entities}`` 로 변환한다. LLM 1회 호출, 작은 context.
+
+Catalog 구성 원칙 (설계 §4):
+  - 사전에 retrieval 로 상위 K개 도구만 넘김 (전체 카탈로그 X)
+  - 각 도구는 name + one_line_summary + when_to_use + 핵심 semantic tags
+  - Pass 2 enrichment 가 채운 ai_metadata 가 있으면 그 정보를 우선 사용;
+    없으면 description 축약본으로 fallback
+
+LLM 은 structured JSON 만 반환 — 파싱 실패 시 BindingError 같은 방식으로
+호출자에게 명확히 전달.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+from graph_tool_call.ontology.llm_provider import OntologyLLM, _extract_json
+
+
+# ---------------------------------------------------------------------------
+# data shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ToolCatalogEntry:
+    """Condensed tool view for intent-parsing prompt — under ~150 chars each."""
+
+    name: str
+    summary: str = ""                          # one_line_summary from ai_metadata
+    when_to_use: str = ""                      # ai_metadata.when_to_use
+    consumes_tags: list[str] = field(default_factory=list)   # required semantic ids
+    canonical_action: str = ""                 # "read" | "search" | "create" | ...
+    primary_resource: str = ""                 # "product" | ...
+
+
+@dataclass
+class ParsedIntent:
+    """Stage 1 output — consumed by Stage 2 PathSynthesizer."""
+
+    target: str                                # tool name picked by LLM
+    entities: dict[str, Any] = field(default_factory=dict)
+    confidence: float = 0.0                    # 0.0 ~ 1.0
+    output_shape: str = "single"               # "single" | "list" | "count"
+    reasoning: str = ""
+
+
+class IntentParseError(Exception):
+    """Raised when the LLM output can't be mapped to a valid ParsedIntent."""
+
+
+# ---------------------------------------------------------------------------
+# prompt
+# ---------------------------------------------------------------------------
+
+
+_INTENT_PROMPT = """\
+You pick the right API tool and extract entity values for a planning system.
+
+User requirement:
+{requirement}
+
+Candidate tools (shortlisted by retrieval):
+{catalog}
+
+Rules:
+  - Pick exactly ONE tool (the final-goal tool). Do not plan the chain —
+    the downstream system will build prerequisite steps automatically.
+  - entities: extract values from the requirement and key them by semantic
+    id when known (e.g. "search_keyword", "product_id", "site_id").
+    For free-text user inputs, prefer "search_keyword".
+  - output_shape: "single" for one-item answers, "list" for multiple,
+    "count" for aggregates.
+  - confidence: your certainty in the tool pick, 0.0~1.0.
+  - reasoning: one short sentence, for audit logs.
+
+Output JSON only — no markdown, no prose. Schema:
+{{
+  "target": "<tool_name>",
+  "entities": {{...}},
+  "confidence": 0.0,
+  "output_shape": "single" | "list" | "count",
+  "reasoning": "..."
+}}
+"""
+
+
+def _format_catalog(entries: list[ToolCatalogEntry]) -> str:
+    lines: list[str] = []
+    for i, e in enumerate(entries, start=1):
+        parts = [f"{i}. {e.name}"]
+        if e.canonical_action or e.primary_resource:
+            parts.append(f"[{e.canonical_action}/{e.primary_resource}]".strip("[/]"))
+        if e.summary:
+            parts.append(f"— {e.summary}")
+        lines.append(" ".join(p for p in parts if p))
+        if e.when_to_use:
+            lines.append(f"   when: {e.when_to_use[:140]}")
+        if e.consumes_tags:
+            lines.append(f"   needs: {', '.join(e.consumes_tags[:6])}")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+
+def parse_intent(
+    requirement: str,
+    catalog: list[ToolCatalogEntry],
+    llm: OntologyLLM,
+) -> ParsedIntent:
+    """Call the LLM once to produce a ParsedIntent.
+
+    ``catalog`` should be the retrieval-shortlisted candidate tools (keep
+    small — ~10 entries — to control prompt size). ``llm`` is any
+    OntologyLLM-compatible provider.
+    """
+    if not catalog:
+        raise IntentParseError("empty catalog — cannot pick a target")
+
+    prompt = _INTENT_PROMPT.format(
+        requirement=requirement.strip(),
+        catalog=_format_catalog(catalog),
+    )
+    raw = llm.generate(prompt)
+
+    try:
+        parsed = _extract_json(raw)
+    except json.JSONDecodeError as exc:
+        raise IntentParseError(f"LLM output not parseable JSON: {exc}") from exc
+
+    if not isinstance(parsed, dict):
+        raise IntentParseError(f"expected JSON object, got {type(parsed).__name__}")
+
+    target = str(parsed.get("target") or "").strip()
+    if not target:
+        raise IntentParseError("target missing from LLM output")
+
+    # Validate target is in the catalog — guard against hallucinated names
+    allowed = {e.name for e in catalog}
+    if target not in allowed:
+        raise IntentParseError(
+            f"target {target!r} not in catalog (candidates: {sorted(allowed)[:5]!r}...)"
+        )
+
+    entities_raw = parsed.get("entities")
+    entities = entities_raw if isinstance(entities_raw, dict) else {}
+
+    try:
+        confidence = float(parsed.get("confidence") or 0.0)
+    except (TypeError, ValueError):
+        confidence = 0.0
+    confidence = max(0.0, min(1.0, confidence))
+
+    shape = str(parsed.get("output_shape") or "single").strip().lower()
+    if shape not in ("single", "list", "count"):
+        shape = "single"
+
+    return ParsedIntent(
+        target=target,
+        entities=entities,
+        confidence=confidence,
+        output_shape=shape,
+        reasoning=str(parsed.get("reasoning") or "").strip(),
+    )
+
+
+__all__ = [
+    "ToolCatalogEntry",
+    "ParsedIntent",
+    "IntentParseError",
+    "parse_intent",
+]
diff --git a/graph_tool_call/plan/response.py b/graph_tool_call/plan/response.py
new file mode 100644
index 0000000..782ca1f
--- /dev/null
+++ b/graph_tool_call/plan/response.py
@@ -0,0 +1,125 @@
+"""Stage 4 — Response Synthesizer.
+
+ExecutionTrace 를 사용자 친화적 자연어 응답으로 변환한다. LLM 1회 호출,
+context 는 execution 결과 요약 + 원본 요구사항.
+
+성공 / 실패 두 경우 모두 다룸:
+  - 성공: plan.output (final step body) + 요구사항 → 답변
+  - 실패: failed_step + error + 부분 결과 → 무엇이 됐고 무엇이 안 됐는지
+
+실행 결과가 대형 JSON 일 수 있으므로 호출자가 미리 projection / 압축한 후
+넘기는 것을 권장 (본 모듈은 단순히 ``str(output)`` 사용).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from graph_tool_call.ontology.llm_provider import OntologyLLM
+
+
+# ---------------------------------------------------------------------------
+# prompts
+# ---------------------------------------------------------------------------
+
+
+_SUCCESS_PROMPT = """\
+You turn API execution results into a natural answer for the user.
+
+User asked:
+{requirement}
+
+Execution result (from the last step):
+{result}
+
+Respond in Korean unless the user's question is clearly in another language.
+Keep it concise — 1~3 sentences for simple answers, short bullet list for
+multi-item results. Do not invent data not present in the result.
+"""
+
+
+_FAILURE_PROMPT = """\
+You explain an API execution failure to the user.
+
+User asked:
+{requirement}
+
+Plan aborted at step {failed_step!r}.
+Error: {error}
+
+Partial results collected before the failure:
+{partial}
+
+Tell the user clearly in Korean (unless the question is another language):
+  - what they asked for
+  - what was attempted
+  - where and why it failed (in plain language — do not dump stack traces)
+  - what they can try next, if obvious
+Keep it short and helpful — 2~4 sentences.
+"""
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+
+def synthesize_success_response(
+    *,
+    requirement: str,
+    result: Any,
+    llm: OntologyLLM,
+    result_char_limit: int = 2000,
+) -> str:
+    """Success case — plan completed, convert output to NL answer."""
+    prompt = _SUCCESS_PROMPT.format(
+        requirement=requirement.strip(),
+        result=_render(result, result_char_limit),
+    )
+    return llm.generate(prompt).strip()
+
+
+def synthesize_failure_response(
+    *,
+    requirement: str,
+    failed_step: str,
+    error: Any,
+    partial_results: Any = None,
+    llm: OntologyLLM,
+    partial_char_limit: int = 1000,
+) -> str:
+    """Failure case — plan aborted, explain to user."""
+    prompt = _FAILURE_PROMPT.format(
+        requirement=requirement.strip(),
+        failed_step=failed_step,
+        error=_render(error, 300),
+        partial=_render(partial_results, partial_char_limit) if partial_results else "(none)",
+    )
+    return llm.generate(prompt).strip()
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _render(value: Any, char_limit: int) -> str:
+    """Serialize *value* to a short string for prompt use."""
+    if value is None:
+        return "(none)"
+    if isinstance(value, str):
+        return value[:char_limit] + ("…" if len(value) > char_limit else "")
+    try:
+        text = json.dumps(value, ensure_ascii=False, indent=2)
+    except (TypeError, ValueError):
+        text = str(value)
+    if len(text) <= char_limit:
+        return text
+    return text[:char_limit] + "…"
+
+
+__all__ = [
+    "synthesize_success_response",
+    "synthesize_failure_response",
+]

From f7e6b42418093d77997b3511e83f7749ba62730d Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Tue, 28 Apr 2026 18:17:03 +0900
Subject: [PATCH 05/14] feat(ontology): add `kind` (data|context) to consume
 FieldSemantic + enrichment prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PathSynthesizer 가 chain 결정 시 ``kind=data`` (비즈니스 값, producer
chain 가능) 와 ``kind=context`` (ambient config, chain 거부 — entity
또는 collection default 만 사용) 를 분리해야 무관 chain (예:
locale/siteNo 의 producer 까지 끌어오기) 을 막을 수 있다. enrichment
prompt 에 두 분류 가이드를 명시해 LLM 이 새 도구의 consume 을
정확히 분류하게 한다.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 graph_tool_call/ontology/llm_provider.py | 44 +++++++++++++++++++-----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py
index a34786d..76e26bc 100644
--- a/graph_tool_call/ontology/llm_provider.py
+++ b/graph_tool_call/ontology/llm_provider.py
@@ -46,11 +46,24 @@ class FieldSemantic:
 
     Used on both produces (what a tool outputs) and consumes (what it
     requires). ``json_path`` is set on produces; ``field`` is set on consumes.
+
+    ``kind`` (consumes only) distinguishes two roles:
+      - ``"data"``    — true data dependency (e.g. a business identifier
+                        needed to address the operation). PathSynthesizer
+                        will chain to a producer for this field.
+      - ``"context"`` — ambient config (locale, site, pagination). Must be
+                        supplied as an entity or collection default; the
+                        synthesizer will NOT build a prerequisite chain
+                        just to fetch it.
+
+    The default ``"data"`` matches pre-kind behavior (safe for tools whose
+    enrichment predates this schema change).
     """
 
     semantic: str
     json_path: str = ""
     field: str = ""
+    kind: str = "data"
 
 
 @dataclass
@@ -199,9 +212,19 @@ class ToolEnrichment:
       * Use CONSISTENT semantic ids across tools. If two tools both return a
         product identifier (one calls it "goodsNo", another "productId"),
         use the same semantic like "product_id".
-  - consumes_semantics: array of {{"semantic": "canonical_id", "field": "paramName"}}
+  - consumes_semantics: array of {{"semantic": "canonical_id",
+                                    "field": "paramName",
+                                    "kind": "data" | "context"}}
       * REQUIRED inputs only. Skip optional filters, pagination.
       * Same semantic id conventions as produces.
+      * kind="data" — business-data dependency: an identifier or value that
+        addresses a specific record (e.g. product_id, order_id, user_id,
+        search_keyword). A prior step in a plan normally produces it.
+      * kind="context" — ambient/environmental config shared across the
+        workflow (locale, site_no, tenant, pagination cursors, flag switches).
+        The user or the caller supplies it as a default — it is NOT produced
+        by a prior step. Use this for anything a plain UI user would set
+        once per session, not per request.
   - pairs_well_with: array of {{"tool": "tool_name_from_available_list",
                                 "reason": "brief reason"}}
       * 2-4 tools that typically precede or follow this tool.
@@ -271,14 +294,19 @@ def _parse_enrichment(data: Any) -> ToolEnrichment | None:
             for p in (data.get("produces_semantics") or [])
             if isinstance(p, dict) and str(p.get("semantic", "")).strip()
         ]
-        consumes = [
-            FieldSemantic(
-                semantic=str(c.get("semantic", "")).strip(),
-                field=str(c.get("field", "")).strip(),
+        consumes = []
+        for c in (data.get("consumes_semantics") or []):
+            if not (isinstance(c, dict) and str(c.get("semantic", "")).strip()):
+                continue
+            raw_kind = str(c.get("kind", "data")).strip().lower()
+            kind = raw_kind if raw_kind in ("data", "context") else "data"
+            consumes.append(
+                FieldSemantic(
+                    semantic=str(c.get("semantic", "")).strip(),
+                    field=str(c.get("field", "")).strip(),
+                    kind=kind,
+                )
             )
-            for c in (data.get("consumes_semantics") or [])
-            if isinstance(c, dict) and str(c.get("semantic", "")).strip()
-        ]
         pairs = [
             PairHint(
                 tool=str(p.get("tool", "")).strip(),

From 37927e0c4b4393d8a52da8a45705970de10293e3 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Tue, 28 Apr 2026 18:22:30 +0900
Subject: [PATCH 06/14] feat(plan/intent): vocab validation, multi-turn seed,
 enum mappings, hard constraints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 1 (parse_intent) 의 4가지 robustness 보강:

  * Vocabulary fuzzy validation — LLM 이 ``search_keyword`` 대신
    ``search_keyword_name`` 같은 인접 표현을 만들면 ratio≥0.8 로 coerce,
    그 외엔 drop. 잘못된 entity key 는 downstream cycle / unsatisfied
    field 로 이어져 silent 한 잘못된 plan 을 만든다.

  * Multi-turn ``seed_entities`` 인자 — popup-driven 흐름에서 직전 turn
    의 결정값을 carry forward. prompt 에 명시 + 코드 안전망 (LLM 이
    seed 무시 시에도 합쳐 줌). 새 turn 의 명시 entity 는 같은 키의
    seed 를 override.

  * ``enum_mappings`` prompt section — 운영자가 등록한 ``{field: {code:
    label}}`` 를 catalog scope 안에서만 노출 (전체 노출 시 prompt 폭주).
    HC5 으로 enum field 는 코드 (left side) 만 entity 값으로 허용.

  * Hard constraint 강화 — ``do not force-fit`` 같은 약한 부정 대신
    ``DO NOT`` 4개 (HC1-HC4) 명시: identifier 필드에 자연어 phrase
    금지, vocab 외 키 발명 금지, 동일 값 여러 필드 금지, 값 변환 금지.
---
 graph_tool_call/plan/intent.py | 208 ++++++++++++++++++++++++++++++---
 1 file changed, 194 insertions(+), 14 deletions(-)

diff --git a/graph_tool_call/plan/intent.py b/graph_tool_call/plan/intent.py
index 618d0d8..74dd8b8 100644
--- a/graph_tool_call/plan/intent.py
+++ b/graph_tool_call/plan/intent.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import difflib
 import json
 from dataclasses import dataclass, field
 from typing import Any
@@ -22,6 +23,13 @@
 from graph_tool_call.ontology.llm_provider import OntologyLLM, _extract_json
 
 
+# Minimum SequenceMatcher ratio for treating an LLM-emitted entity key as
+# a typo/expansion of a real vocab entry. 0.8 catches "search_keyword_name"
+# vs "search_keyword" (~0.85) while rejecting unrelated pairs like
+# "search_keyword" vs "search_query" (~0.54).
+_VOCAB_FUZZY_CUTOFF = 0.8
+
+
 # ---------------------------------------------------------------------------
 # data shape
 # ---------------------------------------------------------------------------
@@ -65,19 +73,48 @@ class IntentParseError(Exception):
 User requirement:
 {requirement}
 
-Candidate tools (shortlisted by retrieval):
+Candidate tools (shortlisted by retrieval — includes the target's
+prerequisite producers so every key you need should appear in some
+tool's "needs:" line below):
 {catalog}
-
-Rules:
-  - Pick exactly ONE tool (the final-goal tool). Do not plan the chain —
-    the downstream system will build prerequisite steps automatically.
-  - entities: extract values from the requirement and key them by semantic
-    id when known (e.g. "search_keyword", "product_id", "site_id").
-    For free-text user inputs, prefer "search_keyword".
-  - output_shape: "single" for one-item answers, "list" for multiple,
-    "count" for aggregates.
-  - confidence: your certainty in the tool pick, 0.0~1.0.
-  - reasoning: one short sentence, for audit logs.
+{vocabulary_block}{enum_block}{seed_block}
+HARD CONSTRAINTS — violating any of these is a planning error, not a
+stylistic choice. Re-check the constraints before you emit JSON.
+
+  HC1. DO NOT put a value into an identifier-style field (a field name
+       ending in "No" / "Id" / "Idx" / "Code" / "id") if the value
+       contains spaces, Korean/Chinese/Japanese letters, or category
+       words ("티셔츠", "신발", "shoes", a brand or model name).
+       Identifier fields accept short alphanumeric record locators
+       only ("G12345", "10293"). A descriptive phrase placed in such
+       a field is always wrong.
+  HC2. DO NOT invent field names. Every entity key MUST appear in one
+       of the candidate tools' "needs:" lines. If no listed field can
+       carry the user's value without violating HC1, omit the entity —
+       empty entities are fine; the downstream synthesizer chains
+       through a producer.
+  HC3. DO NOT put the same value into more than one field. Each value
+       goes into zero or exactly one field.
+  HC4. DO NOT translate, normalize, paraphrase, or expand the user's
+       value. Copy it byte-for-byte as written in the requirement.
+  HC5. For fields that have an enum mapping below, the entity value
+       MUST be one of the listed CODES (left side), never the label
+       (right side) and never the user's original phrase. Pick the
+       code whose label best matches the user's intent. If nothing
+       matches clearly, omit that entity.
+
+Selection guidance (apply only after the constraints hold):
+  - Pick exactly ONE tool — the final-goal tool. Do not plan the chain;
+    the downstream system builds prerequisite steps automatically.
+  - Free-text values (descriptive phrases like "quarzen 티셔츠",
+    "black hoodie") match fields named "searchWord", "query",
+    "keyword", or names ending in "Nm" / "Name".
+  - When several fields could carry the value without violating HC1,
+    prefer one a candidate's "needs:" line lists — that is a field a
+    tool you already considered actually accepts.
+  - output_shape: "single" / "list" / "count".
+  - confidence: 0.0~1.0 — your certainty in the tool pick.
+  - reasoning: one short sentence for audit logs.
 
 Output JSON only — no markdown, no prose. Schema:
 {{
@@ -90,6 +127,107 @@ class IntentParseError(Exception):
 """
 
 
+def _coerce_entity_keys(
+    entities: dict[str, Any],
+    vocab: list[str],
+) -> dict[str, Any]:
+    """Map LLM-emitted entity keys onto the vocabulary.
+
+    Exact match → kept. Close match above ``_VOCAB_FUZZY_CUTOFF`` → coerced
+    to the canonical vocab entry. Otherwise the entry is dropped — silently
+    passing an invented key downstream causes producer-chain failures or
+    cycle detection (the vocab miss is the failure, not the symptom).
+    """
+    vocab_set = set(vocab)
+    out: dict[str, Any] = {}
+    for key, value in entities.items():
+        key_str = str(key)
+        if key_str in vocab_set:
+            out[key_str] = value
+            continue
+        match = difflib.get_close_matches(
+            key_str, vocab, n=1, cutoff=_VOCAB_FUZZY_CUTOFF,
+        )
+        if match:
+            # If multiple LLM keys collapse onto the same vocab entry, the
+            # later one wins. Acceptable: same canonical key with two
+            # values is already a degenerate LLM output.
+            out[match[0]] = value
+    return out
+
+
+def _format_seed_block(seed_entities: dict[str, Any] | None) -> str:
+    """Render a 'carry forward' section for entities the caller already
+    decided in a previous turn.
+
+    Multi-turn flow: when a previous synthesize attempt asked the user to
+    pick a value (e.g. via a popup of enum options), the chosen pairs are
+    fed back as ``seed_entities``. The LLM should keep them as-is unless
+    the new requirement explicitly contradicts a value, and only EXTRACT
+    NEW entities to add. Empty / None ⇒ section omitted.
+    """
+    if not seed_entities:
+        return ""
+    lines = "\n".join(
+        f'  - {k}: {json.dumps(v, ensure_ascii=False)}'
+        for k, v in seed_entities.items()
+    )
+    return (
+        "\n\nExisting entities (carried over from prior turns — keep these "
+        "values exactly unless the user's new requirement explicitly "
+        "overrides one. You only need to extract additional entities that "
+        "the new requirement introduces):\n"
+        f"{lines}"
+    )
+
+
+def _format_enum_block(enum_mappings: dict[str, dict[str, str]] | None) -> str:
+    """Render the optional enum-mapping section of the prompt.
+
+    ``enum_mappings`` shape: ``{field_name: {code: label}}`` — operator-
+    registered code lookups for backend enum fields whose values aren't
+    in the swagger schema (e.g. "10" -> "비회원" for a basket type code).
+    The LLM picks the code whose label matches the user's natural-language
+    intent. Empty / None ⇒ section omitted entirely.
+    """
+    if not enum_mappings:
+        return ""
+    lines: list[str] = []
+    for field, codes in enum_mappings.items():
+        if not isinstance(codes, dict) or not codes:
+            continue
+        lines.append(f"  - {field}:")
+        for code, label in codes.items():
+            lines.append(f'      "{code}" → {label}')
+    if not lines:
+        return ""
+    body = "\n".join(lines)
+    return (
+        "\n\nEnum code mappings (operator-registered — when one of these "
+        "fields needs a value, pick the CODE whose label matches the "
+        "user's intent):\n"
+        f"{body}"
+    )
+
+
+def _format_vocabulary_block(tags: list[str]) -> str:
+    """Render the optional vocabulary section of the prompt.
+
+    Returns an empty string when no vocab is provided so the prompt
+    stays focused on ``catalog``. Callers that want LLM access to
+    field names beyond the catalog (e.g. when retrieval failed to pull
+    in producers) can pass a non-empty list.
+    """
+    if not tags:
+        return ""
+    lines = "\n".join(f"  - {t}" for t in tags)
+    return (
+        "\n\nAvailable entity field names — backup vocabulary used only when "
+        "no candidate tool's \"needs:\" line carries the user's value:\n"
+        f"{lines}"
+    )
+
+
 def _format_catalog(entries: list[ToolCatalogEntry]) -> str:
     lines: list[str] = []
     for i, e in enumerate(entries, start=1):
@@ -115,19 +253,46 @@ def parse_intent(
     requirement: str,
     catalog: list[ToolCatalogEntry],
     llm: OntologyLLM,
+    *,
+    vocabulary: list[str] | None = None,
+    enum_mappings: dict[str, dict[str, str]] | None = None,
+    seed_entities: dict[str, Any] | None = None,
 ) -> ParsedIntent:
     """Call the LLM once to produce a ParsedIntent.
 
     ``catalog`` should be the retrieval-shortlisted candidate tools (keep
-    small — ~10 entries — to control prompt size). ``llm`` is any
-    OntologyLLM-compatible provider.
+    small — ~10 entries — to control prompt size). ``vocabulary`` is the
+    full set of ``kind=data`` semantic ids in the graph (so the LLM can
+    map free-text inputs to a search-style key even when the matching
+    producer wasn't retrieved). ``enum_mappings`` is operator-registered
+    ``{field_name: {code: label}}`` lookups for backend enum fields whose
+    values aren't in the swagger schema — exposed only when relevant
+    (caller should pre-filter to the catalog's consumes fields).
+    ``seed_entities`` carries entities decided in earlier turns of a
+    multi-turn flow (e.g. user clicked an option in a popup); the LLM
+    keeps them and only extracts additional ones from the new
+    ``requirement``. ``llm`` is any OntologyLLM-compatible provider.
     """
     if not catalog:
         raise IntentParseError("empty catalog — cannot pick a target")
 
+    vocab = vocabulary or []
+    if not vocab:
+        # Fallback: derive from catalog. Same-domain narrowing only —
+        # callers that supply the full graph vocab get better accuracy.
+        seen: set[str] = set()
+        for e in catalog:
+            for tag in e.consumes_tags:
+                if tag and tag not in seen:
+                    seen.add(tag)
+                    vocab.append(tag)
+
     prompt = _INTENT_PROMPT.format(
         requirement=requirement.strip(),
         catalog=_format_catalog(catalog),
+        vocabulary_block=_format_vocabulary_block(vocab),
+        enum_block=_format_enum_block(enum_mappings),
+        seed_block=_format_seed_block(seed_entities),
     )
     raw = llm.generate(prompt)
 
@@ -153,6 +318,21 @@ def parse_intent(
     entities_raw = parsed.get("entities")
     entities = entities_raw if isinstance(entities_raw, dict) else {}
 
+    # Validate entity keys against the vocabulary. The LLM regularly emits
+    # a slightly-elaborated key ("search_keyword_name" instead of
+    # "search_keyword") that nothing downstream can match — coerce the
+    # close ones, drop the rest. A wrong key triggers worse downstream
+    # behavior than no key.
+    if vocab and entities:
+        entities = _coerce_entity_keys(entities, vocab)
+
+    # Multi-turn safety net: even if the LLM ignored the carry-forward
+    # instructions, prior-turn entities must persist. New entities from
+    # this turn override on conflict (later turn wins for explicit
+    # contradictions in the requirement).
+    if seed_entities:
+        entities = {**seed_entities, **entities}
+
     try:
         confidence = float(parsed.get("confidence") or 0.0)
     except (TypeError, ValueError):

From 9b14a7a1aa7f50ec4e6d56cf66ee81a71f22e94a Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Tue, 28 Apr 2026 18:22:55 +0900
Subject: [PATCH 07/14] feat(plan/runner): wrapper-agnostic envelope unwrap +
 response_root_keys hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PlanRunner 가 step 응답을 context 에 저장하기 전에 한 단계 envelope 을
peel 하는 휴리스틱 추가. 흔한 backend 패턴 ``{code, message, payload:
{...}, timestamp}`` 에서 swagger 가 envelope 을 안 적었을 때, downstream
binding ``${s1.searchDataList[*].goodsNo}`` 가 ``payload`` 안의 데이터로
자연스럽게 풀리게 한다.

조건 (5가지 모두 충족 시에만 unwrap):
  1. response 가 dict, root key 2개 이상
  2. 정확히 1개의 dict-typed root value (wrapper 후보)
  3. 나머지 root value 모두 scalar / null
  4. expected_root_keys (= produces[].json_path 의 first segment) 가
     response root 에 하나도 없음
  5. wrapper 안에 expected_root_keys 중 하나라도 존재

조건이 strict 해서 false unwrap 은 거의 없음. wrapper 이름은 안 봄
(``payload``/``data``/``result`` 모두 동일하게 동작 — backend fit 아님).

PlanStep 에 ``response_root_keys: list[str]`` 필드 추가 — 합성 시점에
synthesizer 가 채워두면 runtime 에 unwrap detect 가 schema 비교
가능. 채워지지 않으면 unwrap skip (안전 default).
---
 graph_tool_call/plan/runner.py | 57 ++++++++++++++++++++++++++++++++++
 graph_tool_call/plan/schema.py |  6 ++++
 2 files changed, 63 insertions(+)

diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py
index 8b9fa27..141d500 100644
--- a/graph_tool_call/plan/runner.py
+++ b/graph_tool_call/plan/runner.py
@@ -220,6 +220,13 @@ def run_stream(
                 )
                 return
 
+            # 2a. Unwrap a single-level envelope when the response shape
+            # diverges from the schema in the canonical "{code, message,
+            # <wrapper>: {...}, timestamp}" pattern. One detect per step,
+            # not per binding — every binding for this step then resolves
+            # against the unwrapped dict naturally.
+            output = _maybe_unwrap_envelope(output, step.response_root_keys)
+
             step_trace.output = output
             step_trace.duration_ms = _ms_since(step_start)
             trace_steps.append(step_trace)
@@ -333,6 +340,56 @@ def _preview(value: Any, limit: int) -> Any:
     return value
 
 
+def _maybe_unwrap_envelope(
+    output: Any,
+    expected_root_keys: list[str],
+) -> Any:
+    """Peel one envelope layer when the response shape diverges from schema.
+
+    Conservative — unwraps only when ALL of these hold:
+
+      1. ``output`` is a dict with two or more root keys
+         (a bare ``{"payload": ...}`` is more likely real data than envelope).
+      2. Exactly one root value is itself a dict — the wrapper candidate.
+      3. Every other root value is scalar / null
+         (envelope siblings are status/code/message/timestamp — not
+         business collections).
+      4. None of ``expected_root_keys`` appears at the response root
+         (otherwise the response is already in schema-shape).
+      5. At least one ``expected_root_keys`` entry appears inside the
+         wrapper candidate (otherwise the dict-typed sibling is unrelated
+         business data — unwrapping would lose information).
+
+    The wrapper *key name* is never inspected, so this works for
+    ``payload`` / ``data`` / ``result`` / any other convention. Without
+    ``expected_root_keys`` there's no schema signal to validate against,
+    so the output passes through unchanged.
+    """
+    if not expected_root_keys or not isinstance(output, dict) or len(output) < 2:
+        return output
+
+    dict_keys = [k for k, v in output.items() if isinstance(v, dict)]
+    if len(dict_keys) != 1:
+        return output
+
+    wrapper_key = dict_keys[0]
+    for k, v in output.items():
+        if k == wrapper_key:
+            continue
+        if isinstance(v, (dict, list)):
+            return output
+
+    expected = set(expected_root_keys)
+    if expected & set(output.keys()):
+        return output
+
+    wrapper = output[wrapper_key]
+    if not (expected & set(wrapper.keys())):
+        return output
+
+    return wrapper
+
+
 def _output_size(value: Any) -> int:
     """Approximate serialized byte size (for observability)."""
     import json as _json
diff --git a/graph_tool_call/plan/schema.py b/graph_tool_call/plan/schema.py
index 8a18577..b07530f 100644
--- a/graph_tool_call/plan/schema.py
+++ b/graph_tool_call/plan/schema.py
@@ -31,6 +31,12 @@ class PlanStep:
     rationale: str = ""                        # why this step exists (for audit)
     timeout_ms: int | None = None
     retryable: bool = False                    # reserved for v1.1 retry policy
+    # Top-level keys the synthesizer expects in this tool's response,
+    # derived from ``produces[].json_path``. Used by PlanRunner to detect
+    # envelope wrappers (e.g. ``{code, message, payload: {...}}``) when the
+    # ingest captured the wrapped fields without the wrapper itself. Empty
+    # list means "no hint" — the runner then leaves the response untouched.
+    response_root_keys: list[str] = field(default_factory=list)
 
 
 @dataclass

From 2e679137ebb7e62d93bdc889058f0179a93d1bc9 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Tue, 28 Apr 2026 18:23:29 +0900
Subject: [PATCH 08/14] feat(plan/synthesizer): context_defaults, chain
 eligibility, dynamic-option, enum popup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PathSynthesizer 가 ``produces`` 매칭만으로 chain 을 결정하던 흐름에
의미 신호를 단계적으로 도입. 핵심 변화 4가지:

  * ``context_defaults`` 인자 — collection-level ambient values
    (locale/site/tenant) 운영자가 한 번 등록 → ``kind=context`` 필드
    자동 채움. swagger 가 optional 로 표시했지만 backend 가 사실상
    필수로 요구하는 환경값을 cover.

  * Chain eligibility filter — producer 가 ``canonical_action ∈
    {search, read}`` 이고 ``primary_resource`` 가 target 의 도메인
    (target.primary_resource + consumes semantic prefix) 안일 때만
    chain. ``produces`` 매칭으로 무관 도구 (예: claim_cost calculator)
    가 끌려오는 false positive 방지. ai_metadata 부분 enriched / 미
    enriched 케이스는 fallback 통과해 graph quality 가 sparse 한
    collection 도 회귀 없이 동작.

  * Dynamic option detection — required-data field 의 producer 가
    single-hop 으로 호출 가능 (모든 input 이 entity / context_defaults
    로 채워짐) + canonical_action='read' + json_path 가 array 면
    ``DynamicOptionRequired`` (UnsatisfiableField 의 subclass) 를 던져
    호출자가 producer 를 부분 실행해 옵션 list 를 사용자한테 popup.
    chain 으로 임의로 [0] 인덱스 박는 패턴을 막고 사용자 의도 우선.

  * ``enum_field_names`` — 운영자가 enum 매핑 등록한 field 는 chain
    안 만들고 즉시 ``UnsatisfiableField`` raise (popup 으로 풀어야).
    enum 코드 같은 환경 값을 chain 으로 풀 때 끌려오는 무관 도구
    (응답에 같은 코드명 우연히 포함된 도구) 차단.

부수: ``response_root_keys`` 자동 채움 (PlanRunner 의 envelope unwrap
힌트) + producer ranking 정책 docstring 정리.
---
 graph_tool_call/plan/__init__.py    |   2 +
 graph_tool_call/plan/synthesizer.py | 451 +++++++++++++++++++++++++++-
 2 files changed, 438 insertions(+), 15 deletions(-)

diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
index d9e5e1e..8f2d9eb 100644
--- a/graph_tool_call/plan/__init__.py
+++ b/graph_tool_call/plan/__init__.py
@@ -56,6 +56,7 @@ def call_tool(tool_name, args):
     UnsatisfiableFieldError,
     CyclicDependencyError,
     MaxDepthExceededError,
+    DynamicOptionRequired,
 )
 
 __all__ = [
@@ -82,6 +83,7 @@ def call_tool(tool_name, args):
     "UnsatisfiableFieldError",
     "CyclicDependencyError",
     "MaxDepthExceededError",
+    "DynamicOptionRequired",
     # intent
     "ToolCatalogEntry",
     "ParsedIntent",
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
index 33be4dd..44696cf 100644
--- a/graph_tool_call/plan/synthesizer.py
+++ b/graph_tool_call/plan/synthesizer.py
@@ -9,8 +9,6 @@
 
 v1 scope (per design §16.6):
   - Linear chain only — no fan-out, no parallel, no branching.
-  - If multiple producers exist for a required field, the first one is
-    picked (simple, predictable). Ambiguity handling is Phase D+.
   - Max recursion depth = 5 (guard against cyclic or pathological graphs).
 
 Matching order for each required consume field:
@@ -19,6 +17,19 @@
      (Pass 2 LLM enrichment quality).
   3. Another tool's ``produces`` with the same ``field_name``
      (Pass 1 deterministic extraction, fallback).
+
+Producer selection is ranked by Pass 2 metadata signals — no hardcoded
+domain or field rules:
+  - Entity affinity: producer consumes an entity the user supplied,
+    so chaining through it actually uses that entity.
+  - Pair hint: target's ``pairs_well_with`` includes this producer.
+  - Action preference: ``canonical_action`` = search/read fits a
+    prerequisite role better than create/update/delete.
+
+``consumes[].kind`` ("data" | "context", set by Pass 2):
+  - "data" — chain to a producer if entity doesn't match.
+  - "context" — ambient config (locale, site, tenant). Never chained;
+    must come from entity or skipped (runtime uses API default).
 """
 
 from __future__ import annotations
@@ -47,6 +58,38 @@ class MaxDepthExceededError(PlanSynthesisError):
     """Recursion depth exceeded — likely a misshapen graph."""
 
 
+class DynamicOptionRequired(UnsatisfiableFieldError):
+    """A required data field has a single-hop producer that can be called
+    immediately with the user's entities + context_defaults. Surface this
+    so the caller can fetch the option list (instead of weaving a chain)
+    and ask the user to pick — the popup-driven UX for fields like
+    ``itmNo`` (single-품목 option) where the choices are dynamic per
+    request.
+
+    The exception carries enough metadata for the caller to:
+      * know which producer to call (``producer_name``)
+      * find the option array in the producer's response (``options_path``)
+      * pick a sensible label field next to each code (``label_field_hints``)
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        field_name: str,
+        semantic_tag: str,
+        producer_name: str,
+        options_path: str,
+        label_field_hints: list[str],
+    ) -> None:
+        super().__init__(message)
+        self.field_name = field_name
+        self.semantic_tag = semantic_tag
+        self.producer_name = producer_name
+        self.options_path = options_path
+        self.label_field_hints = list(label_field_hints)
+
+
 @dataclass
 class _PartialStep:
     """In-progress step being built during bottom-up synthesis."""
@@ -74,9 +117,26 @@ def __init__(
         graph: dict[str, Any],
         *,
         max_depth: int = 5,
+        context_defaults: dict[str, Any] | None = None,
+        enum_field_names: set[str] | None = None,
     ) -> None:
         self._tools: dict[str, dict[str, Any]] = dict(graph.get("tools") or {})
         self._max_depth = max_depth
+        # Collection-level ambient values (locale, tenant id, site id, ...) the
+        # operator registers once per collection. Filled into ``kind=context``
+        # consume fields when the user's entities don't supply them — avoids
+        # repeating env-style args in every requirement and avoids leaking
+        # backend-specific defaults into library code. Lookup precedence:
+        # entities > context_defaults > skip.
+        self._context_defaults: dict[str, Any] = dict(context_defaults or {})
+        # Field names the operator registered an enum mapping for. When a
+        # required-data field of this kind can't be filled by an entity,
+        # the synthesizer raises UnsatisfiableFieldError instead of
+        # producer-chaining — the caller (service layer) is expected to
+        # surface a popup to the user rather than weaving an awkward
+        # producer chain that pulls in unrelated tools just to source a
+        # code value. User intent (popup choice) wins over chain depth.
+        self._enum_field_names: set[str] = set(enum_field_names or ())
         # semantic_tag -> [tool_name], insertion order preserved
         self._producers_by_semantic: dict[str, list[str]] = {}
         self._producers_by_field: dict[str, list[str]] = {}
@@ -133,6 +193,7 @@ def synthesize(
                 tool=partial.tool,
                 args=args,
                 rationale=partial.rationale,
+                response_root_keys=self._response_root_keys(tool_name),
             ))
 
         target_step_id = steps_by_tool[target].step_id
@@ -140,7 +201,10 @@ def synthesize(
             id=str(uuid.uuid4()),
             goal=goal or f"Execute {target}",
             steps=final_steps,
-            output_binding=f"${{{target_step_id}.body}}",
+            # PlanRunner adapter 는 step ctx 에 응답 body 를 root 로 노출 →
+            # ``${sN}`` 만으로 전체 응답 dict 가 잡힌다 (과거 ``${sN.body}`` 는
+            # adapter 가 ``{status, body}`` 을 그대로 흘릴 때의 흔적).
+            output_binding=f"${{{target_step_id}}}",
             created_at=datetime.now(timezone.utc).isoformat(),
             metadata={
                 "target": target,
@@ -187,22 +251,55 @@ def _resolve(
         rationales: list[str] = []
 
         for consume in consumes:
-            if not consume.get("required"):
-                continue
-
             field_name = consume.get("field_name") or ""
             semantic = consume.get("semantic_tag") or ""
+            kind = str(consume.get("kind") or "data").strip().lower()
+            is_required = bool(consume.get("required"))
 
-            # 1. Entity match (user-supplied)
+            # 1. Entity match (user-supplied) — applies to both data and
+            #    context, both required and optional. The user's input
+            #    always wins.
             entity_val = self._match_entity(entities, semantic, field_name)
             if entity_val is not None:
                 args[field_name] = entity_val
                 continue
 
-            # 2/3. Find a producer (semantic first, then field_name)
+            # 2. Context-kind: try collection-level defaults regardless of
+            #    required flag. Context is never chained — ambient config
+            #    must come from entity or operator-registered default
+            #    (chaining through e.g. getSiteInfo would inflate the plan
+            #    with steps that don't produce business value).
+            if kind == "context":
+                default = self._lookup_context_default(semantic, field_name)
+                if default is not None:
+                    args[field_name] = default
+                continue
+
+            # 3. Optional data field: leave out. The caller's backend will
+            #    apply its own defaults — synthesizer has no business
+            #    inventing values for optional business inputs.
+            if not is_required:
+                continue
+
+            # 4. Enum-field popup priority. If the operator registered an
+            #    enum mapping for this field, it's the kind of value the
+            #    user should pick from a popup — NOT something to chain
+            #    through a producer (which often drags in semantically
+            #    unrelated tools just because their response happens to
+            #    contain a code by the same name). Surface
+            #    UnsatisfiableFieldError so the caller can yield a
+            #    question.required event instead.
+            if field_name in self._enum_field_names:
+                raise UnsatisfiableFieldError(
+                    f"tool {tool_name!r} requires {field_name!r} "
+                    f"(semantic={semantic!r}) — enum field, expects user "
+                    f"selection (no producer chain attempted)"
+                )
+
+            # 5. Required data field → rank candidate producers and pick the best.
             producer = self._find_producer(
                 semantic=semantic, field_name=field_name,
-                exclude=tool_name,
+                target_tool=tool_name, entities=entities,
             )
             if producer is None:
                 raise UnsatisfiableFieldError(
@@ -210,6 +307,41 @@ def _resolve(
                     f"(semantic={semantic!r}) but no entity or producer found"
                 )
 
+            # 5a. Dynamic-option popup priority. Detect "read-detail then
+            #     pick one" patterns where the producer is a single-hop
+            #     read of a product/record whose response carries a
+            #     list of options the user must choose from (e.g.
+            #     ``getProductInfo`` exposes ``$.itmInfo[*].itmNo`` —
+            #     the available SKUs). In that case, defer to the caller
+            #     to fetch options and pop up a question, instead of
+            #     chaining the producer in and binding ``[0]`` blindly.
+            #
+            #     Constrained to ``canonical_action='read'`` because
+            #     ``search`` producers (e.g. seltSearchProduct → goodsNo)
+            #     are exactly the chain idiom we DO want — pick the first
+            #     hit and continue. Without this constraint legitimate
+            #     search→detail chains turn into popups.
+            producer_action = self._producer_action(producer)
+            if (
+                producer_action == "read"
+                and self._is_producer_simple_callable(producer, entities)
+            ):
+                opt_path = self._produces_path_for(
+                    producer, semantic=semantic, field_name=field_name,
+                )
+                if opt_path and "[*]" in opt_path:
+                    raise DynamicOptionRequired(
+                        f"tool {tool_name!r} requires {field_name!r} "
+                        f"(semantic={semantic!r}) — dynamic option from "
+                        f"{producer!r}; caller should fetch options and "
+                        f"prompt the user",
+                        field_name=field_name,
+                        semantic_tag=semantic,
+                        producer_name=producer,
+                        options_path=opt_path,
+                        label_field_hints=self._label_hints_for(producer, opt_path),
+                    )
+
             # Recurse into the producer first so step_id ordering is correct
             self._resolve(
                 tool_name=producer,
@@ -254,19 +386,269 @@ def _find_producer(
         *,
         semantic: str,
         field_name: str,
-        exclude: str,
+        target_tool: str,
+        entities: dict[str, Any],
     ) -> str | None:
-        """Pick the first producer matching semantic, falling back to field name."""
+        """Pick the best-ranked producer for ``semantic`` (or ``field_name``).
+
+        Candidates are gathered from both indexes (semantic first), then
+        ranked using Pass 2 metadata (``_rank_producers``) and finally
+        filtered by ``_is_chain_eligible`` — discards producers whose
+        ``canonical_action`` / ``primary_resource`` signal they're
+        unrelated to the target's domain (e.g. claim-cost calculator
+        showing up as a producer for a basket field just because a
+        ``produces`` entry happens to match).
+        """
+        candidates: list[str] = []
+        seen: set[str] = set()
         if semantic:
             for name in self._producers_by_semantic.get(semantic, []):
-                if name != exclude:
-                    return name
+                if name != target_tool and name not in seen:
+                    candidates.append(name)
+                    seen.add(name)
         if field_name:
             for name in self._producers_by_field.get(field_name, []):
-                if name != exclude:
-                    return name
+                if name != target_tool and name not in seen:
+                    candidates.append(name)
+                    seen.add(name)
+        if not candidates:
+            return None
+
+        ranked = self._rank_producers(
+            candidates, target_tool=target_tool, entities=entities,
+        )
+        for cand in ranked:
+            if self._is_chain_eligible(cand, target_tool=target_tool):
+                return cand
         return None
 
+    def _producer_action(self, producer_name: str) -> str:
+        """Return the producer's ``ai_metadata.canonical_action`` (lowercased,
+        empty string if missing). Used to gate dynamic-option popups to
+        ``read`` producers — search producers are the chain idiom (pick
+        first hit), not popup candidates.
+        """
+        tool = self._tools.get(producer_name) or {}
+        ai = (tool.get("metadata") or {}).get("ai_metadata") or {}
+        return str(ai.get("canonical_action") or "").strip().lower()
+
+    def _is_producer_simple_callable(
+        self,
+        producer_name: str,
+        entities: dict[str, Any],
+    ) -> bool:
+        """True iff the producer can be called with only the user's entities
+        and the collection's context_defaults — i.e. no further producer
+        chain needed to source its inputs.
+
+        Used to detect "single-hop dynamic option" cases: instead of
+        chaining the producer into the plan, the caller fetches it once
+        and pops up the resulting list to the user (e.g. itmNo from
+        getProductInfo when the user already supplied goodsNo).
+        """
+        producer = self._tools.get(producer_name) or {}
+        for c in (producer.get("metadata") or {}).get("consumes") or []:
+            if not isinstance(c, dict) or not c.get("required"):
+                continue
+            field = c.get("field_name") or ""
+            sem = c.get("semantic_tag") or ""
+            kind = str(c.get("kind") or "data").strip().lower()
+            if self._match_entity(entities, sem, field) is not None:
+                continue
+            if kind == "context" and self._lookup_context_default(sem, field) is not None:
+                continue
+            return False
+        return True
+
+    def _produces_path_for(
+        self,
+        producer_name: str,
+        *,
+        semantic: str,
+        field_name: str,
+    ) -> str:
+        """Find the producer's json_path that emits the given field — the
+        location of the option array in the response (e.g.
+        ``$.itmInfo[*].itmNo``). Empty string if no match.
+        """
+        producer = self._tools.get(producer_name) or {}
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            if semantic and p.get("semantic_tag") == semantic:
+                return str(p.get("json_path") or "")
+        # Fallback: match by field_name when semantic missing/mismatched
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            if field_name and p.get("field_name") == field_name:
+                return str(p.get("json_path") or "")
+        return ""
+
+    def _label_hints_for(
+        self,
+        producer_name: str,
+        options_path: str,
+    ) -> list[str]:
+        """Return field names that look like human labels living next to
+        the option-code field in the producer's response. Heuristic: same
+        array prefix, name ending in ``Nm`` / ``Name`` / ``Label``.
+
+        ``options_path`` looks like ``$.itmInfo[*].itmNo``; we walk the
+        producer's other produces entries that share the prefix
+        ``$.itmInfo[*].`` and pick the ones whose field_name suggests a
+        label.
+        """
+        producer = self._tools.get(producer_name) or {}
+        # Compute the array prefix: everything up to the last "."
+        if "." not in options_path:
+            return []
+        prefix = options_path.rsplit(".", 1)[0] + "."
+        hints: list[str] = []
+        seen: set[str] = set()
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            jp = str(p.get("json_path") or "")
+            if not jp.startswith(prefix):
+                continue
+            field = str(p.get("field_name") or "")
+            if not field or field in seen:
+                continue
+            lower = field.lower()
+            if lower.endswith("nm") or lower.endswith("name") or lower.endswith("label"):
+                hints.append(field)
+                seen.add(field)
+        return hints
+
+    def _is_chain_eligible(self, producer_name: str, *, target_tool: str) -> bool:
+        """Return True if ``producer_name`` may be added to the prerequisite
+        chain for ``target_tool``.
+
+        Two signals from Pass 2 ``ai_metadata`` decide:
+
+          1. ``canonical_action`` ∈ {search, read}
+             create/update/delete/action are not prerequisite material —
+             they perform side effects, never just data lookup.
+          2. ``primary_resource`` is in the target's domain set
+             (target's own resource + the prefix of every consume's
+             semantic_tag, e.g. ``product_id`` ⇒ ``product``).
+
+        Either signal absent (sparse ``ai_metadata``) ⇒ pass through.
+        Operators that haven't enriched the graph yet keep the previous
+        behaviour; once enriched, the policy starts filtering. Also
+        reverts to pass-through if the target itself has no ``ai_metadata``,
+        because the "domain set" can't be computed.
+        """
+        producer = self._tools.get(producer_name) or {}
+        p_meta = (producer.get("metadata") or {}).get("ai_metadata") or {}
+        p_action = str(p_meta.get("canonical_action") or "").strip().lower()
+        if not p_action:
+            return True
+        if p_action not in ("search", "read"):
+            return False
+
+        p_resource = str(p_meta.get("primary_resource") or "").strip().lower()
+        if not p_resource:
+            return True
+
+        target = self._tools.get(target_tool) or {}
+        t_meta_full = target.get("metadata") or {}
+        t_meta = t_meta_full.get("ai_metadata") or {}
+        t_resource = str(t_meta.get("primary_resource") or "").strip().lower()
+
+        related: set[str] = set()
+        if t_resource:
+            related.add(t_resource)
+            if "_" in t_resource:
+                related.add(t_resource.split("_", 1)[0])
+
+        for c in (t_meta_full.get("consumes") or []):
+            if not isinstance(c, dict):
+                continue
+            sem = str(c.get("semantic_tag") or "").strip().lower()
+            if not sem:
+                continue
+            related.add(sem.split("_", 1)[0] if "_" in sem else sem)
+
+        if not related:
+            return True
+
+        p_prefix = p_resource.split("_", 1)[0] if "_" in p_resource else p_resource
+        return p_resource in related or p_prefix in related
+
+    def _rank_producers(
+        self,
+        candidates: list[str],
+        *,
+        target_tool: str,
+        entities: dict[str, Any],
+    ) -> list[str]:
+        """Rank candidates by Pass 2 metadata signals.
+
+        Order:
+          1. Entity affinity — producer consumes a field the user already
+             supplied (so the chain actually uses user input).
+          2. Pair hint — target's ``pairs_well_with`` names this producer.
+          3. Action preference — ``search`` > ``read`` > others as a
+             prerequisite role.
+        Ties fall back to insertion order (stable sort).
+
+        No hardcoded names / regexes. Every signal is a per-tool Pass 2
+        field the LLM filled at ingest time.
+        """
+        target_meta = (self._tools.get(target_tool) or {}).get("metadata") or {}
+        target_ai = target_meta.get("ai_metadata") or {}
+        pair_names = {
+            str(p.get("tool") or "").strip()
+            for p in (target_ai.get("pairs_well_with") or [])
+            if isinstance(p, dict)
+        }
+        pair_names.discard("")
+        entity_keys = {str(k) for k in (entities or {}).keys()}
+
+        action_score = {"search": 3, "read": 2, "action": 1}
+
+        def _score(name: str) -> tuple[int, int, int]:
+            tool = self._tools.get(name) or {}
+            meta = tool.get("metadata") or {}
+            ai = meta.get("ai_metadata") or {}
+
+            affinity = 0
+            for c in (meta.get("consumes") or []):
+                tag = c.get("semantic_tag") or ""
+                fname = c.get("field_name") or ""
+                if (tag and tag in entity_keys) or (fname and fname in entity_keys):
+                    affinity += 1
+
+            pair_bonus = 1 if name in pair_names else 0
+            action = str(ai.get("canonical_action") or "").strip().lower()
+            return (affinity, pair_bonus, action_score.get(action, 0))
+
+        # Python's sort is stable; higher score wins, ties keep insertion order.
+        return sorted(candidates, key=_score, reverse=True)
+
+    def _response_root_keys(self, tool_name: str) -> list[str]:
+        """Top-level keys of the tool's response, taken from ``produces``.
+
+        Each ``produces[].json_path`` (e.g. ``$.searchDataList[*].goodsNo``)
+        contributes its first dotted segment (``searchDataList``). Used by
+        PlanRunner as a schema hint for envelope detection — when the
+        actual response is missing every hint at root but a single nested
+        dict contains them, the wrapper is peeled away.
+        """
+        tool = self._tools.get(tool_name) or {}
+        produces = (tool.get("metadata") or {}).get("produces") or []
+        out: list[str] = []
+        seen: set[str] = set()
+        for p in produces:
+            raw = p.get("json_path") or ""
+            head = _jsonpath_head(raw)
+            if head and head not in seen:
+                out.append(head)
+                seen.add(head)
+        return out
+
     def _producer_jsonpath(
         self,
         producer: str,
@@ -298,6 +680,25 @@ def _producer_jsonpath(
         raw = match.get("json_path") or ""
         return _normalize_jsonpath_for_binding(raw)
 
+    def _lookup_context_default(
+        self,
+        semantic: str,
+        field_name: str,
+    ) -> Any | None:
+        """Pick a registered context default for a consume field.
+
+        Mirrors ``_match_entity`` lookup order — semantic tag first (Pass 2
+        canonical id), field name second (Pass 1 raw). Returns ``None`` if
+        the operator hasn't registered a value for either key.
+        """
+        if not self._context_defaults:
+            return None
+        if semantic and semantic in self._context_defaults:
+            return self._context_defaults[semantic]
+        if field_name and field_name in self._context_defaults:
+            return self._context_defaults[field_name]
+        return None
+
     def _match_entity(
         self,
         entities: dict[str, Any],
@@ -336,6 +737,25 @@ def _rewrite_tool_refs(
         return value
 
 
+def _jsonpath_head(raw: str) -> str:
+    """First dotted segment of a JSONPath, stripping ``$``, ``.`` and ``[…]``.
+
+    ``$.payload.searchDataList[*].goodsNo`` → ``"payload"``.
+    ``$.totalCount`` → ``"totalCount"``.
+    Returns ``""`` for empty / unparseable input.
+    """
+    if not raw:
+        return ""
+    path = raw[1:] if raw.startswith("$") else raw
+    if path.startswith("."):
+        path = path[1:]
+    # Cut at the first separator (``.`` or ``[``).
+    for i, ch in enumerate(path):
+        if ch in ".[":
+            return path[:i]
+    return path
+
+
 def _normalize_jsonpath_for_binding(raw: str) -> str:
     """``$.body.goods[*].goodsNo`` → ``body.goods[0].goodsNo``.
 
@@ -357,4 +777,5 @@ def _normalize_jsonpath_for_binding(raw: str) -> str:
     "UnsatisfiableFieldError",
     "CyclicDependencyError",
     "MaxDepthExceededError",
+    "DynamicOptionRequired",
 ]

From 285baa0958b47c2651555510a2f8d37e870f4234 Mon Sep 17 00:00:00 2001
From: daehee <eet43@plateer.com>
Date: Wed, 29 Apr 2026 21:37:24 +0900
Subject: [PATCH 09/14] feat(graphify): build-time-baked confidence labels +
 zero-vector retrieval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a graphify-mode pipeline alongside the existing retrieval engine — the
graph itself carries the relationship signal, so search and plan synthesis
need no embeddings and no per-query LLM calls. All decisions are decided at
ingest time and persisted as edge attrs.

Library
- Confidence enum (EXTRACTED / INFERRED / AMBIGUOUS); add_relation now
  accepts confidence/conf_score/layer/evidence as optional kwargs (legacy
  callers unchanged).
- New graph_tool_call.graphify subpackage:
  * ingest_openapi_graphify(schemas, raw_spec=None) buckets each
    DetectedRelation by layer + score and persists confidence per edge.
  * preserve_refs_for_detection rescues layer-1 shared-schema signal that
    ingest_openapi inlined (Spring/SpringDoc specs depend on this).
  * _apply_pair_hints derives graphify edges from each tool's
    ai_metadata.pairs_well_with (single source-of-truth: ai_metadata; edges
    re-derived on every rebuild).
  * retrieve_graphify: BM25-seeded confidence-weighted BFS, intent-aware
    relation weighting, render_subgraph_text packages results into a
    NODE/EDGE text block bounded by token_budget.
- ingest/openapi.py:
  * Content-type fallback (application/*+json, */*, first available) for
    response/request body schemas — needed for SpringDoc-emitted */* APIs.
  * Wrapper-object/array query parameters (Spring @ModelAttribute) are
    expanded into their inner properties or dropped when those properties
    are already exposed as siblings.
- plan/synthesizer.py:
  * _find_producer redesigned around combined graph + schema signals
    (semantic_exact 100, graph_EXTRACTED 50, field_exact 40, ...) — graph
    edges are first-class, not a fallback chain.
  * Echo-back filter excludes producers that merely relay an input field.
  * Loose field-name matching (case + separator folded) for
    cross-naming-convention coverage.
  * Cycle policy A: visiting set passed to _find_producer so cycle-prone
    candidates are skipped and the chain reroutes around them.
  * F2 + Cycle policy B: unmet required fields surface as
    ${user_input.<field>} placeholders instead of raising; recursion
    failures (MaxDepth/Cyclic) on a producer fall through to the same
    placeholder so plan synthesis never aborts midway.
  * Plan.metadata.user_input_slots collects every placeholder for the
    runner / UI to prompt with.

Tests passing across touched modules: ingest_openapi, dependency,
retrieval, plan, graph_engine.
---
 graph_tool_call/graphify/__init__.py     |  38 ++
 graph_tool_call/graphify/ingest.py       | 434 ++++++++++++++++++++
 graph_tool_call/graphify/retrieval.py    | 478 +++++++++++++++++++++++
 graph_tool_call/ingest/openapi.py        | 146 ++++++-
 graph_tool_call/ontology/builder.py      |  31 +-
 graph_tool_call/ontology/llm_provider.py | 227 +++++++++--
 graph_tool_call/ontology/schema.py       |  20 +
 graph_tool_call/plan/synthesizer.py      | 350 +++++++++++++++--
 graph_tool_call/tool_graph.py            |  24 +-
 scripts/__init__.py                      |   5 +
 10 files changed, 1664 insertions(+), 89 deletions(-)
 create mode 100644 graph_tool_call/graphify/__init__.py
 create mode 100644 graph_tool_call/graphify/ingest.py
 create mode 100644 graph_tool_call/graphify/retrieval.py
 create mode 100644 scripts/__init__.py

diff --git a/graph_tool_call/graphify/__init__.py b/graph_tool_call/graphify/__init__.py
new file mode 100644
index 0000000..6785ee3
--- /dev/null
+++ b/graph_tool_call/graphify/__init__.py
@@ -0,0 +1,38 @@
+"""graphify-mode: deterministic edge extraction + zero-vector retrieval.
+
+Inspired by the graphify project (https://github.com/safishamsi/graphify).
+The core idea: every edge carries a Confidence label, retrieval is a
+keyword-seeded BFS over confidence-weighted edges, and the result is a
+token-budgeted text rendering of the matched subgraph — no embeddings,
+no wRRF fusion, no MMR reranking.
+
+Public API:
+  - ingest_openapi_graphify(schemas) -> (ToolGraph, edge_stats)
+  - retrieve_graphify(tg, query, ...) -> {results, subgraph_text, intent, stats}
+  - render_subgraph_text(tg, nodes, edges, budget) -> str
+"""
+
+from graph_tool_call.graphify.ingest import (
+    DEFAULT_CONF_AMBIGUOUS,
+    DEFAULT_CONF_EXTRACTED,
+    DEFAULT_CONF_INFERRED,
+    _apply_pair_hints,
+    bucket_confidence,
+    ingest_openapi_graphify,
+    preserve_refs_for_detection,
+)
+from graph_tool_call.graphify.retrieval import (
+    render_subgraph_text,
+    retrieve_graphify,
+)
+
+__all__ = [
+    "DEFAULT_CONF_AMBIGUOUS",
+    "DEFAULT_CONF_EXTRACTED",
+    "DEFAULT_CONF_INFERRED",
+    "bucket_confidence",
+    "ingest_openapi_graphify",
+    "preserve_refs_for_detection",
+    "render_subgraph_text",
+    "retrieve_graphify",
+]
diff --git a/graph_tool_call/graphify/ingest.py b/graph_tool_call/graphify/ingest.py
new file mode 100644
index 0000000..48bc8d5
--- /dev/null
+++ b/graph_tool_call/graphify/ingest.py
@@ -0,0 +1,434 @@
+"""Deterministic ingest: ToolSchema list -> ToolGraph with confidence labels.
+
+Pipeline (no LLM, no embeddings):
+  1. ``detect_dependencies`` runs all four layers (path-hierarchy, CRUD,
+     shared $ref, name/RPC/cross-resource) at threshold 0.0.
+  2. Each ``DetectedRelation`` is bucketed by (layer, conf_score) into one of
+     EXTRACTED / INFERRED / AMBIGUOUS / dropped.
+  3. Edges are added to a fresh ``ToolGraph`` with the bucket as ``confidence``
+     attr, plus ``conf_score`` / ``layer`` / ``evidence`` for transparency.
+  4. ``edge_stats`` summarises bucket counts, per-relation counts, and the
+     count of cross-source edges (different ``source_label`` on each end —
+     the key signal that adding a new source linked into the existing graph).
+
+For specs that use a lot of $ref pointers (typical of Swagger/OpenAPI 3.x
+generators like SpringDoc), pass the raw spec dict to
+``preserve_refs_for_detection`` BEFORE calling ``ingest_openapi_graphify`` so
+``detect_dependencies._detect_shared_schemas`` can fire — without this step
+the library's ``ingest_openapi`` resolves refs inline and the shared-schema
+signal is lost. ``ingest_openapi_graphify`` accepts the raw spec directly via
+``raw_spec=`` and runs preservation automatically.
+
+This is the ONLY ingest path used by xgen-workflow. The legacy 14-stage
+``RetrievalEngine`` plumbing in graph_tool_call.retrieval is left intact
+for benchmark/example users but is not invoked from this module.
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any
+
+from graph_tool_call.analyze.dependency import (
+    DetectedRelation,
+    detect_dependencies,
+)
+from graph_tool_call.core.tool import ToolSchema
+from graph_tool_call.ontology.schema import Confidence, RelationType
+from graph_tool_call.tool_graph import ToolGraph
+
+# Thresholds — same numbers graphify uses for INFERRED vs AMBIGUOUS.
+# EXTRACTED additionally requires layer == 1 (deterministic structural).
+DEFAULT_CONF_EXTRACTED = 0.85
+DEFAULT_CONF_INFERRED = 0.85
+DEFAULT_CONF_AMBIGUOUS = 0.70
+
+
+def bucket_confidence(
+    layer: int,
+    conf_score: float,
+    *,
+    extracted_min: float = DEFAULT_CONF_EXTRACTED,
+    inferred_min: float = DEFAULT_CONF_INFERRED,
+    ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS,
+) -> Confidence | None:
+    """Bucket a (layer, conf_score) pair into a Confidence label.
+
+    layer == 1 (path/CRUD/$ref) AND conf >= extracted_min  -> EXTRACTED
+    conf >= inferred_min                                   -> INFERRED
+    ambiguous_min <= conf < inferred_min                   -> AMBIGUOUS
+    else                                                   -> None  (dropped)
+    """
+    if conf_score >= extracted_min and layer == 1:
+        return Confidence.EXTRACTED
+    if conf_score >= inferred_min:
+        return Confidence.INFERRED
+    if conf_score >= ambiguous_min:
+        return Confidence.AMBIGUOUS
+    return None
+
+
+# ---------------------------------------------------------------------------
+# $ref preservation
+#
+# Library ``ingest_openapi`` calls ``_resolve_refs`` which inlines every
+# ``$ref`` pointer into its target schema. That makes life easier for runtime
+# users (they get full schemas, no traversal needed) but it ERASES the signal
+# ``_detect_shared_schemas`` relies on — that detector walks metadata looking
+# for literal ``$ref`` strings to spot tools sharing a DTO.
+#
+# This helper rescans the raw spec, captures refs per operation BEFORE they're
+# resolved, applies a frequency filter (drop common wrappers + singletons),
+# and re-injects them as ``__refs__`` markers into each tool's metadata so
+# ``_collect_refs`` finds them. Identical algorithm to xgen-workflow's
+# ``swagger_tool_generator._collect_operation_refs``.
+# ---------------------------------------------------------------------------
+
+_HTTP_METHODS = ("get", "post", "put", "patch", "delete", "head", "options")
+
+
+def _scan_refs(obj: Any) -> set[str]:
+    """Recursively collect ``$ref`` pointer strings from a schema fragment."""
+    refs: set[str] = set()
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            if k == "$ref" and isinstance(v, str):
+                refs.add(v)
+            else:
+                refs.update(_scan_refs(v))
+    elif isinstance(obj, list):
+        for item in obj:
+            refs.update(_scan_refs(item))
+    return refs
+
+
+def preserve_refs_for_detection(
+    tools: list[ToolSchema],
+    raw_spec: dict[str, Any],
+    *,
+    min_freq: int = 2,
+    max_freq_ratio: float = 0.3,
+) -> int:
+    """Inject ``__refs__`` markers into tool metadata so shared-schema detection fires.
+
+    Walk ``raw_spec`` BEFORE resolve, find $refs per operation, filter to the
+    "domain DTO" sweet spot (>=min_freq references, <=max_freq_ratio of all ops),
+    and re-inject them into each tool's ``metadata.response_schema.__refs__`` and
+    ``metadata.request_body_refs``.
+
+    Why filter:
+      - Common wrappers like ``ApiResponse`` show up in nearly every operation;
+        leaving them in produces a fully-connected COMPLEMENTARY graph (noise).
+      - Singletons show up once and can't form edges anyway.
+
+    Returns the number of tools whose metadata was updated. Mutates ``tools``
+    in place.
+    """
+    paths = (raw_spec.get("paths") or {})
+    if not isinstance(paths, dict):
+        return 0
+
+    raw_per_op: dict[tuple[str, str], tuple[set[str], set[str]]] = {}
+    freq: Counter[str] = Counter()
+
+    for path, item in paths.items():
+        if not isinstance(item, dict):
+            continue
+        for method in _HTTP_METHODS:
+            op = item.get(method)
+            if not isinstance(op, dict):
+                continue
+            req = _scan_refs(op.get("requestBody")) | _scan_refs(op.get("parameters"))
+            resp = _scan_refs(op.get("responses"))
+            if not (req or resp):
+                continue
+            raw_per_op[(method, path)] = (req, resp)
+            for r in req | resp:
+                freq[r] += 1
+
+    if not raw_per_op:
+        return 0
+
+    total_ops = len(raw_per_op)
+    ceiling = max(min_freq, int(total_ops * max_freq_ratio))
+
+    def _useful(r: str) -> bool:
+        return min_freq <= freq[r] <= ceiling
+
+    op_refs: dict[tuple[str, str], tuple[list[str], list[str]]] = {}
+    for k, (req, resp) in raw_per_op.items():
+        rq = sorted(r for r in req if _useful(r))
+        rp = sorted(r for r in resp if _useful(r))
+        if rq or rp:
+            op_refs[k] = (rq, rp)
+
+    updated = 0
+    for tool in tools:
+        md = tool.metadata or {}
+        method = str(md.get("method") or "").lower()
+        path = str(md.get("path") or "")
+        refs = op_refs.get((method, path))
+        if not refs:
+            continue
+        rq, rp = refs
+        if rp:
+            rs = md.get("response_schema") or {}
+            if isinstance(rs, dict):
+                rs = dict(rs)
+                rs["__refs__"] = [{"$ref": r} for r in rp]
+                md["response_schema"] = rs
+        if rq:
+            md["request_body_refs"] = [{"$ref": r} for r in rq]
+        tool.metadata = md
+        updated += 1
+
+    return updated
+
+
+# ---------------------------------------------------------------------------
+# ai_metadata.pairs_well_with → graphify edge derivation
+#
+# ``ai_metadata`` is the source-of-truth (LLM Pass 2 fills it; the operator
+# can hand-edit it via ToolGraphView). On every rebuild we derive the
+# corresponding workflow edges into the graphify graph so ``_find_producer``
+# can score them as a first-class signal — no separate lookup, no two-system
+# sync drift. The frontend keeps reading ``ai_metadata.pairs_well_with``
+# directly (single read path, no UI churn).
+#
+# Confidence mapping reflects the trust we place in each source:
+#   PairHint.source == "manual" → EXTRACTED  (operator deliberately curated)
+#   PairHint.source == "auto"   → INFERRED   (LLM Pass 2 high-confidence)
+#   anything else / missing     → INFERRED   (legacy entries default safe)
+#
+# Layer is set to 2 because pair hints are not structural (path/$ref/CRUD)
+# even when curated — they encode workflow semantics, which sits one level
+# above structural inference in the graphify confidence model.
+# ---------------------------------------------------------------------------
+
+
+def _apply_pair_hints(
+    tg: ToolGraph,
+    schemas: list[ToolSchema],
+) -> dict[str, int]:
+    """Convert ``metadata.ai_metadata.pairs_well_with`` into graphify edges.
+
+    Skips pairs whose target tool isn't in the current graph (cross-source
+    enrichment can list pairs that haven't been ingested yet) and self-pairs.
+    Skips when the same (src, tgt) pair already carries a structural relation
+    from ``detect_dependencies`` UNLESS the new pair is operator-curated
+    (``source="manual"``) — operator intent overrides automatic detection.
+    """
+    stats = {"manual": 0, "auto": 0, "skipped_target_missing": 0,
+             "skipped_self": 0, "skipped_existing_structural": 0}
+    tool_names = set(tg.tools.keys())
+
+    for s in schemas:
+        ai = (s.metadata or {}).get("ai_metadata") or {}
+        pairs = ai.get("pairs_well_with") or []
+        if not isinstance(pairs, list):
+            continue
+        for p in pairs:
+            if not isinstance(p, dict):
+                continue
+            target = str(p.get("tool") or "").strip()
+            if not target:
+                continue
+            if target == s.name:
+                stats["skipped_self"] += 1
+                continue
+            if target not in tool_names:
+                stats["skipped_target_missing"] += 1
+                continue
+
+            source = str(p.get("source") or "auto").strip().lower()
+            is_manual = source == "manual"
+            confidence = Confidence.EXTRACTED if is_manual else Confidence.INFERRED
+            reason = str(p.get("reason") or "")[:200]
+
+            # Existing-edge policy: if detect_dependencies already produced
+            # an edge here we keep it unless the operator is overriding.
+            if tg.graph.has_edge(s.name, target):
+                if not is_manual:
+                    stats["skipped_existing_structural"] += 1
+                    continue
+
+            try:
+                tg.add_relation(
+                    s.name,
+                    target,
+                    RelationType.COMPLEMENTARY,
+                    confidence=confidence,
+                    layer=2,
+                    evidence=f"pair[{source}]: {reason}" if reason else f"pair[{source}]",
+                )
+                stats["manual" if is_manual else "auto"] += 1
+            except (KeyError, ValueError):
+                stats["skipped_target_missing"] += 1
+
+    return stats
+
+
+def _source_label(schema: ToolSchema) -> str:
+    """Return the source label that distinguishes which OpenAPI spec a tool came from.
+
+    xgen-workflow tags each tool with ``metadata.source_label`` (e.g. "order",
+    "claim"). When that's absent, fall back to the first path segment so
+    cross-source detection still works for libraries used outside xgen.
+    """
+    md = schema.metadata or {}
+    label = md.get("source_label")
+    if label:
+        return str(label)
+    path = str(md.get("path") or "")
+    segs = [s for s in path.split("/") if s and not s.startswith("{")]
+    return segs[0] if segs else ""
+
+
+def ingest_openapi_graphify(
+    schemas: list[ToolSchema],
+    *,
+    extracted_min: float = DEFAULT_CONF_EXTRACTED,
+    inferred_min: float = DEFAULT_CONF_INFERRED,
+    ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS,
+    spec: dict[str, Any] | None = None,
+    raw_spec: dict[str, Any] | None = None,
+) -> tuple[ToolGraph, dict[str, Any]]:
+    """Build a graphify-style ToolGraph from a list of ToolSchemas.
+
+    Parameters
+    ----------
+    schemas:
+        Tools to ingest. Pre-existing ``metadata.source_label`` enables
+        cross-source edge tracking.
+    extracted_min / inferred_min / ambiguous_min:
+        Confidence bucket thresholds (see ``bucket_confidence``).
+    spec:
+        Optional normalized spec dict, forwarded to ``detect_dependencies``.
+        Currently unused by the detector but kept for forward compat.
+    raw_spec:
+        Optional ORIGINAL OpenAPI/Swagger spec dict (BEFORE $ref resolution).
+        When supplied, runs ``preserve_refs_for_detection`` so the layer-1
+        shared-schema detector can fire on heavily $ref-using specs (typical
+        of SpringDoc-generated OpenAPI). xgen-workflow callers who already
+        bake refs into tool metadata via swagger_tool_generator can leave
+        this None.
+
+    Returns
+    -------
+    (ToolGraph, edge_stats):
+        ``edge_stats`` keys:
+          EXTRACTED, INFERRED, AMBIGUOUS, dropped:  int counts
+          by_relation:                              {relation_value: int}
+          cross_source:                             int  (edges across labels)
+          tool_count, edge_count:                   int
+          refs_preserved:                           int  (tools touched by
+                                                          preserve_refs_for_detection)
+    """
+    tg = ToolGraph()
+    for s in schemas:
+        tg.add_tool(s)
+
+    label_by_name = {s.name: _source_label(s) for s in schemas}
+
+    stats: dict[str, Any] = {
+        "EXTRACTED": 0,
+        "INFERRED": 0,
+        "AMBIGUOUS": 0,
+        "dropped": 0,
+        "by_relation": {},
+        "cross_source": 0,
+        "tool_count": len(schemas),
+        "edge_count": 0,
+        "refs_preserved": 0,
+    }
+
+    if len(schemas) < 2:
+        return tg, stats
+
+    # Optional: rescue layer-1 shared-schema signal that ingest_openapi inlined.
+    if raw_spec is not None:
+        stats["refs_preserved"] = preserve_refs_for_detection(schemas, raw_spec)
+
+    # min_confidence=0.0 so we see every candidate; we re-bucket here.
+    relations: list[DetectedRelation] = detect_dependencies(
+        schemas, spec, min_confidence=0.0
+    )
+
+    seen: set[tuple[str, str, str]] = set()  # (src, tgt, relation_value)
+    for rel in relations:
+        bucket = bucket_confidence(
+            rel.layer,
+            rel.confidence,
+            extracted_min=extracted_min,
+            inferred_min=inferred_min,
+            ambiguous_min=ambiguous_min,
+        )
+        if bucket is None:
+            stats["dropped"] += 1
+            continue
+
+        rel_value = (
+            rel.relation_type.value
+            if hasattr(rel.relation_type, "value")
+            else str(rel.relation_type)
+        )
+        key = (rel.source, rel.target, rel_value)
+        if key in seen:
+            # detect_dependencies already de-duplicates, but be defensive.
+            continue
+        seen.add(key)
+
+        try:
+            tg.add_relation(
+                rel.source,
+                rel.target,
+                rel.relation_type,
+                confidence=bucket,
+                conf_score=rel.confidence,
+                layer=rel.layer,
+                evidence=rel.evidence,
+            )
+        except (KeyError, ValueError):
+            # Endpoint not in graph (shouldn't happen — tools were just added) — skip.
+            stats["dropped"] += 1
+            continue
+
+        stats[bucket.value] += 1
+        stats["by_relation"][rel_value] = stats["by_relation"].get(rel_value, 0) + 1
+
+        src_label = label_by_name.get(rel.source, "")
+        tgt_label = label_by_name.get(rel.target, "")
+        if src_label and tgt_label and src_label != tgt_label:
+            stats["cross_source"] += 1
+
+    # Derive workflow edges from ai_metadata.pairs_well_with — single
+    # source-of-truth lives on each tool's metadata, edges are regenerated
+    # on every rebuild so operator/LLM curation flows in automatically.
+    pair_stats = _apply_pair_hints(tg, schemas)
+    stats["pair_edges"] = pair_stats
+    # Roll the pair edges into the global confidence/by_relation counters
+    # so ``edge_stats`` accurately reflects the final graph contents.
+    stats["EXTRACTED"] += pair_stats.get("manual", 0)
+    stats["INFERRED"] += pair_stats.get("auto", 0)
+    if pair_stats.get("manual") or pair_stats.get("auto"):
+        stats["by_relation"]["complementary"] = (
+            stats["by_relation"].get("complementary", 0)
+            + pair_stats.get("manual", 0)
+            + pair_stats.get("auto", 0)
+        )
+        # cross_source also re-counted on these new edges for completeness.
+        for s in schemas:
+            ai = (s.metadata or {}).get("ai_metadata") or {}
+            for p in (ai.get("pairs_well_with") or []):
+                if not isinstance(p, dict):
+                    continue
+                tgt = str(p.get("tool") or "").strip()
+                if not tgt or tgt == s.name or tgt not in tg.tools:
+                    continue
+                src_lab = label_by_name.get(s.name, "")
+                tgt_lab = label_by_name.get(tgt, "")
+                if src_lab and tgt_lab and src_lab != tgt_lab:
+                    stats["cross_source"] += 1
+
+    stats["edge_count"] = tg.graph.edge_count()
+    return tg, stats
diff --git a/graph_tool_call/graphify/retrieval.py b/graph_tool_call/graphify/retrieval.py
new file mode 100644
index 0000000..55e659b
--- /dev/null
+++ b/graph_tool_call/graphify/retrieval.py
@@ -0,0 +1,478 @@
+"""Zero-vector retrieval over a graphify-style ToolGraph.
+
+Algorithm (mirrors graphify/serve.py):
+  1. seed = top-5 of BM25(query)  (substring fallback if BM25 returns empty)
+  2. weights = INTENT_RELATION_WEIGHTS[dominant_intent] or DEFAULT
+  3. score = rel_weight[rel] * CONF_FACTOR[confidence] * decay(depth)
+     CONF_FACTOR = {EXTRACTED: 1.0, INFERRED: 0.7, AMBIGUOUS: 0.4, None: 0.5}
+     decay(d)   = 1 / (0.5*d + 1)
+  4. BFS from seeds, depth=2, accumulate max score per neighbour
+  5. history-aware demote (used tools * 0.6)
+  6. render_subgraph_text(top_k nodes + edges, token_budget)
+
+Why this works without embeddings:
+  - The graph carries the semantic signal (CRUD chains, $ref data flow,
+    cross-resource matches) — once a relationship is in the graph, traversal
+    finds it.
+  - Confidence labels let the score down-weight guesses without dropping them;
+    AMBIGUOUS edges still appear, just behind EXTRACTED ones.
+  - Token-budgeted rendering means an LLM gets a compact, structured context
+    (not a list of tool JSON blobs) and can decide chains via the EDGE lines.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Any
+
+from graph_tool_call.core.protocol import GraphEngine
+from graph_tool_call.core.tool import ToolSchema
+from graph_tool_call.ontology.schema import (
+    DEFAULT_RELATION_WEIGHTS,
+    INTENT_RELATION_WEIGHTS,
+    NodeType,
+    RelationType,
+)
+from graph_tool_call.retrieval.intent import classify_intent
+from graph_tool_call.tool_graph import ToolGraph
+
+# Score multiplier per confidence bucket. EXTRACTED edges are deterministic
+# (path/CRUD/$ref) and trusted at 1.0; INFERRED is heuristic but still
+# high-confidence; AMBIGUOUS gets a strong penalty so it's surfaced for
+# review without dominating EXTRACTED chains.
+#
+# Edges added by callers without a confidence attr (e.g. legacy code paths)
+# get the same weight as the no-bucket fallback (0.5) — neither rewarded
+# nor heavily penalised.
+CONF_FACTOR: dict[str | None, float] = {
+    "EXTRACTED": 1.0,
+    "INFERRED": 0.7,
+    "AMBIGUOUS": 0.4,
+    None: 0.5,
+}
+
+_DEFAULT_DEPTH = 2
+_DEFAULT_TOP_K = 10
+_DEFAULT_BUDGET = 2000
+_HISTORY_DEMOTE = 0.6
+
+
+# ---------------------------------------------------------------------------
+# Seed selection
+# ---------------------------------------------------------------------------
+
+
+def _strip_diacritics(text: str) -> str:
+    nfkd = unicodedata.normalize("NFKD", text)
+    return "".join(c for c in nfkd if not unicodedata.combining(c))
+
+
+def _substring_seeds(
+    tools: dict[str, ToolSchema],
+    query: str,
+    *,
+    limit: int = 5,
+) -> list[tuple[str, float]]:
+    """Substring fallback when BM25 returns no hits (very short or non-Latin queries)."""
+    q = _strip_diacritics(query).lower()
+    terms = [t for t in re.split(r"[\s_\-/.,;:!?()]+", q) if t and len(t) > 1]
+    scored: list[tuple[str, float]] = []
+    for name, tool in tools.items():
+        nname = _strip_diacritics(name).lower()
+        ndesc = _strip_diacritics(tool.description or "").lower()
+        score = (
+            sum(1.0 for t in terms if t in nname)
+            + 0.5 * sum(1.0 for t in terms if t in ndesc)
+        )
+        if score > 0:
+            scored.append((name, score))
+    scored.sort(key=lambda x: x[1], reverse=True)
+    return scored[:limit]
+
+
+def _bm25_seeds(tg: ToolGraph, query: str, *, limit: int = 5) -> list[tuple[str, float]]:
+    """Top-N BM25 hits as seeds. Uses the engine's BM25 index, lazy-built once."""
+    try:
+        engine = tg._get_retrieval_engine()  # noqa: SLF001
+        bm25 = engine._get_bm25()  # noqa: SLF001
+    except Exception:
+        return []
+    scores = bm25.score(query) or {}
+    if not scores:
+        return []
+    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    return [(name, score) for name, score in ranked[:limit]]
+
+
+def _select_seeds(
+    tg: ToolGraph,
+    query: str,
+    *,
+    limit: int = 5,
+) -> list[tuple[str, float]]:
+    seeds = _bm25_seeds(tg, query, limit=limit)
+    if seeds:
+        return seeds
+    return _substring_seeds(tg.tools, query, limit=limit)
+
+
+# ---------------------------------------------------------------------------
+# BFS traversal
+# ---------------------------------------------------------------------------
+
+
+def _intent_weights(query: str) -> tuple[dict[str, float], str]:
+    """Pick relation weights based on dominant query intent.
+
+    Returns (weights_map, dominant_label) where label is one of
+    'read'/'write'/'delete'/'neutral'.
+    """
+    intent = classify_intent(query)
+    if intent.is_neutral:
+        return DEFAULT_RELATION_WEIGHTS, "neutral"
+    by_dim = {
+        "read": intent.read_intent,
+        "write": intent.write_intent,
+        "delete": intent.delete_intent,
+    }
+    dominant = max(by_dim, key=lambda k: by_dim[k])
+    if by_dim[dominant] < 0.5:
+        return DEFAULT_RELATION_WEIGHTS, "neutral"
+    weights = INTENT_RELATION_WEIGHTS.get(dominant, DEFAULT_RELATION_WEIGHTS)
+    return weights, dominant
+
+
+def _normalize_relation_key(rel: Any) -> Any:
+    """Relation weights are keyed by RelationType. Normalize string attrs to enum."""
+    if isinstance(rel, RelationType):
+        return rel
+    if isinstance(rel, str):
+        try:
+            return RelationType(rel)
+        except ValueError:
+            return rel
+    return rel
+
+
+def _bfs_from_seeds(
+    graph: GraphEngine,
+    seed_scores: list[tuple[str, float]],
+    *,
+    depth: int,
+    rel_weights: dict[str, float],
+) -> tuple[dict[str, float], list[tuple[str, str]]]:
+    """Confidence-weighted BFS. Returns (scores, edges_visited).
+
+    Score policy:
+      seeds:        normalized BM25 score (top seed = 1.0, others scaled)
+      neighbour at depth d via edge of weight w and confidence c:
+        score(neighbour) = max(prev,  parent_score * w * CONF_FACTOR[c] * 1/(0.5*d + 1))
+
+    Why normalize seeds: if all 5 BM25 hits got flat 1.0, top-K shows them in
+    arbitrary order with identical scores and BFS-found neighbours never compete.
+    Scaling by ``score / max_seed_score`` preserves BM25's relative ranking and
+    lets a strongly-matching seed lift its 1-hop neighbours above weakly-matching
+    sibling seeds.
+
+    Tools nodes are scored; CATEGORY/DOMAIN nodes are passthrough so we can
+    reach sibling tools on the next hop.
+    """
+    if not seed_scores:
+        return {}, []
+
+    max_seed = max((s for _, s in seed_scores), default=1.0) or 1.0
+    scores: dict[str, float] = {
+        n: s / max_seed
+        for n, s in seed_scores
+        if graph.has_node(n)
+    }
+    visited: set[str] = set(scores)
+    frontier: list[str] = list(scores)
+    edges_visited: list[tuple[str, str]] = []
+
+    for d in range(1, depth + 1):
+        decay = 1.0 / (0.5 * d + 1)
+        next_frontier: list[str] = []
+        for node in frontier:
+            parent_score = scores.get(node, 0.0)
+            try:
+                edges = graph.get_edges_from(node, direction="both")
+            except (KeyError, ValueError):
+                continue
+            for src, tgt, attrs in edges:
+                neighbour = tgt if src == node else src
+                if neighbour in visited:
+                    continue
+                neighbour_attrs = graph.get_node_attrs(neighbour)
+                neighbour_type = neighbour_attrs.get("node_type")
+
+                rel_key = _normalize_relation_key(attrs.get("relation"))
+                rel_w = rel_weights.get(rel_key, 0.3)
+                conf = attrs.get("confidence")
+                conf_factor = CONF_FACTOR.get(conf, CONF_FACTOR[None])
+
+                if neighbour_type == NodeType.TOOL:
+                    # Propagate parent's score so a high-BM25 seed lifts its
+                    # neighbours more than a low-BM25 seed does. This is what
+                    # makes the ranking actually informative — without
+                    # parent_score multiplication every BFS-discovered tool
+                    # would inherit the same fixed weight.
+                    score = parent_score * rel_w * conf_factor * decay
+                    scores[neighbour] = max(scores.get(neighbour, 0.0), score)
+                    edges_visited.append((src, tgt))
+                    next_frontier.append(neighbour)
+                    visited.add(neighbour)
+                elif neighbour_type in (NodeType.CATEGORY, NodeType.DOMAIN):
+                    # Passthrough — visit but don't score; lets BFS reach
+                    # sibling tools via CATEGORY hubs without inflating scores.
+                    next_frontier.append(neighbour)
+                    visited.add(neighbour)
+        frontier = next_frontier
+        if not frontier:
+            break
+
+    return scores, edges_visited
+
+
+# ---------------------------------------------------------------------------
+# Subgraph rendering
+# ---------------------------------------------------------------------------
+
+
+def _node_line(name: str, tool: ToolSchema | None, attrs: dict) -> str:
+    """One NODE line for the subgraph text rendering."""
+    md = (tool.metadata if tool else {}) or {}
+    method = str(md.get("method") or "").upper()
+    path = str(md.get("path") or "")
+    src_label = str(md.get("source_label") or "")
+    community = attrs.get("community")
+    parts = [name]
+    if method or path:
+        parts.append(f"[{method} {path}]".strip())
+    if src_label:
+        parts.append(f"[source={src_label}]")
+    if community is not None:
+        parts.append(f"[community={community}]")
+    return "NODE " + " ".join(p for p in parts if p)
+
+
+def _edge_line(
+    u: str,
+    v: str,
+    attrs: dict,
+) -> str:
+    """One EDGE line. confidence in [], evidence in (...)."""
+    rel = attrs.get("relation")
+    rel_str = rel.value if hasattr(rel, "value") else str(rel)
+    conf = attrs.get("confidence", "")
+    conf_str = f" [{conf}]" if conf else ""
+    line = f"EDGE {u} --{rel_str}{conf_str}--> {v}"
+    evidence = attrs.get("evidence")
+    if evidence:
+        line += f"   ({evidence})"
+    return line
+
+
+def render_subgraph_text(
+    tg: ToolGraph,
+    nodes: set[str] | list[str],
+    edges: list[tuple[str, str]] | None = None,
+    *,
+    token_budget: int = _DEFAULT_BUDGET,
+    sort_by_score: dict[str, float] | None = None,
+) -> str:
+    """Render the matched subgraph as ``NODE ...`` / ``EDGE ...`` lines.
+
+    Approx 3 chars per token is the budget conversion. When the rendering
+    overflows the budget, the tail is cut and a ``... (truncated)`` line
+    is appended.
+
+    sort_by_score: if provided, NODE lines are emitted in descending score
+    order so the LLM sees the most relevant tools first.
+
+    edges: optional hint listing edges visited during BFS — purely for
+    ordering. Whether or not this is supplied, ALL graph edges between any
+    pair of chosen nodes are emitted so the LLM sees the full local
+    structure (matching graphify's behaviour).
+    """
+    char_budget = token_budget * 3
+    node_set: set[str] = set(nodes)
+
+    # Order nodes: by retrieval score (desc) if known, else by name.
+    if sort_by_score:
+        node_order = sorted(
+            node_set, key=lambda n: (-sort_by_score.get(n, 0.0), n)
+        )
+    else:
+        node_order = sorted(node_set)
+
+    lines: list[str] = []
+    for n in node_order:
+        if not tg.graph.has_node(n):
+            continue
+        attrs = tg.graph.get_node_attrs(n)
+        tool = tg.tools.get(n)
+        lines.append(_node_line(n, tool, attrs))
+
+    # Walk all graph edges between chosen nodes (not just BFS visited ones)
+    # so the LLM gets the complete local structure. BFS-visited edges naturally
+    # come first when we sort, ensuring no surprise gaps.
+    seen_edges: set[tuple[str, str]] = set()
+    edge_lines: list[str] = []
+    for u in node_order:
+        if not tg.graph.has_node(u):
+            continue
+        try:
+            outgoing = tg.graph.get_edges_from(u, direction="out")
+        except (KeyError, ValueError):
+            continue
+        for src, tgt, attrs in outgoing:
+            if tgt not in node_set:
+                continue
+            key = (src, tgt)
+            if key in seen_edges:
+                continue
+            seen_edges.add(key)
+            edge_lines.append(_edge_line(src, tgt, attrs))
+
+    lines.extend(edge_lines)
+
+    output = "\n".join(lines)
+    if len(output) > char_budget:
+        # Cut at the last newline that fits, then append a marker. Keep the
+        # marker even if it pushes us slightly over the char budget — the
+        # token budget is a soft cap.
+        cut = output[:char_budget].rsplit("\n", 1)[0]
+        output = cut + f"\n... (truncated to ~{token_budget} token budget)"
+    return output
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def retrieve_graphify(
+    tg: ToolGraph,
+    query: str,
+    *,
+    top_k: int = _DEFAULT_TOP_K,
+    depth: int = _DEFAULT_DEPTH,
+    token_budget: int = _DEFAULT_BUDGET,
+    history: list[str] | None = None,
+) -> dict[str, Any]:
+    """Retrieve tools for a natural-language query using graph traversal only.
+
+    Parameters
+    ----------
+    tg:
+        A graphify-style ``ToolGraph``. Edges should carry ``confidence``
+        attrs (EXTRACTED/INFERRED/AMBIGUOUS); edges without one get the
+        neutral 0.5 multiplier.
+    query:
+        Natural-language search.
+    top_k:
+        Maximum tools in the result set (and the rendered subgraph).
+    depth:
+        BFS depth from seeds. 2 is graphify's default and works for most
+        workflow chains (createX -> getX -> doSomethingWithX).
+    token_budget:
+        Char-budget for the rendered text (~3 chars/token).
+    history:
+        Tool names already called in this session — they are demoted (×0.6)
+        to encourage progress through a workflow rather than re-suggesting.
+
+    Returns
+    -------
+    dict with keys:
+      - results:        list of {name, score, tool: {...}} sorted desc.
+      - subgraph_text:  the LLM-ready NODE/EDGE rendering.
+      - intent:         {dominant: 'read'|'write'|'delete'|'neutral', read, write, delete}
+      - stats:          {seeds: [...], visited_nodes: int, visited_edges: int}
+
+    Note: prerequisite chain construction (e.g. listOrders → getOrder → cancelOrder)
+    is NOT this function's job — it lives in Stage 2 ``synthesize_plan`` which
+    consumes the graph this module produces. retrieve_graphify only finds the
+    primary candidates; chain assembly is downstream.
+    """
+    if not query or not tg.tools:
+        return {
+            "results": [],
+            "subgraph_text": "",
+            "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0},
+            "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0},
+        }
+
+    # 1) Seeds
+    seeds_with_scores = _select_seeds(tg, query, limit=5)
+    seed_names = [s for s, _ in seeds_with_scores]
+
+    if not seed_names:
+        return {
+            "results": [],
+            "subgraph_text": "",
+            "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0},
+            "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0},
+        }
+
+    # 2) Intent → relation weight map
+    rel_weights, dominant = _intent_weights(query)
+    from graph_tool_call.retrieval.intent import classify_intent  # noqa: I001 (re-import OK)
+
+    intent_obj = classify_intent(query)
+
+    # 3) BFS — pass full (name, score) pairs so seed scores reflect BM25 ranking
+    scores, edges_visited = _bfs_from_seeds(
+        tg.graph,
+        seeds_with_scores,
+        depth=depth,
+        rel_weights=rel_weights,
+    )
+
+    # 4) History demote
+    if history:
+        for h in history:
+            if h in scores:
+                scores[h] *= _HISTORY_DEMOTE
+
+    # 5) Filter to TOOL nodes only and rank
+    tool_scores: dict[str, float] = {
+        n: s for n, s in scores.items() if n in tg.tools
+    }
+    ranked = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
+    chosen_names: set[str] = {n for n, _ in ranked}
+
+    # 6) Render
+    subgraph_text = render_subgraph_text(
+        tg,
+        chosen_names,
+        edges_visited,
+        token_budget=token_budget,
+        sort_by_score=tool_scores,
+    )
+
+    results = [
+        {
+            "name": name,
+            "score": round(score, 4),
+            "tool": tg.tools[name].to_dict() if name in tg.tools else None,
+        }
+        for name, score in ranked
+    ]
+
+    return {
+        "results": results,
+        "subgraph_text": subgraph_text,
+        "intent": {
+            "dominant": dominant,
+            "read": round(intent_obj.read_intent, 3),
+            "write": round(intent_obj.write_intent, 3),
+            "delete": round(intent_obj.delete_intent, 3),
+        },
+        "stats": {
+            "seeds": seed_names,
+            "visited_nodes": len(scores),
+            "visited_edges": len(edges_visited),
+        },
+    }
diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py
index 41ffe7e..f914fd4 100644
--- a/graph_tool_call/ingest/openapi.py
+++ b/graph_tool_call/ingest/openapi.py
@@ -134,6 +134,41 @@ def _schema_type(schema: dict[str, Any]) -> str:
     return _TYPE_MAP.get(schema.get("type", "string"), "string")
 
 
+def _pick_content_schema(content: dict[str, Any]) -> dict[str, Any]:
+    """Pick a usable schema from an OpenAPI ``content`` object.
+
+    OpenAPI 3.x lets a request body / response declare schemas under any
+    media-type key. The preferred order is:
+
+      1. ``application/json``                 — most common
+      2. ``application/*+json`` (e.g. hal+json) — JSON variants
+      3. ``*/*``                                — Spring/SpringDoc default when
+                                                  the operation doesn't pin a
+                                                  specific content type
+      4. first available media-type            — last resort
+
+    Returning the schema dict (possibly empty). The earlier code only
+    looked at ``application/json`` and silently dropped everything else,
+    which produced empty ``response_schema`` for every Spring endpoint
+    that uses the default ``*/*`` (real-world failure: x2bee Order API,
+    where this caused PathSynthesizer to find zero producers).
+    """
+    if not isinstance(content, dict) or not content:
+        return {}
+    if "application/json" in content:
+        return (content["application/json"] or {}).get("schema") or {}
+    for ct, val in content.items():
+        if isinstance(ct, str) and ct.endswith("+json"):
+            return (val or {}).get("schema") or {}
+    if "*/*" in content:
+        return (content["*/*"] or {}).get("schema") or {}
+    # Last resort: the first content type with a schema.
+    for val in content.values():
+        if isinstance(val, dict) and val.get("schema"):
+            return val["schema"]
+    return {}
+
+
 # ---------------------------------------------------------------------------
 # Operation -> ToolSchema
 # ---------------------------------------------------------------------------
@@ -241,39 +276,109 @@ def _extract_params_openapi3(
     *,
     required_only: bool = False,
 ) -> list[ToolParameter]:
-    """Extract parameters from an OpenAPI 3.x operation."""
+    """Extract parameters from an OpenAPI 3.x operation.
+
+    Spring/SpringDoc gotcha: when a controller takes a `@ModelAttribute`
+    DTO via query string, the spec sometimes lists BOTH the wrapper
+    object AND its inner fields as separate query parameters
+    (``regularOrderDetailRequest`` ``in=query`` ``type=object`` AND
+    ``rglrDeliNo`` ``in=query`` ``type=string``). Treating the wrapper
+    as a real input field poisons downstream producer matching: nothing
+    in the API ever returns a value named after the wrapper class, so
+    PathSynthesizer raises ``UnsatisfiableField`` on a phantom field.
+
+    Strategy: drop wrapper parameters when their inner properties are
+    already exposed as siblings; otherwise expand the wrapper into its
+    leaf properties so callers see the real input names.
+    """
     params: list[ToolParameter] = []
 
+    raw_parameters = list(operation.get("parameters", []))
+    # Pre-collect names from non-object parameters — used to detect when
+    # a wrapper's inner property is already exposed alongside it.
+    sibling_names: set[str] = {
+        str(p.get("name") or "")
+        for p in raw_parameters
+        if isinstance(p, dict) and _schema_type(p.get("schema", {}) or {}) not in ("object",)
+    }
+
     # Path / query / header / cookie parameters
-    for p in operation.get("parameters", []):
+    for p in raw_parameters:
         if "name" not in p:
             continue  # skip malformed parameters (missing required 'name' field)
         schema = p.get("schema", {})
         is_required = p.get("required", False)
+        ptype = _schema_type(schema)
+
+        # Wrapper-object/array query parameter handling.
+        # type=object → wrapper itself (Spring @ModelAttribute style).
+        # type=array of objects → wrapper used to send a list of structured
+        # records (less common but seen in some Spring specs); we expand the
+        # element schema's properties. Primitive arrays (array of integers /
+        # strings) are real list inputs and are NOT expanded here — those
+        # belong to the caller as a single multi-value field.
+        if ptype in ("object", "array") and p.get("in") == "query":
+            wrapper_props: dict[str, Any] = {}
+            wrapper_required: set[str] = set()
+            if ptype == "object":
+                wrapper_props = (schema.get("properties") or {}) if isinstance(schema, dict) else {}
+                wrapper_required = set(schema.get("required") or [])
+            else:  # array
+                items = (schema.get("items") or {}) if isinstance(schema, dict) else {}
+                if isinstance(items, dict) and items.get("type") == "object":
+                    wrapper_props = items.get("properties") or {}
+                    wrapper_required = set(items.get("required") or [])
+                # else: primitive-element array — don't expand, treat as real input
+            if wrapper_props:
+                # If every inner property is already a sibling parameter,
+                # drop the wrapper entirely (deduplication).
+                if all(prop in sibling_names for prop in wrapper_props):
+                    continue
+                # Otherwise expand the wrapper into individual leaves so
+                # producer matching has real field names to chase.
+                for prop_name, prop_schema in wrapper_props.items():
+                    if prop_name in sibling_names:
+                        continue  # don't double-list ones already exposed
+                    inner_required = prop_name in wrapper_required
+                    if required_only and not inner_required:
+                        continue
+                    inner_type = _schema_type(prop_schema or {})
+                    inner_desc = (prop_schema or {}).get("description", "") or ""
+                    params.append(
+                        ToolParameter(
+                            name=prop_name,
+                            type=inner_type,
+                            description=inner_desc,
+                            required=inner_required,
+                            enum=(prop_schema or {}).get("enum"),
+                        )
+                    )
+                continue  # wrapper itself is not added
+
         if required_only and not is_required:
             continue
         desc = p.get("description", "") or ""
         # object/array 타입이면 nested fields를 description에 펼쳐서
         # LLM이 정확한 필드명(예: searchWord)을 알 수 있게 한다.
-        if _schema_type(schema) in ("object", "array"):
+        if ptype in ("object", "array"):
             nested = _summarize_object_schema(schema)
             if nested:
                 desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}"
         params.append(
             ToolParameter(
                 name=p["name"],
-                type=_schema_type(schema),
+                type=ptype,
                 description=desc,
                 required=is_required,
                 enum=schema.get("enum"),
             )
         )
 
-    # requestBody
+    # requestBody — pick the most specific schema across declared media types
+    # (Spring/SpringDoc commonly emits */* — see _pick_content_schema notes).
     request_body = operation.get("requestBody", {})
     content = request_body.get("content", {})
-    json_content = content.get("application/json", {})
-    body_schema = json_content.get("schema", {})
+    body_schema = _pick_content_schema(content)
     body_required = set(body_schema.get("required", []))
     for prop_name, prop_schema in body_schema.get("properties", {}).items():
         is_required = prop_name in body_required
@@ -429,21 +534,24 @@ def _operation_to_tool(
     else:
         parameters = _extract_params_openapi3(operation, resolved_spec, required_only=required_only)
 
-    # Build response schema metadata
+    # Build response schema metadata. Walk responses in success-code order
+    # and use _pick_content_schema so we don't drop schemas declared under
+    # */*, application/*+json, or other non-JSON media types.
     responses = operation.get("responses", {})
     response_schema: dict[str, Any] = {}
     for code in ("200", "201", "default"):
-        if code in responses:
-            resp = responses[code]
-            # Swagger 2.0
-            if "schema" in resp:
-                response_schema = resp["schema"]
-                break
-            # OpenAPI 3.x
-            resp_content = resp.get("content", {})
-            if "application/json" in resp_content:
-                response_schema = resp_content["application/json"].get("schema", {})
-                break
+        if code not in responses:
+            continue
+        resp = responses[code] or {}
+        # Swagger 2.0 puts the schema directly on the response object.
+        if "schema" in resp and isinstance(resp.get("schema"), dict):
+            response_schema = resp["schema"]
+            break
+        # OpenAPI 3.x: inspect the content map.
+        picked = _pick_content_schema(resp.get("content") or {})
+        if picked:
+            response_schema = picked
+            break
 
     metadata: dict[str, Any] = {
         "source": "openapi",
diff --git a/graph_tool_call/ontology/builder.py b/graph_tool_call/ontology/builder.py
index f6fb1a7..517d730 100644
--- a/graph_tool_call/ontology/builder.py
+++ b/graph_tool_call/ontology/builder.py
@@ -5,7 +5,7 @@
 from graph_tool_call.core.dict_graph import DictGraph
 from graph_tool_call.core.protocol import GraphEngine
 from graph_tool_call.core.tool import ToolSchema
-from graph_tool_call.ontology.schema import NodeType, RelationType
+from graph_tool_call.ontology.schema import Confidence, NodeType, RelationType
 
 
 class OntologyBuilder:
@@ -64,11 +64,36 @@ def add_relation(
         target: str,
         relation: str | RelationType,
         weight: float = 1.0,
+        *,
+        confidence: str | Confidence | None = None,
+        conf_score: float | None = None,
+        layer: int | None = None,
+        evidence: str | None = None,
     ) -> None:
-        """Add a directed relation between two nodes."""
+        """Add a directed relation between two nodes.
+
+        Optional graphify-style attrs (all default None — existing callers
+        unaffected):
+
+        confidence:  Confidence label (EXTRACTED / INFERRED / AMBIGUOUS).
+        conf_score:  Raw 0.0–1.0 score from the upstream detector.
+        layer:       1=structural (path/CRUD/$ref), 2=heuristic (name/RPC).
+        evidence:    Human-readable reason; capped at 200 chars to avoid bloat.
+        """
         if isinstance(relation, str):
             relation = RelationType(relation)
-        self._graph.add_edge(source, target, relation=relation, weight=weight)
+        if isinstance(confidence, Confidence):
+            confidence = confidence.value
+        attrs: dict = {"relation": relation, "weight": weight}
+        if confidence is not None:
+            attrs["confidence"] = confidence
+        if conf_score is not None:
+            attrs["conf_score"] = float(conf_score)
+        if layer is not None:
+            attrs["layer"] = int(layer)
+        if evidence:
+            attrs["evidence"] = evidence[:200]
+        self._graph.add_edge(source, target, **attrs)
 
     # --- queries ---
 
diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py
index 76e26bc..6ee8b4e 100644
--- a/graph_tool_call/ontology/llm_provider.py
+++ b/graph_tool_call/ontology/llm_provider.py
@@ -68,10 +68,23 @@ class FieldSemantic:
 
 @dataclass
 class PairHint:
-    """LLM-suggested tool that pairs with the current tool."""
+    """A tool that pairs with the current tool in a workflow.
+
+    ``source`` distinguishes ownership so re-running auto enrichment doesn't
+    overwrite operator curation:
+      - ``"auto"``   — produced by Pass 2a (per-tool batch) or Pass 2b
+                       (cross-batch). Replaced on every Pass 2b re-run.
+      - ``"manual"`` — added by an operator through the UI. Never overwritten
+                       by automatic enrichment.
+
+    Default ``"manual"`` is intentional: legacy data without a ``source``
+    field gets the safer label, so a Pass 2b re-run does not silently delete
+    pre-existing entries that may have been hand-curated.
+    """
 
     tool: str
-    reason: str
+    reason: str = ""
+    source: str = "manual"
 
 
 @dataclass
@@ -192,11 +205,7 @@ class ToolEnrichment:
 Produce structured metadata that downstream components use to (1) pick the
 right tool for a user's goal, (2) synthesize execution plans, and (3) wire
 one tool's output to another tool's input.
-
-AVAILABLE TOOLS IN THE COLLECTION (names + 1-line descriptions, for
-pairs_well_with reference):
-{all_tools_brief}
-
+{reference_block}{vocab_block}
 TOOLS TO ANNOTATE (this batch):
 {batch_detailed}
 
@@ -243,6 +252,51 @@ class ToolEnrichment:
   - Return JSON only. No markdown fences, no prose, no comments."""
 
 
+# Pass 2b — cross-batch workflow pairing.
+#
+# Per-tool enrichment (Pass 2a) only sees one batch at a time, so it cannot
+# spot pairs whose other half lives in a different batch. This prompt shows
+# the entire collection's 1-line summaries so the LLM can suggest workflow
+# successors that span resources.
+#
+# The output is batched (subset of tools per call) to stay within the
+# response token budget — input stays full, output stays small.
+_PAIRS_PROMPT = """\
+You are reviewing an API tool collection to suggest workflow pairs.
+
+For EACH tool in the OUTPUT BATCH, suggest 2-4 OTHER tools from the FULL
+TOOL LIST that are commonly invoked just before or just after this tool in
+a real-world workflow. Pairs SHOULD cross resource boundaries when there is
+a natural business sequence (e.g. product detail → add to cart → checkout).
+
+Pair quality matters more than quantity — only suggest tools you are
+confident about. If a tool has no good pair candidates, return an empty
+array for it.
+
+FULL TOOL LIST (all available tools — pick pairs only from this list):
+{full_list}
+
+OUTPUT BATCH (suggest pairs ONLY for these tools):
+{batch_list}
+
+OUTPUT FORMAT (strict JSON):
+{{
+  "tool_name_1": [
+    {{"tool": "other_tool_name", "reason": "short reason"}},
+    ...
+  ],
+  "tool_name_2": [...],
+  ...
+}}
+
+STRICT RULES:
+  - You MUST include one entry for EVERY tool in the OUTPUT BATCH (use
+    empty array if no good pairs).
+  - Pair tool names MUST exactly match a name in the FULL TOOL LIST.
+  - Do NOT pair a tool with itself.
+  - Return JSON only. No markdown fences, no prose, no comments."""
+
+
 def _format_tools_list(tools: list[ToolSummary]) -> str:
     lines = []
     for i, t in enumerate(tools, 1):
@@ -261,6 +315,22 @@ def _format_tools_brief(tools: list[ToolSummary]) -> str:
     return "\n".join(f"- {t.name}" for t in tools)
 
 
+def _format_tools_for_pairs(tools: list[ToolSummary]) -> str:
+    """Compact ``name: 1-line summary`` block for Pass 2b prompts.
+
+    Uses ``description`` (mapped from ai_metadata.one_line_summary by the
+    caller for tools that have been Pass 2a annotated) so the LLM can pair
+    based on workflow meaning, not just tool names.
+    """
+    lines = []
+    for t in tools:
+        summary = (t.description or "").strip().replace("\n", " ")
+        if len(summary) > 100:
+            summary = summary[:97] + "..."
+        lines.append(f"- {t.name}: {summary}" if summary else f"- {t.name}")
+    return "\n".join(lines)
+
+
 def _format_tools_for_enrichment(tools: list[ToolSummary]) -> str:
     """Detailed per-tool block for enrichment prompt input."""
     blocks = []
@@ -307,10 +377,15 @@ def _parse_enrichment(data: Any) -> ToolEnrichment | None:
                     kind=kind,
                 )
             )
+        # Pairs from per-tool enrichment are batch-scoped (LLM only sees the
+        # current batch), so quality is lower than cross-batch Pass 2b.
+        # Marked source="auto" so a Pass 2b run can replace them while
+        # preserving operator-curated source="manual" entries.
         pairs = [
             PairHint(
                 tool=str(p.get("tool", "")).strip(),
                 reason=str(p.get("reason", "")).strip(),
+                source="auto",
             )
             for p in (data.get("pairs_well_with") or [])
             if isinstance(p, dict) and str(p.get("tool", "")).strip()
@@ -624,45 +699,129 @@ def generate_example_queries(
 
         return all_queries
 
+    def enrich_pairs(
+        self,
+        tools: list[ToolSummary],
+        batch_size: int = 30,
+    ) -> dict[str, list[PairHint]]:
+        """Pass 2b — cross-batch workflow pair suggestion.
+
+        Unlike Pass 2a (``enrich_tool_semantics``) which sees only the
+        current batch, this pass shows the LLM the full collection's 1-line
+        summaries so it can suggest pairs that cross resource boundaries
+        (e.g. ``getProductDetail → addToCart`` even when the two tools live
+        in different swagger sources).
+
+        Output is batched only on the OUTPUT axis: input list stays full
+        for every call, output covers ``batch_size`` tools per call. This
+        keeps the prompt short and avoids the 8k-token output limit
+        truncating long pair lists.
+
+        Tools should arrive with ``description`` set to ai_metadata
+        ``one_line_summary`` when available (Pass 2a output) so pairing can
+        rely on workflow meaning, not just tool names.
+
+        Returns: {tool_name: [PairHint(source="auto"), ...]}
+        """
+        results: dict[str, list[PairHint]] = {}
+        if not tools:
+            return results
+
+        full_list = _format_tools_for_pairs(tools)
+
+        for i in range(0, len(tools), batch_size):
+            batch = tools[i : i + batch_size]
+            batch_list = _format_tools_for_pairs(batch)
+            prompt = _PAIRS_PROMPT.format(full_list=full_list, batch_list=batch_list)
+            response = self.generate(prompt)
+
+            try:
+                parsed = _extract_json(response)
+                if not isinstance(parsed, dict):
+                    continue
+                for name, raw_pairs in parsed.items():
+                    if not isinstance(raw_pairs, list):
+                        continue
+                    pair_list: list[PairHint] = []
+                    for p in raw_pairs:
+                        if not isinstance(p, dict):
+                            continue
+                        target = str(p.get("tool", "")).strip()
+                        if not target or target == name:
+                            continue
+                        pair_list.append(PairHint(
+                            tool=target,
+                            reason=str(p.get("reason", "")).strip(),
+                            source="auto",
+                        ))
+                    results[str(name)] = pair_list
+            except (json.JSONDecodeError, KeyError, TypeError):
+                continue
+
+        return results
+
     def enrich_tool_semantics(
         self,
         tools: list[ToolSummary],
         batch_size: int = 10,
         *,
         reference_tools: list[ToolSummary] | None = None,
+        existing_vocab: list[str] | None = None,
+        valid_tool_names: set[str] | None = None,
     ) -> dict[str, ToolEnrichment]:
         """Per-tool semantic annotation for Plan-and-Execute architecture.
 
-        ``tools`` = the batch (or batches) of tools to produce detailed
-        enrichment for. ``reference_tools`` = the full catalog used only to
-        build ``all_tools_brief`` in the prompt (so LLM picks
-        ``pairs_well_with`` from valid names). If ``reference_tools`` is
-        None, falls back to ``tools``.
-
-        Streaming callers typically pass one batch in ``tools`` + the full
-        collection in ``reference_tools`` + ``batch_size=len(tools)`` so the
-        internal loop runs once per caller invocation.
-
-        Output is used by:
-          - Stage 1 (target selection) — ``one_line_summary`` + ``when_to_use``
-            in tool catalog make LLM picks more accurate with smaller context.
-          - Stage 2 (path synthesis) — ``produces_semantics`` /
-            ``consumes_semantics`` carry canonical semantic ids so bindings
-            work across convention mismatches (e.g. ``goodsNo`` ≡ ``productId``)
-            without a hardcoded synonym table.
-          - Graph edges — ``pairs_well_with`` becomes optional semantic edges
-            that complement structural field-match edges.
+        ``tools`` = the batch(es) to produce detailed enrichment for.
+
+        ``reference_tools`` (optional, default ``None``) — when supplied,
+        rendered as a brief tool list in the prompt so the LLM can pick
+        ``pairs_well_with`` from valid names. **Streaming callers should
+        usually pass ``None``** — Pass 2b handles pairs in a separate
+        cross-batch call, and skipping the reference block saves ~50%
+        prompt tokens. The pair list emitted in this pass is post-validated
+        against ``valid_tool_names`` instead.
+
+        ``existing_vocab`` (optional) — accumulated semantic ids decided in
+        previous batches of the same enrichment run. The LLM is asked to
+        reuse these labels when applicable, which keeps cross-batch vocab
+        consistent (avoids ``product_id`` vs ``productId`` divergence).
+        Streaming callers should pass the unique semantics seen so far.
+
+        ``valid_tool_names`` (optional) — full set of tool names in the
+        collection. When supplied, ``pairs_well_with`` entries pointing to
+        tools outside this set are dropped silently (LLM hallucination
+        guard). When ``reference_tools`` is None the LLM only knows the
+        names in the current batch; without this guard it would invent
+        names for cross-batch pairs.
         """
         results: dict[str, ToolEnrichment] = {}
         if not tools:
             return results
 
-        all_brief = _format_tools_brief(reference_tools or tools)
+        ref_block = ""
+        if reference_tools:
+            ref_block = (
+                "\nAVAILABLE TOOLS IN THE COLLECTION (names + 1-line "
+                "descriptions, for pairs_well_with reference):\n"
+                + _format_tools_brief(reference_tools)
+                + "\n"
+            )
+
+        vocab_block = ""
+        if existing_vocab:
+            vocab_block = (
+                "\nEXISTING SEMANTIC VOCABULARY (reuse these canonical ids "
+                "when the field has the same meaning — keeps cross-batch "
+                "labels consistent):\n"
+                + "\n".join(f"- {s}" for s in sorted(set(existing_vocab)))
+                + "\n"
+            )
 
         for i in range(0, len(tools), batch_size):
             batch = tools[i : i + batch_size]
             prompt = _ENRICH_SEMANTICS_PROMPT.format(
-                all_tools_brief=all_brief,
+                reference_block=ref_block,
+                vocab_block=vocab_block,
                 batch_detailed=_format_tools_for_enrichment(batch),
             )
             response = self.generate(prompt)
@@ -673,8 +832,16 @@ def enrich_tool_semantics(
                     continue
                 for name, data in parsed.items():
                     enrichment = _parse_enrichment(data)
-                    if enrichment is not None and enrichment.canonical_action:
-                        results[str(name)] = enrichment
+                    if enrichment is None or not enrichment.canonical_action:
+                        continue
+                    # Hallucination guard for pairs_well_with — drop entries
+                    # whose target name is not in the catalog.
+                    if valid_tool_names is not None:
+                        enrichment.pairs_well_with = [
+                            p for p in enrichment.pairs_well_with
+                            if p.tool in valid_tool_names and p.tool != str(name)
+                        ]
+                    results[str(name)] = enrichment
             except (json.JSONDecodeError, KeyError, TypeError):
                 continue
 
diff --git a/graph_tool_call/ontology/schema.py b/graph_tool_call/ontology/schema.py
index 04086fb..2a67290 100644
--- a/graph_tool_call/ontology/schema.py
+++ b/graph_tool_call/ontology/schema.py
@@ -24,6 +24,26 @@ class NodeType(str, Enum):
     DOMAIN = "domain"
 
 
+class Confidence(str, Enum):
+    """Edge confidence label, graphify-style.
+
+    Every edge in a graphify-style ToolGraph carries one of three labels so
+    downstream consumers (LLM agents, retrieval scoring, UI) can distinguish
+    deterministic facts from heuristic guesses.
+
+    EXTRACTED  — derived deterministically from the spec (path hierarchy,
+                 shared $ref, CRUD pattern). conf_score >= 0.85 AND layer == 1.
+    INFERRED   — heuristic match (name-based, RPC pattern, cross-resource).
+                 conf_score >= 0.85 but not strictly structural.
+    AMBIGUOUS  — low-confidence heuristic (0.70 <= conf_score < 0.85).
+                 Surface in UI for review; retrieval applies a score penalty.
+    """
+
+    EXTRACTED = "EXTRACTED"
+    INFERRED = "INFERRED"
+    AMBIGUOUS = "AMBIGUOUS"
+
+
 # Weights for relation types during retrieval scoring
 DEFAULT_RELATION_WEIGHTS: dict[str, float] = {
     RelationType.SIMILAR_TO: 0.8,
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
index 44696cf..35858c4 100644
--- a/graph_tool_call/plan/synthesizer.py
+++ b/graph_tool_call/plan/synthesizer.py
@@ -90,6 +90,46 @@ def __init__(
         self.label_field_hints = list(label_field_hints)
 
 
+def _normalize_field_name(name: str) -> str:
+    """Lowercase + strip separators for loose field-name matching.
+
+    Conservative on purpose:
+      ``ordNo`` → ``ordno``
+      ``ord_no`` → ``ordno``
+      ``ORD-NO`` → ``ordno``
+    BUT keeps token roots distinct:
+      ``ordNo`` ≠ ``orderNo`` (``ordno`` ≠ ``orderno``)
+    Token-level synonym mapping (``ord`` ↔ ``order``) is domain-specific
+    and not done here — the graph-edge fallback handles those cases.
+    """
+    if not name:
+        return ""
+    out: list[str] = []
+    for ch in name:
+        if ch.isalnum():
+            out.append(ch.lower())
+    return "".join(out)
+
+
+def _normalize_field_name(name: str) -> str:
+    """Lowercase + strip non-alphanumerics for loose field-name matching.
+
+    Conservative on purpose:
+      ``ordNo`` → ``ordno``    ``ord_no`` → ``ordno``    ``ORD-NO`` → ``ordno``
+
+    Token roots stay distinct:
+      ``ordNo`` ≠ ``orderNo``  (``ordno`` ≠ ``orderno``)
+
+    Token-level synonym mapping (``ord`` ↔ ``order``) is domain-specific
+    and intentionally NOT done here — that's the job of the graph-edge
+    fallback in ``_find_producer``, which uses path/$ref/CRUD signals
+    instead of name guessing.
+    """
+    if not name:
+        return ""
+    return "".join(ch.lower() for ch in name if ch.isalnum())
+
+
 @dataclass
 class _PartialStep:
     """In-progress step being built during bottom-up synthesis."""
@@ -140,6 +180,18 @@ def __init__(
         # semantic_tag -> [tool_name], insertion order preserved
         self._producers_by_semantic: dict[str, list[str]] = {}
         self._producers_by_field: dict[str, list[str]] = {}
+        # Loose-field index: normalised field name → [tool_name].
+        # Lets ``ordNo`` match producers of ``ordno`` / ``ord_no`` / ``ORDNO``.
+        # Conservative — only normalises case + separators, never strips
+        # tokens (so ``ordNo`` ≠ ``orderNo`` — those need the graph fallback).
+        self._producers_by_loose_field: dict[str, list[str]] = {}
+        # graphify-mode adjacency: ``tool_name -> [edge_dict]`` for outgoing
+        # workflow edges (REQUIRES / PRECEDES / COMPLEMENTARY). Used as a
+        # fallback in ``_find_producer`` when neither semantic_tag nor
+        # field_name match — we walk the graph the user/extractor built
+        # rather than failing on field-name divergence.
+        self._workflow_edges_out: dict[str, list[dict[str, Any]]] = {}
+        self._index_workflow_edges(graph)
         self._build_producer_indexes()
 
     # ------------------------------------------------------------------
@@ -197,6 +249,23 @@ def synthesize(
             ))
 
         target_step_id = steps_by_tool[target].step_id
+
+        # Collect user_input slots so the runner can prompt the caller in
+        # advance and the UI can render a single popup with all missing
+        # fields, instead of one popup per step. Each entry: which step
+        # needs which field, and (when known) the original semantic_tag
+        # so frontend can show the same enum/popup the operator
+        # registered for that field.
+        user_input_slots: list[dict[str, Any]] = []
+        for step in final_steps:
+            for arg_name, arg_val in (step.args or {}).items():
+                if isinstance(arg_val, str) and arg_val.startswith("${user_input."):
+                    user_input_slots.append({
+                        "step_id": step.id,
+                        "tool": step.tool,
+                        "field_name": arg_name,
+                    })
+
         return Plan(
             id=str(uuid.uuid4()),
             goal=goal or f"Execute {target}",
@@ -210,6 +279,7 @@ def synthesize(
                 "target": target,
                 "entities": dict(entities),
                 "synthesized_by": "PathSynthesizer/v1",
+                "user_input_slots": user_input_slots,
             },
         )
 
@@ -297,15 +367,26 @@ def _resolve(
                 )
 
             # 5. Required data field → rank candidate producers and pick the best.
+            #    Pass ``visiting`` as ``excluded`` so cycle-prone candidates are
+            #    skipped here (Cycle policy A). The chain reroutes around the
+            #    cycle when an alternative producer exists; only when none
+            #    remains does the caller fall through to user-input slot (F2).
             producer = self._find_producer(
                 semantic=semantic, field_name=field_name,
                 target_tool=tool_name, entities=entities,
+                excluded=visiting,
             )
             if producer is None:
-                raise UnsatisfiableFieldError(
-                    f"tool {tool_name!r} requires {field_name!r} "
-                    f"(semantic={semantic!r}) but no entity or producer found"
-                )
+                # F2 + Cycle policy B: gracefully surface the field as a
+                # ``${user_input.<field>}`` placeholder rather than aborting
+                # the entire plan. The runner detects the placeholder at
+                # step-start and asks the user (or its surrounding agent)
+                # to supply the value. The plan's metadata records every
+                # such slot so the caller can pre-collect inputs.
+                placeholder = f"${{user_input.{field_name}}}"
+                args[field_name] = placeholder
+                rationales.append(f"{field_name} ← user_input")
+                continue
 
             # 5a. Dynamic-option popup priority. Detect "read-detail then
             #     pick one" patterns where the producer is a single-hop
@@ -342,14 +423,27 @@ def _resolve(
                         label_field_hints=self._label_hints_for(producer, opt_path),
                     )
 
-            # Recurse into the producer first so step_id ordering is correct
-            self._resolve(
-                tool_name=producer,
-                entities=entities,
-                steps_by_tool=steps_by_tool,
-                visiting=visiting,
-                depth=depth + 1,
-            )
+            # Recurse into the producer first so step_id ordering is correct.
+            # Cycle policy B + F2: if the producer's own chain is too deep
+            # or cycles back, we don't abort the whole plan — we drop this
+            # producer and fall back to a user_input slot for the field.
+            # This keeps the surface tool callable when the prerequisite
+            # chain extends beyond what the synthesiser can flatten.
+            try:
+                self._resolve(
+                    tool_name=producer,
+                    entities=entities,
+                    steps_by_tool=steps_by_tool,
+                    visiting=visiting,
+                    depth=depth + 1,
+                )
+            except (MaxDepthExceededError, CyclicDependencyError) as exc:
+                placeholder = f"${{user_input.{field_name}}}"
+                args[field_name] = placeholder
+                rationales.append(
+                    f"{field_name} ← user_input (chain unflattenable: {exc.__class__.__name__})"
+                )
+                continue
 
             # Build a placeholder binding — will be rewritten after step_ids
             # are assigned. Format: ${<tool_name>.<jsonpath-sans-root>}
@@ -370,16 +464,107 @@ def _resolve(
     # ------------------------------------------------------------------
 
     def _build_producer_indexes(self) -> None:
-        """Index which tools produce which semantic / field across graph."""
+        """Index which tools produce which semantic / field across the graph.
+
+        Echo-back filter: a tool that takes ``ordNo`` as input and echoes it
+        back in its response is NOT a producer of ``ordNo`` in any useful
+        sense — it's just relaying the value the caller already supplied. We
+        skip those entries so the index reflects tools that actually CREATE
+        or DISCOVER the value (``listOrders``, ``createOrder``,
+        ``searchOrders`` etc.) rather than every endpoint that happens to
+        round-trip the field.
+
+        Same rule applied to ``semantic_tag`` for parity with the LLM Pass 2
+        enrichment path. Empty consumes (no input fields) → never echo, so
+        all produces are real producers.
+        """
         for name, tool in self._tools.items():
             meta = tool.get("metadata") or {}
+            consumed_fields: set[str] = set()
+            consumed_semantics: set[str] = set()
+            for c in meta.get("consumes") or []:
+                if not isinstance(c, dict):
+                    continue
+                cf = c.get("field_name") or ""
+                cs = c.get("semantic_tag") or ""
+                if cf:
+                    consumed_fields.add(cf)
+                if cs:
+                    consumed_semantics.add(cs)
+
             for produce in meta.get("produces") or []:
                 sem = produce.get("semantic_tag") or ""
                 fname = produce.get("field_name") or ""
+                # Skip pure echo-back: the field came in, gets relayed out.
+                if fname and fname in consumed_fields:
+                    continue
+                if sem and sem in consumed_semantics:
+                    continue
                 if sem:
                     self._producers_by_semantic.setdefault(sem, []).append(name)
                 if fname:
                     self._producers_by_field.setdefault(fname, []).append(name)
+                    loose = _normalize_field_name(fname)
+                    if loose and loose != fname:
+                        self._producers_by_loose_field.setdefault(loose, []).append(name)
+
+    # ---- graphify edge indexing & traversal ---------------------------------
+
+    _WORKFLOW_RELATIONS: frozenset[str] = frozenset(
+        {"requires", "precedes", "complementary"}
+    )
+    _CONFIDENCE_RANK: dict[str, int] = {
+        "EXTRACTED": 0,
+        "INFERRED": 1,
+        "AMBIGUOUS": 2,
+    }
+
+    def _index_workflow_edges(self, graph: dict[str, Any]) -> None:
+        """Bucket the graphify graph's outgoing workflow edges by source tool.
+
+        Accepts the same graph dict the rest of the class consumes — looks
+        for ``graph.graph.edges`` (DictGraph.to_dict() output) or the
+        legacy NetworkX-style ``graph.graph.links`` if present. Edges
+        without a confidence label are kept (treated as fallback) so this
+        also works on graphs built before the graphify ingest landed.
+        """
+        graph_inner = graph.get("graph") or {}
+        edges = graph_inner.get("edges") or graph_inner.get("links") or []
+        for e in edges:
+            if not isinstance(e, dict):
+                continue
+            src = e.get("source") or e.get("from")
+            tgt = e.get("target") or e.get("to")
+            rel = e.get("relation")
+            rel_str = (
+                rel.value if hasattr(rel, "value")
+                else str(rel) if rel is not None else ""
+            ).lower()
+            if not src or not tgt or rel_str not in self._WORKFLOW_RELATIONS:
+                continue
+            self._workflow_edges_out.setdefault(src, []).append({
+                "target": tgt,
+                "relation": rel_str,
+                "confidence": e.get("confidence"),
+                "conf_score": float(e.get("conf_score") or 0.0),
+                "evidence": e.get("evidence") or "",
+            })
+
+    # Producer-signal score weights. Higher = stronger signal that this
+    # candidate genuinely produces the value the target needs. Weights chosen
+    # so combined signals (e.g. graph EXTRACTED + field exact = 90) beat any
+    # single signal, and graph EXTRACTED alone (50) beats field exact alone
+    # (40) — Path/$ref/CRUD-derived edges are more reliable than coincidental
+    # field-name overlap. ``semantic_exact`` requires LLM Pass 2 enrichment;
+    # when present it's the strongest signal we have.
+    _SIGNAL_WEIGHTS: dict[str, int] = {
+        "semantic_exact": 100,
+        "graph_EXTRACTED": 50,
+        "field_exact": 40,
+        "graph_INFERRED": 20,
+        "field_loose": 10,
+        "graph_AMBIGUOUS": 5,
+    }
 
     def _find_producer(
         self,
@@ -388,36 +573,133 @@ def _find_producer(
         field_name: str,
         target_tool: str,
         entities: dict[str, Any],
+        excluded: set[str] | None = None,
     ) -> str | None:
-        """Pick the best-ranked producer for ``semantic`` (or ``field_name``).
-
-        Candidates are gathered from both indexes (semantic first), then
-        ranked using Pass 2 metadata (``_rank_producers``) and finally
-        filtered by ``_is_chain_eligible`` — discards producers whose
-        ``canonical_action`` / ``primary_resource`` signal they're
-        unrelated to the target's domain (e.g. claim-cost calculator
-        showing up as a producer for a basket field just because a
-        ``produces`` entry happens to match).
+        """Pick the best producer using combined graph + schema signals.
+
+        Producer matching is treated as the intersection of two first-class
+        signals (NOT a fallback chain):
+          (a) Schema match — semantic_tag / field_name on ``produces``.
+          (b) Graph traversal — outgoing REQUIRES / PRECEDES / COMPLEMENTARY
+              edges from ``target_tool``, ranked by ``confidence``.
+
+        A candidate accumulates one entry per matching signal. The signal
+        weights live in ``_SIGNAL_WEIGHTS`` and combine additively, so a
+        candidate matched by both graph EXTRACTED and field_exact (90) wins
+        over one matched only by field_exact (40). Tie-break uses the
+        existing Pass-2 ``_rank_producers`` (entity affinity, pair hint,
+        canonical action), and ``_is_chain_eligible`` still gates the final
+        pick — sparse Pass-2 metadata pass-throughs apply unchanged.
+
+        ``excluded`` is the set of tools currently being resolved (the
+        caller's ``visiting`` set). Producer candidates in this set would
+        re-enter recursion and trigger ``CyclicDependencyError`` — we skip
+        them here so the second-best candidate gets a chance instead. This
+        is the "skip-this-branch" cycle policy: the chain reroutes around
+        the cycle when alternative producers exist; only when all candidates
+        cycle does the caller fall back to user-input slot handling.
+
+        Returns the highest-scoring eligible candidate, or None if no
+        candidate has any signal (or all signals point to ``excluded`` tools).
         """
-        candidates: list[str] = []
-        seen: set[str] = set()
+        excluded = excluded or set()
+        candidate_signals: dict[str, set[str]] = {}
+
+        def _record(name: str, signal: str) -> None:
+            if name and name != target_tool:
+                candidate_signals.setdefault(name, set()).add(signal)
+
+        # (a) schema-side: exact semantic / field_name (echo-back already
+        # filtered when the index was built).
         if semantic:
-            for name in self._producers_by_semantic.get(semantic, []):
-                if name != target_tool and name not in seen:
-                    candidates.append(name)
-                    seen.add(name)
+            for n in self._producers_by_semantic.get(semantic, []):
+                _record(n, "semantic_exact")
         if field_name:
-            for name in self._producers_by_field.get(field_name, []):
-                if name != target_tool and name not in seen:
-                    candidates.append(name)
-                    seen.add(name)
-        if not candidates:
+            for n in self._producers_by_field.get(field_name, []):
+                _record(n, "field_exact")
+
+        # (a') schema-side: loose field match — separator/case folded.
+        # ``ordNo`` won't match ``orderNo`` (different roots) but will match
+        # ``ord_no`` / ``ORDNO``. Cross-naming-convention safety net.
+        if field_name:
+            loose = _normalize_field_name(field_name)
+            if loose:
+                for n in self._producers_by_loose_field.get(loose, []):
+                    if n in candidate_signals:
+                        continue  # already had a stronger signal
+                    _record(n, "field_loose")
+
+        # (b) graph-side: walk outgoing workflow edges, verify each
+        # candidate actually has a matching produces entry.
+        edges = self._workflow_edges_out.get(target_tool) or []
+        loose_target = _normalize_field_name(field_name) if field_name else ""
+        for e in edges:
+            cand = e.get("target")
+            if not cand or cand == target_tool:
+                continue
+            tool = self._tools.get(cand)
+            if not tool:
+                continue
+            cand_consumes_fields = {
+                (c or {}).get("field_name", "")
+                for c in (tool.get("metadata") or {}).get("consumes") or []
+                if isinstance(c, dict)
+            }
+            cand_consumes_semantics = {
+                (c or {}).get("semantic_tag", "")
+                for c in (tool.get("metadata") or {}).get("consumes") or []
+                if isinstance(c, dict)
+            }
+            for p in (tool.get("metadata") or {}).get("produces") or []:
+                if not isinstance(p, dict):
+                    continue
+                p_sem = p.get("semantic_tag") or ""
+                p_fname = p.get("field_name") or ""
+                # Echo-back guard for the candidate itself — same rule as
+                # _build_producer_indexes, applied here so graph-edge
+                # discoveries don't sneak in a relayed value.
+                if p_fname and p_fname in cand_consumes_fields:
+                    continue
+                if p_sem and p_sem in cand_consumes_semantics:
+                    continue
+
+                matched = False
+                if semantic and p_sem == semantic:
+                    matched = True
+                elif field_name and p_fname == field_name:
+                    matched = True
+                elif loose_target and _normalize_field_name(p_fname) == loose_target:
+                    matched = True
+                if not matched:
+                    continue
+
+                conf = e.get("confidence") or "AMBIGUOUS"
+                _record(cand, f"graph_{conf}")
+                break  # one signal per candidate per edge target is enough
+
+        if not candidate_signals:
             return None
 
+        # Score and pre-rank by signal strength (stable for equal scores).
+        def _score(signals: set[str]) -> int:
+            return sum(self._SIGNAL_WEIGHTS.get(s, 0) for s in signals)
+
+        scored = sorted(
+            candidate_signals.items(),
+            key=lambda item: (-_score(item[1]), item[0]),
+        )
+        sorted_names = [n for n, _ in scored]
+
+        # Pass 2 / chain-eligibility gate — pass-through when ai_metadata
+        # is sparse, identical behaviour to the previous implementation.
+        # Cycle filter: skip candidates currently in the resolution stack so
+        # the synthesiser reroutes around the cycle instead of raising.
         ranked = self._rank_producers(
-            candidates, target_tool=target_tool, entities=entities,
+            sorted_names, target_tool=target_tool, entities=entities,
         )
         for cand in ranked:
+            if cand in excluded:
+                continue
             if self._is_chain_eligible(cand, target_tool=target_tool):
                 return cand
         return None
diff --git a/graph_tool_call/tool_graph.py b/graph_tool_call/tool_graph.py
index 28839ed..00c2353 100644
--- a/graph_tool_call/tool_graph.py
+++ b/graph_tool_call/tool_graph.py
@@ -16,7 +16,7 @@
 from graph_tool_call.core.protocol import GraphEngine
 from graph_tool_call.core.tool import ToolSchema, normalize_tool, parse_tool
 from graph_tool_call.ontology.builder import OntologyBuilder
-from graph_tool_call.ontology.schema import RelationType
+from graph_tool_call.ontology.schema import Confidence, RelationType
 
 
 def _encode_spec_url(base: str, raw_url: str) -> str:
@@ -488,9 +488,27 @@ def add_relation(
         target: str,
         relation: str | RelationType,
         weight: float = 1.0,
+        *,
+        confidence: str | Confidence | None = None,
+        conf_score: float | None = None,
+        layer: int | None = None,
+        evidence: str | None = None,
     ) -> None:
-        """Add a relation between two tools."""
-        self._builder.add_relation(source, target, relation, weight)
+        """Add a relation between two tools.
+
+        Optional graphify-style attrs are forwarded to ``OntologyBuilder``;
+        see ``OntologyBuilder.add_relation`` for semantics.
+        """
+        self._builder.add_relation(
+            source,
+            target,
+            relation,
+            weight,
+            confidence=confidence,
+            conf_score=conf_score,
+            layer=layer,
+            evidence=evidence,
+        )
         self._invalidate_retrieval()
 
     def add_domain(self, domain: str, description: str = "") -> None:
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..ff68b3a
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1,5 @@
+"""Internal scripts package — referenced by tests/test_release_script.py.
+
+Empty marker so Python treats ``scripts/`` as an importable package.
+Not included in the published wheel (see ``pyproject.toml`` ``packages``).
+"""

From eb101e594e530195c9017dbede000ba6589adba9 Mon Sep 17 00:00:00 2001
From: daehee <1998opening@gmail.com>
Date: Sun, 3 May 2026 07:58:27 +0900
Subject: [PATCH 10/14] =?UTF-8?q?fix:=20ruff=20lint=20=ED=86=B5=EA=B3=BC?=
 =?UTF-8?q?=20+=20=EC=83=88=20docs/examples=20+=20io=5Fcontract=20?=
 =?UTF-8?q?=EB=AA=A8=EB=93=88=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lint:
  - ruff check . 23건 수정 (E501/F401/F841/F402/N806/N818/I001/UP035)
  - ruff format . 전체 적용
  - examples/xgen_workflow_gateway.py 구문 오류 수정 (lambda **kwargs 위치)
  - tests/test_dependency.py 중복 정의 제거
  - tests/test_gateway_*.py 옵셔널 import 에 # noqa: E402

신규:
  - graph_tool_call/ingest/io_contract.py — extract_leaves (xgen-workflow 의존)
  - docs/* — api-reference / benchmarks / cli / integrations / roadmap
  - benchmarks/results/models/{README,bonsai-8b-q1_0}.md
  - .pre-commit-config.yaml
  - examples/test_bonsai_tool_calling.py

기타:
  - .gitignore — benchmarks/results/benchmark_*.json (자동 출력 무시)
---
 .gitignore                                  |   3 +
 .pre-commit-config.yaml                     |  21 +
 benchmarks/results/models/README.md         |  37 ++
 benchmarks/results/models/bonsai-8b-q1_0.md | 147 +++++++
 benchmarks/run_competitive.py               |  34 +-
 docs/api-reference.md                       | 145 +++++++
 docs/benchmarks.md                          | 174 +++++++++
 docs/cli.md                                 | 137 +++++++
 docs/integrations/direct-api.md             | 125 ++++++
 docs/integrations/langchain.md              | 111 ++++++
 docs/integrations/mcp-proxy.md              | 117 ++++++
 docs/integrations/mcp-server.md             | 100 +++++
 docs/integrations/middleware.md             |  65 ++++
 docs/roadmap.md                             | 298 ++++++++++++++
 examples/test_bonsai_tool_calling.py        | 405 ++++++++++++++++++++
 examples/xgen_workflow_agent.py             |  98 +++--
 examples/xgen_workflow_gateway.py           |  33 +-
 graph_tool_call/analyze/dependency.py       | 191 +++++----
 graph_tool_call/graphify/__init__.py        |   1 +
 graph_tool_call/graphify/ingest.py          |  17 +-
 graph_tool_call/graphify/retrieval.py       |  19 +-
 graph_tool_call/ingest/io_contract.py       |   4 +-
 graph_tool_call/ingest/openapi.py           |   2 +-
 graph_tool_call/langchain/agent.py          |  17 +-
 graph_tool_call/langchain/gateway.py        |   8 +-
 graph_tool_call/mcp_proxy.py                |   3 +-
 graph_tool_call/net.py                      |   8 +-
 graph_tool_call/ontology/llm_provider.py    |  35 +-
 graph_tool_call/plan/__init__.py            |  36 +-
 graph_tool_call/plan/binding.py             |  10 +-
 graph_tool_call/plan/intent.py              |  29 +-
 graph_tool_call/plan/response.py            |   1 -
 graph_tool_call/plan/runner.py              |  53 +--
 graph_tool_call/plan/schema.py              |  22 +-
 graph_tool_call/plan/synthesizer.py         |  83 ++--
 graph_tool_call/retrieval/engine.py         |  20 +-
 graph_tool_call/retrieval/graph_search.py   |  64 +++-
 graph_tool_call/serialization.py            |   5 +-
 graph_tool_call/tool_graph.py               |  33 +-
 graph_tool_call/workflow.py                 |  85 ++--
 tests/test_dependency.py                    |  10 -
 tests/test_gateway_e2e.py                   | 120 +++++-
 tests/test_gateway_token_saving.py          | 130 ++++++-
 tests/test_gateway_xgen_workflow.py         | 272 ++++++++++---
 tests/test_langchain_agent.py               |   5 +-
 tests/test_langchain_compatibility.py       |  99 ++---
 tests/test_langchain_gateway.py             |  50 ++-
 tests/test_langchain_toolkit.py             |  13 +-
 48 files changed, 2939 insertions(+), 556 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 benchmarks/results/models/README.md
 create mode 100644 benchmarks/results/models/bonsai-8b-q1_0.md
 create mode 100644 docs/api-reference.md
 create mode 100644 docs/benchmarks.md
 create mode 100644 docs/cli.md
 create mode 100644 docs/integrations/direct-api.md
 create mode 100644 docs/integrations/langchain.md
 create mode 100644 docs/integrations/mcp-proxy.md
 create mode 100644 docs/integrations/mcp-server.md
 create mode 100644 docs/integrations/middleware.md
 create mode 100644 docs/roadmap.md
 create mode 100644 examples/test_bonsai_tool_calling.py

diff --git a/.gitignore b/.gitignore
index 4188784..670b4ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,6 @@ benchmarks/results/
 
 # Personal memo
 memo/
+
+# Benchmark output (timestamped, auto-generated)
+benchmarks/results/benchmark_*.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..b086ba0
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+# Pre-commit hooks for graph-tool-call
+#
+# Install once per clone:
+#     pip install pre-commit
+#     pre-commit install
+#
+# Run manually on all files:
+#     pre-commit run --all-files
+#
+# These hooks mirror the CI lint job (.github/workflows/ci.yml).
+# If they fail locally, CI will also fail — fix before committing.
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.15.4
+    hooks:
+      - id: ruff
+        name: ruff check
+        args: [--fix]
+      - id: ruff-format
+        name: ruff format
diff --git a/benchmarks/results/models/README.md b/benchmarks/results/models/README.md
new file mode 100644
index 0000000..21e00e6
--- /dev/null
+++ b/benchmarks/results/models/README.md
@@ -0,0 +1,37 @@
+# Model Benchmark Results
+
+graph-tool-call의 도구 검색 + LLM tool calling end-to-end 벤치마크 결과.
+
+## How to Run
+
+```bash
+# Retrieval-only (LLM 불필요)
+python -m benchmarks.run_benchmark --mode retrieval -v
+
+# E2E with Ollama
+python -m benchmarks.run_benchmark --mode e2e -m qwen3:4b -v --save
+
+# E2E with OpenAI-compatible server (llama.cpp, vLLM 등)
+python -m benchmarks.run_benchmark --mode e2e -m "Bonsai-8B.gguf" \
+  --ollama-url "http://localhost:8080/v1" -v --save
+```
+
+## Model Comparison
+
+| Model | Size | Quant | Petstore (19t) | Mixed MCP (38t) | Retrieve Boost |
+|-------|-----:|-------|-------:|--------:|------:|
+| [Bonsai-8B](bonsai-8b-q1_0.md) | 1.1 GB | Q1_0 (1-bit) | BL 65% / RT 57% | BL 0% / RT 73% | **0% → 73%** |
+
+> **BL** = Baseline (all tools), **RT** = Retrieve (top-5 filtered)
+
+## Key Findings
+
+1. **소형 모델일수록 graph-tool-call 효과가 크다** — 도구 수가 context 한계를 넘으면 baseline이 0%로 무너지지만, retrieval 필터링으로 복구 가능
+2. **도구 20개 이하**에서는 baseline과 retrieve 차이가 작거나 역전될 수 있음
+3. **도구 30개 이상**에서는 retrieve 파이프라인이 필수적
+
+## Adding a New Model
+
+1. 벤치마크 실행 후 JSON 결과 확인 (`benchmarks/results/`)
+2. `models/` 디렉토리에 `{model-name}.md` 작성 (기존 문서 포맷 참고)
+3. 이 README의 Model Comparison 테이블에 행 추가
diff --git a/benchmarks/results/models/bonsai-8b-q1_0.md b/benchmarks/results/models/bonsai-8b-q1_0.md
new file mode 100644
index 0000000..2a83a30
--- /dev/null
+++ b/benchmarks/results/models/bonsai-8b-q1_0.md
@@ -0,0 +1,147 @@
+# Bonsai-8B (1-bit Q1_0)
+
+> Tested: 2026-04-07 | Runtime: llama.cpp | API: OpenAI-compatible (`localhost:8080`)
+
+## Model Spec
+
+| Item | Value |
+|------|-------|
+| Parameters | 8B |
+| Quantization | Q1_0 (1-bit) |
+| File Size | 1.1 GB |
+| Memory Usage | ~1.7 GB |
+| Prompt Speed | 58 tok/s |
+| Generation Speed | 31 tok/s |
+| Context Window | 8192 (default) |
+
+## Results Summary
+
+| Dataset | Tools | Queries | Baseline Acc | Retrieve Acc | Delta | Token Reduction | Recall@5 | p-value |
+|---------|------:|--------:|-------------:|-------------:|------:|----------------:|---------:|--------:|
+| Petstore 3.0 | 19 | 23 | 65.2% | 56.5% | -8.7% | 68.4% | 98.6% | 0.3283 (ns) |
+| Mixed MCP | 38 | 30 | 0.0% | 73.3% | **+73.3%** | N/A | 93.3% | <0.0001 (***) |
+
+## Petstore 3.0 (19 tools, 23 queries)
+
+### Metrics
+
+| Metric | Baseline | Retrieve (top-5) |
+|--------|---------|-----------------|
+| Tool Accuracy | 65.2% (15/23) | 56.5% (13/23) |
+| Avg Input Tokens | 1,875 | 601 |
+| Token Reduction | — | 68.4% |
+| Token Efficiency | 0.35 | 0.95 |
+| Avg Latency | 1,897 ms | 3,877 ms |
+
+### Per-Query Results
+
+| # | Query | Difficulty | Baseline | Retrieve | Notes |
+|---|-------|-----------|----------|----------|-------|
+| 1 | Find all available pets | easy | findPetsByStatus | findPetsByStatus | |
+| 2 | Add a new dog to the pet store | easy | addPet | **None** | retrieve: tool call 미생성 |
+| 3 | Get pet with ID 42 | easy | getPetById | getPetById | |
+| 4 | Update the name of my pet | medium | **None** | **None** | 양쪽 실패 |
+| 5 | Delete pet number 7 | easy | deletePet | deletePet | |
+| 6 | Search pets by their tags | medium | findPetsByTags | findPetsByTags | |
+| 7 | Upload a photo of my pet | medium | **None** | **None** | 양쪽 실패 |
+| 8 | Check the store inventory | easy | getInventory | getInventory | |
+| 9 | Place an order to buy a pet | easy | **None** | **None** | 양쪽 실패 |
+| 10 | Look up order number 5 | easy | getOrderById | getOrderById | |
+| 11 | Cancel my order | easy | **None** | **None** | 양쪽 실패 |
+| 12 | Create a new user account | easy | createUser | **None** | retrieve: tool call 미생성 |
+| 13 | Sign in with username and password | easy | loginUser | loginUser | |
+| 14 | Log out of my account | easy | logoutUser | logoutUser | |
+| 15 | View user profile for john123 | easy | getUserByName | getUserByName | |
+| 16 | Change user email address | medium | **None** | **None** | 양쪽 실패 |
+| 17 | Remove user john123 | easy | deleteUser | deleteUser | |
+| 18 | Create multiple user accounts at once | medium | **None** | createUsersWithListInput | retrieve만 성공 |
+| 19 | Show me sold pets | easy | findPetsByStatus | findPetsByStatus | |
+| 20 | Adopt a pet (workflow) | hard | **None** | **None** | 양쪽 실패 |
+| 21 | Update pet using form data | hard | updatePetWithForm | updatePetWithForm | |
+| 22 | What pets are in the store? | medium | **None** | **None** | 양쪽 실패 |
+| 23 | Remove a pet listing and delete order | hard | deletePet | **None** | retrieve: tool call 미생성 |
+
+## Mixed MCP Servers (38 tools, 30 queries)
+
+### Metrics
+
+| Metric | Baseline | Retrieve (top-5) |
+|--------|---------|-----------------|
+| Tool Accuracy | 0.0% (0/30) | 73.3% (22/30) |
+| Avg Input Tokens | N/A (all failed) | 682 |
+| Avg Latency | N/A | 4,509 ms |
+| Token Efficiency | 0.00 | 1.06 |
+
+### Per-Query Results
+
+| # | Query | Difficulty | Retrieve | Notes |
+|---|-------|-----------|----------|-------|
+| 1 | Read the contents of config.yaml | easy | read_file | |
+| 2 | Write a new configuration file | easy | write_file | |
+| 3 | List all files in the src directory | easy | list_directory | |
+| 4 | Create the output directory | easy | create_directory | |
+| 5 | Find all Python files in the project | easy | search_files | |
+| 6 | Move the old log file to archive | easy | move_file | |
+| 7 | Check the file size and permissions | easy | get_file_info | |
+| 8 | Show the directory tree structure | easy | directory_tree | |
+| 9 | Edit the import statement in main.py | medium | edit_file | |
+| 10 | Read multiple config files at once | medium | read_multiple_files | |
+| 11 | Create a new issue for the bug | easy | **None** | retrieval OK, tool call 미생성 |
+| 12 | Open a pull request for my changes | medium | **None** | retrieval miss (recall=0) |
+| 13 | Search for repos about ML | easy | search_repositories | |
+| 14 | Fork the upstream repository | medium | **None** | retrieval OK, tool call 미생성 |
+| 15 | List all open issues with bug label | easy | list_issues | |
+| 16 | Get the README from the GitHub repo | medium | get_file_contents | |
+| 17 | Merge the feature branch PR | medium | **None** | retrieval OK, tool call 미생성 |
+| 18 | Comment on the PR with review feedback | medium | **None** | retrieval miss (recall=0) |
+| 19 | Create a new branch for the feature | easy | create_branch | |
+| 20 | Push the updated files to GitHub | medium | **None** | retrieval OK, tool call 미생성 |
+| 21 | Search code for the function definition | medium | search_code | |
+| 22 | Which directories can the file server access? | hard | list_allowed_directories | |
+| 23 | Check details of PR number 55 | easy | get_pull_request | |
+| 24 | Approve the pull request after review | medium | **None** | retrieval OK, tool call 미생성 |
+| 25 | View the commit history | easy | list_commits | |
+| 26 | Create a new GitHub repo and initialize it | easy | create_repository | |
+| 27 | Update the issue title and close it | medium | update_issue | |
+| 28 | See what files were changed in PR 10 | easy | get_pull_request_files | |
+| 29 | Find all TypeScript files matching *.test.ts | easy | search_files | |
+| 30 | Create a file on GitHub with deploy config | medium | create_repository | wrong tool (expected: create_or_update_file) |
+
+## Failure Analysis
+
+### 1. Tool Call 미생성 (None) — 가장 빈번한 실패 패턴
+
+Bonsai-8B는 도구를 **잘못 고르는** 것이 아니라, tool call JSON을 **아예 생성하지 못하는** 경우가 대부분이다. 텍스트로 응답하거나 빈 응답을 반환한다.
+
+- Petstore baseline: 8/23 (34.8%) None
+- Petstore retrieve: 10/23 (43.5%) None
+- Mixed MCP baseline: 30/30 (100%) None
+- Mixed MCP retrieve: 7/30 (23.3%) None
+
+### 2. Baseline 완전 실패 (Mixed MCP)
+
+38개 도구를 전부 context에 넣으면 input tokens가 과다해져 tool call 자체를 포기한다. 1-bit 양자화 모델의 long context 처리 한계.
+
+### 3. Write 작업 취약
+
+tool call 미생성 실패가 write/create 계열에 집중:
+- `placeOrder`, `addPet`, `uploadFile`, `fork_repository`, `push_files` 등
+- read 계열은 상대적으로 안정적 (getPetById, getInventory 등)
+
+### 4. Retrieve가 Baseline보다 낮은 Petstore
+
+19개 도구는 Bonsai-8B가 감당 가능한 수준이라 baseline이 소폭 우위 (65.2% vs 56.5%).
+하지만 retrieve 모드에서 `createUsersWithListInput` 같은 세밀한 선택에 성공한 케이스도 있다.
+
+## Key Insight
+
+> **도구 수가 많아질수록 graph-tool-call의 retrieval 필터링은 필수적이다.**
+> 38개 도구만으로도 Bonsai-8B baseline은 0%로 완전히 무너지지만,
+> top-5 필터링 시 73.3%까지 복구된다. (p < 0.0001)
+>
+> 1-bit 양자화 소형 모델에서 graph-tool-call의 가치가 가장 극명하게 드러난다.
+
+## Raw Data
+
+- Petstore: `benchmarks/results/benchmark_e2e_20260407_014809.json`
+- Mixed MCP: `benchmarks/results/benchmark_e2e_20260407_015032.json`
diff --git a/benchmarks/run_competitive.py b/benchmarks/run_competitive.py
index 110ad37..fa11372 100644
--- a/benchmarks/run_competitive.py
+++ b/benchmarks/run_competitive.py
@@ -15,7 +15,7 @@
 import argparse
 import json
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime, timezone
 
 from benchmarks.config import DATASET_REGISTRY
@@ -214,7 +214,10 @@ def print_comparison(
 def main() -> None:
     parser = argparse.ArgumentParser(description="Competitive retrieval benchmark")
     parser.add_argument(
-        "--datasets", "-d", nargs="+", default=None,
+        "--datasets",
+        "-d",
+        nargs="+",
+        default=None,
         help="Datasets to benchmark (default: all non-legacy)",
     )
     parser.add_argument("--top-k", type=int, default=5)
@@ -223,20 +226,15 @@ def main() -> None:
     parser.add_argument("--save", action="store_true", help="Save results as JSON")
     args = parser.parse_args()
 
-    dataset_names = args.datasets or [
-        k for k, v in DATASET_REGISTRY.items() if not v.get("legacy")
-    ]
+    dataset_names = args.datasets or [k for k, v in DATASET_REGISTRY.items() if not v.get("legacy")]
 
-    active_strategies = [
-        s for s in STRATEGIES
-        if not (args.no_embedding and s.embedding)
-    ]
+    active_strategies = [s for s in STRATEGIES if not (args.no_embedding and s.embedding)]
 
-    print(f"\n  Competitive Retrieval Benchmark")
+    print("\n  Competitive Retrieval Benchmark")
     print(f"  Strategies: {len(active_strategies)}")
     print(f"  Datasets: {len(dataset_names)}")
     if any(s.embedding for s in active_strategies):
-        print(f"  Embedding: ollama/qwen3-embedding:0.6b")
+        print("  Embedding: ollama/qwen3-embedding:0.6b")
     print()
 
     all_results: dict[str, dict[str, StrategyResult]] = {}
@@ -257,15 +255,21 @@ def main() -> None:
         for strategy in active_strategies:
             print(f"    → {strategy.label}...", end="", flush=True)
             result = run_strategy(
-                tg_base, strategy, gt["queries"],
-                top_k=args.top_k, verbose=args.verbose,
+                tg_base,
+                strategy,
+                gt["queries"],
+                top_k=args.top_k,
+                verbose=args.verbose,
             )
             ds_results[strategy.name] = result
             print(f" Recall={result.recall_5:.1%} MRR={result.mrr:.3f}")
 
         print_comparison(
-            gt["name"], gt.get("tool_count", len(tg_base.tools)),
-            len(gt["queries"]), ds_results, active_strategies,
+            gt["name"],
+            gt.get("tool_count", len(tg_base.tools)),
+            len(gt["queries"]),
+            ds_results,
+            active_strategies,
         )
         all_results[ds_name] = ds_results
 
diff --git a/docs/api-reference.md b/docs/api-reference.md
new file mode 100644
index 0000000..b64babe
--- /dev/null
+++ b/docs/api-reference.md
@@ -0,0 +1,145 @@
+# Python API Reference
+
+The primary entry point is `ToolGraph`. Most workflows are: ingest a spec → call `retrieve()`.
+
+```python
+from graph_tool_call import ToolGraph
+
+tg = ToolGraph()
+tg.ingest_openapi("api.json")
+tools = tg.retrieve("create a pet", top_k=5)
+```
+
+---
+
+## `ToolGraph` methods
+
+### Construction
+
+| Method | Description |
+|---|---|
+| `ToolGraph()` | Empty graph |
+| `ToolGraph.from_url(url, cache=...)` | Build from Swagger UI or spec URL (auto-discovers spec groups) |
+| `ToolGraph.load(path)` | Deserialize from JSON |
+
+### Ingestion
+
+| Method | Description |
+|---|---|
+| `add_tool(tool)` | Add a single tool (auto-detects format) |
+| `add_tools(tools)` | Add multiple tools |
+| `ingest_openapi(source)` | Ingest from OpenAPI / Swagger spec (file path, URL, or dict) |
+| `ingest_mcp_tools(tools)` | Ingest from MCP tool list |
+| `ingest_mcp_server(url)` | Fetch and ingest from an MCP HTTP server |
+| `ingest_functions(fns)` | Ingest from Python callables (uses type hints + docstrings) |
+| `ingest_arazzo(source)` | Ingest Arazzo 1.0.0 workflow spec |
+| `add_relation(src, tgt, type)` | Add a manual relation between two tools |
+
+### Retrieval
+
+| Method | Description |
+|---|---|
+| `retrieve(query, top_k=10)` | Search and return tool list |
+| `retrieve_with_scores(query, top_k=10)` | Search and return tools with confidence scores and relation hints |
+| `plan_workflow(query)` | Build an ordered execution plan |
+| `suggest_next(tool, history=...)` | Suggest next tools based on graph relations |
+| `validate_tool_call(call)` | Validate and auto-correct a tool call |
+| `assess_tool_call(call)` | Return `allow` / `confirm` / `deny` decision based on annotations |
+
+### Configuration
+
+| Method | Description |
+|---|---|
+| `enable_embedding(provider)` | Enable hybrid embedding search (Ollama, OpenAI, vLLM, sentence-transformers, callable) |
+| `enable_reranker(model)` | Enable cross-encoder reranking |
+| `enable_diversity(lambda_)` | Enable MMR diversity |
+| `set_weights(keyword=, graph=, embedding=, annotation=)` | Tune wRRF fusion weights |
+| `auto_organize(llm=...)` | Auto-categorize tools (rule-based or LLM-enhanced) |
+| `build_ontology(llm=...)` | Build complete ontology |
+
+### Analysis
+
+| Method | Description |
+|---|---|
+| `find_duplicates(threshold)` | Find duplicate tools across sources |
+| `merge_duplicates(pairs)` | Merge detected duplicates |
+| `apply_conflicts()` | Detect and add `CONFLICTS_WITH` edges |
+| `analyze()` | Build operational analysis summary |
+
+### Persistence
+
+| Method | Description |
+|---|---|
+| `save(path)` | Serialize to JSON (preserves embeddings + weights when set) |
+| `ToolGraph.load(path)` | Deserialize and restore retrieval state |
+
+### Export & visualization
+
+| Method | Description |
+|---|---|
+| `export_html(path, progressive=True)` | Interactive HTML (vis.js) |
+| `export_graphml(path)` | GraphML for Gephi / yEd |
+| `export_cypher(path)` | Neo4j Cypher statements |
+| `dashboard_app()` | Build Dash Cytoscape app object |
+| `dashboard(port=8050)` | Launch interactive dashboard |
+
+### Execution
+
+| Method | Description |
+|---|---|
+| `execute(name, params, base_url=...)` | Execute an OpenAPI tool directly |
+
+---
+
+## Top-level helpers
+
+| Function | Description |
+|---|---|
+| `filter_tools(tools, query, top_k=5)` | One-shot filter on any tool list (LangChain, OpenAI, MCP, Anthropic, callables) |
+| `GraphToolkit(tools, top_k=5)` | Reusable toolkit — build graph once, filter per query |
+
+## Middleware
+
+| Function | Description |
+|---|---|
+| `patch_openai(client, graph, top_k=5)` | Auto-filter tools on OpenAI client |
+| `patch_anthropic(client, graph, top_k=5)` | Auto-filter tools on Anthropic client |
+
+## LangChain
+
+| Function | Description |
+|---|---|
+| `create_gateway_tools(tools, top_k=10)` | Convert N tools → 2 gateway meta-tools |
+| `create_agent(llm, tools, top_k=5)` | Auto-filtering LangGraph agent |
+| `GraphToolRetriever(tool_graph, top_k=5)` | LangChain `BaseRetriever` returning `Document` objects |
+| `tool_schema_to_openai_function(tool)` | Convert `ToolSchema` → OpenAI function dict |
+
+---
+
+## Embedding provider strings
+
+`enable_embedding()` accepts:
+
+| Form | Example |
+|---|---|
+| `"ollama/<model>"` | `"ollama/qwen3-embedding:0.6b"` |
+| `"openai/<model>"` | `"openai/text-embedding-3-large"` |
+| `"vllm/<model>"` | `"vllm/Qwen/Qwen3-Embedding-0.6B"` |
+| `"vllm/<model>@<url>"` | `"vllm/model@http://gpu-box:8000/v1"` |
+| `"llamacpp/<model>@<url>"` | `"llamacpp/model@http://192.168.1.10:8080/v1"` |
+| `"<url>@<model>"` | `"http://localhost:8000/v1@my-model"` |
+| `"sentence-transformers/<model>"` | `"sentence-transformers/all-MiniLM-L6-v2"` |
+| `callable` | `lambda texts: my_embed_fn(texts)` |
+
+## Ontology LLM inputs
+
+`auto_organize(llm=...)` accepts:
+
+| Input | Wrapped as |
+|---|---|
+| `OntologyLLM` instance | Pass-through |
+| `callable(str) -> str` | `CallableOntologyLLM` |
+| OpenAI client (has `chat.completions`) | `OpenAIClientOntologyLLM` |
+| `"ollama/model"` | `OllamaOntologyLLM` |
+| `"openai/model"` | `OpenAICompatibleOntologyLLM` |
+| `"litellm/model"` | litellm.completion wrapper |
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
new file mode 100644
index 0000000..2e1d39d
--- /dev/null
+++ b/docs/benchmarks.md
@@ -0,0 +1,174 @@
+# Benchmark Results
+
+Detailed benchmark data for graph-tool-call. The README contains a 3-row summary; this document contains the full pipeline, retrieval-only, competitive, large-scale, and LangChain agent results.
+
+- **Model used (LLM benchmarks)**: `qwen3:4b` (4-bit, Ollama), unless noted
+- **Pipelines compared**: `baseline` (all tools), `retrieve-k3 / k5 / k10`, plus `+ embedding`, `+ ontology`
+- **Reproduce**: see [Reproduce](#reproduce) at the bottom
+
+---
+
+## What we measure
+
+graph-tool-call verifies two things.
+
+1. Can performance be **maintained or improved** by giving the LLM only a subset of retrieved tools?
+2. Does the **retriever itself** rank the correct tools within the top K?
+
+These are different questions. A retriever that achieves high `Gold Tool Recall@K` does not automatically translate to high end-to-end accuracy — the LLM still has to pick the right tool from the candidate set.
+
+### Metrics
+
+- **End-to-end Accuracy** — did the LLM ultimately succeed in selecting the correct tool / performing the correct workflow?
+- **Gold Tool Recall@K** — was the canonical gold tool included in the top K at the retrieval stage?
+- **Avg tokens** — average tokens passed to the LLM
+- **Token reduction** — token savings vs. baseline
+
+> The two accuracy metrics often diverge. Evaluations that accept **alternative tools** or **equivalent workflows** as correct may show End-to-end Accuracy that doesn't exactly match Gold Tool Recall@K. `baseline` has no retrieval stage, so Gold Tool Recall@K does not apply.
+
+---
+
+## 1. Full pipeline comparison
+
+| Dataset | Tools | Pipeline | End-to-end Accuracy | Gold Tool Recall@K | Avg tokens | Token reduction |
+|---|---:|---|---:|---:|---:|---:|
+| Petstore | 19 | baseline | 100.0% | — | 1,239 | — |
+| Petstore | 19 | retrieve-k3 | 90.0% | 93.3% | 305 | 75.4% |
+| Petstore | 19 | retrieve-k5 | 95.0% | 98.3% | 440 | 64.4% |
+| Petstore | 19 | retrieve-k10 | 100.0% | 98.3% | 720 | 41.9% |
+| GitHub | 50 | baseline | 100.0% | — | 3,302 | — |
+| GitHub | 50 | retrieve-k3 | 85.0% | 87.5% | 289 | 91.3% |
+| GitHub | 50 | retrieve-k5 | 87.5% | 87.5% | 398 | 87.9% |
+| GitHub | 50 | retrieve-k10 | 90.0% | 92.5% | 662 | 79.9% |
+| Mixed MCP | 38 | baseline | 96.7% | — | 2,741 | — |
+| Mixed MCP | 38 | retrieve-k3 | 86.7% | 93.3% | 328 | 88.0% |
+| Mixed MCP | 38 | retrieve-k5 | 90.0% | 96.7% | 461 | 83.2% |
+| Mixed MCP | 38 | retrieve-k10 | 96.7% | 100.0% | 826 | 69.9% |
+| Kubernetes core/v1 | 248 | baseline | 12.0% | — | 8,192 | — |
+| Kubernetes core/v1 | 248 | retrieve-k5 | 78.0% | 91.0% | 1,613 | 80.3% |
+| Kubernetes core/v1 | 248 | retrieve-k5 + embedding | 80.0% | 94.0% | 1,728 | 78.9% |
+| Kubernetes core/v1 | 248 | retrieve-k5 + ontology | **82.0%** | 96.0% | 1,699 | 79.3% |
+| Kubernetes core/v1 | 248 | retrieve-k5 + embedding + ontology | **82.0%** | **98.0%** | 1,924 | 76.5% |
+
+### Key insights
+
+- **Small/medium APIs (19~50 tools)** — baseline is already strong. graph-tool-call's main value here is **64~91% token savings** with little accuracy loss.
+- **Large APIs (248 tools)** — baseline collapses to **12%** due to context overload. graph-tool-call recovers performance to **78~82%** by narrowing candidates through retrieval. At this scale it's not an optimization — it's closer to a required retrieval layer.
+- **`retrieve-k5` is the best default**. Good token/accuracy tradeoff. On large datasets, adding embedding/ontology yields further gains.
+
+---
+
+## 2. Retrieval quality (BM25 + graph only)
+
+The table below measures retrieval quality **before the LLM stage**. Only BM25 + graph traversal — no embedding or ontology.
+
+| Dataset | Tools | Gold Tool Recall@3 | Gold Tool Recall@5 | Gold Tool Recall@10 |
+|---|---:|---:|---:|---:|
+| Petstore | 19 | 93.3% | **98.3%** | 98.3% |
+| GitHub | 50 | 87.5% | **87.5%** | 92.5% |
+| Mixed MCP | 38 | 93.3% | **96.7%** | 100.0% |
+| Kubernetes core/v1 | 248 | 82.0% | **91.0%** | 92.0% |
+
+### How to read
+
+- **Gold Tool Recall@K** measures the retriever's ability to include the correct tool in the candidate set, **not** final LLM accuracy.
+- On small datasets, `k=5` already achieves high recall.
+- On large datasets, increasing `k` raises recall but also increases tokens passed to the LLM — consider both.
+
+### Insights
+
+- **Petstore / Mixed MCP** — `k=5` alone includes nearly all correct tools.
+- **GitHub** — there's a recall gap between `k=5` and `k=10`; choose `k=10` if recall matters more than tokens.
+- **Kubernetes core/v1** — even with 248 tools, `k=5` already achieves **91.0%** gold recall. The retrieval stage alone compresses the candidate set dramatically while retaining most correct tools.
+
+---
+
+## 3. When do embedding and ontology help?
+
+Comparison on the largest dataset (Kubernetes core/v1, 248 tools), all on top of `retrieve-k5`.
+
+| Pipeline | End-to-end Accuracy | Gold Tool Recall@5 | Interpretation |
+|---|---:|---:|---|
+| retrieve-k5 | 78.0% | 91.0% | BM25 + graph alone is a strong baseline |
+| + embedding | 80.0% | 94.0% | Recovers semantically-similar but differently-worded queries |
+| + ontology | **82.0%** | 96.0% | LLM-generated keywords/example queries significantly improve retrieval |
+| + embedding + ontology | **82.0%** | **98.0%** | Accuracy maintained, gold recall at its highest |
+
+- **Embedding** compensates for **semantic similarity** that BM25 misses.
+- **Ontology** **expands the searchable representation itself** when descriptions are short or non-standard.
+- Using both together yields limited extra end-to-end gains, but **gold recall reaches its highest**.
+
+---
+
+## 4. Competitive benchmark (retrieval strategies)
+
+Compared 6 retrieval strategies across 9 datasets (19–1068 tools):
+
+| Strategy | Recall@5 | MRR | Latency |
+|---|:---:|:---:|:---:|
+| Vector Only (≈bigtool) | 96.8% | 0.897 | 176ms |
+| BM25 Only | 91.6% | 0.819 | 1.5ms |
+| BM25 + Graph (default) | 91.6% | 0.819 | 14ms |
+| Full Pipeline (with embedding) | 96.8% | 0.897 | 172ms |
+
+**Key finding** — without embedding, BM25+Graph achieves 91.6% Recall, competitive with vector search at **65× faster speed**. With embedding enabled, performance matches pure vector search.
+
+---
+
+## 5. Scale test: 1068 tools (GitHub full API)
+
+| Strategy | Recall@5 | MRR | Miss% |
+|---|:---:|:---:|:---:|
+| Vector Only | 88.0% | 0.761 | 12.0% |
+| BM25 + Graph | 78.0% | 0.643 | 22.0% |
+| Full Pipeline | 88.0% | 0.761 | 12.0% |
+
+At 1068 tools, baseline (passing all definitions) is impractical due to context size — graph-tool-call provides a working retrieval layer where vector-only and full pipeline tie.
+
+---
+
+## 6. LangChain agent benchmark (200 tools)
+
+End-to-end accuracy when **200 simple tools** are registered and invoked through a LangChain agent.
+
+- **Direct (D)** — all 200 tool definitions passed to the LLM at once
+- **Graph (G)** — tools managed via graph-tool-call gateway (search → call, 2 turns)
+
+| Model | D-Acc | G-Acc | D-Turns | G-Turns | D-Tokens | G-Tokens | Savings | D-Time | G-Time |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| gpt-4.1 | 60.0% | 80.0% | 1.0 | 2.0 | 52,587 | 6,639 | 87.4% | 15.5s | 17.6s |
+| gpt-5.2 | 60.0% | **100.0%** | 1.0 | 2.0 | 53,645 | 10,508 | 80.4% | 20.5s | 17.1s |
+| gpt-5.4 | 60.0% | **100.0%** | 1.0 | 2.0 | 60,035 | 14,049 | 76.6% | 18.2s | 17.0s |
+| claude-sonnet-4-20250514 | 100.0% | 100.0% | 1.0 | 2.0 | 196,183 | 17,349 | 91.2% | 58.2s | 49.4s |
+| claude-sonnet-4-6 | 100.0% | 100.0% | 1.0 | 2.0 | 198,665 | 20,074 | 89.9% | 67.0s | 69.4s |
+| claude-haiku-4-5 | 100.0% | 100.0% | 1.0 | 2.0 | 197,845 | 19,714 | 90.0% | 23.7s | 22.8s |
+
+> Acc = accuracy, Turns = average agent turns, Tokens = total tokens, Savings = token reduction (D→G), Time = wall-clock.
+
+### Key findings
+
+- GPT-series models drop to **60% accuracy** when all 200 tools are passed directly; graph-tool-call recovers to **80–100%**.
+- Claude-series models maintain 100% accuracy either way, but graph-tool-call delivers **89–91% token savings**.
+- Graph mode adds 1 extra turn (search → call) but total latency stays comparable or decreases thanks to smaller context.
+- Across all models, token reduction ranges from **76.6% to 91.2%**.
+
+---
+
+## Reproduce
+
+```bash
+# Retrieval quality only (fast, no LLM needed)
+python -m benchmarks.run_benchmark
+python -m benchmarks.run_benchmark -d k8s -v
+
+# Pipeline benchmark (LLM comparison)
+python -m benchmarks.run_benchmark --mode pipeline -m qwen3:4b
+python -m benchmarks.run_benchmark --mode pipeline \
+  --pipelines baseline retrieve-k3 retrieve-k5 retrieve-k10
+
+# Save baseline and compare across runs
+python -m benchmarks.run_benchmark --mode pipeline --save-baseline
+python -m benchmarks.run_benchmark --mode pipeline --diff
+```
+
+See [`benchmarks/`](../benchmarks/) for dataset definitions, ground truth, and the runner source.
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 0000000..bbda320
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,137 @@
+# CLI Reference
+
+```bash
+pip install graph-tool-call           # core CLI
+pip install graph-tool-call[mcp]      # + MCP server / proxy commands
+```
+
+## Commands at a glance
+
+| Command | Purpose |
+|---|---|
+| `search`     | One-liner: ingest + retrieve in one step |
+| `serve`      | Run as MCP server |
+| `proxy`      | Run as MCP proxy (aggregates multiple MCP backends) |
+| `ingest`     | Build a graph from a spec and save |
+| `retrieve`   | Search a pre-built graph |
+| `analyze`    | Print operational analysis (duplicates, conflicts, orphans) |
+| `visualize`  | Export graph to HTML / GraphML |
+| `info`       | Print graph statistics |
+| `dashboard`  | Launch interactive Dash Cytoscape UI |
+
+---
+
+## `search` — one-liner search
+
+```bash
+# Ingest + retrieve in one step
+graph-tool-call search "cancel order" \
+  --source https://api.example.com/openapi.json
+
+graph-tool-call search "delete user" \
+  --source ./openapi.json --scores --json
+```
+
+Useful for quick exploration without saving the graph.
+
+---
+
+## `serve` — MCP server
+
+```bash
+# Single source
+graph-tool-call serve --source https://api.example.com/openapi.json
+
+# Pre-built graph
+graph-tool-call serve --graph prebuilt.json
+
+# Multiple sources
+graph-tool-call serve \
+  -s https://api1.com/spec.json \
+  -s https://api2.com/spec.json
+
+# Remote (SSE / streamable HTTP)
+graph-tool-call serve --source api.json --transport sse --host 0.0.0.0 --port 8000
+graph-tool-call serve --source api.json --transport streamable-http --port 8000
+```
+
+See [MCP Server integration guide](integrations/mcp-server.md) for client configuration.
+
+---
+
+## `proxy` — MCP proxy
+
+```bash
+graph-tool-call proxy --config ~/backends.json
+graph-tool-call proxy --config backends.json --transport sse --port 8000
+graph-tool-call proxy --config backends.json --passthrough-threshold 50
+```
+
+See [MCP Proxy integration guide](integrations/mcp-proxy.md) for `backends.json` format.
+
+---
+
+## `ingest` — build and save a graph
+
+```bash
+graph-tool-call ingest https://api.example.com/openapi.json -o graph.json
+graph-tool-call ingest ./spec.yaml --embedding --organize
+```
+
+Flags:
+- `-o, --output PATH` — output graph file (JSON)
+- `--embedding` — enable embedding-based hybrid search
+- `--organize` — auto-categorize tools into ontology
+
+---
+
+## `retrieve` — search a pre-built graph
+
+```bash
+graph-tool-call retrieve "query" -g graph.json -k 10
+```
+
+Flags:
+- `-g, --graph PATH` — pre-built graph file
+- `-k, --top-k N` — number of results
+- `--scores` — print scores
+- `--json` — JSON output
+
+---
+
+## `analyze` — operational analysis
+
+```bash
+graph-tool-call analyze graph.json --duplicates --conflicts
+```
+
+Prints duplicate tools, conflict pairs, orphan tools, category coverage.
+
+---
+
+## `visualize` — export to HTML / GraphML
+
+```bash
+graph-tool-call visualize graph.json -f html       # interactive HTML
+graph-tool-call visualize graph.json -f graphml    # Gephi/yEd
+graph-tool-call visualize graph.json -f cypher     # Neo4j
+```
+
+---
+
+## `info` — graph statistics
+
+```bash
+graph-tool-call info graph.json
+# → ToolGraph(tools=248, nodes=251, edges=1024)
+```
+
+---
+
+## `dashboard` — interactive UI
+
+```bash
+graph-tool-call dashboard graph.json --port 8050
+```
+
+Launches the Dash Cytoscape interactive dashboard for graph inspection and retrieval testing. Requires `pip install graph-tool-call[dashboard]`.
diff --git a/docs/integrations/direct-api.md b/docs/integrations/direct-api.md
new file mode 100644
index 0000000..8b408df
--- /dev/null
+++ b/docs/integrations/direct-api.md
@@ -0,0 +1,125 @@
+# Direct API Integration
+
+Use `retrieve()` to search, then convert results to your provider's tool format. Works with **any OpenAI-compatible API** (OpenAI, Azure, Ollama, vLLM, llama.cpp) and Anthropic.
+
+## OpenAI / OpenAI-compatible
+
+```python
+from openai import OpenAI
+from graph_tool_call import ToolGraph
+from graph_tool_call.langchain.tools import tool_schema_to_openai_function
+
+# Build graph from any source
+tg = ToolGraph.from_url(
+    "https://petstore3.swagger.io/api/v3/openapi.json",
+    cache="petstore.json",
+)
+
+# Retrieve only the relevant tools for a query
+tools = tg.retrieve("create a new pet", top_k=5)
+
+# Convert to OpenAI function-calling format
+openai_tools = [
+    {"type": "function", "function": tool_schema_to_openai_function(t)}
+    for t in tools
+]
+
+# Use with any provider — OpenAI, Azure, Ollama, vLLM, llama.cpp, etc.
+client = OpenAI()
+# Or for Ollama: OpenAI(base_url="http://localhost:11434/v1")
+
+response = client.chat.completions.create(
+    model="gpt-4o",
+    tools=openai_tools,  # only 5 relevant tools instead of all 248
+    messages=[{"role": "user", "content": "create a new pet"}],
+)
+```
+
+## Anthropic Claude
+
+```python
+from anthropic import Anthropic
+from graph_tool_call import ToolGraph
+
+tg = ToolGraph.from_url("https://api.example.com/openapi.json")
+tools = tg.retrieve("cancel an order", top_k=5)
+
+# Convert to Anthropic tool format
+anthropic_tools = [
+    {
+        "name": t.name,
+        "description": t.description,
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                p.name: {"type": p.type, "description": p.description}
+                for p in t.parameters
+            },
+            "required": [p.name for p in t.parameters if p.required],
+        },
+    }
+    for t in tools
+]
+
+client = Anthropic()
+response = client.messages.create(
+    model="claude-sonnet-4-20250514",
+    tools=anthropic_tools,
+    messages=[{"role": "user", "content": "cancel my order"}],
+    max_tokens=1024,
+)
+```
+
+## Wrap any tool list (no graph needed)
+
+If you already have a list of tools in any format (LangChain `BaseTool`, OpenAI dicts, MCP dicts, Anthropic dicts, plain Python functions), use `filter_tools` directly — **no extra dependencies**:
+
+```python
+from graph_tool_call import filter_tools
+
+filtered = filter_tools(all_tools, "send an email to John", top_k=5)
+# → only the 5 most relevant tools, original objects preserved
+```
+
+### Reusable toolkit
+
+Build the graph once, filter per query:
+
+```python
+from graph_tool_call import GraphToolkit
+
+toolkit = GraphToolkit(tools=all_tools, top_k=5)
+
+tools_a = toolkit.get_tools("cancel my order")
+tools_b = toolkit.get_tools("check the weather")
+
+# Access the underlying ToolGraph for advanced config
+toolkit.graph.enable_embedding("ollama/qwen3-embedding:0.6b")
+```
+
+## Workflow planning
+
+Beyond per-query filtering, `plan_workflow()` returns ordered execution chains with prerequisites — reducing agent round-trips from 3-4 to 1.
+
+```python
+from graph_tool_call import ToolGraph
+
+tg = ToolGraph.from_url("https://api.example.com/openapi.json")
+
+plan = tg.plan_workflow("process a refund")
+for step in plan.steps:
+    print(f"{step.order}. {step.tool.name} — {step.reason}")
+# 1. getOrder — prerequisite for requestRefund
+# 2. requestRefund — primary action
+
+# Edit the workflow
+plan.remove_step("listOrders")
+plan.insert_step(0, "getOrder", tools=tg.tools, reason="need order ID")
+plan.set_param_mapping("requestRefund", "order_id", "getOrder.response.id")
+
+# Visual editor (opens in browser)
+plan.open_editor(tools=tg.tools)
+
+# Save / Load
+plan.save("refund_workflow.json")
+```
diff --git a/docs/integrations/langchain.md b/docs/integrations/langchain.md
new file mode 100644
index 0000000..bcffbea
--- /dev/null
+++ b/docs/integrations/langchain.md
@@ -0,0 +1,111 @@
+# LangChain / LangGraph Integration
+
+```bash
+pip install graph-tool-call[langchain] langgraph
+```
+
+Three integration patterns — pick the one that fits your architecture.
+
+| Pattern | Best for | How it works |
+|---|---|---|
+| **Gateway** | 50+ tools, existing agents | LLM explicitly searches → calls |
+| **Auto-filter** | New agents, simple setup | Transparent per-turn tool swap |
+| **Manual** | Full control | You call `filter_tools()` yourself |
+
+---
+
+## 1. Gateway Tools (recommended for large tool sets)
+
+Convert 50~500+ tools into **2 meta-tools** (`search_tools` + `call_tool`). The LLM searches first, then calls — no tool definitions bloat in context.
+
+```python
+from graph_tool_call.langchain import create_gateway_tools
+
+# 62 tools from Slack, GitHub, Jira, MS365, custom APIs...
+all_tools = slack_tools + github_tools + jira_tools + ms365_tools + api_tools
+
+# Convert to 2 gateway meta-tools
+gateway = create_gateway_tools(all_tools, top_k=10)
+# → [search_tools, call_tool]
+
+# Use with any LangChain agent — only 2 tools in context
+agent = create_react_agent(model=llm, tools=gateway)
+result = agent.invoke({
+    "messages": [("user", "PROJ-123 이슈를 Done으로 변경해줘")]
+})
+```
+
+### How it works
+
+```text
+User: "Cancel order #500"
+  ↓
+LLM calls search_tools(query="cancel order")
+  → returns: cancel_order, get_order, process_refund (with parameter info)
+  ↓
+LLM calls call_tool(tool_name="cancel_order", arguments={"order_id": 500})
+  → returns: {"order_id": 500, "status": "cancelled"}
+  ↓
+LLM: "Order #500 has been cancelled."
+```
+
+### Token impact
+
+| | All tools bound | Gateway (2 tools) |
+|---|:---:|:---:|
+| **62 tools** | ~6,090 tokens/turn | ~475 tokens/turn |
+| **Token reduction** | — | **92%** |
+| **Accuracy** (qwen3.5:4b) | — | 70% (100% with GPT-4o) |
+
+> Works with **any existing LangChain agent setup**. Just replace `tools=all_tools` with `tools=create_gateway_tools(all_tools)`.
+
+See the [200-tool LangChain agent benchmark](../benchmarks.md#6-langchain-agent-benchmark-200-tools) for results across GPT and Claude models.
+
+---
+
+## 2. Auto-filtering Agent (transparent per-turn filtering)
+
+The agent automatically filters tools each turn — the LLM never sees the full list.
+
+```python
+from graph_tool_call.langchain import create_agent
+
+# 200 tools go in — LLM sees only ~5 relevant ones each turn
+agent = create_agent(llm, tools=all_200_tools, top_k=5)
+
+result = agent.invoke({"messages": [("user", "cancel my order")]})
+# Turn 1: LLM sees [cancel_order, get_order, process_refund, ...]
+# Turn 2: LLM sees [next relevant tools based on conversation]
+```
+
+---
+
+## 3. Manual filtering
+
+```python
+from graph_tool_call import filter_tools
+from langgraph.prebuilt import create_react_agent
+
+filtered = filter_tools(langchain_tools, "cancel order", top_k=5)
+agent = create_react_agent(llm, filtered)
+```
+
+---
+
+## LangChain Retriever (returns Documents)
+
+If you want to use graph-tool-call as a regular retriever returning `Document` objects (e.g., for a chain that doesn't use tool-calling):
+
+```python
+from graph_tool_call import ToolGraph
+from graph_tool_call.langchain import GraphToolRetriever
+
+tg = ToolGraph.from_url("https://api.example.com/openapi.json")
+
+retriever = GraphToolRetriever(tool_graph=tg, top_k=5)
+docs = retriever.invoke("cancel an order")
+
+for doc in docs:
+    print(doc.page_content)       # "cancelOrder: Cancel an existing order"
+    print(doc.metadata["tags"])   # ["order"]
+```
diff --git a/docs/integrations/mcp-proxy.md b/docs/integrations/mcp-proxy.md
new file mode 100644
index 0000000..572c6b6
--- /dev/null
+++ b/docs/integrations/mcp-proxy.md
@@ -0,0 +1,117 @@
+# MCP Proxy
+
+When you have many MCP servers, their tool names pile up in every LLM turn. **MCP Proxy** bundles them behind a single server: **172 tools → 3 meta-tools**, saving ~1,200 tokens per turn.
+
+## How it works
+
+```text
+            ┌─────────────────────────────┐
+Claude  ──▶ │  graph-tool-call MCP Proxy  │
+            │  ┌───────────────────────┐  │     ┌──────────────┐
+            │  │ search_tools          │  │ ──▶ │ playwright   │
+            │  │ get_tool_schema       │  │ ──▶ │ filesystem   │
+            │  │ call_backend_tool     │  │ ──▶ │ my-api       │
+            │  └───────────────────────┘  │ ──▶ │ ...          │
+            └─────────────────────────────┘     └──────────────┘
+                  3 meta-tools                    N backends
+```
+
+The proxy starts each backend, indexes all tools into a `ToolGraph`, and exposes only 3 meta-tools to the LLM. After `search_tools`, matched tools are **dynamically injected** so the LLM can call them directly in 1 hop.
+
+## Setup
+
+### Step 1 — Create `backends.json`
+
+```jsonc
+// ~/backends.json
+{
+  "backends": {
+    "playwright": {
+      "command": "npx",
+      "args": ["@playwright/mcp", "--headless"]
+    },
+    "filesystem": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-filesystem", "/home"]
+    },
+    "my-api": {
+      "command": "uvx",
+      "args": ["some-mcp-server"],
+      "env": { "API_KEY": "sk-..." }
+    }
+  },
+  "top_k": 10,
+  "cache_path": "~/.cache/mcp-proxy-cache.json"
+}
+```
+
+> **Embedding is optional.** Add `"embedding": "ollama/qwen3-embedding:0.6b"` for cross-language search (requires Ollama running). Without it, BM25 keyword search still works.
+
+### Step 2 — Register the proxy with Claude Code
+
+```bash
+claude mcp add -s user tool-proxy -- \
+  uvx "graph-tool-call[mcp]" proxy --config ~/backends.json
+```
+
+### Step 3 — Remove the original individual servers
+
+```bash
+claude mcp remove playwright -s user
+claude mcp remove filesystem -s user
+claude mcp remove my-api -s user
+```
+
+### Step 4 — Restart Claude Code and verify
+
+```bash
+claude mcp list
+# tool-proxy: ... - ✓ Connected
+# (individual servers should be gone)
+```
+
+## Remote transport
+
+```bash
+graph-tool-call proxy --config backends.json --transport sse --port 8000
+```
+
+## Passthrough mode (few tools)
+
+When total tools across all backends is **≤ 30**, the proxy **skips the graph layer entirely** and exposes every backend tool directly. Zero overhead, no meta-tools, original tool names and schemas preserved.
+
+This is useful when you want a **single MCP entry point** for several small servers without paying the search/meta-tool tax.
+
+```bash
+# Explicitly set the threshold (default: 30)
+graph-tool-call proxy --config backends.json --passthrough-threshold 50
+```
+
+Or in `backends.json`:
+
+```jsonc
+{
+  "backends": { ... },
+  "passthrough_threshold": 50   // ≤ 50 → passthrough, > 50 → gateway
+}
+```
+
+| Mode | When | Exposed tools |
+|---|---|---|
+| **gateway** (default) | total tools > threshold | `search_tools` + `get_tool_schema` + `call_backend_tool` |
+| **passthrough** | total tools ≤ threshold | All backend tools directly (original names/schemas) |
+
+## Alternative: `.mcp.json` config
+
+```jsonc
+// .mcp.json (project-level or global)
+{
+  "mcpServers": {
+    "tool-proxy": {
+      "command": "uvx",
+      "args": ["graph-tool-call[mcp]", "proxy",
+               "--config", "/path/to/backends.json"]
+    }
+  }
+}
+```
diff --git a/docs/integrations/mcp-server.md b/docs/integrations/mcp-server.md
new file mode 100644
index 0000000..78a99af
--- /dev/null
+++ b/docs/integrations/mcp-server.md
@@ -0,0 +1,100 @@
+# MCP Server
+
+Run graph-tool-call as an MCP server. Any MCP-compatible agent (Claude Code, Cursor, Windsurf, etc.) can use tool search with just a config entry.
+
+## Quick start
+
+```jsonc
+// .mcp.json
+{
+  "mcpServers": {
+    "tool-search": {
+      "command": "uvx",
+      "args": ["graph-tool-call[mcp]", "serve",
+               "--source", "https://api.example.com/openapi.json"]
+    }
+  }
+}
+```
+
+## Remote deployment (SSE / Streamable HTTP)
+
+The MCP server supports remote transports for shared deployments:
+
+```bash
+# SSE transport
+graph-tool-call serve --source api.json --transport sse --host 0.0.0.0 --port 8000
+
+# Streamable HTTP
+graph-tool-call serve --source api.json --transport streamable-http --port 8000
+```
+
+Client config for a remote MCP server:
+
+```json
+{
+  "mcpServers": {
+    "tool-search": {
+      "url": "http://tool-search.internal:8000/sse"
+    }
+  }
+}
+```
+
+## Exposed tools
+
+The MCP server exposes 6 tools:
+
+| Tool | Purpose |
+|---|---|
+| `search_tools` | Hybrid search across the tool graph |
+| `get_tool_schema` | Fetch the full schema for a specific tool |
+| `execute_tool` | Execute an OpenAPI tool directly |
+| `list_categories` | List ontology categories |
+| `graph_info` | Graph statistics (nodes, edges, relations) |
+| `load_source` | Hot-load a new source into the running server |
+
+## Search results include workflow guidance
+
+Search results contain **relations** between tools and a **suggested execution order**:
+
+```json
+{
+  "tools": [
+    {
+      "name": "createOrder",
+      "relations": [
+        {"target": "getOrder", "type": "precedes",
+         "hint": "Call this tool before getOrder"}
+      ]
+    },
+    {"name": "getOrder", "prerequisites": ["createOrder"]}
+  ],
+  "workflow": {"suggested_order": ["createOrder", "getOrder", "updateOrderStatus"]}
+}
+```
+
+This lets the agent plan multi-step calls in one turn instead of round-tripping per tool.
+
+## Multiple sources
+
+Pass `-s` multiple times to merge several specs into one graph:
+
+```bash
+graph-tool-call serve \
+  -s https://api1.example.com/openapi.json \
+  -s https://api2.example.com/openapi.json
+```
+
+Cross-source duplicate detection automatically dedupes tools that appear in multiple specs.
+
+## Pre-built graph
+
+Build the graph once, serve it many times:
+
+```bash
+graph-tool-call ingest https://api.example.com/openapi.json -o graph.json
+graph-tool-call serve --graph graph.json
+```
+
+See the [CLI reference](../cli.md) for the full `serve` flag list.
diff --git a/docs/integrations/middleware.md b/docs/integrations/middleware.md
new file mode 100644
index 0000000..b1a1716
--- /dev/null
+++ b/docs/integrations/middleware.md
@@ -0,0 +1,65 @@
+# SDK Middleware
+
+Already have tool-calling code? Add **one line** to automatically filter tools through graph-tool-call. Existing code stays unchanged.
+
+## OpenAI
+
+```python
+from openai import OpenAI
+from graph_tool_call import ToolGraph
+from graph_tool_call.middleware import patch_openai
+
+client = OpenAI()
+tg = ToolGraph.from_url("https://api.example.com/openapi.json")
+
+patch_openai(client, graph=tg, top_k=5)  # ← add this line
+
+# Existing code unchanged — 248 tools go in, only 5 relevant ones are sent
+response = client.chat.completions.create(
+    model="gpt-4o",
+    tools=all_248_tools,
+    messages=messages,
+)
+```
+
+## Anthropic
+
+```python
+from anthropic import Anthropic
+from graph_tool_call import ToolGraph
+from graph_tool_call.middleware import patch_anthropic
+
+client = Anthropic()
+tg = ToolGraph.from_url("https://api.example.com/openapi.json")
+
+patch_anthropic(client, graph=tg, top_k=5)  # ← add this line
+
+# Existing code unchanged
+response = client.messages.create(
+    model="claude-sonnet-4-20250514",
+    tools=all_248_tools,
+    messages=messages,
+    max_tokens=1024,
+)
+```
+
+## How it works
+
+The middleware monkey-patches `chat.completions.create` (OpenAI) or `messages.create` (Anthropic) so that whenever `tools=...` is passed, it:
+
+1. Reads the latest user message
+2. Calls `graph.retrieve(query, top_k=top_k)`
+3. Replaces `tools=` with the filtered subset
+4. Forwards the request
+
+The original tool list never reaches the model. There's no change to the SDK return type, streaming, or async behavior.
+
+## When to use
+
+| Use middleware when... | Use direct API when... |
+|---|---|
+| You have working tool-calling code already | You're starting from scratch |
+| You don't want to refactor for retrieval | You want explicit control over which tools are sent |
+| Tool list comes from a runtime registry | Tool list is static and known |
+
+For explicit retrieval control, see [Direct API integration](direct-api.md).
diff --git a/docs/roadmap.md b/docs/roadmap.md
new file mode 100644
index 0000000..ce613fc
--- /dev/null
+++ b/docs/roadmap.md
@@ -0,0 +1,298 @@
+# graph-tool-call Roadmap
+
+> 작성일: 2026-04-09
+> 상태: Phase 0~4.5 완료 (255+ tests). 다음 6~9개월 고도화 방향.
+>
+> 관련 문서:
+> - [memo/differentiation-analysis.md](../memo/differentiation-analysis.md) — 학술 차별화 9개 분석
+> - [docs/wbs/README.md](wbs/README.md) — Phase 0~4.5 WBS
+> - [docs/benchmarks.md](benchmarks.md) — 현재 벤치마크 결과
+
+---
+
+## 요약
+
+현재 graph-tool-call은 **"도구 검색 라이브러리"**로 완성도 높음. 다음 단계는 **"도구 검색 + 실행 + 거버넌스 레이어"**로 카테고리 확장.
+
+고도화 후보 15개를 2축 — **거버넌스**(다른 작업의 전제 + 안정성/보안/관측성)와 **효과**(사용자 체감 + 학술 임팩트) — 로 평가해 우선순위를 매겼다. 1번과 2번 우선순위는 의존성이 명확하므로 순서 고정, 3번 이후는 시간/리소스 제약에 따라 3가지 시나리오로 분기.
+
+---
+
+## 1. 배경 — 현재 gap
+
+### 1.1 README가 약속한 것 vs 실제 구현
+
+README Quick Start는 `plan_workflow()`에 대해 이렇게 쓰여 있다:
+
+> `plan_workflow()` returns ordered execution chains with prerequisites — **reducing agent round-trips from 3-4 to 1**.
+
+하지만 현재 `graph_tool_call/workflow.py`에는 **계획 작성/편집 메서드만 있고 실행 메서드가 없다** (`execute` / `run` / `invoke` 키워드 0개). 단일 tool 실행은 `ToolGraph.execute()`(tool_graph.py:629)에 있지만, 이를 chain으로 묶는 orchestration 레이어가 부재. 즉 "round-trip 1회" 약속은 **절반만 지켜진 상태**.
+
+### 1.2 학술 차별화 9개 중 1개만 구현
+
+[memo/differentiation-analysis.md](../memo/differentiation-analysis.md)는 9개의 학술 차별화 후보를 정리해두었다:
+
+| # | 후보 | Tier | 구현 상태 |
+|---|---|:---:|:---:|
+| 3.1 | MCP Annotation-Aware Retrieval | 1 | ✅ (Phase 2.5) |
+| 3.2 | Execution Trace → Causal Tool Graph | 1 | ❌ |
+| 3.3 | Token-Budget Constrained Graph Selection | 1 | ❌ |
+| 3.4 | Dynamic Tool Graph | 2 | 부분 |
+| 3.5 | Cross-Server Tool Dependency | 2 | ❌ |
+| 3.6 | Tool Name Disambiguation | 2 | ❌ (prefix 회피만) |
+| 3.7 | Cross-Primitive Retrieval | 3 | ❌ |
+| 3.8 | Failure-Aware Closed-Loop Retrieval | 3 | ❌ |
+| 3.9 | Stateful Session-Aware Retrieval | 3 | 부분 (history 파라미터만) |
+
+### 1.3 코드 베이스 분석으로 발견한 추가 gap
+
+학술 차별화 분석에 **없는** 실용 gap 7개:
+
+| 코드 | 위치 | gap |
+|---|---|---|
+| `graph_tool_call/workflow.py` | `WorkflowPlan` | `execute_plan()` 부재 |
+| `graph_tool_call/mcp_proxy.py:164-211` | backend tool 수집 | schema 검증/provenance/ACL 부재 |
+| `graph_tool_call/retrieval/engine.py:111-188` | 5개 scorer | Score Provider 플러그인 인터페이스 부재 |
+| `graph_tool_call/__init__.py` | public API | trace export / debug API 부재 |
+| `graph_tool_call/retrieval/graph_search.py:38-100` | `_get_category_index()` | 매 query마다 재구축 |
+| `graph_tool_call/ingest/` | 단일 spec | spec 경계를 넘는 federation 부재 |
+| `graph_tool_call/ingest/` | 6종 format | 새 format 추가 시 adapter interface 부재 |
+
+---
+
+## 2. 후보 15개 간략 설명
+
+후보는 **그룹 1 (사용자 즉시 체감)**, **그룹 2 (시스템 견고성)**, **그룹 3 (검색 품질 / 학술)** 으로 분류.
+
+### 그룹 1 — 사용자 즉시 체감
+
+#### A. Workflow Execution Engine
+**현상**: `plan_workflow()`는 계획만 생성, 실행은 사용자/LLM 책임.
+**작업**: `execute_plan(goal, initial_args)` 추가. `params_from` path expression parser로 step 간 데이터 자동 전달. 실패 시 skip / rollback / abort 정책. dry-run 모드.
+**효과**: README 약속("round-trip 3-4회 → 1회") 완성. LLM agent가 `execute_plan` 1번 호출로 multi-step workflow 처리. 다른 학술 후보 5개(P4, P5, B enforcement, C trace, end-to-end 벤치마크)의 전제.
+**Effort**: 1.5~2주 (HTTP 실행 인프라는 이미 존재, chain orchestration만 추가)
+
+#### B. Tool Poisoning Defense
+**현상**: MCP backend tool schema를 그대로 신뢰. 악성 서버가 description에 prompt injection 삽입 가능 (Invariant Labs 보고).
+**작업**: Schema 해시 기반 mutation detection / tool provenance 추적 / annotation 기반 ACL (`readOnlyHint=true`만 unprivileged 노출) / prompt injection 패턴 탐지.
+**효과**: 회사 도입 시 보안팀 차단 사유 제거. USENIX Security / IEEE S&P 같은 보안 학회 논문 타겟 가능 (Tool Poisoning을 retrieval layer에서 막는 first work).
+**Effort**: 2~3주
+
+#### C. Observability + Trace Export
+**현상**: 기본 logging만. "왜 이 도구가 검색되지 않았지?" 디버그 불가.
+**작업**: `RetrievalEngine`에 `TraceContext` 추가 — 단계별 점수 + 최종 순위 캡처. OpenTelemetry span export. CLI `--trace-out trace.json`.
+**효과**: 사용자가 자기 데이터로 튜닝 가능. P4/P5의 데이터 소스 겸용. 도입 전 검증 가능성 ↑.
+**Effort**: 1주
+
+#### F. Cross-Spec Federation
+**현상**: 여러 OpenAPI spec ingest 시 단순 union. spec A의 `getUser` 출력과 spec B의 `userId` 파라미터가 연결되지 않음.
+**작업**: spec 경계를 넘는 parameter schema 매칭. provenance metadata.
+**효과**: 회사 internal API 여러 개 통합 케이스 — 실제 도입 시나리오에서 가장 흔한 요구.
+**Effort**: 2주 (P1과 인프라 공유)
+
+### 그룹 2 — 시스템 견고성 (인프라)
+
+#### D. Pluggable Score Provider SPI
+**현상**: `retrieval/engine.py`에 5개 scorer가 hardcoded. 새 score 추가하려면 engine 직접 수정.
+**작업**: `ScoreProvider` Protocol + `register_score_provider()` API. 기존 5개를 SPI에 맞춰 refactor.
+**효과**: 학술 후보 P3/P4/P5의 score 통합이 모두 plug-in. 외부 기여자가 새 score (popularity, latency, user feedback 등)를 추가 가능.
+**Effort**: 1주
+
+#### E. Incremental Re-indexing
+**현상**: tool 추가 시 BM25 index와 embedding을 처음부터 재구축. `_get_category_index()`는 매 query마다 재구축 (1068 tools × tokenization = O(n)).
+**작업**: BM25/embedding add-remove API. category index lazy invalidation.
+**효과**: 1068+ tools 환경에서 latency 급감. MCP server를 동적으로 추가/제거하는 환경(Cursor, Claude Code)에서 직접 이득.
+**Effort**: 1주
+
+#### G. Anti-Corruption Adapter Layer
+**현상**: ingest format 6종(OpenAPI/MCP tools/MCP server/Python fn/manual/Arazzo). 새 format 추가 시 ingest 모듈에 산발적 코드.
+**작업**: `IngestAdapter` 추상 interface. 기존 6종 refactor. gRPC + GraphQL adapter PoC.
+**효과**: 외부 기여 진입 장벽 ↓. 신규 format 200~300 LOC로 추가 가능.
+**Effort**: 2주
+
+### 그룹 3 — 검색 품질 / 학술 차별화
+
+#### P1. Cross-Server Tool Dependency
+**현상**: 같은 spec 내 의존성만 감지. cross-server 흐름(Slack → Jira → GitHub)은 별개로 취급.
+**작업**: `mcp_proxy.py` cross-backend ingest 시 parameter schema 매칭. 새 edge type 불필요 (`REQUIRES`로 충분), backend metadata만 추가.
+**효과**: MCP-Bench retrieval 오류의 50%를 차지하는 cross-server 문제 직접 해결. 논문 1편 핵심 contribution.
+**Effort**: 2~3주
+
+#### P2. Tool Name Disambiguation
+**현상**: MCP 생태계 59% 이름 충돌("search" 32개 서버). 현재는 `serverName__toolName` prefix로 회피.
+**작업**: 동일 이름 도구를 (signature + ontology context)로 자동 구분. disambiguation key 생성.
+**효과**: LLM이 prefix 신경 쓰지 않아도 됨. 논문 1편의 sub-contribution으로 포함 가능.
+**Effort**: 2주
+
+#### P3. Stateful Session-Aware Retrieval
+**현상**: history 파라미터로 "이미 호출한 도구"를 demote만 함. 다음 도구 예측 없음.
+**작업**: Markov chain on tool graph로 다음 도구 확률 계산. `RetrievalEngine`에 새 score source.
+**효과**: multi-turn 대화 retrieval 정확도 ↑. 짧은 후속 질문("이제 그거 취소해") 처리력 강화.
+**Effort**: 3~4주 (평가 인프라 포함)
+
+#### P4. Failure-Aware Closed-Loop Retrieval
+**현상**: 도구 실패해도 retrieval 순위에 영향 없음.
+**작업**: 실행 trace → edge weight online learning. 최근 실패율 높은 도구 자동 강등. `SIMILAR_TO` fallback 제안.
+**효과**: self-healing. 운영 중 안 쓰이는 도구 자동 정리.
+**전제**: **A 필요** (실행 trace가 있어야 학습 가능)
+**Effort**: 3주
+
+#### P5. Execution Trace → Causal Tool Graph
+**현상**: 의존성 그래프가 정적 metadata(OpenAPI/CRUD)에만 의존.
+**작업**: 실행 trace에서 interventional causal discovery로 인과적 의존성 자동 발견.
+**효과**: Tier 1 단독 contribution (causal discovery + tool learning 교차점, 미개척). 사용할수록 똑똑해지는 시스템.
+**전제**: **A 필요**
+**Effort**: 6~8주 (이론 + 평가)
+
+#### P6. Token-Budget Constrained Graph Selection
+**현상**: top-k=5 고정. 도구 토큰 비용 예측 불가.
+**작업**: dependency-constrained knapsack ILP formulation. DAG 활용 근사 알고리즘. approximation ratio 증명.
+**효과**: Cursor 40 한도, OpenAI 권장 20개 제약 해결. 비용 예측 가능. ICML/NeurIPS 타겟.
+**Effort**: 8주 (이론 작업 중심)
+
+#### P7. Cross-Primitive Retrieval
+**현상**: MCP의 3대 primitive 중 Tools만 검색.
+**작업**: Resources/Prompts 노드 추가. 새 edge type(`PROVIDES_CONTEXT`, `TEMPLATES`). heterogeneous graph 검색.
+**효과**: MCP 잠재력 100% 활용.
+**블로커**: Resources/Prompts 사용하는 실제 MCP 서버가 적어 평가 데이터셋 부족
+**Effort**: 6주 (+ 평가 데이터 수집)
+
+---
+
+## 3. 우선순위 매트릭스
+
+거버넌스 = 다른 작업의 전제 + 안정성/보안/관측성
+효과 = 사용자 체감 + 잠금 해제 + 학술/시장 임팩트
+
+| 순위 | 후보 | 거버넌스 | 효과 | 합계 | 전제 |
+|:---:|---|:---:|:---:|:---:|---|
+| 1 | **A. Workflow Execution** | ★★★★★ | ★★★★★ | 10 | 없음 |
+| 2 | **B. Tool Poisoning Defense** | ★★★★★ | ★★★★ | 9 | 없음 |
+| 3 | **D. Score Provider SPI** | ★★★★★ | ★★★ | 8 | 없음 |
+| 4 | **C. Observability + Trace** | ★★★★ | ★★★ | 7 | 없음 |
+| 5 | **P6. Token-Budget Knapsack** | ★★★ | ★★★★ | 7 | 없음 |
+| 6 | **P1. Cross-Server Dependency** | ★★ | ★★★★ | 6 | 없음 |
+| 7 | **F. Cross-Spec Federation** | ★★ | ★★★★ | 6 | P1 인프라 공유 |
+| 8 | **G. Adapter Layer** | ★★★★ | ★★ | 6 | 없음 |
+| 9 | **E. Incremental Re-indexing** | ★★★ | ★★★ | 6 | 없음 |
+| 10 | P5. Causal Tool Graph | ★★★ | ★★★★★ | 8* | **A 필요** |
+| 11 | P4. Failure-Aware | ★★★ | ★★★ | 6* | **A 필요** |
+| 12 | P2. Name Disambiguation | ★★ | ★★★ | 5 | 없음 |
+| 13 | P7. Cross-Primitive | ★★ | ★★★ | 5 | 없음 |
+| 14 | P3. Stateful Session | ★ | ★★★ | 4 | 없음 |
+
+`*` 전제 작업(A) 미완료 시 진입 불가
+
+---
+
+## 4. 마일스톤
+
+### 권장 진행 (6~8개월, 시나리오 C)
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ Week 1         D. Score Provider SPI            [1w]   │
+│ Week 2-4       A. Workflow Execution            [3w]   │
+│ Week 5-6       C. Observability + Trace         [2w]   │
+│ Week 7-9       B. Tool Poisoning Defense        [3w]   │
+├─────────────────────────────────────────────────────────┤
+│ 마일스톤 1 (~9주) — v0.5 Release                         │
+│   ✓ Workflow 완성 (README 약속 성립)                    │
+│   ✓ Security defense                                    │
+│   ✓ Observability + debug                               │
+│   ✓ Score provider plugin SPI                           │
+└─────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────┐
+│ Week 10-12     P1. Cross-Server Dependency      [3w]   │
+│ Week 13-14     F. Cross-Spec Federation         [2w]   │
+│ Week 15-16     P2. Name Disambiguation          [2w]   │
+├─────────────────────────────────────────────────────────┤
+│ 마일스톤 2 (~16주) — 논문 1 초안                          │
+│   "MCP-Native Graph Tool Retrieval:                     │
+│    Cross-Server Dependency + Name Disambiguation +      │
+│    Annotation-Aware Defense"                            │
+│   타겟: EMNLP Workshop / ACL Findings                   │
+└─────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────┐
+│ Week 17-24     P5. Causal Tool Graph            [8w]   │
+│ Week 25-32     P6. Token-Budget Knapsack        [8w]   │
+├─────────────────────────────────────────────────────────┤
+│ 마일스톤 3 (~32주) — 논문 2                              │
+│   "Token-Constrained Tool Selection as                  │
+│    Graph Optimization"                                  │
+│   타겟: ICML / NeurIPS                                  │
+└─────────────────────────────────────────────────────────┘
+```
+
+### 대안 시나리오
+
+#### 시나리오 A — 학술 우선 (4개월, 논문 1편)
+
+시간이 제한적이고 논문 1편을 빨리 받고 싶은 경우.
+
+```
+Week 1          D. Score Provider SPI            [1w]
+Week 2-4        P1. Cross-Server Dependency      [3w]
+Week 5-6        P2. Name Disambiguation          [2w]
+Week 7-9        B. Tool Poisoning Defense        [3w]
+Week 10-16      논문 1 작성 (LiveMCPBench + ToolBench 평가)
+```
+
+**리스크**: A가 뒤로 밀려서 P4/P5가 영구적으로 막힘. 논문 2 불가능.
+
+#### 시나리오 B — 프로덕션 우선 (2개월, 사용자 확보)
+
+학술 의도 없이 실사용자 확보가 목표인 경우.
+
+```
+Week 1-3        A. Workflow Execution            [3w]
+Week 4          C. Observability + Trace         [1w]
+Week 5          E. Incremental Re-indexing       [1w]
+Week 6-7        F. Cross-Spec Federation         [2w]
+Week 8          블로그 + LangChain community 등록 (Phase 4 잔여)
+```
+
+**산출물**: PyPI 다운로드 ↑, real-world adopter case 1~2개, blog 기반 유입.
+
+---
+
+## 5. Phase 4 잔여 작업
+
+시나리오와 무관하게 마무리할 가치 있음. 마일스톤 사이사이에 끼워 진행.
+
+- [ ] 4-1d. 관계 검증 UI (confirm/reject)
+- [ ] 4-2. LangChain community package 등록
+- [ ] 4-3. 블로그: "Why Graph > Vector for Tool Retrieval"
+- [ ] 4-4. (선택) LAPIS 포맷 출력
+- [ ] 4-5. (선택) Rust (PyO3+petgraph) 최적화
+
+---
+
+## 6. 의사결정 포인트
+
+다음 작업에 들어가기 전 확정해야 할 것:
+
+1. **시나리오 선택** (A/B/C)
+   - 논문 마감 있음 → A
+   - 학술 의도 없음 → B
+   - 시간 여유 있음 → C (권장)
+
+2. **첫 작업 확정**
+   - 시나리오 C → D (1주) 후 A (3주)
+   - 시나리오 A → D (1주) 후 P1 (3주)
+   - 시나리오 B → A (3주)
+
+3. **Phase 4 잔여 끼워넣기 여부**
+   - 4-2, 4-3은 시나리오 B 마지막에 통합
+   - 4-1d는 Workflow Editor와 함께 A 작업 시 처리 가능
+
+---
+
+## 참고
+
+- [memo/differentiation-analysis.md](../memo/differentiation-analysis.md) — 학술 차별화 9개 후보 상세
+- [docs/wbs/README.md](wbs/README.md) — Phase 0~4.5 완료 내역
+- [docs/benchmarks.md](benchmarks.md) — 현재 벤치마크 (Petstore 19 / GitHub 50 / MCP 38 / k8s 248 / GitHub full 1068)
+- [docs/design/benchmark.md](design/benchmark.md) — 벤치마크 설계 근거
diff --git a/examples/test_bonsai_tool_calling.py b/examples/test_bonsai_tool_calling.py
new file mode 100644
index 0000000..c33831e
--- /dev/null
+++ b/examples/test_bonsai_tool_calling.py
@@ -0,0 +1,405 @@
+"""Bonsai-8B (1-bit Q1_0) tool calling 능력 테스트.
+
+3가지 테스트:
+1. SearchLLM 통합 (query expansion / intent decomposition)
+2. OpenAI function calling format (tools parameter)
+3. graph-tool-call retrieve + LLM 조합
+"""
+# ruff: noqa: E501
+
+from __future__ import annotations
+
+import json
+import time
+import urllib.request
+
+BASE_URL = "http://localhost:8080/v1"
+MODEL = "Bonsai-8B.gguf"
+
+# ── 테스트용 도구 정의 ──────────────────────────────────────────────
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get current weather for a city",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {"type": "string", "description": "City name"},
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "Temperature unit",
+                    },
+                },
+                "required": ["city"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "send_email",
+            "description": "Send an email to a recipient",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "to": {"type": "string", "description": "Recipient email address"},
+                    "subject": {"type": "string", "description": "Email subject line"},
+                    "body": {"type": "string", "description": "Email body content"},
+                },
+                "required": ["to", "subject", "body"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search_products",
+            "description": "Search for products in the catalog",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "Search query"},
+                    "category": {"type": "string", "description": "Product category filter"},
+                    "max_price": {"type": "number", "description": "Maximum price filter"},
+                },
+                "required": ["query"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "create_order",
+            "description": "Create a new order for a product",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "product_id": {"type": "string", "description": "Product ID to order"},
+                    "quantity": {"type": "integer", "description": "Number of items"},
+                    "shipping_address": {"type": "string", "description": "Delivery address"},
+                },
+                "required": ["product_id", "quantity"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "cancel_order",
+            "description": "Cancel an existing order",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "order_id": {"type": "string", "description": "Order ID to cancel"},
+                    "reason": {"type": "string", "description": "Cancellation reason"},
+                },
+                "required": ["order_id"],
+            },
+        },
+    },
+]
+
+
+def chat(messages: list[dict], tools: list[dict] | None = None, **kwargs) -> dict:
+    """OpenAI 호환 API 호출."""
+    payload: dict = {
+        "model": MODEL,
+        "messages": messages,
+        "temperature": 0.1,
+        "max_tokens": 512,
+        **kwargs,
+    }
+    if tools:
+        payload["tools"] = tools
+        payload["tool_choice"] = "auto"
+
+    data = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        f"{BASE_URL}/chat/completions",
+        data=data,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=120) as resp:  # noqa: S310
+        return json.loads(resp.read().decode())
+
+
+def print_header(title: str) -> None:
+    print(f"\n{'=' * 60}")
+    print(f"  {title}")
+    print(f"{'=' * 60}")
+
+
+def print_result(label: str, passed: bool, detail: str = "") -> None:
+    status = "PASS" if passed else "FAIL"
+    mark = "[v]" if passed else "[x]"
+    print(f"  {mark} {label}: {status}")
+    if detail:
+        print(f"      -> {detail}")
+
+
+# ── TEST 1: SearchLLM 통합 ──────────────────────────────────────────
+
+
+def test_search_llm():
+    """graph-tool-call의 OpenAICompatibleSearchLLM으로 query expansion 테스트."""
+    print_header("TEST 1: SearchLLM Query Expansion & Intent Decomposition")
+
+    from graph_tool_call.retrieval.search_llm import OpenAICompatibleSearchLLM
+
+    llm = OpenAICompatibleSearchLLM(
+        model=MODEL,
+        base_url=BASE_URL,
+        api_key="none",
+    )
+
+    # 1a. Query Expansion
+    print("\n  [Query Expansion]")
+    queries = [
+        "파일을 읽고 내용을 수정해서 저장하고 싶어",
+        "search for cheap laptops and order one",
+        "주문 취소하고 환불 처리해줘",
+    ]
+    for q in queries:
+        t0 = time.time()
+        result = llm.expand_query(q)
+        elapsed = time.time() - t0
+        has_keywords = len(result.keywords) > 0
+        print_result(
+            f"expand '{q[:30]}...'",
+            has_keywords,
+            f"keywords={result.keywords}, synonyms={result.synonyms}, "
+            f"english={result.english_terms} ({elapsed:.1f}s)",
+        )
+
+    # 1b. Intent Decomposition
+    print("\n  [Intent Decomposition]")
+    multi_queries = [
+        "Find a laptop under $1000, order it, and send confirmation email",
+        "날씨 확인하고 이메일로 알려줘",
+    ]
+    for q in multi_queries:
+        t0 = time.time()
+        intents = llm.decompose_intents(q)
+        elapsed = time.time() - t0
+        has_intents = len(intents) > 0
+        intent_strs = [f"{i.action}({i.target})" for i in intents]
+        print_result(
+            f"decompose '{q[:35]}...'",
+            has_intents,
+            f"intents={intent_strs} ({elapsed:.1f}s)",
+        )
+
+
+# ── TEST 2: OpenAI Function Calling ─────────────────────────────────
+
+
+def test_function_calling():
+    """직접 tool calling format으로 호출하여 도구 선택 능력 테스트."""
+    print_header("TEST 2: OpenAI Function Calling Format")
+
+    test_cases = [
+        {
+            "name": "단일 도구 - 날씨",
+            "message": "What's the weather like in Seoul?",
+            "expected_tool": "get_weather",
+            "expected_args": ["city"],
+        },
+        {
+            "name": "단일 도구 - 이메일",
+            "message": "Send an email to john@example.com saying hello",
+            "expected_tool": "send_email",
+            "expected_args": ["to"],
+        },
+        {
+            "name": "단일 도구 - 상품 검색",
+            "message": "Find me laptops under $500",
+            "expected_tool": "search_products",
+            "expected_args": ["query"],
+        },
+        {
+            "name": "단일 도구 - 주문 취소",
+            "message": "Cancel order ORD-12345 because I changed my mind",
+            "expected_tool": "cancel_order",
+            "expected_args": ["order_id"],
+        },
+        {
+            "name": "도구 불필요 - 일반 대화",
+            "message": "Hello, how are you?",
+            "expected_tool": None,
+            "expected_args": [],
+        },
+    ]
+
+    for tc in test_cases:
+        t0 = time.time()
+        try:
+            result = chat(
+                messages=[{"role": "user", "content": tc["message"]}],
+                tools=TOOLS,
+            )
+            elapsed = time.time() - t0
+            choice = result["choices"][0]
+            msg = choice["message"]
+
+            tool_calls = msg.get("tool_calls", [])
+            finish_reason = choice.get("finish_reason", "")
+
+            if tc["expected_tool"] is None:
+                # 도구 호출하지 않아야 하는 케이스
+                passed = len(tool_calls) == 0
+                detail = f"finish={finish_reason}, tool_calls={len(tool_calls)}"
+            else:
+                if tool_calls:
+                    called = tool_calls[0]
+                    func_name = called.get("function", {}).get("name", "")
+                    func_args_raw = called.get("function", {}).get("arguments", "{}")
+                    try:
+                        func_args = (
+                            json.loads(func_args_raw)
+                            if isinstance(func_args_raw, str)
+                            else func_args_raw
+                        )
+                    except json.JSONDecodeError:
+                        func_args = {}
+
+                    name_match = func_name == tc["expected_tool"]
+                    args_present = all(k in func_args for k in tc["expected_args"])
+                    passed = name_match and args_present
+
+                    detail = (
+                        f"called={func_name}, args={json.dumps(func_args, ensure_ascii=False)}"
+                        f" ({elapsed:.1f}s)"
+                    )
+                else:
+                    passed = False
+                    content_preview = msg.get("content", "")[:80]
+                    detail = f"NO tool call, got text: '{content_preview}...' ({elapsed:.1f}s)"
+
+        except Exception as e:
+            passed = False
+            detail = f"ERROR: {e}"
+            elapsed = time.time() - t0
+
+        print_result(tc["name"], passed, detail)
+
+
+# ── TEST 3: graph-tool-call retrieve + LLM 조합 ─────────────────────
+
+
+def test_retrieve_with_llm():
+    """ToolGraph retrieve 후 LLM에게 도구 선택시키는 end-to-end 테스트."""
+    print_header("TEST 3: ToolGraph Retrieve + LLM Tool Selection (E2E)")
+
+    from graph_tool_call import ToolGraph
+
+    tg = ToolGraph()
+    tg.add_tools(TOOLS)
+
+    # 관계 추가
+    tg.add_relation("search_products", "create_order", "requires")
+    tg.add_relation("create_order", "cancel_order", "complementary")
+    tg.add_relation("get_weather", "send_email", "complementary")
+
+    test_queries = [
+        {
+            "query": "I want to buy a laptop",
+            "expected_retrieval": ["search_products", "create_order"],
+            "expected_tool": "search_products",
+        },
+        {
+            "query": "Cancel my order ORD-999",
+            "expected_retrieval": ["cancel_order"],
+            "expected_tool": "cancel_order",
+        },
+        {
+            "query": "Check Seoul weather and email it to me",
+            "expected_retrieval": ["get_weather", "send_email"],
+            "expected_tool": "get_weather",  # 첫 번째로 호출할 도구
+        },
+    ]
+
+    for tc in test_queries:
+        # Step 1: graph-tool-call로 관련 도구 검색
+        retrieved = tg.retrieve(tc["query"], top_k=3)
+        retrieved_names = [t.name for t in retrieved]
+
+        retrieval_hit = any(e in retrieved_names for e in tc["expected_retrieval"])
+        print_result(
+            f"retrieve '{tc['query'][:30]}...'",
+            retrieval_hit,
+            f"got={retrieved_names}",
+        )
+
+        # Step 2: 검색된 도구만 LLM에 전달
+        filtered_tools = [t for t in TOOLS if t["function"]["name"] in retrieved_names]
+
+        t0 = time.time()
+        try:
+            result = chat(
+                messages=[{"role": "user", "content": tc["query"]}],
+                tools=filtered_tools,
+            )
+            elapsed = time.time() - t0
+            choice = result["choices"][0]
+            tool_calls = choice["message"].get("tool_calls", [])
+
+            if tool_calls:
+                func_name = tool_calls[0].get("function", {}).get("name", "")
+                func_args_raw = tool_calls[0].get("function", {}).get("arguments", "{}")
+                try:
+                    func_args = (
+                        json.loads(func_args_raw)
+                        if isinstance(func_args_raw, str)
+                        else func_args_raw
+                    )
+                except json.JSONDecodeError:
+                    func_args = {}
+                passed = func_name == tc["expected_tool"]
+                detail = f"LLM chose={func_name}, args={json.dumps(func_args, ensure_ascii=False)} ({elapsed:.1f}s)"
+            else:
+                passed = False
+                content = choice["message"].get("content", "")[:80]
+                detail = f"NO tool call: '{content}' ({elapsed:.1f}s)"
+        except Exception as e:
+            passed = False
+            detail = f"ERROR: {e}"
+
+        print_result(f"  LLM select '{tc['expected_tool']}'", passed, detail)
+
+
+# ── MAIN ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print("\n" + "#" * 60)
+    print("  Bonsai-8B (1-bit Q1_0) Tool Calling Benchmark")
+    print("  Model: localhost:8080 | OpenAI-compatible API")
+    print("#" * 60)
+
+    results = {}
+
+    # Test 1
+    try:
+        test_search_llm()
+    except Exception as e:
+        print(f"  [!] Test 1 failed: {e}")
+
+    # Test 2
+    try:
+        test_function_calling()
+    except Exception as e:
+        print(f"  [!] Test 2 failed: {e}")
+
+    # Test 3
+    try:
+        test_retrieve_with_llm()
+    except Exception as e:
+        print(f"  [!] Test 3 failed: {e}")
+
+    print(f"\n{'=' * 60}")
+    print("  Done!")
+    print(f"{'=' * 60}\n")
diff --git a/examples/xgen_workflow_agent.py b/examples/xgen_workflow_agent.py
index cd8da37..2031410 100644
--- a/examples/xgen_workflow_agent.py
+++ b/examples/xgen_workflow_agent.py
@@ -101,12 +101,19 @@ def upload_file(file_path: str, bucket: str) -> str:
     @tool
     def query_database(sql: str) -> str:
         """Execute a SQL query on the analytics database."""
-        return f"Query result: 42 rows"
+        return "Query result: 42 rows"
 
     all_tools = [
-        search_products, get_order_detail, cancel_order, create_refund,
-        send_notification, get_user_profile, update_inventory,
-        generate_report, upload_file, query_database,
+        search_products,
+        get_order_detail,
+        cancel_order,
+        create_refund,
+        send_notification,
+        get_user_profile,
+        update_inventory,
+        generate_report,
+        upload_file,
+        query_database,
     ]
 
     # -- 핵심: filter_tools 적용 (2줄) --
@@ -199,11 +206,9 @@ def send_notification(user_id: str, message: str) -> str:
         """Send push notification to a user."""
         return f"Sent to {user_id}"
 
-    all_tools = [search_products, get_order_detail, cancel_order,
-                 create_refund, send_notification]
+    _all_tools = [search_products, get_order_detail, cancel_order, create_refund, send_notification]
 
     # -- 핵심: create_agent 교체 --
-    from graph_tool_call.langchain import create_agent as create_filtered_agent
 
     # query_mode="message": 기본값, 추가 LLM 호출 없음 (빠름)
     # query_mode="llm": 대화 컨텍스트에서 검색 쿼리 생성 (멀티턴 강함)
@@ -284,27 +289,70 @@ def pattern_c_gateway():
         tools (50~500개) → create_gateway_tools() → 2개 meta-tool → agent_core
     """
     import json
-    from langchain_core.tools import tool
 
     # -- 시뮬레이션: DB에서 가져온 사용자 등록 tool 50개 --
     tools = []
     tool_categories = {
-        "order": ["create_order", "get_order", "cancel_order", "update_order", "list_orders",
-                  "get_order_status", "track_shipment", "confirm_delivery", "return_order",
-                  "exchange_order"],
-        "product": ["search_products", "get_product", "create_product", "update_product",
-                    "delete_product", "get_product_reviews", "add_product_review",
-                    "get_product_inventory", "update_price", "get_categories"],
-        "user": ["get_user", "create_user", "update_user", "delete_user", "list_users",
-                "get_user_orders", "get_user_wishlist", "add_to_wishlist",
-                "get_user_notifications", "update_preferences"],
-        "payment": ["process_payment", "create_refund", "get_payment_status",
-                    "list_transactions", "get_invoice", "send_receipt",
-                    "validate_coupon", "apply_discount", "get_billing_info",
-                    "update_payment_method"],
-        "admin": ["generate_report", "get_analytics", "export_data", "import_data",
-                  "get_system_status", "clear_cache", "send_notification",
-                  "create_announcement", "get_audit_log", "manage_permissions"],
+        "order": [
+            "create_order",
+            "get_order",
+            "cancel_order",
+            "update_order",
+            "list_orders",
+            "get_order_status",
+            "track_shipment",
+            "confirm_delivery",
+            "return_order",
+            "exchange_order",
+        ],
+        "product": [
+            "search_products",
+            "get_product",
+            "create_product",
+            "update_product",
+            "delete_product",
+            "get_product_reviews",
+            "add_product_review",
+            "get_product_inventory",
+            "update_price",
+            "get_categories",
+        ],
+        "user": [
+            "get_user",
+            "create_user",
+            "update_user",
+            "delete_user",
+            "list_users",
+            "get_user_orders",
+            "get_user_wishlist",
+            "add_to_wishlist",
+            "get_user_notifications",
+            "update_preferences",
+        ],
+        "payment": [
+            "process_payment",
+            "create_refund",
+            "get_payment_status",
+            "list_transactions",
+            "get_invoice",
+            "send_receipt",
+            "validate_coupon",
+            "apply_discount",
+            "get_billing_info",
+            "update_payment_method",
+        ],
+        "admin": [
+            "generate_report",
+            "get_analytics",
+            "export_data",
+            "import_data",
+            "get_system_status",
+            "clear_cache",
+            "send_notification",
+            "create_announcement",
+            "get_audit_log",
+            "manage_permissions",
+        ],
     }
 
     for category, tool_names in tool_categories.items():
@@ -410,7 +458,7 @@ def bonus_multiturn_scenario():
         top_name = results[0].name if results else "없음"
         match = "✓" if top_name == s["expected"] else "✗"
 
-        print(f"  턴 {s['turn']}: \"{s['message']}\"")
+        print(f'  턴 {s["turn"]}: "{s["message"]}"')
         print(f"    → message 모드 Top-1: {top_name} {match}")
         if "note" in s:
             print(f"    ※ {s['note']}")
diff --git a/examples/xgen_workflow_gateway.py b/examples/xgen_workflow_gateway.py
index 1f0ac71..37a356a 100644
--- a/examples/xgen_workflow_gateway.py
+++ b/examples/xgen_workflow_gateway.py
@@ -24,13 +24,13 @@
 import asyncio
 import json
 
-
 # =====================================================================
 # 방법 1: MCP 서버에서 tool 수집 → gateway
 # =====================================================================
 # 실제 MCP 서버(Slack, GitHub, Jira 등)에서 tool을 가져와서
 # gateway 2개로 축약하는 패턴. xgen-workflow 실 적용 코드와 동일.
 
+
 async def example_mcp_gateway():
     """MCP 서버에서 tool 수집 후 gateway agent 구성."""
     from langchain_openai import ChatOpenAI
@@ -66,8 +66,9 @@ async def example_mcp_gateway():
                 # MCP tool → LangChain StructuredTool 변환
                 for t in response.tools:
                     from langchain_core.tools import StructuredTool
+
                     tool = StructuredTool.from_function(
-                        func=lambda **kwargs, _s=session, _n=t.name: asyncio.run(
+                        func=lambda _s=session, _n=t.name, **kwargs: asyncio.run(
                             _s.call_tool(_n, kwargs)
                         ),
                         name=t.name,
@@ -97,6 +98,7 @@ async def example_mcp_gateway():
 # Swagger/OpenAPI에서 tool을 자동 생성하고 gateway에 넣는 패턴.
 # 사내 API가 OpenAPI spec으로 문서화되어 있을 때 유용.
 
+
 def example_openapi_gateway():
     """OpenAPI spec에서 tool 생성 후 gateway agent 구성."""
     from langchain_openai import ChatOpenAI
@@ -104,7 +106,6 @@ def example_openapi_gateway():
 
     from graph_tool_call import ToolGraph
     from graph_tool_call.langchain import create_gateway_tools
-    from graph_tool_call.langchain.tools import tool_schema_to_openai_function
 
     # ── 1. OpenAPI spec에서 ToolGraph 구축 ───────────────────────
     # URL 또는 파일 경로 모두 가능
@@ -127,12 +128,15 @@ def example_openapi_gateway():
         def make_fn(tool_schema=schema):
             def fn(**kwargs):
                 req = build_request(tool_schema, kwargs)
-                return json.dumps({
-                    "method": req.method,
-                    "url": req.url,
-                    "body": req.body,
-                    "note": "실제 환경에서는 requests.request()로 호출",
-                })
+                return json.dumps(
+                    {
+                        "method": req.method,
+                        "url": req.url,
+                        "body": req.body,
+                        "note": "실제 환경에서는 requests.request()로 호출",
+                    }
+                )
+
             return fn
 
         tool = StructuredTool.from_function(
@@ -163,6 +167,7 @@ def fn(**kwargs):
 # 이미 LangChain tool이 있는 프로젝트에서 gateway 또는 filter를
 # 한 줄로 적용하는 패턴. 기존 코드 변경 최소화.
 
+
 def example_langchain_integration():
     """기존 LangChain tool에 gateway/filter 적용."""
     from langchain_community.tools import DuckDuckGoSearchRun
@@ -214,8 +219,14 @@ def send_slack_message(channel: str, message: str) -> str:
         return json.dumps({"ok": True, "channel": channel})
 
     all_tools = [
-        search, get_weather, send_email, create_calendar_event,
-        list_files, read_file, query_database, create_jira_issue,
+        search,
+        get_weather,
+        send_email,
+        create_calendar_event,
+        list_files,
+        read_file,
+        query_database,
+        create_jira_issue,
         send_slack_message,
     ]
     print(f"  기존 tool {len(all_tools)}개")
diff --git a/graph_tool_call/analyze/dependency.py b/graph_tool_call/analyze/dependency.py
index 552ebbe..e21481d 100644
--- a/graph_tool_call/analyze/dependency.py
+++ b/graph_tool_call/analyze/dependency.py
@@ -134,16 +134,13 @@ def _group_by_resource(tools: list[ToolSchema]) -> dict[str, list[ToolSchema]]:
 
     The base resource is the first *meaningful* non-param path segment.
     A segment is considered a non-meaningful prefix when it groups more than
-    ``_PREFIX_THRESHOLD`` percent of all tools — this handles version prefixes
+    ``prefix_threshold`` percent of all tools — this handles version prefixes
     (``/v1``, ``/v2``), routing prefixes (``/api``, ``/rest``), etc. without
     requiring a hardcoded list.
     """
-    _PREFIX_THRESHOLD = 0.4  # if a segment covers >40% of tools, it's a prefix
+    prefix_threshold = 0.4  # if a segment covers >40% of tools, it's a prefix
 
-    api_tools = [
-        t for t in tools
-        if t.metadata.get("path") and t.metadata.get("method")
-    ]
+    api_tools = [t for t in tools if t.metadata.get("path") and t.metadata.get("method")]
     if not api_tools:
         return {}
 
@@ -171,7 +168,7 @@ def _group_by_resource(tools: list[ToolSchema]) -> dict[str, list[ToolSchema]]:
         if not counter:
             break
         most_common_count = max(counter.values())
-        if most_common_count / total > _PREFIX_THRESHOLD:
+        if most_common_count / total > prefix_threshold:
             skip_depth = depth + 1
         else:
             break
@@ -296,12 +293,8 @@ def _detect_crud_patterns(group: list[ToolSchema]) -> list[DetectedRelation]:
     posts = by_role.get("post_collection", [])
     gets_single = by_role.get("get_single", [])
     gets_collection = by_role.get("get_collection", [])
-    puts = by_role.get("put_single", [])
-    patches = by_role.get("patch_single", [])
     deletes = by_role.get("delete_single", [])
 
-    updates = puts + patches
-
     # --- Focused CRUD relations ---
     # Only create relations that represent real data dependencies,
     # not every possible CRUD combination.
@@ -323,7 +316,10 @@ def _detect_crud_patterns(group: list[ToolSchema]) -> list[DetectedRelation]:
                     target=post.name,
                     relation_type=RelationType.REQUIRES,
                     confidence=0.9,
-                    evidence=f"{get_s.name} (GET single) requires {post.name} (POST) — same resource '{post_resource}'",
+                    evidence=(
+                        f"{get_s.name} (GET single) requires {post.name} (POST) — "
+                        f"same resource '{post_resource}'"
+                    ),
                     layer=1,
                 )
             )
@@ -524,7 +520,8 @@ def _detect_name_based(tools: list[ToolSchema]) -> list[DetectedRelation]:
             # Require strong evidence: 2+ shared tokens, or the token
             # appears in a parameter ending with "id" (e.g., "orderId")
             has_id_param = any(
-                tok in p.name.lower() for p in tool_b.parameters
+                tok in p.name.lower()
+                for p in tool_b.parameters
                 for tok in shared
                 if "id" in p.name.lower()
             )
@@ -575,7 +572,8 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
         name_tokens = _normalize_name(tool.name)
         # Remove verb prefix
         resource_tokens = [
-            t for t in name_tokens
+            t
+            for t in name_tokens
             if t not in ("get", "list", "create", "add", "post", "read", "find")
         ]
         for tok in resource_tokens:
@@ -617,16 +615,11 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
                 # Prefer GET single over GET list/POST
                 provider_method = (provider.metadata.get("method") or "").upper()
                 provider_path = provider.metadata.get("path", "")
-                is_get_single = (
-                    provider_method == "GET"
-                    and _is_single_resource_path(provider_path)
-                )
+                is_get_single = provider_method == "GET" and _is_single_resource_path(provider_path)
 
                 # Only create cross-resource link if provider is from
                 # a DIFFERENT resource category than the consumer
-                consumer_resource = _extract_resource(
-                    tool.metadata.get("path", "")
-                )
+                consumer_resource = _extract_resource(tool.metadata.get("path", ""))
                 provider_resource = _extract_resource(provider_path)
                 if consumer_resource == provider_resource:
                     continue  # same resource — handled by structural detection
@@ -661,44 +654,92 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
 # Maps leading verb in an RPC method name to a CRUD intent category.
 _VERB_TO_INTENT: dict[str, str] = {
     # read
-    "get": "read", "find": "read", "fetch": "read", "list": "read",
-    "search": "read", "select": "read", "load": "read", "read": "read",
+    "get": "read",
+    "find": "read",
+    "fetch": "read",
+    "list": "read",
+    "search": "read",
+    "select": "read",
+    "load": "read",
+    "read": "read",
     "download": "read",
     # write (create)
-    "save": "write", "create": "write", "add": "write", "insert": "write",
-    "register": "write", "regist": "write",
+    "save": "write",
+    "create": "write",
+    "add": "write",
+    "insert": "write",
+    "register": "write",
+    "regist": "write",
     # update
-    "modify": "update", "update": "update", "edit": "update",
-    "change": "update", "patch": "update",
+    "modify": "update",
+    "update": "update",
+    "edit": "update",
+    "change": "update",
+    "patch": "update",
     # delete
-    "delete": "delete", "remove": "delete", "cancel": "delete",
+    "delete": "delete",
+    "remove": "delete",
+    "cancel": "delete",
     "withdraw": "delete",
     # action (side-effect operations)
-    "process": "action", "execute": "action", "apply": "action",
-    "approve": "action", "reject": "action", "confirm": "action",
-    "accept": "action", "send": "action", "upload": "action",
+    "process": "action",
+    "execute": "action",
+    "apply": "action",
+    "approve": "action",
+    "reject": "action",
+    "confirm": "action",
+    "accept": "action",
+    "send": "action",
+    "upload": "action",
     "export": "action",
 }
 
 # Trailing tokens in method names that describe the *view*, not the resource.
-_NAME_SUFFIXES: frozenset[str] = frozenset({
-    "list", "detail", "details", "info", "count", "excel", "popup",
-    "summary", "check", "data", "total", "all", "page", "download",
-})
+_NAME_SUFFIXES: frozenset[str] = frozenset(
+    {
+        "list",
+        "detail",
+        "details",
+        "info",
+        "count",
+        "excel",
+        "popup",
+        "summary",
+        "check",
+        "data",
+        "total",
+        "all",
+        "page",
+        "download",
+    }
+)
 
 # Common DTO class-name suffixes that are not part of the resource identity.
-_DTO_SUFFIXES: frozenset[str] = frozenset({
-    "request", "response", "dto", "entity", "info", "base",
-    "api", "vo", "model", "form", "param", "result", "ml",
-})
+_DTO_SUFFIXES: frozenset[str] = frozenset(
+    {
+        "request",
+        "response",
+        "dto",
+        "entity",
+        "info",
+        "base",
+        "api",
+        "vo",
+        "model",
+        "form",
+        "param",
+        "result",
+        "ml",
+    }
+)
 
 # CRUD workflow rules: (source_intent, target_intent, relation, same_ctrl_conf, cross_ctrl_conf)
 # ``None`` for cross_ctrl_conf means the rule is skipped across controllers.
 _WORKFLOW_RULES: list[tuple[str, str, RelationType, float, float | None]] = [
-    ("read",   "write",  RelationType.REQUIRES, 0.9,  0.8),
-    ("update", "read",   RelationType.REQUIRES, 0.85, 0.75),
-    ("delete", "read",   RelationType.REQUIRES, 0.85, 0.75),
-    ("action", "read",   RelationType.REQUIRES, 0.75, None),
+    ("read", "write", RelationType.REQUIRES, 0.9, 0.8),
+    ("update", "read", RelationType.REQUIRES, 0.85, 0.75),
+    ("delete", "read", RelationType.REQUIRES, 0.85, 0.75),
+    ("action", "read", RelationType.REQUIRES, 0.75, None),
 ]
 
 
@@ -791,31 +832,35 @@ def _detect_rpc_crud_workflows(tools: list[ToolSchema]) -> list[DetectedRelation
                     same = _same_controller(src, tgt)
                     if not same and cross_conf is None:
                         continue
-                    relations.append(DetectedRelation(
-                        source=src.name,
-                        target=tgt.name,
-                        relation_type=rel_type,
-                        confidence=same_conf if same else cross_conf,  # type: ignore[arg-type]
-                        evidence=(
-                            f"{src.name} ({src_intent}) → {tgt.name} ({tgt_intent})"
-                            f" — resource '{resource}'"
-                        ),
-                        layer=4,
-                    ))
+                    relations.append(
+                        DetectedRelation(
+                            source=src.name,
+                            target=tgt.name,
+                            relation_type=rel_type,
+                            confidence=same_conf if same else cross_conf,  # type: ignore[arg-type]
+                            evidence=(
+                                f"{src.name} ({src_intent}) → {tgt.name} ({tgt_intent})"
+                                f" — resource '{resource}'"
+                            ),
+                            layer=4,
+                        )
+                    )
 
         # Readers within same controller are SIMILAR_TO.
         readers = by_intent.get("read", [])
         for i, r1 in enumerate(readers):
-            for r2 in readers[i + 1:]:
+            for r2 in readers[i + 1 :]:
                 if r1.name != r2.name and _same_controller(r1, r2):
-                    relations.append(DetectedRelation(
-                        source=r1.name,
-                        target=r2.name,
-                        relation_type=RelationType.SIMILAR_TO,
-                        confidence=0.8,
-                        evidence=f"{r1.name} ↔ {r2.name} — similar reads for '{resource}'",
-                        layer=4,
-                    ))
+                    relations.append(
+                        DetectedRelation(
+                            source=r1.name,
+                            target=r2.name,
+                            relation_type=RelationType.SIMILAR_TO,
+                            confidence=0.8,
+                            evidence=f"{r1.name} ↔ {r2.name} — similar reads for '{resource}'",
+                            layer=4,
+                        )
+                    )
 
     return relations
 
@@ -836,16 +881,18 @@ def _detect_rpc_dto_links(tools: list[ToolSchema]) -> list[DetectedRelation]:
         if not 2 <= len(members) <= 20:
             continue
         for i, a in enumerate(members):
-            for b in members[i + 1:]:
+            for b in members[i + 1 :]:
                 if a.name != b.name and not _same_controller(a, b):
-                    relations.append(DetectedRelation(
-                        source=a.name,
-                        target=b.name,
-                        relation_type=RelationType.COMPLEMENTARY,
-                        confidence=0.75,
-                        evidence=f"{a.name} ↔ {b.name} — shared DTO '{dto_res}'",
-                        layer=4,
-                    ))
+                    relations.append(
+                        DetectedRelation(
+                            source=a.name,
+                            target=b.name,
+                            relation_type=RelationType.COMPLEMENTARY,
+                            confidence=0.75,
+                            evidence=f"{a.name} ↔ {b.name} — shared DTO '{dto_res}'",
+                            layer=4,
+                        )
+                    )
 
     return relations
 
diff --git a/graph_tool_call/graphify/__init__.py b/graph_tool_call/graphify/__init__.py
index 6785ee3..98bbbce 100644
--- a/graph_tool_call/graphify/__init__.py
+++ b/graph_tool_call/graphify/__init__.py
@@ -30,6 +30,7 @@
     "DEFAULT_CONF_AMBIGUOUS",
     "DEFAULT_CONF_EXTRACTED",
     "DEFAULT_CONF_INFERRED",
+    "_apply_pair_hints",
     "bucket_confidence",
     "ingest_openapi_graphify",
     "preserve_refs_for_detection",
diff --git a/graph_tool_call/graphify/ingest.py b/graph_tool_call/graphify/ingest.py
index 48bc8d5..afa23f3 100644
--- a/graph_tool_call/graphify/ingest.py
+++ b/graph_tool_call/graphify/ingest.py
@@ -124,7 +124,7 @@ def preserve_refs_for_detection(
     Returns the number of tools whose metadata was updated. Mutates ``tools``
     in place.
     """
-    paths = (raw_spec.get("paths") or {})
+    paths = raw_spec.get("paths") or {}
     if not isinstance(paths, dict):
         return 0
 
@@ -218,8 +218,13 @@ def _apply_pair_hints(
     from ``detect_dependencies`` UNLESS the new pair is operator-curated
     (``source="manual"``) — operator intent overrides automatic detection.
     """
-    stats = {"manual": 0, "auto": 0, "skipped_target_missing": 0,
-             "skipped_self": 0, "skipped_existing_structural": 0}
+    stats = {
+        "manual": 0,
+        "auto": 0,
+        "skipped_target_missing": 0,
+        "skipped_self": 0,
+        "skipped_existing_structural": 0,
+    }
     tool_names = set(tg.tools.keys())
 
     for s in schemas:
@@ -350,9 +355,7 @@ def ingest_openapi_graphify(
         stats["refs_preserved"] = preserve_refs_for_detection(schemas, raw_spec)
 
     # min_confidence=0.0 so we see every candidate; we re-bucket here.
-    relations: list[DetectedRelation] = detect_dependencies(
-        schemas, spec, min_confidence=0.0
-    )
+    relations: list[DetectedRelation] = detect_dependencies(schemas, spec, min_confidence=0.0)
 
     seen: set[tuple[str, str, str]] = set()  # (src, tgt, relation_value)
     for rel in relations:
@@ -419,7 +422,7 @@ def ingest_openapi_graphify(
         # cross_source also re-counted on these new edges for completeness.
         for s in schemas:
             ai = (s.metadata or {}).get("ai_metadata") or {}
-            for p in (ai.get("pairs_well_with") or []):
+            for p in ai.get("pairs_well_with") or []:
                 if not isinstance(p, dict):
                     continue
                 tgt = str(p.get("tool") or "").strip()
diff --git a/graph_tool_call/graphify/retrieval.py b/graph_tool_call/graphify/retrieval.py
index 55e659b..f15e4bc 100644
--- a/graph_tool_call/graphify/retrieval.py
+++ b/graph_tool_call/graphify/retrieval.py
@@ -81,10 +81,7 @@ def _substring_seeds(
     for name, tool in tools.items():
         nname = _strip_diacritics(name).lower()
         ndesc = _strip_diacritics(tool.description or "").lower()
-        score = (
-            sum(1.0 for t in terms if t in nname)
-            + 0.5 * sum(1.0 for t in terms if t in ndesc)
-        )
+        score = sum(1.0 for t in terms if t in nname) + 0.5 * sum(1.0 for t in terms if t in ndesc)
         if score > 0:
             scored.append((name, score))
     scored.sort(key=lambda x: x[1], reverse=True)
@@ -182,11 +179,7 @@ def _bfs_from_seeds(
         return {}, []
 
     max_seed = max((s for _, s in seed_scores), default=1.0) or 1.0
-    scores: dict[str, float] = {
-        n: s / max_seed
-        for n, s in seed_scores
-        if graph.has_node(n)
-    }
+    scores: dict[str, float] = {n: s / max_seed for n, s in seed_scores if graph.has_node(n)}
     visited: set[str] = set(scores)
     frontier: list[str] = list(scores)
     edges_visited: list[tuple[str, str]] = []
@@ -301,9 +294,7 @@ def render_subgraph_text(
 
     # Order nodes: by retrieval score (desc) if known, else by name.
     if sort_by_score:
-        node_order = sorted(
-            node_set, key=lambda n: (-sort_by_score.get(n, 0.0), n)
-        )
+        node_order = sorted(node_set, key=lambda n: (-sort_by_score.get(n, 0.0), n))
     else:
         node_order = sorted(node_set)
 
@@ -437,9 +428,7 @@ def retrieve_graphify(
                 scores[h] *= _HISTORY_DEMOTE
 
     # 5) Filter to TOOL nodes only and rank
-    tool_scores: dict[str, float] = {
-        n: s for n, s in scores.items() if n in tg.tools
-    }
+    tool_scores: dict[str, float] = {n: s for n, s in scores.items() if n in tg.tools}
     ranked = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
     chosen_names: set[str] = {n for n, _ in ranked}
 
diff --git a/graph_tool_call/ingest/io_contract.py b/graph_tool_call/ingest/io_contract.py
index 1768a47..7748bb5 100644
--- a/graph_tool_call/ingest/io_contract.py
+++ b/graph_tool_call/ingest/io_contract.py
@@ -245,9 +245,7 @@ def extract_consumes_for_operation(
     seen_names: set[str] = set()
 
     # query / path / header parameters
-    all_params = (operation.get("parameters") or []) + (
-        (path_item or {}).get("parameters") or []
-    )
+    all_params = (operation.get("parameters") or []) + ((path_item or {}).get("parameters") or [])
     for p in all_params:
         if not isinstance(p, dict) or "name" not in p:
             continue
diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py
index f914fd4..8f93dea 100644
--- a/graph_tool_call/ingest/openapi.py
+++ b/graph_tool_call/ingest/openapi.py
@@ -384,7 +384,7 @@ def _extract_params_openapi3(
         is_required = prop_name in body_required
         if required_only and not is_required:
             continue
-        desc = (prop_schema.get("description") or "")
+        desc = prop_schema.get("description") or ""
         # nested object/array는 한 단계 더 펼치기
         if _schema_type(prop_schema) in ("object", "array"):
             nested = _summarize_object_schema(prop_schema)
diff --git a/graph_tool_call/langchain/agent.py b/graph_tool_call/langchain/agent.py
index 9215e6c..533183e 100644
--- a/graph_tool_call/langchain/agent.py
+++ b/graph_tool_call/langchain/agent.py
@@ -118,9 +118,7 @@ def _generate_query_with_llm(
     # Include a sample of tool names to help the LLM understand the domain
     sample_tools = ", ".join(tool_names[:20])
     user_prompt = (
-        f"Available tools include: {sample_tools}\n\n"
-        f"Conversation:\n{conversation}\n\n"
-        f"Search query:"
+        f"Available tools include: {sample_tools}\n\nConversation:\n{conversation}\n\nSearch query:"
     )
 
     try:
@@ -129,10 +127,12 @@ def _generate_query_with_llm(
         if hasattr(model, "bound_tools"):
             # If model has tools bound, get the underlying model
             base_model = model
-        response = base_model.invoke([
-            SystemMessage(content=_QUERY_GEN_SYSTEM),
-            HumanMessage(content=user_prompt),
-        ])
+        response = base_model.invoke(
+            [
+                SystemMessage(content=_QUERY_GEN_SYSTEM),
+                HumanMessage(content=user_prompt),
+            ]
+        )
         query = response.content.strip().strip('"').strip("'")
         if query:
             logger.debug("LLM-generated query: %s", query)
@@ -187,8 +187,7 @@ def create_agent(
         from langgraph.prebuilt import create_react_agent
     except ImportError:
         raise ImportError(
-            "langgraph is required for create_agent(). "
-            "Install with: pip install langgraph"
+            "langgraph is required for create_agent(). Install with: pip install langgraph"
         )
 
     from graph_tool_call import ToolGraph
diff --git a/graph_tool_call/langchain/gateway.py b/graph_tool_call/langchain/gateway.py
index 1ad9e97..a570589 100644
--- a/graph_tool_call/langchain/gateway.py
+++ b/graph_tool_call/langchain/gateway.py
@@ -99,9 +99,7 @@ def _summarize_response_schema(schema: dict[str, Any]) -> str | None:
     return f"array of {summary}" if is_array else summary
 
 
-def _enrich_from_graph(
-    name: str, graph: Any | None
-) -> dict[str, Any]:
+def _enrich_from_graph(name: str, graph: Any | None) -> dict[str, Any]:
     """Pull source_label, method/path, response summary, and outgoing edges
     from the underlying ToolGraph for *name*. Returns an empty dict if the
     graph or tool is not available — callers should treat all keys as optional.
@@ -136,9 +134,7 @@ def _enrich_from_graph(
         chains: list[str] = []
         for _src, target, attrs in edges:
             relation = attrs.get("relation")
-            relation_name = (
-                relation.value if hasattr(relation, "value") else str(relation)
-            )
+            relation_name = relation.value if hasattr(relation, "value") else str(relation)
             # Skip purely structural BELONGS_TO edges
             if relation_name in ("belongs_to", "BELONGS_TO"):
                 continue
diff --git a/graph_tool_call/mcp_proxy.py b/graph_tool_call/mcp_proxy.py
index 7cb9c71..7f2669c 100644
--- a/graph_tool_call/mcp_proxy.py
+++ b/graph_tool_call/mcp_proxy.py
@@ -786,9 +786,10 @@ async def app_lifespan(app: Any) -> Any:  # type: ignore[override]
                 yield
                 task.cancel_scope.cancel()
 
-    import anyio
     from contextlib import asynccontextmanager
 
+    import anyio
+
     @asynccontextmanager
     async def lifespan(app: Any) -> Any:  # type: ignore[override]
         async with transport.connect() as (read_stream, write_stream):
diff --git a/graph_tool_call/net.py b/graph_tool_call/net.py
index ba46e26..466ae30 100644
--- a/graph_tool_call/net.py
+++ b/graph_tool_call/net.py
@@ -55,6 +55,7 @@ def _open_url(
     handlers: list[Any] = [_LimitedRedirectHandler(max_redirects)]
     if not verify_ssl:
         import ssl
+
         ctx = ssl.create_default_context()
         ctx.check_hostname = False
         ctx.verify_mode = ssl.CERT_NONE
@@ -157,7 +158,12 @@ def fetch_url_text(
 
     req = urllib.request.Request(url, headers=headers or {})
     try:
-        with _open_url(req, timeout=timeout, max_redirects=max_redirects, verify_ssl=verify_ssl) as resp:
+        with _open_url(
+            req,
+            timeout=timeout,
+            max_redirects=max_redirects,
+            verify_ssl=verify_ssl,
+        ) as resp:
             final_url = url
             if hasattr(resp, "geturl"):
                 candidate = resp.geturl()
diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py
index 6ee8b4e..eb29850 100644
--- a/graph_tool_call/ontology/llm_provider.py
+++ b/graph_tool_call/ontology/llm_provider.py
@@ -99,8 +99,9 @@ class ToolEnrichment:
       - Graph edges (``pairs_well_with`` becomes semantic edges)
     """
 
-    canonical_action: str                         # search | read | create | update | delete | action
-    primary_resource: str                         # e.g. "product"
+    # canonical_action: search | read | create | update | delete | action
+    canonical_action: str
+    primary_resource: str  # e.g. "product"
     one_line_summary: str
     when_to_use: str
     when_not_to_use: str = ""
@@ -119,8 +120,10 @@ class ToolEnrichment:
 Example:
 Tools: createUser, getUserProfile, deleteUser
 Answer: [
-  {{"source":"getUserProfile","target":"createUser","relation":"REQUIRES","confidence":0.9,"reason":"need user to exist"}},
-  {{"source":"createUser","target":"deleteUser","relation":"PRECEDES","confidence":0.8,"reason":"create before delete"}}
+  {{"source":"getUserProfile","target":"createUser","relation":"REQUIRES","confidence":0.9,
+    "reason":"need user to exist"}},
+  {{"source":"createUser","target":"deleteUser","relation":"PRECEDES","confidence":0.8,
+    "reason":"create before delete"}}
 ]
 
 Relation types:
@@ -365,7 +368,7 @@ def _parse_enrichment(data: Any) -> ToolEnrichment | None:
             if isinstance(p, dict) and str(p.get("semantic", "")).strip()
         ]
         consumes = []
-        for c in (data.get("consumes_semantics") or []):
+        for c in data.get("consumes_semantics") or []:
             if not (isinstance(c, dict) and str(c.get("semantic", "")).strip()):
                 continue
             raw_kind = str(c.get("kind", "data")).strip().lower()
@@ -430,6 +433,7 @@ def _extract_json(text: str) -> Any:
 
     # Remove <think>...</think> blocks (qwen3 thinking mode)
     import re as _re
+
     text = _re.sub(r"<think>[\s\S]*?</think>", "", text).strip()
 
     # Remove markdown code blocks
@@ -564,8 +568,7 @@ def verify_relations(
         for i in range(0, len(relations), batch_size):
             batch = relations[i : i + batch_size]
             rels_text = "\n".join(
-                f"- {r.source} {r.relation_type.name} {r.target} ({r.reason[:60]})"
-                for r in batch
+                f"- {r.source} {r.relation_type.name} {r.target} ({r.reason[:60]})" for r in batch
             )
             prompt = _VERIFY_RELATIONS_PROMPT.format(
                 relations_list=rels_text,
@@ -610,8 +613,7 @@ def suggest_missing(
         """
         tools_text = _format_tools_list(tools[:30])
         existing_text = "\n".join(
-            f"- {r.source} {r.relation_type.name} {r.target}"
-            for r in existing_relations[:30]
+            f"- {r.source} {r.relation_type.name} {r.target}" for r in existing_relations[:30]
         )
         prompt = _SUGGEST_MISSING_PROMPT.format(
             tools_list=tools_text,
@@ -749,11 +751,13 @@ def enrich_pairs(
                         target = str(p.get("tool", "")).strip()
                         if not target or target == name:
                             continue
-                        pair_list.append(PairHint(
-                            tool=target,
-                            reason=str(p.get("reason", "")).strip(),
-                            source="auto",
-                        ))
+                        pair_list.append(
+                            PairHint(
+                                tool=target,
+                                reason=str(p.get("reason", "")).strip(),
+                                source="auto",
+                            )
+                        )
                     results[str(name)] = pair_list
             except (json.JSONDecodeError, KeyError, TypeError):
                 continue
@@ -838,7 +842,8 @@ def enrich_tool_semantics(
                     # whose target name is not in the catalog.
                     if valid_tool_names is not None:
                         enrichment.pairs_well_with = [
-                            p for p in enrichment.pairs_well_with
+                            p
+                            for p in enrichment.pairs_well_with
                             if p.tool in valid_tool_names and p.tool != str(name)
                         ]
                     results[str(name)] = enrichment
diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
index 8f2d9eb..dbab1f3 100644
--- a/graph_tool_call/plan/__init__.py
+++ b/graph_tool_call/plan/__init__.py
@@ -24,39 +24,39 @@ def call_tool(tool_name, args):
     BindingError,
     resolve_bindings,
 )
+from graph_tool_call.plan.intent import (
+    IntentParseError,
+    ParsedIntent,
+    ToolCatalogEntry,
+    parse_intent,
+)
+from graph_tool_call.plan.response import (
+    synthesize_failure_response,
+    synthesize_success_response,
+)
 from graph_tool_call.plan.runner import (
-    PlanRunner,
+    PlanAborted,
+    PlanCompleted,
     PlanEvent,
+    PlanRunner,
     PlanStarted,
-    StepStarted,
     StepCompleted,
     StepFailed,
-    PlanCompleted,
-    PlanAborted,
+    StepStarted,
 )
 from graph_tool_call.plan.schema import (
+    ExecutionTrace,
     Plan,
     PlanStep,
-    ExecutionTrace,
     StepTrace,
 )
-from graph_tool_call.plan.intent import (
-    IntentParseError,
-    ParsedIntent,
-    ToolCatalogEntry,
-    parse_intent,
-)
-from graph_tool_call.plan.response import (
-    synthesize_success_response,
-    synthesize_failure_response,
-)
 from graph_tool_call.plan.synthesizer import (
+    CyclicDependencyError,
+    DynamicOptionRequired,
+    MaxDepthExceededError,
     PathSynthesizer,
     PlanSynthesisError,
     UnsatisfiableFieldError,
-    CyclicDependencyError,
-    MaxDepthExceededError,
-    DynamicOptionRequired,
 )
 
 __all__ = [
diff --git a/graph_tool_call/plan/binding.py b/graph_tool_call/plan/binding.py
index 58d9eef..2ae6a50 100644
--- a/graph_tool_call/plan/binding.py
+++ b/graph_tool_call/plan/binding.py
@@ -102,9 +102,7 @@ def _lookup(expr: str, context: dict[str, Any]) -> Any:
             try:
                 idx = int(tok[1:-1])
             except ValueError as exc:
-                raise BindingError(
-                    f"non-numeric array index {tok!r} in binding {expr!r}"
-                ) from exc
+                raise BindingError(f"non-numeric array index {tok!r} in binding {expr!r}") from exc
             if not isinstance(node, (list, tuple)):
                 raise BindingError(
                     f"indexing {tok} on non-list type {type(node).__name__} (expr={expr!r})"
@@ -112,9 +110,7 @@ def _lookup(expr: str, context: dict[str, Any]) -> Any:
             try:
                 node = node[idx]
             except IndexError as exc:
-                raise BindingError(
-                    f"index {idx} out of range in binding {expr!r}"
-                ) from exc
+                raise BindingError(f"index {idx} out of range in binding {expr!r}") from exc
         else:
             if not isinstance(node, dict):
                 raise BindingError(
@@ -152,7 +148,7 @@ def _tokenize(expr: str) -> list[str]:
             end = expr.find("]", i)
             if end == -1:
                 raise BindingError(f"unclosed '[' in binding {expr!r}")
-            tokens.append(expr[i:end + 1])
+            tokens.append(expr[i : end + 1])
             i = end
         else:
             buf.append(ch)
diff --git a/graph_tool_call/plan/intent.py b/graph_tool_call/plan/intent.py
index 74dd8b8..c62d396 100644
--- a/graph_tool_call/plan/intent.py
+++ b/graph_tool_call/plan/intent.py
@@ -22,7 +22,6 @@
 
 from graph_tool_call.ontology.llm_provider import OntologyLLM, _extract_json
 
-
 # Minimum SequenceMatcher ratio for treating an LLM-emitted entity key as
 # a typo/expansion of a real vocab entry. 0.8 catches "search_keyword_name"
 # vs "search_keyword" (~0.85) while rejecting unrelated pairs like
@@ -40,21 +39,21 @@ class ToolCatalogEntry:
     """Condensed tool view for intent-parsing prompt — under ~150 chars each."""
 
     name: str
-    summary: str = ""                          # one_line_summary from ai_metadata
-    when_to_use: str = ""                      # ai_metadata.when_to_use
-    consumes_tags: list[str] = field(default_factory=list)   # required semantic ids
-    canonical_action: str = ""                 # "read" | "search" | "create" | ...
-    primary_resource: str = ""                 # "product" | ...
+    summary: str = ""  # one_line_summary from ai_metadata
+    when_to_use: str = ""  # ai_metadata.when_to_use
+    consumes_tags: list[str] = field(default_factory=list)  # required semantic ids
+    canonical_action: str = ""  # "read" | "search" | "create" | ...
+    primary_resource: str = ""  # "product" | ...
 
 
 @dataclass
 class ParsedIntent:
     """Stage 1 output — consumed by Stage 2 PathSynthesizer."""
 
-    target: str                                # tool name picked by LLM
+    target: str  # tool name picked by LLM
     entities: dict[str, Any] = field(default_factory=dict)
-    confidence: float = 0.0                    # 0.0 ~ 1.0
-    output_shape: str = "single"               # "single" | "list" | "count"
+    confidence: float = 0.0  # 0.0 ~ 1.0
+    output_shape: str = "single"  # "single" | "list" | "count"
     reasoning: str = ""
 
 
@@ -146,7 +145,10 @@ def _coerce_entity_keys(
             out[key_str] = value
             continue
         match = difflib.get_close_matches(
-            key_str, vocab, n=1, cutoff=_VOCAB_FUZZY_CUTOFF,
+            key_str,
+            vocab,
+            n=1,
+            cutoff=_VOCAB_FUZZY_CUTOFF,
         )
         if match:
             # If multiple LLM keys collapse onto the same vocab entry, the
@@ -169,8 +171,7 @@ def _format_seed_block(seed_entities: dict[str, Any] | None) -> str:
     if not seed_entities:
         return ""
     lines = "\n".join(
-        f'  - {k}: {json.dumps(v, ensure_ascii=False)}'
-        for k, v in seed_entities.items()
+        f"  - {k}: {json.dumps(v, ensure_ascii=False)}" for k, v in seed_entities.items()
     )
     return (
         "\n\nExisting entities (carried over from prior turns — keep these "
@@ -193,10 +194,10 @@ def _format_enum_block(enum_mappings: dict[str, dict[str, str]] | None) -> str:
     if not enum_mappings:
         return ""
     lines: list[str] = []
-    for field, codes in enum_mappings.items():
+    for field_name, codes in enum_mappings.items():
         if not isinstance(codes, dict) or not codes:
             continue
-        lines.append(f"  - {field}:")
+        lines.append(f"  - {field_name}:")
         for code, label in codes.items():
             lines.append(f'      "{code}" → {label}')
     if not lines:
diff --git a/graph_tool_call/plan/response.py b/graph_tool_call/plan/response.py
index 782ca1f..714b5d4 100644
--- a/graph_tool_call/plan/response.py
+++ b/graph_tool_call/plan/response.py
@@ -18,7 +18,6 @@
 
 from graph_tool_call.ontology.llm_provider import OntologyLLM
 
-
 # ---------------------------------------------------------------------------
 # prompts
 # ---------------------------------------------------------------------------
diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py
index 141d500..3b70f77 100644
--- a/graph_tool_call/plan/runner.py
+++ b/graph_tool_call/plan/runner.py
@@ -15,19 +15,18 @@
 from __future__ import annotations
 
 import time
+from collections.abc import Callable, Iterator
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Any, Callable, Iterator
+from typing import Any
 
 from graph_tool_call.plan.binding import BindingError, resolve_bindings
 from graph_tool_call.plan.schema import (
     ExecutionTrace,
     Plan,
-    PlanStep,
     StepTrace,
 )
 
-
 # ---------------------------------------------------------------------------
 # Event types — structured so callers can pattern-match by ``type`` field
 # ---------------------------------------------------------------------------
@@ -57,7 +56,7 @@ class StepCompleted:
     step_id: str = ""
     tool: str = ""
     duration_ms: int = 0
-    output_preview: Any = None                 # truncated output for UI
+    output_preview: Any = None  # truncated output for UI
     output_size: int = 0
 
 
@@ -87,14 +86,7 @@ class PlanAborted:
     total_duration_ms: int = 0
 
 
-PlanEvent = (
-    PlanStarted
-    | StepStarted
-    | StepCompleted
-    | StepFailed
-    | PlanCompleted
-    | PlanAborted
-)
+PlanEvent = PlanStarted | StepStarted | StepCompleted | StepFailed | PlanCompleted | PlanAborted
 
 
 # ---------------------------------------------------------------------------
@@ -126,7 +118,7 @@ def __init__(
         call_tool: ToolCaller,
         *,
         output_preview_limit: int = 512,
-        on_error: str = "abort",                  # 'abort' only in v1
+        on_error: str = "abort",  # 'abort' only in v1
     ) -> None:
         self._call_tool = call_tool
         self._preview_limit = output_preview_limit
@@ -148,7 +140,6 @@ def run_stream(
         ``input_context`` supplies values for ``${input.xxx}`` bindings —
         typically the entities extracted by Stage 1 (intent parser).
         """
-        started = _now_iso()
         plan_start = time.monotonic()
 
         yield PlanStarted(
@@ -180,11 +171,14 @@ def run_stream(
                 step_trace.duration_ms = _ms_since(step_start)
                 trace_steps.append(step_trace)
                 yield StepFailed(
-                    step_id=step.id, tool=step.tool,
-                    error=err, duration_ms=step_trace.duration_ms,
+                    step_id=step.id,
+                    tool=step.tool,
+                    error=err,
+                    duration_ms=step_trace.duration_ms,
                 )
                 yield PlanAborted(
-                    plan_id=plan.id, failed_step=step.id,
+                    plan_id=plan.id,
+                    failed_step=step.id,
                     error=err,
                     total_duration_ms=_ms_since(plan_start),
                 )
@@ -192,15 +186,17 @@ def run_stream(
 
             step_trace.args_resolved = resolved
             yield StepStarted(
-                step_id=step.id, tool=step.tool,
+                step_id=step.id,
+                tool=step.tool,
                 args_resolved=resolved,
-                index=idx, total=len(plan.steps),
+                index=idx,
+                total=len(plan.steps),
             )
 
             # 2. Execute via caller's tool invoker
             try:
                 output = self._call_tool(step.tool, resolved)
-            except Exception as exc:              # noqa: BLE001 — caller-defined
+            except Exception as exc:  # noqa: BLE001 — caller-defined
                 err = {
                     "kind": "tool",
                     "message": str(exc),
@@ -210,11 +206,14 @@ def run_stream(
                 step_trace.duration_ms = _ms_since(step_start)
                 trace_steps.append(step_trace)
                 yield StepFailed(
-                    step_id=step.id, tool=step.tool,
-                    error=err, duration_ms=step_trace.duration_ms,
+                    step_id=step.id,
+                    tool=step.tool,
+                    error=err,
+                    duration_ms=step_trace.duration_ms,
                 )
                 yield PlanAborted(
-                    plan_id=plan.id, failed_step=step.id,
+                    plan_id=plan.id,
+                    failed_step=step.id,
                     error=err,
                     total_duration_ms=_ms_since(plan_start),
                 )
@@ -235,7 +234,8 @@ def run_stream(
             context[step.id] = output
 
             yield StepCompleted(
-                step_id=step.id, tool=step.tool,
+                step_id=step.id,
+                tool=step.tool,
                 duration_ms=step_trace.duration_ms,
                 output_preview=_preview(output, self._preview_limit),
                 output_size=_output_size(output),
@@ -251,7 +251,8 @@ def run_stream(
         except BindingError as exc:
             err = {"kind": "output_binding", "message": str(exc)}
             yield PlanAborted(
-                plan_id=plan.id, failed_step="<output_binding>",
+                plan_id=plan.id,
+                failed_step="<output_binding>",
                 error=err,
                 total_duration_ms=_ms_since(plan_start),
             )
@@ -328,6 +329,7 @@ def _preview(value: Any, limit: int) -> Any:
     """Trim large outputs for UI previews. Keep small values intact."""
     if isinstance(value, (dict, list)):
         import json as _json
+
         try:
             rendered = _json.dumps(value, ensure_ascii=False)
         except (TypeError, ValueError):
@@ -393,6 +395,7 @@ def _maybe_unwrap_envelope(
 def _output_size(value: Any) -> int:
     """Approximate serialized byte size (for observability)."""
     import json as _json
+
     try:
         return len(_json.dumps(value, ensure_ascii=False))
     except (TypeError, ValueError):
diff --git a/graph_tool_call/plan/schema.py b/graph_tool_call/plan/schema.py
index b07530f..9fff497 100644
--- a/graph_tool_call/plan/schema.py
+++ b/graph_tool_call/plan/schema.py
@@ -25,12 +25,12 @@ class PlanStep:
     at runtime by ``resolve_bindings`` using the accumulated step context.
     """
 
-    id: str                                    # "s1", "s2", ...
-    tool: str                                  # function_name (graph node name)
+    id: str  # "s1", "s2", ...
+    tool: str  # function_name (graph node name)
     args: dict[str, Any] = field(default_factory=dict)
-    rationale: str = ""                        # why this step exists (for audit)
+    rationale: str = ""  # why this step exists (for audit)
     timeout_ms: int | None = None
-    retryable: bool = False                    # reserved for v1.1 retry policy
+    retryable: bool = False  # reserved for v1.1 retry policy
     # Top-level keys the synthesizer expects in this tool's response,
     # derived from ``produces[].json_path``. Used by PlanRunner to detect
     # envelope wrappers (e.g. ``{code, message, payload: {...}}``) when the
@@ -51,11 +51,11 @@ class Plan:
     the final answer. If unset, runner returns the last step's result.
     """
 
-    id: str                                    # uuid
-    goal: str                                  # user requirement summary
+    id: str  # uuid
+    goal: str  # user requirement summary
     steps: list[PlanStep] = field(default_factory=list)
-    output_binding: str | None = None          # e.g. "${s2.body}"
-    created_at: str = ""                       # ISO8601
+    output_binding: str | None = None  # e.g. "${s2.body}"
+    created_at: str = ""  # ISO8601
     metadata: dict[str, Any] = field(default_factory=dict)
 
 
@@ -66,8 +66,8 @@ class StepTrace:
     id: str
     tool: str
     args_resolved: dict[str, Any] = field(default_factory=dict)
-    output: Any = None                         # set on success
-    error: dict[str, Any] | None = None        # set on failure
+    output: Any = None  # set on success
+    error: dict[str, Any] | None = None  # set on failure
     duration_ms: int = 0
     retries: int = 0
 
@@ -79,7 +79,7 @@ class ExecutionTrace:
     plan_id: str
     success: bool
     steps: list[StepTrace] = field(default_factory=list)
-    output: Any = None                         # plan.output_binding resolved
+    output: Any = None  # plan.output_binding resolved
     failed_step: str | None = None
     total_duration_ms: int = 0
     started_at: str = ""
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
index 35858c4..4942b7e 100644
--- a/graph_tool_call/plan/synthesizer.py
+++ b/graph_tool_call/plan/synthesizer.py
@@ -58,7 +58,7 @@ class MaxDepthExceededError(PlanSynthesisError):
     """Recursion depth exceeded — likely a misshapen graph."""
 
 
-class DynamicOptionRequired(UnsatisfiableFieldError):
+class DynamicOptionRequired(UnsatisfiableFieldError):  # noqa: N818
     """A required data field has a single-hop producer that can be called
     immediately with the user's entities + context_defaults. Surface this
     so the caller can fetch the option list (instead of weaving a chain)
@@ -137,7 +137,7 @@ class _PartialStep:
     tool: str
     args: dict[str, Any] = field(default_factory=dict)
     rationale: str = ""
-    step_id: str = ""                          # assigned at topological sort
+    step_id: str = ""  # assigned at topological sort
 
 
 class PathSynthesizer:
@@ -236,17 +236,16 @@ def synthesize(
         final_steps: list[PlanStep] = []
         for tool_name in ordered_tools:
             partial = steps_by_tool[tool_name]
-            args = {
-                k: self._rewrite_tool_refs(v, steps_by_tool)
-                for k, v in partial.args.items()
-            }
-            final_steps.append(PlanStep(
-                id=partial.step_id,
-                tool=partial.tool,
-                args=args,
-                rationale=partial.rationale,
-                response_root_keys=self._response_root_keys(tool_name),
-            ))
+            args = {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in partial.args.items()}
+            final_steps.append(
+                PlanStep(
+                    id=partial.step_id,
+                    tool=partial.tool,
+                    args=args,
+                    rationale=partial.rationale,
+                    response_root_keys=self._response_root_keys(tool_name),
+                )
+            )
 
         target_step_id = steps_by_tool[target].step_id
 
@@ -260,11 +259,13 @@ def synthesize(
         for step in final_steps:
             for arg_name, arg_val in (step.args or {}).items():
                 if isinstance(arg_val, str) and arg_val.startswith("${user_input."):
-                    user_input_slots.append({
-                        "step_id": step.id,
-                        "tool": step.tool,
-                        "field_name": arg_name,
-                    })
+                    user_input_slots.append(
+                        {
+                            "step_id": step.id,
+                            "tool": step.tool,
+                            "field_name": arg_name,
+                        }
+                    )
 
         return Plan(
             id=str(uuid.uuid4()),
@@ -372,8 +373,10 @@ def _resolve(
             #    cycle when an alternative producer exists; only when none
             #    remains does the caller fall through to user-input slot (F2).
             producer = self._find_producer(
-                semantic=semantic, field_name=field_name,
-                target_tool=tool_name, entities=entities,
+                semantic=semantic,
+                field_name=field_name,
+                target_tool=tool_name,
+                entities=entities,
                 excluded=visiting,
             )
             if producer is None:
@@ -403,12 +406,11 @@ def _resolve(
             #     hit and continue. Without this constraint legitimate
             #     search→detail chains turn into popups.
             producer_action = self._producer_action(producer)
-            if (
-                producer_action == "read"
-                and self._is_producer_simple_callable(producer, entities)
-            ):
+            if producer_action == "read" and self._is_producer_simple_callable(producer, entities):
                 opt_path = self._produces_path_for(
-                    producer, semantic=semantic, field_name=field_name,
+                    producer,
+                    semantic=semantic,
+                    field_name=field_name,
                 )
                 if opt_path and "[*]" in opt_path:
                     raise DynamicOptionRequired(
@@ -510,9 +512,7 @@ def _build_producer_indexes(self) -> None:
 
     # ---- graphify edge indexing & traversal ---------------------------------
 
-    _WORKFLOW_RELATIONS: frozenset[str] = frozenset(
-        {"requires", "precedes", "complementary"}
-    )
+    _WORKFLOW_RELATIONS: frozenset[str] = frozenset({"requires", "precedes", "complementary"})
     _CONFIDENCE_RANK: dict[str, int] = {
         "EXTRACTED": 0,
         "INFERRED": 1,
@@ -537,18 +537,19 @@ def _index_workflow_edges(self, graph: dict[str, Any]) -> None:
             tgt = e.get("target") or e.get("to")
             rel = e.get("relation")
             rel_str = (
-                rel.value if hasattr(rel, "value")
-                else str(rel) if rel is not None else ""
+                rel.value if hasattr(rel, "value") else str(rel) if rel is not None else ""
             ).lower()
             if not src or not tgt or rel_str not in self._WORKFLOW_RELATIONS:
                 continue
-            self._workflow_edges_out.setdefault(src, []).append({
-                "target": tgt,
-                "relation": rel_str,
-                "confidence": e.get("confidence"),
-                "conf_score": float(e.get("conf_score") or 0.0),
-                "evidence": e.get("evidence") or "",
-            })
+            self._workflow_edges_out.setdefault(src, []).append(
+                {
+                    "target": tgt,
+                    "relation": rel_str,
+                    "confidence": e.get("confidence"),
+                    "conf_score": float(e.get("conf_score") or 0.0),
+                    "evidence": e.get("evidence") or "",
+                }
+            )
 
     # Producer-signal score weights. Higher = stronger signal that this
     # candidate genuinely produces the value the target needs. Weights chosen
@@ -695,7 +696,9 @@ def _score(signals: set[str]) -> int:
         # Cycle filter: skip candidates currently in the resolution stack so
         # the synthesiser reroutes around the cycle instead of raising.
         ranked = self._rank_producers(
-            sorted_names, target_tool=target_tool, entities=entities,
+            sorted_names,
+            target_tool=target_tool,
+            entities=entities,
         )
         for cand in ranked:
             if cand in excluded:
@@ -845,7 +848,7 @@ def _is_chain_eligible(self, producer_name: str, *, target_tool: str) -> bool:
             if "_" in t_resource:
                 related.add(t_resource.split("_", 1)[0])
 
-        for c in (t_meta_full.get("consumes") or []):
+        for c in t_meta_full.get("consumes") or []:
             if not isinstance(c, dict):
                 continue
             sem = str(c.get("semantic_tag") or "").strip().lower()
@@ -897,7 +900,7 @@ def _score(name: str) -> tuple[int, int, int]:
             ai = meta.get("ai_metadata") or {}
 
             affinity = 0
-            for c in (meta.get("consumes") or []):
+            for c in meta.get("consumes") or []:
                 tag = c.get("semantic_tag") or ""
                 fname = c.get("field_name") or ""
                 if (tag and tag in entity_keys) or (fname and fname in entity_keys):
diff --git a/graph_tool_call/retrieval/engine.py b/graph_tool_call/retrieval/engine.py
index 73e1b1d..c785912 100644
--- a/graph_tool_call/retrieval/engine.py
+++ b/graph_tool_call/retrieval/engine.py
@@ -455,7 +455,8 @@ def _inject_graph_candidates(
 
         # Only consider graph candidates not already found by primary channels
         new_candidates = {
-            name: score for name, score in graph_scores.items()
+            name: score
+            for name, score in graph_scores.items()
             if name not in final_scores and name in self._tools
         }
         if not new_candidates:
@@ -497,7 +498,6 @@ def _boost_method_intent(self, query_intent: Any, scores: dict[str, float]) -> N
             elif query_intent.delete_intent > 0.5 and method == "DELETE":
                 scores[name] *= 1.15
 
-
     def _boost_embedding_rerank(self, query: str, scores: dict[str, float]) -> None:
         """Rerank top candidates using embedding description similarity."""
         if self._embedding_index is None or self._embedding_index._provider is None:
@@ -809,8 +809,12 @@ async def aretrieve(
         return await loop.run_in_executor(
             None,
             lambda: self.retrieve(
-                query, top_k=top_k, max_graph_depth=max_graph_depth,
-                mode=mode, llm=llm, history=history,
+                query,
+                top_k=top_k,
+                max_graph_depth=max_graph_depth,
+                mode=mode,
+                llm=llm,
+                history=history,
             ),
         )
 
@@ -830,8 +834,12 @@ async def aretrieve_with_scores(
         return await loop.run_in_executor(
             None,
             lambda: self.retrieve_with_scores(
-                query, top_k=top_k, max_graph_depth=max_graph_depth,
-                mode=mode, llm=llm, history=history,
+                query,
+                top_k=top_k,
+                max_graph_depth=max_graph_depth,
+                mode=mode,
+                llm=llm,
+                history=history,
             ),
         )
 
diff --git a/graph_tool_call/retrieval/graph_search.py b/graph_tool_call/retrieval/graph_search.py
index cefe1de..1ecc3e9 100644
--- a/graph_tool_call/retrieval/graph_search.py
+++ b/graph_tool_call/retrieval/graph_search.py
@@ -121,8 +121,25 @@ def resource_first_search(
         cat_index = self._get_category_index()
         query_lower = query.lower()
         query_tokens = set(re.split(r"[\s_\-/.,;:!?()]+", query_lower))
-        query_tokens -= {"a", "an", "the", "of", "for", "to", "in", "by", "is", "and", "or", "my",
-                         "all", "this", "that", "with", "from"}
+        query_tokens -= {
+            "a",
+            "an",
+            "the",
+            "of",
+            "for",
+            "to",
+            "in",
+            "by",
+            "is",
+            "and",
+            "or",
+            "my",
+            "all",
+            "this",
+            "that",
+            "with",
+            "from",
+        }
         query_tokens.discard("")
 
         if not query_tokens:
@@ -188,9 +205,7 @@ def resource_first_search(
         return dict(ranked[:max_results])
 
     @staticmethod
-    def _compute_intent_boost(
-        intent: Any | None, tool_node: str, tools: dict | None
-    ) -> float:
+    def _compute_intent_boost(intent: Any | None, tool_node: str, tools: dict | None) -> float:
         """Score boost based on query intent vs tool's HTTP method/name."""
         if not intent or intent.is_neutral or not tools:
             return 1.0
@@ -206,18 +221,35 @@ def _compute_intent_boost(
         if intent.write_intent > 0.5:
             if method in ("POST", "PUT", "PATCH"):
                 boost = 1.8
-            for verb in ("create", "add", "set", "update", "enable",
-                         "register", "upload", "submit", "request",
-                         "fork", "star", "follow", "lock", "merge",
-                         "close", "open", "transfer", "approve",
-                         "checkout", "cancel", "clear"):
+            for verb in (
+                "create",
+                "add",
+                "set",
+                "update",
+                "enable",
+                "register",
+                "upload",
+                "submit",
+                "request",
+                "fork",
+                "star",
+                "follow",
+                "lock",
+                "merge",
+                "close",
+                "open",
+                "transfer",
+                "approve",
+                "checkout",
+                "cancel",
+                "clear",
+            ):
                 if verb in name_lower:
                     boost = max(boost, 1.5)
         elif intent.read_intent > 0.5:
             if method == "GET":
                 boost = 1.5
-            for verb in ("get", "list", "check", "download", "search",
-                         "validate", "calculate"):
+            for verb in ("get", "list", "check", "download", "search", "validate", "calculate"):
                 if verb in name_lower:
                     boost = max(boost, 1.3)
         elif intent.delete_intent > 0.5:
@@ -230,9 +262,7 @@ def _compute_intent_boost(
         return boost
 
     @staticmethod
-    def _compute_desc_boost(
-        query_tokens: set[str], tool_node: str, tools: dict | None
-    ) -> float:
+    def _compute_desc_boost(query_tokens: set[str], tool_node: str, tools: dict | None) -> float:
         """Boost tools whose description contains query keywords."""
         if not tools:
             return 1.0
@@ -299,9 +329,7 @@ def _expand_chains(
                     # Decayed score: prerequisites get 60% at depth 1, 36% at depth 2
                     decay = 0.6 ** (depth + 1)
                     chain_score = base_score * decay
-                    chain_scores[neighbor] = max(
-                        chain_scores.get(neighbor, 0), chain_score
-                    )
+                    chain_scores[neighbor] = max(chain_scores.get(neighbor, 0), chain_score)
                     queue.append((neighbor, depth + 1))
 
         return chain_scores
diff --git a/graph_tool_call/serialization.py b/graph_tool_call/serialization.py
index cac1c00..81e56b6 100644
--- a/graph_tool_call/serialization.py
+++ b/graph_tool_call/serialization.py
@@ -52,7 +52,10 @@ def save_graph(
     path = Path(path)
     try:
         path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(json.dumps(data, indent=2, ensure_ascii=False, default=str), encoding="utf-8")
+        path.write_text(
+            json.dumps(data, indent=2, ensure_ascii=False, default=str),
+            encoding="utf-8",
+        )
     except PermissionError:
         msg = f"Permission denied: {path}. Check directory permissions."
         raise PermissionError(msg) from None
diff --git a/graph_tool_call/tool_graph.py b/graph_tool_call/tool_graph.py
index 00c2353..e415368 100644
--- a/graph_tool_call/tool_graph.py
+++ b/graph_tool_call/tool_graph.py
@@ -1282,7 +1282,8 @@ def list_sources(self) -> list[str]:
     def tools_by_source(self, source_label: str) -> list[ToolSchema]:
         """Return all tools tagged with the given ``source_label``."""
         return [
-            t for t in self._tools.values()
+            t
+            for t in self._tools.values()
             if t.metadata and t.metadata.get("source_label") == source_label
         ]
 
@@ -1631,17 +1632,21 @@ def call_tool(tool_name: str, arguments: dict[str, Any] | None = None) -> str:
             """
             schema = graph_ref._tools.get(tool_name)
             if schema is None:
-                return json.dumps({
-                    "error": f"Tool '{tool_name}' not found.",
-                    "hint": "Use search_tools to find the correct tool name.",
-                })
+                return json.dumps(
+                    {
+                        "error": f"Tool '{tool_name}' not found.",
+                        "hint": "Use search_tools to find the correct tool name.",
+                    }
+                )
 
             callable_ = schema.get_callable()
             if callable_ is None:
-                return json.dumps({
-                    "error": f"Tool '{tool_name}' is not callable.",
-                    "hint": "This tool was registered without a callable implementation.",
-                })
+                return json.dumps(
+                    {
+                        "error": f"Tool '{tool_name}' is not callable.",
+                        "hint": "This tool was registered without a callable implementation.",
+                    }
+                )
 
             args: dict[str, Any] = {}
             if arguments is not None:
@@ -1665,10 +1670,12 @@ def call_tool(tool_name: str, arguments: dict[str, Any] | None = None) -> str:
                     return result
                 return json.dumps(result, ensure_ascii=False, default=str)
             except Exception as e:
-                return json.dumps({
-                    "error": str(e),
-                    "tool_name": tool_name,
-                })
+                return json.dumps(
+                    {
+                        "error": str(e),
+                        "tool_name": tool_name,
+                    }
+                )
 
         return [search_tools, call_tool]
 
diff --git a/graph_tool_call/workflow.py b/graph_tool_call/workflow.py
index 1e64ca2..c440a65 100644
--- a/graph_tool_call/workflow.py
+++ b/graph_tool_call/workflow.py
@@ -25,7 +25,7 @@
 import json
 import re
 from collections import defaultdict, deque
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 
@@ -118,9 +118,7 @@ def reorder(self, tool_names: list[str]) -> WorkflowPlan:
         self.confidence = "manual"
         return self
 
-    def set_param_mapping(
-        self, tool_name: str, param: str, source: str
-    ) -> WorkflowPlan:
+    def set_param_mapping(self, tool_name: str, param: str, source: str) -> WorkflowPlan:
         """Set a parameter mapping for a step.
 
         Example::
@@ -162,7 +160,6 @@ def open_editor(self, tools: dict[str, ToolSchema] | None = None) -> None:
             plan.open_editor(tools=tg.tools)
         """
         import tempfile
-        import urllib.parse
         import webbrowser
 
         # Build editor data: workflow + tool catalog
@@ -208,9 +205,7 @@ def open_editor(self, tools: dict[str, ToolSchema] | None = None) -> None:
             webbrowser.open(f"file://{f.name}")
 
     @classmethod
-    def load(
-        cls, path: str | Path, *, tools: dict[str, ToolSchema]
-    ) -> WorkflowPlan:
+    def load(cls, path: str | Path, *, tools: dict[str, ToolSchema]) -> WorkflowPlan:
         """Load workflow from JSON file."""
         data = json.loads(Path(path).read_text(encoding="utf-8"))
         steps = []
@@ -218,12 +213,14 @@ def load(
             tool = tools.get(s["tool"])
             if not tool:
                 continue
-            steps.append(WorkflowStep(
-                order=s.get("order", 0),
-                tool=tool,
-                reason=s.get("reason", ""),
-                params_from=s.get("params_from", {}),
-            ))
+            steps.append(
+                WorkflowStep(
+                    order=s.get("order", 0),
+                    tool=tool,
+                    reason=s.get("reason", ""),
+                    params_from=s.get("params_from", {}),
+                )
+            )
         plan = cls(
             goal=data.get("goal", ""),
             steps=steps,
@@ -300,8 +297,25 @@ def plan(
     def _pick_primary(self, goal: str, scores: dict[str, float]) -> str:
         """Pick the best primary tool by combining graph score + name relevance."""
         tokens = set(re.split(r"[\s_\-/.,;:!?()]+", goal.lower()))
-        tokens -= {"a", "an", "the", "of", "for", "to", "in", "by", "is",
-                    "and", "or", "my", "all", "this", "that", "with", "from"}
+        tokens -= {
+            "a",
+            "an",
+            "the",
+            "of",
+            "for",
+            "to",
+            "in",
+            "by",
+            "is",
+            "and",
+            "or",
+            "my",
+            "all",
+            "this",
+            "that",
+            "with",
+            "from",
+        }
         tokens.discard("")
 
         def _relevance(name: str) -> float:
@@ -338,9 +352,7 @@ def _name_match(self, goal: str) -> dict[str, float]:
                 scores[name] = overlap
         return scores
 
-    def _build_chain(
-        self, target: str, max_steps: int
-    ) -> dict[str, set[str]]:
+    def _build_chain(self, target: str, max_steps: int) -> dict[str, set[str]]:
         """Build a prerequisite chain for the target tool.
 
         Follows REQUIRES edges to find data providers:
@@ -356,8 +368,6 @@ def _build_chain(
         if not self._graph.has_node(target):
             return dict(predecessors)
 
-        target_category = self._get_category(target)
-
         # BFS with max depth 2 — follow REQUIRES to find prerequisites
         visited: set[str] = {target}
         queue: deque[tuple[str, int]] = deque([(target, 0)])
@@ -393,9 +403,7 @@ def _build_chain(
 
                 if "REQUIRES" in relation and src == node:
                     # Accept GET as data provider (same or cross-resource)
-                    if n_method == "GET" or any(
-                        v in neighbor.lower() for v in ("get", "list")
-                    ):
+                    if n_method == "GET" or any(v in neighbor.lower() for v in ("get", "list")):
                         accepted = True
 
                 elif "PRECEDES" in relation and tgt == node:
@@ -403,8 +411,7 @@ def _build_chain(
                     neighbor_category = self._get_category(neighbor)
                     node_category = self._get_category(node)
                     same_cat = (
-                        node_category and neighbor_category
-                        and node_category == neighbor_category
+                        node_category and neighbor_category and node_category == neighbor_category
                     )
                     is_creator = n_method == "POST" or any(
                         v in neighbor.lower() for v in ("create", "add")
@@ -422,7 +429,7 @@ def _build_chain(
         # Trim to max_steps
         if len(predecessors) > max_steps:
             # Keep target + closest prerequisites
-            direct_preds = list(predecessors[target])[:max_steps - 1]
+            direct_preds = list(predecessors[target])[: max_steps - 1]
             trimmed: dict[str, set[str]] = {target: set(direct_preds)}
             for p in direct_preds:
                 trimmed[p] = predecessors.get(p, set()) & set(direct_preds)
@@ -460,9 +467,7 @@ def _topo_sort(self, predecessors: dict[str, set[str]]) -> list[str]:
                 result.append(n)
         return result
 
-    def _infer_reason(
-        self, tool_name: str, primary: str, chain: dict[str, set[str]]
-    ) -> str:
+    def _infer_reason(self, tool_name: str, primary: str, chain: dict[str, set[str]]) -> str:
         if tool_name == primary:
             return "primary action"
         dependents = [n for n, p in chain.items() if tool_name in p]
@@ -470,9 +475,7 @@ def _infer_reason(
             return f"prerequisite for {', '.join(dependents)}"
         return "related"
 
-    def _enhance_with_llm(
-        self, plan: WorkflowPlan, llm: Any, max_steps: int
-    ) -> WorkflowPlan:
+    def _enhance_with_llm(self, plan: WorkflowPlan, llm: Any, max_steps: int) -> WorkflowPlan:
         """Use LLM to fill cross-resource gaps and add parameter mappings."""
         current_chain = [s.tool.name for s in plan.steps]
         available = []
@@ -498,7 +501,8 @@ def _enhance_with_llm(
 {chr(10).join(available[:60])}
 
 Return JSON:
-{{"steps": [{{"tool": "name", "reason": "why", "params_from": {{"param": "step.response.field"}}}}]}}
+{{"steps": [{{"tool": "name", "reason": "why",
+  "params_from": {{"param": "step.response.field"}}}}]}}
 
 Rules:
 - Keep existing chain steps unless clearly wrong
@@ -521,11 +525,14 @@ def _enhance_with_llm(
                 tool = self._tools.get(s.get("tool", ""))
                 if not tool:
                     continue
-                new_steps.append(WorkflowStep(
-                    order=i + 1, tool=tool,
-                    reason=s.get("reason", ""),
-                    params_from=s.get("params_from", {}),
-                ))
+                new_steps.append(
+                    WorkflowStep(
+                        order=i + 1,
+                        tool=tool,
+                        reason=s.get("reason", ""),
+                        params_from=s.get("params_from", {}),
+                    )
+                )
             if new_steps:
                 plan.steps = new_steps
                 plan.confidence = "graph+llm"
diff --git a/tests/test_dependency.py b/tests/test_dependency.py
index f9a77de..3c452e9 100644
--- a/tests/test_dependency.py
+++ b/tests/test_dependency.py
@@ -60,16 +60,6 @@ def _find_relation(
 # ---------------------------------------------------------------------------
 
 
-def test_crud_requires():
-    """POST → GET/{id} should produce REQUIRES."""
-    tools = _pet_tools()
-    relations = detect_dependencies(tools)
-    rel = _find_relation(relations, "getPet", "createPet", RelationType.REQUIRES)
-    assert rel is not None, "GET single should REQUIRE POST"
-    assert rel.confidence >= 0.9
-    assert rel.layer == 1
-
-
 def test_crud_complementary():
     """POST and PUT are no longer marked COMPLEMENTARY (removed to reduce noise)."""
     tools = _pet_tools()
diff --git a/tests/test_gateway_e2e.py b/tests/test_gateway_e2e.py
index 15525af..0142d7c 100644
--- a/tests/test_gateway_e2e.py
+++ b/tests/test_gateway_e2e.py
@@ -9,247 +9,294 @@
 import time
 
 from langchain_core.tools import tool
+
 pytest = __import__("pytest")
 ChatOllama = pytest.importorskip("langchain_ollama").ChatOllama
-from langgraph.prebuilt import create_react_agent
-
-from graph_tool_call.langchain.gateway import create_gateway_tools
+from langgraph.prebuilt import create_react_agent  # noqa: E402
 
+from graph_tool_call.langchain.gateway import create_gateway_tools  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # 50 tools (same as test_create_agent_e2e.py)
 # ---------------------------------------------------------------------------
 
+
 @tool
 def create_user(username: str, email: str) -> str:
     """Create a new user account with username and email."""
     return json.dumps({"id": 1, "username": username, "email": email})
 
+
 @tool
 def get_user(user_id: int) -> str:
     """Get user profile by user ID."""
     return json.dumps({"id": user_id, "username": "john", "email": "john@example.com"})
 
+
 @tool
 def update_user(user_id: int, email: str) -> str:
     """Update user profile information."""
     return json.dumps({"id": user_id, "email": email})
 
+
 @tool
 def delete_user(user_id: int) -> str:
     """Delete a user account permanently."""
     return json.dumps({"deleted": True, "id": user_id})
 
+
 @tool
 def list_users(page: int = 1) -> str:
     """List all users with pagination."""
     return json.dumps({"users": [{"id": 1, "username": "john"}], "page": page})
 
+
 @tool
 def search_users(query: str) -> str:
     """Search users by name or email."""
     return json.dumps({"results": [{"id": 1, "username": query}]})
 
+
 @tool
 def reset_password(user_id: int) -> str:
     """Send password reset email to user."""
     return json.dumps({"sent": True, "user_id": user_id})
 
+
 @tool
 def ban_user(user_id: int, reason: str) -> str:
     """Ban a user account with a reason."""
     return json.dumps({"banned": True, "user_id": user_id, "reason": reason})
 
+
 @tool
 def create_order(product_id: int, quantity: int) -> str:
     """Create a new order for a product."""
     return json.dumps({"order_id": 100, "product_id": product_id, "quantity": quantity})
 
+
 @tool
 def get_order(order_id: int) -> str:
     """Get order details by order ID."""
     return json.dumps({"order_id": order_id, "status": "pending", "total": 99.99})
 
+
 @tool
 def cancel_order(order_id: int) -> str:
     """Cancel an existing order."""
     return json.dumps({"order_id": order_id, "status": "cancelled"})
 
+
 @tool
 def list_orders(user_id: int) -> str:
     """List all orders for a user."""
     return json.dumps({"orders": [{"order_id": 100, "status": "pending"}]})
 
+
 @tool
 def update_order_status(order_id: int, status: str) -> str:
     """Update order status (pending, shipped, delivered)."""
     return json.dumps({"order_id": order_id, "status": status})
 
+
 @tool
 def process_refund(order_id: int) -> str:
     """Process a refund for a cancelled order."""
     return json.dumps({"order_id": order_id, "refunded": True, "amount": 99.99})
 
+
 @tool
 def track_shipment(order_id: int) -> str:
     """Track shipment status for an order."""
     return json.dumps({"order_id": order_id, "tracking": "1Z999AA10123456784"})
 
+
 @tool
 def create_product(name: str, price: float) -> str:
     """Create a new product listing."""
     return json.dumps({"product_id": 1, "name": name, "price": price})
 
+
 @tool
 def get_product(product_id: int) -> str:
     """Get product details by product ID."""
     return json.dumps({"product_id": product_id, "name": "Widget", "price": 29.99})
 
+
 @tool
 def update_product(product_id: int, price: float) -> str:
     """Update product price."""
     return json.dumps({"product_id": product_id, "price": price})
 
+
 @tool
 def delete_product(product_id: int) -> str:
     """Delete a product listing."""
     return json.dumps({"deleted": True, "product_id": product_id})
 
+
 @tool
 def list_products(category: str = "all") -> str:
     """List products by category."""
     return json.dumps({"products": [{"id": 1, "name": "Widget", "category": category}]})
 
+
 @tool
 def search_products(query: str) -> str:
     """Search products by name or description."""
     return json.dumps({"results": [{"id": 1, "name": query}]})
 
+
 @tool
 def charge_card(amount: float, card_token: str) -> str:
     """Charge a credit card."""
     return json.dumps({"charge_id": "ch_123", "amount": amount, "status": "succeeded"})
 
+
 @tool
 def get_payment(payment_id: str) -> str:
     """Get payment details."""
     return json.dumps({"payment_id": payment_id, "amount": 99.99, "status": "succeeded"})
 
+
 @tool
 def list_payments(user_id: int) -> str:
     """List payment history for a user."""
     return json.dumps({"payments": [{"id": "ch_123", "amount": 99.99}]})
 
+
 @tool
 def create_invoice(order_id: int) -> str:
     """Generate an invoice for an order."""
     return json.dumps({"invoice_id": "inv_123", "order_id": order_id})
 
+
 @tool
 def send_email(to: str, subject: str, body: str) -> str:
     """Send an email to a recipient."""
     return json.dumps({"sent": True, "to": to, "subject": subject})
 
+
 @tool
 def send_sms(phone: str, message: str) -> str:
     """Send an SMS text message."""
     return json.dumps({"sent": True, "phone": phone})
 
+
 @tool
 def send_push_notification(user_id: int, title: str, message: str) -> str:
     """Send a push notification to a user's device."""
     return json.dumps({"sent": True, "user_id": user_id, "title": title})
 
+
 @tool
 def list_notifications(user_id: int) -> str:
     """List all notifications for a user."""
     return json.dumps({"notifications": [{"id": 1, "title": "Order shipped"}]})
 
+
 @tool
 def upload_file(filename: str, content_type: str) -> str:
     """Upload a file to storage."""
     return json.dumps({"file_id": "f_123", "filename": filename})
 
+
 @tool
 def download_file(file_id: str) -> str:
     """Download a file from storage."""
     return json.dumps({"file_id": file_id, "url": "https://storage.example.com/f_123"})
 
+
 @tool
 def delete_file(file_id: str) -> str:
     """Delete a file from storage."""
     return json.dumps({"deleted": True, "file_id": file_id})
 
+
 @tool
 def list_files(folder: str = "/") -> str:
     """List files in a folder."""
     return json.dumps({"files": [{"id": "f_123", "name": "report.pdf"}]})
 
+
 @tool
 def get_dashboard_stats() -> str:
     """Get overview dashboard statistics."""
     return json.dumps({"total_users": 1000, "total_orders": 5000, "revenue": 150000})
 
+
 @tool
 def get_sales_report(start_date: str, end_date: str) -> str:
     """Generate sales report for a date range."""
     return json.dumps({"start": start_date, "end": end_date, "total": 50000})
 
+
 @tool
 def get_user_activity(user_id: int) -> str:
     """Get activity log for a user."""
     return json.dumps({"user_id": user_id, "actions": ["login", "view_product", "checkout"]})
 
+
 @tool
 def get_conversion_rate(period: str = "monthly") -> str:
     """Get conversion rate analytics."""
     return json.dumps({"period": period, "rate": 0.032})
 
+
 @tool
 def get_weather(city: str) -> str:
     """Get current weather for a city."""
     return json.dumps({"city": city, "temp": 22, "condition": "sunny"})
 
+
 @tool
 def get_forecast(city: str, days: int = 7) -> str:
     """Get weather forecast for next N days."""
     return json.dumps({"city": city, "days": days, "forecast": [{"day": 1, "temp": 22}]})
 
+
 @tool
 def create_event(title: str, date: str) -> str:
     """Create a calendar event."""
     return json.dumps({"event_id": "e_123", "title": title, "date": date})
 
+
 @tool
 def list_events(date: str) -> str:
     """List calendar events for a date."""
     return json.dumps({"events": [{"id": "e_123", "title": "Meeting"}]})
 
+
 @tool
 def delete_event(event_id: str) -> str:
     """Delete a calendar event."""
     return json.dumps({"deleted": True, "event_id": event_id})
 
+
 @tool
 def get_settings() -> str:
     """Get current application settings."""
     return json.dumps({"theme": "dark", "language": "en", "notifications": True})
 
+
 @tool
 def update_settings(key: str, value: str) -> str:
     """Update an application setting."""
     return json.dumps({"key": key, "value": value, "updated": True})
 
+
 @tool
 def translate_text(text: str, target_lang: str) -> str:
     """Translate text to a target language."""
     return json.dumps({"original": text, "translated": f"[{target_lang}] {text}"})
 
+
 @tool
 def generate_report(report_type: str) -> str:
     """Generate a system report (daily, weekly, monthly)."""
     return json.dumps({"type": report_type, "generated": True})
 
+
 @tool
 def health_check() -> str:
     """Check system health status."""
@@ -257,20 +304,53 @@ def health_check() -> str:
 
 
 ALL_TOOLS = [
-    create_user, get_user, update_user, delete_user, list_users,
-    search_users, reset_password, ban_user,
-    create_order, get_order, cancel_order, list_orders,
-    update_order_status, process_refund, track_shipment,
-    create_product, get_product, update_product, delete_product,
-    list_products, search_products,
-    charge_card, get_payment, list_payments, create_invoice,
-    send_email, send_sms, send_push_notification, list_notifications,
-    upload_file, download_file, delete_file, list_files,
-    get_dashboard_stats, get_sales_report, get_user_activity, get_conversion_rate,
-    get_weather, get_forecast,
-    create_event, list_events, delete_event,
-    get_settings, update_settings,
-    translate_text, generate_report, health_check,
+    create_user,
+    get_user,
+    update_user,
+    delete_user,
+    list_users,
+    search_users,
+    reset_password,
+    ban_user,
+    create_order,
+    get_order,
+    cancel_order,
+    list_orders,
+    update_order_status,
+    process_refund,
+    track_shipment,
+    create_product,
+    get_product,
+    update_product,
+    delete_product,
+    list_products,
+    search_products,
+    charge_card,
+    get_payment,
+    list_payments,
+    create_invoice,
+    send_email,
+    send_sms,
+    send_push_notification,
+    list_notifications,
+    upload_file,
+    download_file,
+    delete_file,
+    list_files,
+    get_dashboard_stats,
+    get_sales_report,
+    get_user_activity,
+    get_conversion_rate,
+    get_weather,
+    get_forecast,
+    create_event,
+    list_events,
+    delete_event,
+    get_settings,
+    update_settings,
+    translate_text,
+    generate_report,
+    health_check,
 ]
 
 
@@ -309,7 +389,7 @@ def health_check() -> str:
 
 def main():
     print(f"Total tools: {len(ALL_TOOLS)}")
-    print(f"Gateway tool 2개로 변환 → LLM이 search_tools + call_tool 사용")
+    print("Gateway tool 2개로 변환 → LLM이 search_tools + call_tool 사용")
     print("=" * 70)
 
     llm = ChatOllama(model="qwen3.5:4b", temperature=0)
@@ -374,8 +454,8 @@ def main():
             print(f"  [ERROR] {e}")
 
     print(f"\n{'=' * 70}")
-    print(f"RESULT: {passed}/{total} ({passed/total*100:.0f}%)")
-    print(f"  - LLM에 노출된 tool 수: 2 (search_tools, call_tool)")
+    print(f"RESULT: {passed}/{total} ({passed / total * 100:.0f}%)")
+    print("  - LLM에 노출된 tool 수: 2 (search_tools, call_tool)")
     print(f"  - 실제 backend tool 수: {len(ALL_TOOLS)}")
     print("=" * 70)
 
diff --git a/tests/test_gateway_token_saving.py b/tests/test_gateway_token_saving.py
index 5703545..ef9564f 100644
--- a/tests/test_gateway_token_saving.py
+++ b/tests/test_gateway_token_saving.py
@@ -9,244 +9,291 @@
 import json
 
 from langchain_core.tools import tool
+
 pytest = __import__("pytest")
 ChatOllama = pytest.importorskip("langchain_ollama").ChatOllama
 
-from graph_tool_call.langchain.gateway import create_gateway_tools
-
+from graph_tool_call.langchain.gateway import create_gateway_tools  # noqa: E402
 
 # --- Same 47 tools as e2e test ---
 
+
 @tool
 def create_user(username: str, email: str) -> str:
     """Create a new user account with username and email."""
     return json.dumps({"id": 1, "username": username, "email": email})
 
+
 @tool
 def get_user(user_id: int) -> str:
     """Get user profile by user ID."""
     return json.dumps({"id": user_id, "username": "john"})
 
+
 @tool
 def update_user(user_id: int, email: str) -> str:
     """Update user profile information."""
     return json.dumps({"id": user_id, "email": email})
 
+
 @tool
 def delete_user(user_id: int) -> str:
     """Delete a user account permanently."""
     return json.dumps({"deleted": True, "id": user_id})
 
+
 @tool
 def list_users(page: int = 1) -> str:
     """List all users with pagination."""
     return json.dumps({"users": [{"id": 1}], "page": page})
 
+
 @tool
 def search_users(query: str) -> str:
     """Search users by name or email."""
     return json.dumps({"results": [{"id": 1, "username": query}]})
 
+
 @tool
 def reset_password(user_id: int) -> str:
     """Send password reset email to user."""
     return json.dumps({"sent": True, "user_id": user_id})
 
+
 @tool
 def ban_user(user_id: int, reason: str) -> str:
     """Ban a user account with a reason."""
     return json.dumps({"banned": True, "user_id": user_id})
 
+
 @tool
 def create_order(product_id: int, quantity: int) -> str:
     """Create a new order for a product."""
     return json.dumps({"order_id": 100, "product_id": product_id})
 
+
 @tool
 def get_order(order_id: int) -> str:
     """Get order details by order ID."""
     return json.dumps({"order_id": order_id, "status": "pending"})
 
+
 @tool
 def cancel_order(order_id: int) -> str:
     """Cancel an existing order."""
     return json.dumps({"order_id": order_id, "status": "cancelled"})
 
+
 @tool
 def list_orders(user_id: int) -> str:
     """List all orders for a user."""
     return json.dumps({"orders": [{"order_id": 100}]})
 
+
 @tool
 def update_order_status(order_id: int, status: str) -> str:
     """Update order status (pending, shipped, delivered)."""
     return json.dumps({"order_id": order_id, "status": status})
 
+
 @tool
 def process_refund(order_id: int) -> str:
     """Process a refund for a cancelled order."""
     return json.dumps({"order_id": order_id, "refunded": True})
 
+
 @tool
 def track_shipment(order_id: int) -> str:
     """Track shipment status for an order."""
     return json.dumps({"order_id": order_id, "tracking": "1Z999"})
 
+
 @tool
 def create_product(name: str, price: float) -> str:
     """Create a new product listing."""
     return json.dumps({"product_id": 1, "name": name, "price": price})
 
+
 @tool
 def get_product(product_id: int) -> str:
     """Get product details by product ID."""
     return json.dumps({"product_id": product_id, "name": "Widget"})
 
+
 @tool
 def update_product(product_id: int, price: float) -> str:
     """Update product price."""
     return json.dumps({"product_id": product_id, "price": price})
 
+
 @tool
 def delete_product(product_id: int) -> str:
     """Delete a product listing."""
     return json.dumps({"deleted": True, "product_id": product_id})
 
+
 @tool
 def list_products(category: str = "all") -> str:
     """List products by category."""
     return json.dumps({"products": [{"id": 1, "category": category}]})
 
+
 @tool
 def search_products(query: str) -> str:
     """Search products by name or description."""
     return json.dumps({"results": [{"id": 1, "name": query}]})
 
+
 @tool
 def charge_card(amount: float, card_token: str) -> str:
     """Charge a credit card."""
     return json.dumps({"charge_id": "ch_123", "amount": amount})
 
+
 @tool
 def get_payment(payment_id: str) -> str:
     """Get payment details."""
     return json.dumps({"payment_id": payment_id, "amount": 99.99})
 
+
 @tool
 def list_payments(user_id: int) -> str:
     """List payment history for a user."""
     return json.dumps({"payments": [{"id": "ch_123"}]})
 
+
 @tool
 def create_invoice(order_id: int) -> str:
     """Generate an invoice for an order."""
     return json.dumps({"invoice_id": "inv_123", "order_id": order_id})
 
+
 @tool
 def send_email(to: str, subject: str, body: str) -> str:
     """Send an email to a recipient."""
     return json.dumps({"sent": True, "to": to, "subject": subject})
 
+
 @tool
 def send_sms(phone: str, message: str) -> str:
     """Send an SMS text message."""
     return json.dumps({"sent": True, "phone": phone})
 
+
 @tool
 def send_push_notification(user_id: int, title: str, message: str) -> str:
     """Send a push notification to a user's device."""
     return json.dumps({"sent": True, "user_id": user_id})
 
+
 @tool
 def list_notifications(user_id: int) -> str:
     """List all notifications for a user."""
     return json.dumps({"notifications": [{"id": 1}]})
 
+
 @tool
 def upload_file(filename: str, content_type: str) -> str:
     """Upload a file to storage."""
     return json.dumps({"file_id": "f_123", "filename": filename})
 
+
 @tool
 def download_file(file_id: str) -> str:
     """Download a file from storage."""
     return json.dumps({"file_id": file_id, "url": "https://example.com/f"})
 
+
 @tool
 def delete_file(file_id: str) -> str:
     """Delete a file from storage."""
     return json.dumps({"deleted": True, "file_id": file_id})
 
+
 @tool
 def list_files(folder: str = "/") -> str:
     """List files in a folder."""
     return json.dumps({"files": [{"id": "f_123", "name": "report.pdf"}]})
 
+
 @tool
 def get_dashboard_stats() -> str:
     """Get overview dashboard statistics."""
     return json.dumps({"total_users": 1000, "revenue": 150000})
 
+
 @tool
 def get_sales_report(start_date: str, end_date: str) -> str:
     """Generate sales report for a date range."""
     return json.dumps({"start": start_date, "end": end_date, "total": 50000})
 
+
 @tool
 def get_user_activity(user_id: int) -> str:
     """Get activity log for a user."""
     return json.dumps({"user_id": user_id, "actions": ["login"]})
 
+
 @tool
 def get_conversion_rate(period: str = "monthly") -> str:
     """Get conversion rate analytics."""
     return json.dumps({"period": period, "rate": 0.032})
 
+
 @tool
 def get_weather(city: str) -> str:
     """Get current weather for a city."""
     return json.dumps({"city": city, "temp": 22, "condition": "sunny"})
 
+
 @tool
 def get_forecast(city: str, days: int = 7) -> str:
     """Get weather forecast for next N days."""
     return json.dumps({"city": city, "days": days})
 
+
 @tool
 def create_event(title: str, date: str) -> str:
     """Create a calendar event."""
     return json.dumps({"event_id": "e_123", "title": title, "date": date})
 
+
 @tool
 def list_events(date: str) -> str:
     """List calendar events for a date."""
     return json.dumps({"events": [{"id": "e_123", "title": "Meeting"}]})
 
+
 @tool
 def delete_event(event_id: str) -> str:
     """Delete a calendar event."""
     return json.dumps({"deleted": True, "event_id": event_id})
 
+
 @tool
 def get_settings() -> str:
     """Get current application settings."""
     return json.dumps({"theme": "dark", "language": "en"})
 
+
 @tool
 def update_settings(key: str, value: str) -> str:
     """Update an application setting."""
     return json.dumps({"key": key, "value": value, "updated": True})
 
+
 @tool
 def translate_text(text: str, target_lang: str) -> str:
     """Translate text to a target language."""
     return json.dumps({"original": text, "translated": f"[{target_lang}] {text}"})
 
+
 @tool
 def generate_report(report_type: str) -> str:
     """Generate a system report (daily, weekly, monthly)."""
     return json.dumps({"type": report_type, "generated": True})
 
+
 @tool
 def health_check() -> str:
     """Check system health status."""
@@ -254,20 +301,53 @@ def health_check() -> str:
 
 
 ALL_TOOLS = [
-    create_user, get_user, update_user, delete_user, list_users,
-    search_users, reset_password, ban_user,
-    create_order, get_order, cancel_order, list_orders,
-    update_order_status, process_refund, track_shipment,
-    create_product, get_product, update_product, delete_product,
-    list_products, search_products,
-    charge_card, get_payment, list_payments, create_invoice,
-    send_email, send_sms, send_push_notification, list_notifications,
-    upload_file, download_file, delete_file, list_files,
-    get_dashboard_stats, get_sales_report, get_user_activity, get_conversion_rate,
-    get_weather, get_forecast,
-    create_event, list_events, delete_event,
-    get_settings, update_settings,
-    translate_text, generate_report, health_check,
+    create_user,
+    get_user,
+    update_user,
+    delete_user,
+    list_users,
+    search_users,
+    reset_password,
+    ban_user,
+    create_order,
+    get_order,
+    cancel_order,
+    list_orders,
+    update_order_status,
+    process_refund,
+    track_shipment,
+    create_product,
+    get_product,
+    update_product,
+    delete_product,
+    list_products,
+    search_products,
+    charge_card,
+    get_payment,
+    list_payments,
+    create_invoice,
+    send_email,
+    send_sms,
+    send_push_notification,
+    list_notifications,
+    upload_file,
+    download_file,
+    delete_file,
+    list_files,
+    get_dashboard_stats,
+    get_sales_report,
+    get_user_activity,
+    get_conversion_rate,
+    get_weather,
+    get_forecast,
+    create_event,
+    list_events,
+    delete_event,
+    get_settings,
+    update_settings,
+    translate_text,
+    generate_report,
+    health_check,
 ]
 
 
@@ -280,7 +360,7 @@ def _count_tool_schema_chars(tools: list) -> int:
             "function": {
                 "name": t.name,
                 "description": t.description,
-            }
+            },
         }
         if hasattr(t, "args_schema") and t.args_schema:
             try:
@@ -316,7 +396,7 @@ def main():
     print(f"    Tool calls: {[tc['name'] for tc in (result_all.tool_calls or [])]}")
 
     # --- Method 2: Gateway 2 tools ---
-    print(f"\n[2] Gateway 2 tools bound to LLM")
+    print("\n[2] Gateway 2 tools bound to LLM")
     gateway = create_gateway_tools(ALL_TOOLS, top_k=10)
     llm_gw = llm.bind_tools(gateway)
     gw_chars = _count_tool_schema_chars(gateway)
@@ -333,12 +413,20 @@ def main():
     # --- Comparison ---
     print(f"\n{'=' * 70}")
     print("COMPARISON")
-    print(f"  Tool schema: {all_chars:,} → {gw_chars:,} chars ({(1 - gw_chars/all_chars)*100:.0f}% reduction)")
-    print(f"  Estimated tokens: ~{all_tokens_est:,} → ~{gw_tokens_est:,} ({(1 - gw_tokens_est/all_tokens_est)*100:.0f}% reduction)")
+    char_reduction = (1 - gw_chars / all_chars) * 100
+    print(f"  Tool schema: {all_chars:,} → {gw_chars:,} chars ({char_reduction:.0f}% reduction)")
+    token_reduction = (1 - gw_tokens_est / all_tokens_est) * 100
+    print(
+        f"  Estimated tokens: ~{all_tokens_est:,} → ~{gw_tokens_est:,} "
+        f"({token_reduction:.0f}% reduction)"
+    )
 
     if isinstance(prompt_all, int) and isinstance(prompt_gw, int):
         actual_reduction = (1 - prompt_gw / prompt_all) * 100
-        print(f"  Actual prompt_tokens: {prompt_all:,} → {prompt_gw:,} ({actual_reduction:.0f}% reduction)")
+        print(
+            f"  Actual prompt_tokens: {prompt_all:,} → {prompt_gw:,} "
+            f"({actual_reduction:.0f}% reduction)"
+        )
         saved = prompt_all - prompt_gw
         print(f"  Tokens saved per turn: {saved:,}")
     else:
diff --git a/tests/test_gateway_xgen_workflow.py b/tests/test_gateway_xgen_workflow.py
index eaa0e12..59fa02e 100644
--- a/tests/test_gateway_xgen_workflow.py
+++ b/tests/test_gateway_xgen_workflow.py
@@ -16,42 +16,50 @@
 import time
 
 from langchain_core.tools import tool
+
 pytest = __import__("pytest")
 ChatOllama = pytest.importorskip("langchain_ollama").ChatOllama
-from langgraph.prebuilt import create_react_agent
-
-from graph_tool_call.langchain.gateway import create_gateway_tools
+from langgraph.prebuilt import create_react_agent  # noqa: E402
 
+from graph_tool_call.langchain.gateway import create_gateway_tools  # noqa: E402
 
 # ===================================================================
 # Slack MCP Tools (6)
 # ===================================================================
 
+
 @tool
 def slack_get_channel_id(channel_name: str) -> str:
     """Get the ID of a Slack channel by name."""
     return json.dumps({"channel_id": "C01234", "name": channel_name})
 
+
 @tool
 def slack_send_message(channel_id: str, message: str) -> str:
     """Send a message to a Slack channel."""
     return json.dumps({"ok": True, "channel": channel_id, "ts": "1234567890.123456"})
 
+
 @tool
 def slack_list_channels() -> str:
     """List all Slack channels in the workspace."""
-    return json.dumps({"channels": [{"id": "C01", "name": "general"}, {"id": "C02", "name": "dev"}]})
+    return json.dumps(
+        {"channels": [{"id": "C01", "name": "general"}, {"id": "C02", "name": "dev"}]}
+    )
+
 
 @tool
 def slack_list_users() -> str:
     """List all users in the Slack workspace."""
     return json.dumps({"users": [{"id": "U01", "name": "alice"}, {"id": "U02", "name": "bob"}]})
 
+
 @tool
 def slack_search_conversations(query: str) -> str:
     """Search Slack conversations by keyword."""
     return json.dumps({"messages": [{"text": f"Found: {query}", "channel": "C01"}]})
 
+
 @tool
 def slack_get_message_link(channel_id: str, message_ts: str) -> str:
     """Get a permalink to a specific Slack message."""
@@ -62,141 +70,185 @@ def slack_get_message_link(channel_id: str, message_ts: str) -> str:
 # GitHub MCP Tools (8)
 # ===================================================================
 
+
 @tool
 def github_get_file(path: str, repo: str = "main-repo") -> str:
     """Get the contents of a file from a GitHub repository."""
     return json.dumps({"path": path, "content": "file content here", "sha": "abc123"})
 
+
 @tool
 def github_get_issues(repo: str, state: str = "open") -> str:
     """Get all issues from a GitHub repository."""
     return json.dumps({"issues": [{"number": 1, "title": "Bug fix", "state": state}]})
 
+
 @tool
 def github_search_issues(query: str) -> str:
     """Search issues across GitHub repositories."""
     return json.dumps({"items": [{"number": 42, "title": query, "state": "open"}]})
 
+
 @tool
 def github_create_issue(title: str, body: str, repo: str = "main-repo") -> str:
     """Create a new issue in a GitHub repository."""
-    return json.dumps({"number": 100, "title": title, "html_url": "https://github.com/repo/issues/100"})
+    return json.dumps(
+        {"number": 100, "title": title, "html_url": "https://github.com/repo/issues/100"}
+    )
+
 
 @tool
 def github_create_pull_request(title: str, body: str, head: str, base: str = "main") -> str:
     """Create a new pull request in a GitHub repository."""
     return json.dumps({"number": 50, "title": title, "html_url": "https://github.com/repo/pull/50"})
 
+
 @tool
 def github_comment_on_issue(issue_number: int, comment: str) -> str:
     """Add a comment to a GitHub issue."""
     return json.dumps({"id": 999, "issue_number": issue_number, "body": comment})
 
+
 @tool
 def github_list_pull_requests(repo: str, state: str = "open") -> str:
     """List pull requests in a GitHub repository."""
     return json.dumps({"pull_requests": [{"number": 50, "title": "Feature PR", "state": state}]})
 
+
 @tool
 def github_get_pull_request(pull_number: int) -> str:
     """Get details of a specific pull request."""
-    return json.dumps({"number": pull_number, "title": "Feature", "mergeable": True, "additions": 50})
+    return json.dumps(
+        {"number": pull_number, "title": "Feature", "mergeable": True, "additions": 50}
+    )
 
 
 # ===================================================================
 # Atlassian — Jira Tools (19)
 # ===================================================================
 
+
 @tool
 def jira_search_issues(jql: str, max_results: int = 50) -> str:
     """Search Jira issues using JQL query language."""
-    return json.dumps({"issues": [{"key": "PROJ-123", "summary": "Sample issue", "status": "Open"}], "total": 1})
+    return json.dumps(
+        {"issues": [{"key": "PROJ-123", "summary": "Sample issue", "status": "Open"}], "total": 1}
+    )
+
 
 @tool
 def jira_get_issue(issue_key: str) -> str:
     """Get details of a single Jira issue by key."""
-    return json.dumps({"key": issue_key, "summary": "Bug in login", "status": "In Progress", "assignee": "alice"})
+    return json.dumps(
+        {"key": issue_key, "summary": "Bug in login", "status": "In Progress", "assignee": "alice"}
+    )
+
 
 @tool
 def jira_create_issue(project_key: str, summary: str, issue_type: str = "Task") -> str:
     """Create a new Jira issue or sub-task."""
     return json.dumps({"key": f"{project_key}-999", "summary": summary, "type": issue_type})
 
+
 @tool
 def jira_update_issue(issue_key: str, fields: str) -> str:
     """Update fields of a Jira issue."""
     return json.dumps({"key": issue_key, "updated": True})
 
+
 @tool
 def jira_get_transitions(issue_key: str) -> str:
     """Get available status transitions for a Jira issue."""
-    return json.dumps({"transitions": [{"id": "31", "name": "Done"}, {"id": "21", "name": "In Progress"}]})
+    return json.dumps(
+        {"transitions": [{"id": "31", "name": "Done"}, {"id": "21", "name": "In Progress"}]}
+    )
+
 
 @tool
 def jira_transition_issue(issue_key: str, transition_id: str) -> str:
     """Change the status of a Jira issue via transition."""
     return json.dumps({"key": issue_key, "transitioned": True, "transition_id": transition_id})
 
+
 @tool
 def jira_add_comment(issue_key: str, comment_body: str) -> str:
     """Add a comment to a Jira issue."""
     return json.dumps({"id": "10001", "issue_key": issue_key, "body": comment_body})
 
+
 @tool
 def jira_get_comments(issue_key: str) -> str:
     """Get all comments from a Jira issue."""
     return json.dumps({"comments": [{"id": "10001", "body": "Working on it", "author": "alice"}]})
 
+
 @tool
 def jira_list_projects() -> str:
     """List all Jira projects accessible to the user."""
-    return json.dumps({"projects": [{"key": "PROJ", "name": "Main Project"}, {"key": "DEV", "name": "Development"}]})
+    return json.dumps(
+        {
+            "projects": [
+                {"key": "PROJ", "name": "Main Project"},
+                {"key": "DEV", "name": "Development"},
+            ]
+        }
+    )
+
 
 @tool
 def jira_get_project(project_key: str) -> str:
     """Get details of a specific Jira project."""
     return json.dumps({"key": project_key, "name": "Main Project", "lead": "alice"})
 
+
 @tool
 def jira_assign_issue(issue_key: str, assignee: str) -> str:
     """Assign a Jira issue to a user."""
     return json.dumps({"key": issue_key, "assignee": assignee})
 
+
 @tool
 def jira_add_worklog(issue_key: str, time_spent: str, comment: str = "") -> str:
     """Log time spent on a Jira issue."""
     return json.dumps({"issue_key": issue_key, "time_spent": time_spent, "logged": True})
 
+
 @tool
 def jira_search_users(query: str) -> str:
     """Search for Jira users by name or email."""
     return json.dumps({"users": [{"name": "alice", "email": "alice@example.com"}]})
 
+
 @tool
 def jira_delete_issue(issue_key: str) -> str:
     """Delete a Jira issue permanently."""
     return json.dumps({"key": issue_key, "deleted": True})
 
+
 @tool
 def jira_get_boards() -> str:
     """Get all Scrum/Kanban boards in Jira."""
     return json.dumps({"boards": [{"id": 1, "name": "Sprint Board", "type": "scrum"}]})
 
+
 @tool
 def jira_get_sprints(board_id: int) -> str:
     """Get sprints from a Jira board."""
     return json.dumps({"sprints": [{"id": 10, "name": "Sprint 5", "state": "active"}]})
 
+
 @tool
 def jira_link_issues(inward_key: str, outward_key: str, link_type: str = "Relates") -> str:
     """Link two Jira issues together."""
     return json.dumps({"linked": True, "inward": inward_key, "outward": outward_key})
 
+
 @tool
 def jira_get_attachments(issue_key: str) -> str:
     """Get attachments from a Jira issue."""
     return json.dumps({"attachments": [{"filename": "screenshot.png", "size": 102400}]})
 
+
 @tool
 def jira_add_attachment(issue_key: str, filename: str) -> str:
     """Add an attachment to a Jira issue."""
@@ -207,46 +259,66 @@ def jira_add_attachment(issue_key: str, filename: str) -> str:
 # Atlassian — Confluence Tools (9)
 # ===================================================================
 
+
 @tool
 def confluence_search(cql: str, limit: int = 25) -> str:
     """Search Confluence content using CQL query language."""
-    return json.dumps({"results": [{"id": "123", "title": "API Guide", "type": "page"}], "total": 1})
+    return json.dumps(
+        {"results": [{"id": "123", "title": "API Guide", "type": "page"}], "total": 1}
+    )
+
 
 @tool
 def confluence_get_page(page_id: str) -> str:
     """Get a Confluence page by ID."""
     return json.dumps({"id": page_id, "title": "API Guide", "body": "Page content here..."})
 
+
 @tool
 def confluence_create_page(space_key: str, title: str, body: str) -> str:
     """Create a new Confluence page in a space."""
     return json.dumps({"id": "456", "title": title, "space": space_key})
 
+
 @tool
 def confluence_update_page(page_id: str, title: str, body: str) -> str:
     """Update an existing Confluence page."""
     return json.dumps({"id": page_id, "title": title, "updated": True})
 
+
 @tool
 def confluence_delete_page(page_id: str) -> str:
     """Delete a Confluence page."""
     return json.dumps({"id": page_id, "deleted": True})
 
+
 @tool
 def confluence_get_spaces(limit: int = 25) -> str:
     """List all Confluence spaces."""
-    return json.dumps({"spaces": [{"key": "DEV", "name": "Development"}, {"key": "HR", "name": "Human Resources"}]})
+    return json.dumps(
+        {
+            "spaces": [
+                {"key": "DEV", "name": "Development"},
+                {"key": "HR", "name": "Human Resources"},
+            ]
+        }
+    )
+
 
 @tool
 def confluence_get_pages_in_space(space_key: str) -> str:
     """Get all pages in a Confluence space."""
-    return json.dumps({"pages": [{"id": "123", "title": "API Guide"}, {"id": "124", "title": "Setup Guide"}]})
+    return json.dumps(
+        {"pages": [{"id": "123", "title": "API Guide"}, {"id": "124", "title": "Setup Guide"}]}
+    )
+
 
 @tool
 def confluence_add_comment(page_id: str, body: str) -> str:
     """Add a comment to a Confluence page."""
     return json.dumps({"id": "789", "page_id": page_id, "body": body})
 
+
 @tool
 def confluence_get_page_comments(page_id: str) -> str:
     """Get all comments from a Confluence page."""
@@ -257,110 +329,161 @@ def confluence_get_page_comments(page_id: str) -> str:
 # MS365 MCP Tools (15)
 # ===================================================================
 
+
 @tool
 def ms365_list_mails(folder: str = "inbox", top: int = 10) -> str:
     """List emails from Outlook mailbox."""
-    return json.dumps({"emails": [{"id": "m1", "subject": "Meeting tomorrow", "from": "boss@company.com"}]})
+    return json.dumps(
+        {"emails": [{"id": "m1", "subject": "Meeting tomorrow", "from": "boss@company.com"}]}
+    )
+
 
 @tool
 def ms365_read_mail(message_id: str) -> str:
     """Read a specific email from Outlook."""
-    return json.dumps({"id": message_id, "subject": "Meeting", "body": "Please join at 3pm", "from": "boss@company.com"})
+    return json.dumps(
+        {
+            "id": message_id,
+            "subject": "Meeting",
+            "body": "Please join at 3pm",
+            "from": "boss@company.com",
+        }
+    )
+
 
 @tool
 def ms365_send_email(to: str, subject: str, body: str) -> str:
     """Send an email via Outlook."""
     return json.dumps({"sent": True, "to": to, "subject": subject})
 
+
 @tool
 def ms365_reply_to_email(message_id: str, body: str) -> str:
     """Reply to an email in Outlook."""
     return json.dumps({"replied": True, "message_id": message_id})
 
+
 @tool
 def ms365_list_calendar_events(start_date: str, end_date: str) -> str:
     """List calendar events within a date range."""
-    return json.dumps({"events": [{"subject": "Team standup", "start": start_date, "location": "Room A"}]})
+    return json.dumps(
+        {"events": [{"subject": "Team standup", "start": start_date, "location": "Room A"}]}
+    )
+
 
 @tool
 def ms365_create_event(subject: str, start: str, end: str, attendees: str = "") -> str:
     """Create a new calendar event in Outlook."""
     return json.dumps({"id": "e1", "subject": subject, "start": start, "end": end})
 
+
 @tool
 def ms365_delete_event(event_id: str) -> str:
     """Delete a calendar event."""
     return json.dumps({"deleted": True, "event_id": event_id})
 
+
 @tool
 def ms365_list_teams() -> str:
     """List all Microsoft Teams the user belongs to."""
-    return json.dumps({"teams": [{"id": "t1", "name": "Engineering"}, {"id": "t2", "name": "Design"}]})
+    return json.dumps(
+        {"teams": [{"id": "t1", "name": "Engineering"}, {"id": "t2", "name": "Design"}]}
+    )
+
 
 @tool
 def ms365_list_team_channels(team_id: str) -> str:
     """List channels in a Microsoft Teams team."""
-    return json.dumps({"channels": [{"id": "ch1", "name": "General"}, {"id": "ch2", "name": "Dev"}]})
+    return json.dumps(
+        {"channels": [{"id": "ch1", "name": "General"}, {"id": "ch2", "name": "Dev"}]}
+    )
+
 
 @tool
 def ms365_send_team_message(team_id: str, channel_id: str, message: str) -> str:
     """Send a message to a Microsoft Teams channel."""
     return json.dumps({"sent": True, "team_id": team_id, "channel_id": channel_id})
 
+
 @tool
 def ms365_list_files(folder_path: str = "/") -> str:
     """List files in OneDrive."""
-    return json.dumps({"files": [{"name": "report.xlsx", "size": 51200}, {"name": "notes.docx", "size": 10240}]})
+    return json.dumps(
+        {"files": [{"name": "report.xlsx", "size": 51200}, {"name": "notes.docx", "size": 10240}]}
+    )
+
 
 @tool
 def ms365_create_task(title: str, due_date: str = "") -> str:
     """Create a task in Microsoft To Do / Planner."""
     return json.dumps({"id": "task1", "title": title, "due_date": due_date, "status": "notStarted"})
 
+
 @tool
 def ms365_list_tasks(plan_id: str = "default") -> str:
     """List tasks from Microsoft Planner."""
     return json.dumps({"tasks": [{"id": "task1", "title": "Review PR", "status": "inProgress"}]})
 
+
 @tool
 def ms365_list_contacts(top: int = 10) -> str:
     """List contacts from Outlook."""
     return json.dumps({"contacts": [{"name": "Alice Kim", "email": "alice@company.com"}]})
 
+
 @tool
 def ms365_get_contact(contact_id: str) -> str:
     """Get a specific contact from Outlook."""
-    return json.dumps({"id": contact_id, "name": "Alice Kim", "email": "alice@company.com", "phone": "+82-10-1234-5678"})
+    return json.dumps(
+        {
+            "id": contact_id,
+            "name": "Alice Kim",
+            "email": "alice@company.com",
+            "phone": "+82-10-1234-5678",
+        }
+    )
 
 
 # ===================================================================
 # API Tool Loader Tools (5) — custom REST API tools
 # ===================================================================
 
+
 @tool
 def api_get_product_inventory(product_code: str) -> str:
     """조회: 상품 코드로 재고 수량을 조회합니다. Query product inventory by product code."""
     return json.dumps({"product_code": product_code, "quantity": 150, "warehouse": "Seoul-01"})
 
+
 @tool
 def api_create_purchase_order(supplier_id: str, items: str) -> str:
     """생성: 공급업체에 발주서를 생성합니다. Create a purchase order to a supplier."""
     return json.dumps({"po_number": "PO-2024-001", "supplier_id": supplier_id, "status": "created"})
 
+
 @tool
 def api_get_customer_info(customer_id: str) -> str:
     """조회: 고객 ID로 고객 상세 정보를 조회합니다. Get customer details by customer ID."""
-    return json.dumps({"customer_id": customer_id, "name": "Kim Corp", "grade": "VIP", "credit_limit": 50000000})
+    return json.dumps(
+        {"customer_id": customer_id, "name": "Kim Corp", "grade": "VIP", "credit_limit": 50000000}
+    )
+
 
 @tool
 def api_submit_approval(document_id: str, action: str) -> str:
-    """결재: 문서 결재를 승인 또는 반려합니다. Approve or reject a document in the approval workflow."""
+    """결재: 문서 결재를 승인 또는 반려합니다.
+
+    Approve or reject a document in the approval workflow.
+    """
     return json.dumps({"document_id": document_id, "action": action, "result": "processed"})
 
+
 @tool
 def api_get_sales_dashboard(period: str = "monthly") -> str:
     """대시보드: 매출 현황 대시보드 데이터를 조회합니다. Get sales dashboard data."""
-    return json.dumps({"period": period, "total_sales": 1250000000, "orders": 3400, "growth": "+12.5%"})
+    return json.dumps(
+        {"period": period, "total_sales": 1250000000, "orders": 3400, "growth": "+12.5%"}
+    )
 
 
 # ===================================================================
@@ -369,32 +492,73 @@ def api_get_sales_dashboard(period: str = "monthly") -> str:
 
 ALL_TOOLS = [
     # Slack (6)
-    slack_get_channel_id, slack_send_message, slack_list_channels,
-    slack_list_users, slack_search_conversations, slack_get_message_link,
+    slack_get_channel_id,
+    slack_send_message,
+    slack_list_channels,
+    slack_list_users,
+    slack_search_conversations,
+    slack_get_message_link,
     # GitHub (8)
-    github_get_file, github_get_issues, github_search_issues,
-    github_create_issue, github_create_pull_request, github_comment_on_issue,
-    github_list_pull_requests, github_get_pull_request,
+    github_get_file,
+    github_get_issues,
+    github_search_issues,
+    github_create_issue,
+    github_create_pull_request,
+    github_comment_on_issue,
+    github_list_pull_requests,
+    github_get_pull_request,
     # Jira (19)
-    jira_search_issues, jira_get_issue, jira_create_issue, jira_update_issue,
-    jira_get_transitions, jira_transition_issue, jira_add_comment,
-    jira_get_comments, jira_list_projects, jira_get_project,
-    jira_assign_issue, jira_add_worklog, jira_search_users,
-    jira_delete_issue, jira_get_boards, jira_get_sprints,
-    jira_link_issues, jira_get_attachments, jira_add_attachment,
+    jira_search_issues,
+    jira_get_issue,
+    jira_create_issue,
+    jira_update_issue,
+    jira_get_transitions,
+    jira_transition_issue,
+    jira_add_comment,
+    jira_get_comments,
+    jira_list_projects,
+    jira_get_project,
+    jira_assign_issue,
+    jira_add_worklog,
+    jira_search_users,
+    jira_delete_issue,
+    jira_get_boards,
+    jira_get_sprints,
+    jira_link_issues,
+    jira_get_attachments,
+    jira_add_attachment,
     # Confluence (9)
-    confluence_search, confluence_get_page, confluence_create_page,
-    confluence_update_page, confluence_delete_page, confluence_get_spaces,
-    confluence_get_pages_in_space, confluence_add_comment, confluence_get_page_comments,
+    confluence_search,
+    confluence_get_page,
+    confluence_create_page,
+    confluence_update_page,
+    confluence_delete_page,
+    confluence_get_spaces,
+    confluence_get_pages_in_space,
+    confluence_add_comment,
+    confluence_get_page_comments,
     # MS365 (15)
-    ms365_list_mails, ms365_read_mail, ms365_send_email, ms365_reply_to_email,
-    ms365_list_calendar_events, ms365_create_event, ms365_delete_event,
-    ms365_list_teams, ms365_list_team_channels, ms365_send_team_message,
-    ms365_list_files, ms365_create_task, ms365_list_tasks,
-    ms365_list_contacts, ms365_get_contact,
+    ms365_list_mails,
+    ms365_read_mail,
+    ms365_send_email,
+    ms365_reply_to_email,
+    ms365_list_calendar_events,
+    ms365_create_event,
+    ms365_delete_event,
+    ms365_list_teams,
+    ms365_list_team_channels,
+    ms365_send_team_message,
+    ms365_list_files,
+    ms365_create_task,
+    ms365_list_tasks,
+    ms365_list_contacts,
+    ms365_get_contact,
     # API Tools (5)
-    api_get_product_inventory, api_create_purchase_order,
-    api_get_customer_info, api_submit_approval, api_get_sales_dashboard,
+    api_get_product_inventory,
+    api_create_purchase_order,
+    api_get_customer_info,
+    api_submit_approval,
+    api_get_sales_dashboard,
 ]
 
 
@@ -481,15 +645,15 @@ def _count_tool_schema_chars(tools: list) -> int:
 
 def main():
     print(f"{'=' * 70}")
-    print(f"xgen-workflow Gateway E2E Test")
+    print("xgen-workflow Gateway E2E Test")
     print(f"{'=' * 70}")
-    print(f"Tool breakdown:")
-    print(f"  Slack MCP:      6 tools")
-    print(f"  GitHub MCP:     8 tools")
-    print(f"  Jira MCP:      19 tools")
-    print(f"  Confluence MCP:  9 tools")
-    print(f"  MS365 MCP:     15 tools")
-    print(f"  API Loader:     5 tools")
+    print("Tool breakdown:")
+    print("  Slack MCP:      6 tools")
+    print("  GitHub MCP:     8 tools")
+    print("  Jira MCP:      19 tools")
+    print("  Confluence MCP:  9 tools")
+    print("  MS365 MCP:     15 tools")
+    print("  API Loader:     5 tools")
     print(f"  Total:         {len(ALL_TOOLS)} tools → gateway 2 tools")
     print(f"{'=' * 70}")
 
@@ -499,7 +663,7 @@ def main():
     gw_chars = _count_tool_schema_chars(gateway)
     reduction = (1 - gw_chars / all_chars) * 100
     print(f"\nToken savings: {all_chars:,} → {gw_chars:,} chars ({reduction:.0f}% reduction)")
-    print(f"  ~{all_chars//4:,} → ~{gw_chars//4:,} tokens per turn")
+    print(f"  ~{all_chars // 4:,} → ~{gw_chars // 4:,} tokens per turn")
 
     # LLM test
     llm = ChatOllama(model="qwen3.5:4b", temperature=0)
@@ -549,7 +713,7 @@ def main():
             print(f"  [ERROR] {e} ({elapsed:.1f}s)")
 
     print(f"\n{'=' * 70}")
-    print(f"RESULT: {passed}/{total} ({passed/total*100:.0f}%)")
+    print(f"RESULT: {passed}/{total} ({passed / total * 100:.0f}%)")
     print(f"  Tools: {len(ALL_TOOLS)} → 2 (gateway)")
     print(f"  Token reduction: {reduction:.0f}%")
     print(f"{'=' * 70}")
diff --git a/tests/test_langchain_agent.py b/tests/test_langchain_agent.py
index e3167cc..14f26ce 100644
--- a/tests/test_langchain_agent.py
+++ b/tests/test_langchain_agent.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-from typing import Any
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -87,7 +86,7 @@ def cancel_order(order_id: str) -> str:
 
         from graph_tool_call.langchain.agent import create_agent
 
-        agent = create_agent(
+        create_agent(
             mock_model,
             tools=[get_weather, send_email, cancel_order],
             top_k=2,
@@ -156,7 +155,7 @@ def search_users(query: str) -> str:
         state = {"messages": [HumanMessage(content="what's the weather in Seoul")]}
         runtime = MagicMock()
 
-        result = model_factory(state, runtime)
+        model_factory(state, runtime)
 
         # bind_tools should have been called with a filtered subset
         mock_model.bind_tools.assert_called_once()
diff --git a/tests/test_langchain_compatibility.py b/tests/test_langchain_compatibility.py
index b567c84..313c5c4 100644
--- a/tests/test_langchain_compatibility.py
+++ b/tests/test_langchain_compatibility.py
@@ -10,9 +10,6 @@
 from dataclasses import dataclass, field
 from typing import Any
 
-import pytest
-
-
 # ---------------------------------------------------------------------------
 # Fake LangChain tool stubs (same pattern as test_langchain_gateway.py)
 # ---------------------------------------------------------------------------
@@ -62,9 +59,7 @@ def _make_math_tools() -> list[FakeTool]:
     )
     return [
         FakeTool(name="add", description="Add two numbers together", args_schema=add_schema),
-        FakeTool(
-            name="multiply", description="Multiply two numbers", args_schema=multiply_schema
-        ),
+        FakeTool(name="multiply", description="Multiply two numbers", args_schema=multiply_schema),
     ]
 
 
@@ -232,10 +227,12 @@ def _get_call_tool(self, tools):
     def test_call_existing_tool(self):
         call = self._get_call_tool(_make_diverse_tools())
 
-        result = call.invoke({
-            "tool_name": "cancel_order",
-            "arguments": {"order_id": "123"},
-        })
+        result = call.invoke(
+            {
+                "tool_name": "cancel_order",
+                "arguments": {"order_id": "123"},
+            }
+        )
 
         assert "cancel_order" in result
         assert "123" in result
@@ -243,10 +240,12 @@ def test_call_existing_tool(self):
     def test_call_nonexistent_tool(self):
         call = self._get_call_tool(_make_diverse_tools())
 
-        result = call.invoke({
-            "tool_name": "nonexistent_tool",
-            "arguments": {},
-        })
+        result = call.invoke(
+            {
+                "tool_name": "nonexistent_tool",
+                "arguments": {},
+            }
+        )
         data = json.loads(result)
 
         assert "error" in data
@@ -255,19 +254,23 @@ def test_call_nonexistent_tool(self):
     def test_call_with_none_arguments(self):
         call = self._get_call_tool(_make_diverse_tools())
 
-        result = call.invoke({
-            "tool_name": "get_weather",
-            "arguments": None,
-        })
+        result = call.invoke(
+            {
+                "tool_name": "get_weather",
+                "arguments": None,
+            }
+        )
 
         assert "get_weather" in result
 
     def test_call_with_missing_arguments(self):
         call = self._get_call_tool(_make_diverse_tools())
 
-        result = call.invoke({
-            "tool_name": "get_weather",
-        })
+        result = call.invoke(
+            {
+                "tool_name": "get_weather",
+            }
+        )
 
         assert "get_weather" in result
 
@@ -387,10 +390,12 @@ def test_search_then_call(self):
         assert any(t["name"] == "send_email" for t in search_result["tools"])
 
         # Step 2: Call
-        call_result = call.invoke({
-            "tool_name": "send_email",
-            "arguments": {"to": "user@example.com", "body": "hello"},
-        })
+        call_result = call.invoke(
+            {
+                "tool_name": "send_email",
+                "arguments": {"to": "user@example.com", "body": "hello"},
+            }
+        )
         assert "send_email" in call_result
         assert "executed" in call_result
 
@@ -410,10 +415,12 @@ def test_search_then_call_via_iter(self):
         search_result = json.loads(search.invoke({"query": "weather"}))
         assert any(t["name"] == "get_weather" for t in search_result["tools"])
 
-        call_result = call.invoke({
-            "tool_name": "get_weather",
-            "arguments": {"city": "Seoul"},
-        })
+        call_result = call.invoke(
+            {
+                "tool_name": "get_weather",
+                "arguments": {"city": "Seoul"},
+            }
+        )
         assert "get_weather" in call_result
 
     def test_user_example_scenario(self):
@@ -425,9 +432,7 @@ def test_user_example_scenario(self):
         search_documents = FakeTool(
             name="search_documents", description="Search documents by query string"
         )
-        get_weather = FakeTool(
-            name="get_weather", description="Get current weather for a city"
-        )
+        get_weather = FakeTool(name="get_weather", description="Get current weather for a city")
 
         # ToolGraph creation and tool registration (user's pattern)
         tg_tool = ToolGraph()
@@ -451,10 +456,12 @@ def test_user_example_scenario(self):
 
         # Verify call works
         call = next(t for t in tools if t.name == "call_tool")
-        call_result = call.invoke({
-            "tool_name": "add",
-            "arguments": {"a": 1, "b": 2},
-        })
+        call_result = call.invoke(
+            {
+                "tool_name": "add",
+                "arguments": {"a": 1, "b": 2},
+            }
+        )
         assert "add" in call_result
         assert "executed" in call_result
 
@@ -481,10 +488,12 @@ def test_add_tool_after_gateway_creation(self):
         assert result["total_tools"] == 2
 
         # And the new tool is callable
-        call_result = call.invoke({
-            "tool_name": "multiply",
-            "arguments": {"a": 3, "b": 4},
-        })
+        call_result = call.invoke(
+            {
+                "tool_name": "multiply",
+                "arguments": {"a": 3, "b": 4},
+            }
+        )
         assert "multiply" in call_result
 
     def test_add_tools_batch(self):
@@ -546,10 +555,12 @@ def test_call_with_dict_arguments(self):
         gateway = tg.as_tools()
         call = next(t for t in gateway if t.name == "call_tool")
 
-        result = call.invoke({
-            "tool_name": "add",
-            "arguments": {"a": 1, "b": 2},
-        })
+        result = call.invoke(
+            {
+                "tool_name": "add",
+                "arguments": {"a": 1, "b": 2},
+            }
+        )
         assert "add" in result
         assert "executed" in result
 
diff --git a/tests/test_langchain_gateway.py b/tests/test_langchain_gateway.py
index 220328b..1d7b9cf 100644
--- a/tests/test_langchain_gateway.py
+++ b/tests/test_langchain_gateway.py
@@ -6,8 +6,6 @@
 from dataclasses import dataclass
 from typing import Any
 
-import pytest
-
 
 @dataclass
 class FakeTool:
@@ -134,10 +132,12 @@ def test_call_existing_tool(self):
         tools = _make_tools()
         call = self._get_call_tool(tools)
 
-        result = call.invoke({
-            "tool_name": "cancel_order",
-            "arguments": {"order_id": "123"},
-        })
+        result = call.invoke(
+            {
+                "tool_name": "cancel_order",
+                "arguments": {"order_id": "123"},
+            }
+        )
 
         assert "cancel_order" in result
         assert "123" in result
@@ -146,10 +146,12 @@ def test_call_nonexistent_tool(self):
         tools = _make_tools()
         call = self._get_call_tool(tools)
 
-        result = call.invoke({
-            "tool_name": "nonexistent_tool",
-            "arguments": {},
-        })
+        result = call.invoke(
+            {
+                "tool_name": "nonexistent_tool",
+                "arguments": {},
+            }
+        )
         data = json.loads(result)
 
         assert "error" in data
@@ -159,9 +161,11 @@ def test_call_with_empty_arguments(self):
         tools = _make_tools()
         call = self._get_call_tool(tools)
 
-        result = call.invoke({
-            "tool_name": "get_weather",
-        })
+        result = call.invoke(
+            {
+                "tool_name": "get_weather",
+            }
+        )
 
         assert "get_weather" in result
 
@@ -169,10 +173,12 @@ def test_call_with_none_arguments(self):
         tools = _make_tools()
         call = self._get_call_tool(tools)
 
-        result = call.invoke({
-            "tool_name": "get_weather",
-            "arguments": None,
-        })
+        result = call.invoke(
+            {
+                "tool_name": "get_weather",
+                "arguments": None,
+            }
+        )
 
         assert "get_weather" in result
 
@@ -194,10 +200,12 @@ def test_search_then_call(self):
         assert any(t["name"] == "send_email" for t in search_result["tools"])
 
         # Step 2: Call
-        call_result = call.invoke({
-            "tool_name": "send_email",
-            "arguments": {"to": "user@example.com", "body": "hello"},
-        })
+        call_result = call.invoke(
+            {
+                "tool_name": "send_email",
+                "arguments": {"to": "user@example.com", "body": "hello"},
+            }
+        )
         assert "send_email" in call_result
         assert "executed" in call_result
 
diff --git a/tests/test_langchain_toolkit.py b/tests/test_langchain_toolkit.py
index 9e091ac..1e9d19a 100644
--- a/tests/test_langchain_toolkit.py
+++ b/tests/test_langchain_toolkit.py
@@ -2,11 +2,9 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any
 
-import pytest
-
 
 @dataclass
 class FakeTool:
@@ -61,8 +59,7 @@ def test_filter_tools_preserves_original_objects():
 
 
 def test_filter_tools_with_prebuilt_graph():
-    from graph_tool_call import ToolGraph
-    from graph_tool_call import filter_tools
+    from graph_tool_call import ToolGraph, filter_tools
 
     tools = _make_tools()
     tg = ToolGraph()
@@ -114,8 +111,7 @@ def test_toolkit_all_tools():
 
 
 def test_toolkit_graph_accessible():
-    from graph_tool_call import ToolGraph
-    from graph_tool_call import GraphToolkit
+    from graph_tool_call import GraphToolkit, ToolGraph
 
     tools = _make_tools()
     toolkit = GraphToolkit(tools=tools)
@@ -125,8 +121,7 @@ def test_toolkit_graph_accessible():
 
 
 def test_toolkit_with_prebuilt_graph():
-    from graph_tool_call import ToolGraph
-    from graph_tool_call import GraphToolkit
+    from graph_tool_call import GraphToolkit, ToolGraph
 
     tg = ToolGraph()
     tools = _make_tools(5)

From 59b4b7f6c478ff970b7e99dfcf76fb72fd8dc942 Mon Sep 17 00:00:00 2001
From: daehee <1998opening@gmail.com>
Date: Sun, 3 May 2026 18:00:11 +0900
Subject: [PATCH 11/14] =?UTF-8?q?fix:=20=EC=BD=94=EB=93=9C=20=EB=A6=AC?=
 =?UTF-8?q?=EB=B7=B0=20=EA=B2=B0=ED=95=A8=20=EB=B0=98=EC=98=81=20(CRITICAL?=
 =?UTF-8?q?=20#1/#2=20+=20=EB=8B=A8=EC=9C=84=20=ED=85=8C=EC=8A=A4=ED=8A=B8?=
 =?UTF-8?q?=20+=20=EC=86=8C=ED=95=AD=EB=AA=A9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CRITICAL #1 — ${user_input.x} ↔ context["input"] 이름 불일치 해소
  - PlanRunner 가 input_context 를 'input' / 'user_input' 두 키로 alias 등록.
  - synthesizer 의 F2/Cycle-policy fallback (${user_input.<field>}) 이 정상 resolve.
  - 영향: F2 fallback 으로 합성된 plan 이 첫 step 부터 abort 되던 회귀 fix.

CRITICAL #2 — ExecutionTrace.steps 가 항상 빈 리스트
  - PlanCompleted / PlanAborted 이벤트에 trace_steps 필드 추가.
  - run_stream 이 종결 이벤트에 누적된 StepTrace 를 실어 보냄.
  - run() 은 종결 이벤트의 trace_steps 를 추출해 ExecutionTrace.steps 채움.
  - 영향: run_stream 안 쓰는 caller 도 step 단위 trace 받을 수 있음.

단위 테스트 신규 추가 (42 PASS) — plan/graphify 모듈 cover
  - tests/test_plan_runner.py    — CRITICAL #1, #2 회귀 테스트 + 핵심 동작
  - tests/test_plan_synthesizer.py — 합성/체이닝/F2 fallback/normalize 등
  - tests/test_plan_binding.py   — placeholder resolution + 에러 동작
  - tests/test_io_contract.py    — extract_leaves + query/path enum 추출 회귀
  - tests/test_dependency_verbs.py — _VERB_TO_INTENT 'reg' 매핑

기타
  - synthesizer.py: _normalize_field_name 중복 정의 제거 (첫 정의가 dead code 였음).
  - dependency.py: _VERB_TO_INTENT 에 'reg' 추가 (regGoodsApprove → write).
  - io_contract.py: query/path/header parameter 의 enum 추출 (이전엔 body 만).
---
 graph_tool_call/analyze/dependency.py |   1 +
 graph_tool_call/ingest/io_contract.py |   8 +-
 graph_tool_call/plan/runner.py        |  44 ++++--
 graph_tool_call/plan/synthesizer.py   |  21 ---
 tests/test_dependency_verbs.py        |  25 +++
 tests/test_io_contract.py             | 170 ++++++++++++++++++++
 tests/test_plan_binding.py            |  70 ++++++++
 tests/test_plan_runner.py             | 220 ++++++++++++++++++++++++++
 tests/test_plan_synthesizer.py        | 172 ++++++++++++++++++++
 9 files changed, 693 insertions(+), 38 deletions(-)
 create mode 100644 tests/test_dependency_verbs.py
 create mode 100644 tests/test_io_contract.py
 create mode 100644 tests/test_plan_binding.py
 create mode 100644 tests/test_plan_runner.py
 create mode 100644 tests/test_plan_synthesizer.py

diff --git a/graph_tool_call/analyze/dependency.py b/graph_tool_call/analyze/dependency.py
index 533f344..28864fa 100644
--- a/graph_tool_call/analyze/dependency.py
+++ b/graph_tool_call/analyze/dependency.py
@@ -670,6 +670,7 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
     "insert": "write",
     "register": "write",
     "regist": "write",
+    "reg": "write",  # camelCase 약어 (regGoodsApprove 등)
     # update
     "modify": "update",
     "update": "update",
diff --git a/graph_tool_call/ingest/io_contract.py b/graph_tool_call/ingest/io_contract.py
index 7748bb5..90bf308 100644
--- a/graph_tool_call/ingest/io_contract.py
+++ b/graph_tool_call/ingest/io_contract.py
@@ -257,8 +257,13 @@ def extract_consumes_for_operation(
             continue
         if is_swagger2:
             ftype = p.get("type") or "string"
+            # Swagger 2.0 — enum lives directly on the parameter object.
+            enum_vals = p.get("enum") or []
         else:
-            ftype = _schema_type(p.get("schema") or {}) or "string"
+            param_schema = p.get("schema") or {}
+            ftype = _schema_type(param_schema) or "string"
+            # OpenAPI 3.x — enum lives under ``schema``.
+            enum_vals = param_schema.get("enum") or [] if isinstance(param_schema, dict) else []
         if p["name"] in seen_names:
             continue
         seen_names.add(p["name"])
@@ -269,6 +274,7 @@ def extract_consumes_for_operation(
                 field_type=ftype,
                 required=is_required,
                 description=str(p.get("description") or "")[:200],
+                enum=list(enum_vals),
             )
         )
 
diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py
index 3b70f77..73038de 100644
--- a/graph_tool_call/plan/runner.py
+++ b/graph_tool_call/plan/runner.py
@@ -75,6 +75,8 @@ class PlanCompleted:
     plan_id: str = ""
     output: Any = None
     total_duration_ms: int = 0
+    # 누적 step traces — 비-스트리밍 ``run()`` 이 ExecutionTrace.steps 채울 때 사용.
+    trace_steps: list[StepTrace] = field(default_factory=list)
 
 
 @dataclass
@@ -84,6 +86,7 @@ class PlanAborted:
     failed_step: str = ""
     error: dict[str, Any] = field(default_factory=dict)
     total_duration_ms: int = 0
+    trace_steps: list[StepTrace] = field(default_factory=list)
 
 
 PlanEvent = PlanStarted | StepStarted | StepCompleted | StepFailed | PlanCompleted | PlanAborted
@@ -137,8 +140,12 @@ def run_stream(
     ) -> Iterator[PlanEvent]:
         """Execute *plan* and yield events as each step progresses.
 
-        ``input_context`` supplies values for ``${input.xxx}`` bindings —
-        typically the entities extracted by Stage 1 (intent parser).
+        ``input_context`` supplies values for ``${input.xxx}`` and
+        ``${user_input.xxx}`` bindings (both keys resolve to the same dict,
+        kept as aliases because the synthesizer emits ``user_input`` for
+        F2/Cycle-policy fallbacks and historical entity-injection paths use
+        ``input``). Typically the entities extracted by Stage 1 (intent
+        parser) plus any operator-supplied seed values.
         """
         plan_start = time.monotonic()
 
@@ -148,10 +155,14 @@ def run_stream(
             step_count=len(plan.steps),
         )
 
-        # step_id -> output (runtime context for binding resolution)
+        # step_id -> output (runtime context for binding resolution).
+        # ``input`` and ``user_input`` are aliases — same dict, both names —
+        # so binding ``${input.x}`` and ``${user_input.x}`` both resolve.
         context: dict[str, Any] = {}
         if input_context:
-            context["input"] = dict(input_context)
+            input_dict = dict(input_context)
+            context["input"] = input_dict
+            context["user_input"] = input_dict
 
         trace_steps: list[StepTrace] = []
 
@@ -181,6 +192,7 @@ def run_stream(
                     failed_step=step.id,
                     error=err,
                     total_duration_ms=_ms_since(plan_start),
+                    trace_steps=list(trace_steps),
                 )
                 return
 
@@ -216,6 +228,7 @@ def run_stream(
                     failed_step=step.id,
                     error=err,
                     total_duration_ms=_ms_since(plan_start),
+                    trace_steps=list(trace_steps),
                 )
                 return
 
@@ -255,6 +268,7 @@ def run_stream(
                 failed_step="<output_binding>",
                 error=err,
                 total_duration_ms=_ms_since(plan_start),
+                trace_steps=list(trace_steps),
             )
             return
 
@@ -262,6 +276,7 @@ def run_stream(
             plan_id=plan.id,
             output=final,
             total_duration_ms=_ms_since(plan_start),
+            trace_steps=list(trace_steps),
         )
 
     # ----------------------------------------------------------------------
@@ -274,7 +289,12 @@ def run(
         *,
         input_context: dict[str, Any] | None = None,
     ) -> ExecutionTrace:
-        """Execute *plan* and return an ExecutionTrace aggregating events."""
+        """Execute *plan* and return an ExecutionTrace aggregating events.
+
+        ``trace_steps`` 는 종결 이벤트 (``PlanCompleted`` / ``PlanAborted``) 가
+        실어 보내는 것을 그대로 사용 — run_stream 안에서 step 단위로 누적된
+        StepTrace 가 그대로 ExecutionTrace.steps 에 들어간다.
+        """
         started_at = _now_iso()
         started = time.monotonic()
         trace_steps: list[StepTrace] = []
@@ -282,24 +302,16 @@ def run(
         failed_step: str | None = None
         output: Any = None
 
-        last_step_output: dict[str, Any] = {}
-
         for event in self.run_stream(plan, input_context=input_context):
             etype = event.type
-            if etype == "step.completed":
-                # step trace built progressively — simpler: derive from events
-                pass
-            elif etype == "plan.completed":
+            if etype == "plan.completed":
                 success = True
                 output = event.output  # type: ignore[union-attr]
+                trace_steps = list(event.trace_steps)  # type: ignore[union-attr]
             elif etype == "plan.aborted":
                 failed_step = event.failed_step  # type: ignore[union-attr]
+                trace_steps = list(event.trace_steps)  # type: ignore[union-attr]
 
-        # Recompute trace_steps by re-running the stream? No — we already
-        # lost events. Instead the run_stream implementation should also
-        # surface StepTrace. For v1 keep trace minimal (plan-level only) —
-        # callers that need per-step detail should use run_stream.
-        _ = last_step_output  # (placeholder to satisfy future extension)
         return ExecutionTrace(
             plan_id=plan.id,
             success=success,
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
index 4942b7e..ac8f2de 100644
--- a/graph_tool_call/plan/synthesizer.py
+++ b/graph_tool_call/plan/synthesizer.py
@@ -90,27 +90,6 @@ def __init__(
         self.label_field_hints = list(label_field_hints)
 
 
-def _normalize_field_name(name: str) -> str:
-    """Lowercase + strip separators for loose field-name matching.
-
-    Conservative on purpose:
-      ``ordNo`` → ``ordno``
-      ``ord_no`` → ``ordno``
-      ``ORD-NO`` → ``ordno``
-    BUT keeps token roots distinct:
-      ``ordNo`` ≠ ``orderNo`` (``ordno`` ≠ ``orderno``)
-    Token-level synonym mapping (``ord`` ↔ ``order``) is domain-specific
-    and not done here — the graph-edge fallback handles those cases.
-    """
-    if not name:
-        return ""
-    out: list[str] = []
-    for ch in name:
-        if ch.isalnum():
-            out.append(ch.lower())
-    return "".join(out)
-
-
 def _normalize_field_name(name: str) -> str:
     """Lowercase + strip non-alphanumerics for loose field-name matching.
 
diff --git a/tests/test_dependency_verbs.py b/tests/test_dependency_verbs.py
new file mode 100644
index 0000000..e583d05
--- /dev/null
+++ b/tests/test_dependency_verbs.py
@@ -0,0 +1,25 @@
+"""Unit tests for ``graph_tool_call.analyze.dependency`` verb mapping.
+
+특히 'reg' 약어가 'write' intent 로 분류되는지 확인 (리뷰 🟢 항목).
+"""
+from __future__ import annotations
+
+from graph_tool_call.analyze.dependency import _VERB_TO_INTENT
+
+
+def test_reg_abbrev_maps_to_write():
+    """``regGoodsApprove`` 같은 camelCase 약어를 위해 'reg' 도 write 로 잡아야."""
+    assert _VERB_TO_INTENT.get("reg") == "write"
+
+
+def test_register_full_form_still_maps_to_write():
+    assert _VERB_TO_INTENT.get("register") == "write"
+    assert _VERB_TO_INTENT.get("regist") == "write"
+
+
+def test_basic_verbs_unchanged():
+    """기존 verb mapping 회귀 방지."""
+    assert _VERB_TO_INTENT.get("get") == "read"
+    assert _VERB_TO_INTENT.get("create") == "write"
+    assert _VERB_TO_INTENT.get("update") == "update"
+    assert _VERB_TO_INTENT.get("delete") == "delete"
diff --git a/tests/test_io_contract.py b/tests/test_io_contract.py
new file mode 100644
index 0000000..b9b9b84
--- /dev/null
+++ b/tests/test_io_contract.py
@@ -0,0 +1,170 @@
+"""Unit tests for ``graph_tool_call.ingest.io_contract``.
+
+특히 query/path parameter 의 enum 추출 (리뷰에서 빠뜨려진 부분) 확인.
+"""
+from __future__ import annotations
+
+from graph_tool_call.ingest.io_contract import (
+    extract_consumes_for_operation,
+    extract_leaves,
+    extract_produces_for_operation,
+)
+
+# ─── extract_leaves ──
+
+
+def test_extract_leaves_object_with_primitives():
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+        },
+        "required": ["name"],
+    }
+    leaves = extract_leaves(schema, base_path="$")
+    by_name = {leaf.field_name: leaf for leaf in leaves}
+    assert by_name["name"].required is True
+    assert by_name["name"].field_type == "string"
+    assert by_name["age"].required is False
+
+
+def test_extract_leaves_array_of_objects():
+    schema = {
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {"id": {"type": "string"}},
+        },
+    }
+    leaves = extract_leaves(schema, base_path="$.body")
+    paths = {leaf.json_path for leaf in leaves}
+    assert any("[*]" in p for p in paths), "array → [*] wildcard 경로"
+
+
+def test_extract_leaves_captures_enum():
+    schema = {
+        "type": "object",
+        "properties": {
+            "status": {"type": "string", "enum": ["pending", "shipped"]},
+        },
+    }
+    leaves = extract_leaves(schema, base_path="$")
+    status = next(leaf for leaf in leaves if leaf.field_name == "status")
+    assert status.enum == ["pending", "shipped"]
+
+
+# ─── consumes — enum 추출 회귀 (리뷰 🟢 항목) ──
+
+
+def test_query_param_enum_extracted_openapi3():
+    """OpenAPI 3.x query param 의 schema.enum 이 FieldLeaf.enum 에 들어가야."""
+    operation = {
+        "parameters": [
+            {
+                "name": "sort",
+                "in": "query",
+                "required": True,
+                "schema": {"type": "string", "enum": ["asc", "desc"]},
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    by_name = {leaf.field_name: leaf for leaf in leaves}
+    assert "sort" in by_name
+    assert by_name["sort"].enum == ["asc", "desc"]
+
+
+def test_query_param_enum_extracted_swagger2():
+    """Swagger 2.0 query param 의 enum (parameter level) 도 잡아야."""
+    operation = {
+        "parameters": [
+            {
+                "name": "type",
+                "in": "query",
+                "required": True,
+                "type": "string",
+                "enum": ["A", "B", "C"],
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation, is_swagger2=True)
+    type_leaf = next(leaf for leaf in leaves if leaf.field_name == "type")
+    assert type_leaf.enum == ["A", "B", "C"]
+
+
+def test_path_param_enum_extracted():
+    """Path param 의 enum 도 동일."""
+    operation = {
+        "parameters": [
+            {
+                "name": "kind",
+                "in": "path",
+                "required": True,
+                "schema": {"type": "string", "enum": ["x", "y"]},
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    kind = next(leaf for leaf in leaves if leaf.field_name == "kind")
+    assert kind.enum == ["x", "y"]
+
+
+def test_param_without_enum_has_empty_list():
+    """enum 없는 일반 param 은 enum=[] 으로 들어가야 (None 아님)."""
+    operation = {
+        "parameters": [
+            {"name": "page", "in": "query", "schema": {"type": "integer"}},
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation, required_only=False)
+    page = next(leaf for leaf in leaves if leaf.field_name == "page")
+    assert page.enum == []
+
+
+# ─── produces ──
+
+
+def test_extract_produces_walks_response_body():
+    operation = {
+        "responses": {
+            "200": {
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "type": "object",
+                            "properties": {
+                                "data": {
+                                    "type": "object",
+                                    "properties": {
+                                        "id": {"type": "string"},
+                                    },
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    }
+    leaves = extract_produces_for_operation(operation)
+    paths = {leaf.json_path for leaf in leaves}
+    assert "$.data.id" in paths
+
+
+def test_consumes_skips_optional_when_required_only():
+    operation = {
+        "parameters": [
+            {"name": "must", "in": "query", "required": True, "schema": {"type": "string"}},
+            {"name": "maybe", "in": "query", "required": False, "schema": {"type": "string"}},
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    names = {leaf.field_name for leaf in leaves}
+    assert "must" in names
+    assert "maybe" not in names
diff --git a/tests/test_plan_binding.py b/tests/test_plan_binding.py
new file mode 100644
index 0000000..139860e
--- /dev/null
+++ b/tests/test_plan_binding.py
@@ -0,0 +1,70 @@
+"""Unit tests for ``graph_tool_call.plan.binding``.
+
+binding placeholder resolution + error 동작.
+"""
+from __future__ import annotations
+
+import pytest
+
+from graph_tool_call.plan.binding import BindingError, resolve_bindings
+
+
+def test_literal_passes_through():
+    assert resolve_bindings("hello", {}) == "hello"
+    assert resolve_bindings(42, {}) == 42
+    assert resolve_bindings(None, {}) is None
+
+
+def test_simple_lookup():
+    ctx = {"s1": {"foo": "BAR"}}
+    assert resolve_bindings("${s1.foo}", ctx) == "BAR"
+
+
+def test_full_step_object():
+    ctx = {"s1": {"a": 1, "b": 2}}
+    assert resolve_bindings("${s1}", ctx) == {"a": 1, "b": 2}
+
+
+def test_array_index():
+    ctx = {"s1": {"items": [{"id": "A"}, {"id": "B"}]}}
+    assert resolve_bindings("${s1.items[0].id}", ctx) == "A"
+    assert resolve_bindings("${s1.items[1].id}", ctx) == "B"
+
+
+def test_array_negative_index():
+    ctx = {"s1": [10, 20, 30]}
+    assert resolve_bindings("${s1[-1]}", ctx) == 30
+
+
+def test_unknown_source_raises():
+    with pytest.raises(BindingError, match="unknown source"):
+        resolve_bindings("${ghost.x}", {"s1": {}})
+
+
+def test_dict_walks_recursively():
+    ctx = {"s1": {"v": 9}}
+    out = resolve_bindings(
+        {"a": "${s1.v}", "b": "literal", "nested": {"c": "${s1.v}"}},
+        ctx,
+    )
+    assert out == {"a": 9, "b": "literal", "nested": {"c": 9}}
+
+
+def test_list_walks_recursively():
+    ctx = {"s1": {"v": "X"}}
+    out = resolve_bindings(["${s1.v}", "lit", {"k": "${s1.v}"}], ctx)
+    assert out == ["X", "lit", {"k": "X"}]
+
+
+def test_oob_index_raises():
+    ctx = {"s1": [1, 2]}
+    with pytest.raises(BindingError, match="out of range"):
+        resolve_bindings("${s1[5]}", ctx)
+
+
+def test_input_alias_lookup():
+    """input / user_input 둘 다 같은 값 가리키도록 caller 가 등록한 케이스."""
+    shared = {"keyword": "shoes"}
+    ctx = {"input": shared, "user_input": shared}
+    assert resolve_bindings("${input.keyword}", ctx) == "shoes"
+    assert resolve_bindings("${user_input.keyword}", ctx) == "shoes"
diff --git a/tests/test_plan_runner.py b/tests/test_plan_runner.py
new file mode 100644
index 0000000..923522d
--- /dev/null
+++ b/tests/test_plan_runner.py
@@ -0,0 +1,220 @@
+"""Unit tests for ``graph_tool_call.plan.runner``.
+
+리뷰 CRITICAL #1, #2 회귀 방지 + 핵심 동작 cover.
+"""
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from graph_tool_call.plan import (
+    Plan,
+    PlanRunner,
+    PlanStep,
+)
+from graph_tool_call.plan.runner import (
+    PlanAborted,
+    PlanCompleted,
+)
+
+
+def _echo(name: str, args: dict[str, Any]) -> dict[str, Any]:
+    return {"echoed": args, "tool": name}
+
+
+# ─── CRITICAL #1: input_context 가 ${user_input.x} / ${input.x} 둘 다 resolve ──
+
+
+def test_user_input_alias_resolves():
+    """``${user_input.foo}`` 가 input_context["foo"] 로 resolve 되어야 한다.
+
+    이전엔 synthesizer 가 ${user_input.x} 만들고 runner 가 context["input"] 에만
+    심어서 첫 step 부터 BindingError 로 abort 됐던 케이스.
+    """
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"foo": "${user_input.foo}"}),
+        ],
+        output_binding="${s1}",
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"})
+    assert trace.success, f"plan should succeed, got: {trace.failed_step}"
+    assert trace.steps[0].args_resolved == {"foo": "BAR"}
+
+
+def test_input_alias_resolves_too():
+    """``${input.foo}`` 도 동일 dict 가리켜야 한다 (backward compat)."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"foo": "${input.foo}"}),
+        ],
+        output_binding="${s1}",
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"})
+    assert trace.success
+    assert trace.steps[0].args_resolved == {"foo": "BAR"}
+
+
+def test_mixed_input_user_input_in_same_step():
+    """한 step 에 ${input.x} 와 ${user_input.y} 가 섞여 있어도 둘 다 resolve."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(
+                id="s1",
+                tool="echo",
+                args={"a": "${input.x}", "b": "${user_input.y}"},
+            ),
+        ],
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"x": "X", "y": "Y"})
+    assert trace.success
+    assert trace.steps[0].args_resolved == {"a": "X", "b": "Y"}
+
+
+# ─── CRITICAL #2: ExecutionTrace.steps 가 누적 ──
+
+
+def test_execution_trace_accumulates_steps():
+    """run() 의 ExecutionTrace.steps 가 빈 리스트가 아니어야 한다.
+
+    이전엔 runner.py:289 의 pass 때문에 항상 [] 였던 케이스.
+    """
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"x": "hello"}),
+            PlanStep(id="s2", tool="echo", args={"y": "${s1.echoed.x}"}),
+        ],
+        output_binding="${s2}",
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+    assert len(trace.steps) == 2, "두 step 모두 trace 에 누적돼야 함"
+    assert trace.steps[0].id == "s1"
+    assert trace.steps[1].id == "s2"
+    assert trace.steps[0].output == {"echoed": {"x": "hello"}, "tool": "echo"}
+    assert trace.steps[1].args_resolved == {"y": "hello"}, "이전 step 출력 binding"
+
+
+def test_execution_trace_includes_failed_step():
+    """실패해도 실패한 step + 그 이전 step 이 trace 에 포함."""
+    def flaky(name: str, args: dict[str, Any]) -> dict[str, Any]:
+        if name == "boom":
+            raise RuntimeError("simulated")
+        return {"ok": True}
+
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="ok"),
+            PlanStep(id="s2", tool="boom"),
+            PlanStep(id="s3", tool="never_called"),
+        ],
+    )
+    trace = PlanRunner(flaky).run(plan)
+    assert trace.success is False
+    assert trace.failed_step == "s2"
+    assert len(trace.steps) == 2, "실패까지의 step 만 누적 (s3 는 도달 안 함)"
+    assert trace.steps[0].id == "s1"
+    assert trace.steps[0].error is None
+    assert trace.steps[1].id == "s2"
+    assert trace.steps[1].error is not None
+    assert "simulated" in trace.steps[1].error["message"]
+
+
+# ─── 일반 동작 ──
+
+
+def test_run_stream_yields_expected_events_in_order():
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})],
+    )
+    events = list(PlanRunner(_echo).run_stream(plan))
+    types = [e.type for e in events]
+    assert types[0] == "plan.started"
+    assert types[-1] == "plan.completed"
+    assert "step.started" in types
+    assert "step.completed" in types
+
+
+def test_plan_completed_carries_trace_steps():
+    """run_stream 의 PlanCompleted 가 trace_steps 를 실어 보내야 run() 이 읽을 수 있음."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})],
+    )
+    completed = next(
+        e for e in PlanRunner(_echo).run_stream(plan)
+        if isinstance(e, PlanCompleted)
+    )
+    assert len(completed.trace_steps) == 1
+    assert completed.trace_steps[0].id == "s1"
+
+
+def test_plan_aborted_carries_trace_steps():
+    """abort 시에도 PlanAborted 가 그때까지의 trace_steps 를 실어 보내야 함."""
+    def fail(name: str, args: dict[str, Any]) -> dict[str, Any]:
+        raise RuntimeError("boom")
+
+    plan = Plan(id="t", goal="g", steps=[PlanStep(id="s1", tool="x")])
+    aborted = next(
+        e for e in PlanRunner(fail).run_stream(plan)
+        if isinstance(e, PlanAborted)
+    )
+    assert len(aborted.trace_steps) == 1
+    assert aborted.trace_steps[0].error is not None
+
+
+def test_binding_to_unknown_source_aborts():
+    """존재하지 않는 step id 참조 → BindingError → abort."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "${ghost.foo}"})],
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success is False
+    assert trace.failed_step == "s1"
+    assert trace.steps[0].error["kind"] == "binding"
+
+
+def test_output_binding_resolves_nested_path():
+    """output_binding 이 step 응답 안의 nested path 를 가리킬 수 있어야."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"v": 42})],
+        output_binding="${s1.echoed.v}",
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+    assert trace.output == 42
+
+
+def test_no_input_context_works_when_plan_has_no_input_binding():
+    """input_context 안 줘도 ${input.x} 안 쓰면 동작."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "literal"})],
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+
+
+def test_v1_only_supports_abort_on_error():
+    """v1 PlanRunner 는 on_error='abort' 만 허용 — 다른 값은 ValueError."""
+    with pytest.raises(ValueError):
+        PlanRunner(_echo, on_error="continue")
diff --git a/tests/test_plan_synthesizer.py b/tests/test_plan_synthesizer.py
new file mode 100644
index 0000000..7ad4717
--- /dev/null
+++ b/tests/test_plan_synthesizer.py
@@ -0,0 +1,172 @@
+"""Unit tests for ``graph_tool_call.plan.synthesizer``.
+
+핵심 합성 시나리오 + Cycle/F2 fallback 의 user_input placeholder 출력.
+"""
+from __future__ import annotations
+
+import pytest
+
+from graph_tool_call.plan.synthesizer import (
+    PathSynthesizer,
+    PlanSynthesisError,
+    _normalize_field_name,
+)
+
+
+def _basic_graph() -> dict:
+    """포함:
+      - 'searchProduct': 입력=keyword, 출력=goodsNo (semantic=goods.id)
+      - 'getProductDetail': 입력=goodsNo (semantic=goods.id) → 의존
+    """
+    return {
+        "tools": {
+            "searchProduct": {
+                "metadata": {
+                    "method": "GET",
+                    "path": "/api/v1/products",
+                    "consumes": [
+                        {"field_name": "keyword", "kind": "data", "required": True}
+                    ],
+                    "produces": [
+                        {
+                            "field_name": "goodsNo",
+                            "json_path": "$.body.items[*].goodsNo",
+                            "semantic_tag": "goods.id",
+                        }
+                    ],
+                    "ai_metadata": {
+                        "canonical_action": "search",
+                        "primary_resource": "product",
+                    },
+                },
+            },
+            "getProductDetail": {
+                "metadata": {
+                    "method": "GET",
+                    "path": "/api/v1/products/{goodsNo}",
+                    "consumes": [
+                        {
+                            "field_name": "goodsNo",
+                            "semantic_tag": "goods.id",
+                            "kind": "data",
+                            "required": True,
+                        }
+                    ],
+                    "produces": [
+                        {"field_name": "name", "json_path": "$.body.name"}
+                    ],
+                    "ai_metadata": {
+                        "canonical_action": "read",
+                        "primary_resource": "product",
+                    },
+                },
+            },
+        },
+    }
+
+
+# ─── normalize_field_name ──
+
+
+def test_normalize_field_name_collapses_separators():
+    assert _normalize_field_name("ord_no") == "ordno"
+    assert _normalize_field_name("ORD-NO") == "ordno"
+    assert _normalize_field_name("ordNo") == "ordno"
+
+
+def test_normalize_field_name_keeps_token_roots_distinct():
+    """ord ≠ order — token-level synonym mapping은 안 함."""
+    assert _normalize_field_name("ordNo") != _normalize_field_name("orderNo")
+
+
+def test_normalize_field_name_empty():
+    assert _normalize_field_name("") == ""
+    assert _normalize_field_name(None) == ""  # type: ignore[arg-type]
+
+
+# ─── synthesizer 핵심 동작 ──
+
+
+def test_synthesize_uses_entity_when_available():
+    """user 가 keyword 를 entity 로 줬으면 검색 step 1개로 끝나야."""
+    syn = PathSynthesizer(_basic_graph())
+    plan = syn.synthesize(target="searchProduct", entities={"keyword": "shoes"})
+    assert len(plan.steps) == 1
+    assert plan.steps[0].tool == "searchProduct"
+    assert plan.steps[0].args == {"keyword": "shoes"}
+
+
+def test_synthesize_chains_producer_when_entity_missing():
+    """getProductDetail 호출하려면 goodsNo 가 필요 — searchProduct 가 producer.
+
+    keyword 만 entity 로 주면 chain: searchProduct → getProductDetail.
+    합성 후 step 이름은 ``s1``/``s2`` 로 정렬되고, binding 도 그에 맞게 rewrite 됨.
+    """
+    syn = PathSynthesizer(_basic_graph())
+    plan = syn.synthesize(
+        target="getProductDetail", entities={"keyword": "shoes"},
+    )
+    assert len(plan.steps) == 2, "검색 + 상세조회 2-step chain"
+    assert plan.steps[0].tool == "searchProduct"
+    assert plan.steps[1].tool == "getProductDetail"
+    binding = plan.steps[1].args.get("goodsNo", "")
+    # step_id 순서 정렬 후 binding 은 ${s1...} 로 rewrite — 첫 step 의 출력 가리킴
+    assert binding.startswith("${"), "binding placeholder 형식이어야"
+    assert "s1" in binding, f"첫 step (s1) 출력 binding 이어야, got {binding}"
+    assert "goodsNo" in binding, "produces 필드 경로 포함"
+
+
+def test_synthesize_falls_back_to_user_input_placeholder():
+    """필수 field 인데 entity 도 없고 producer 도 없으면 ``${user_input.X}`` 로 fallback.
+
+    F2 + Cycle policy B 의 핵심 동작 — abort 대신 caller 에게 슬롯을 surface.
+    runner 가 input_context 에 ``user_input`` 별칭으로 등록하므로
+    plan 자체는 합성되고, 실행 시 caller 가 값을 공급하면 작동한다.
+    """
+    g = {
+        "tools": {
+            "needsX": {
+                "metadata": {
+                    "consumes": [
+                        {"field_name": "mysteryField", "kind": "data", "required": True}
+                    ],
+                    "produces": [],
+                    "ai_metadata": {"canonical_action": "read"},
+                },
+            },
+        },
+    }
+    syn = PathSynthesizer(g)
+    plan = syn.synthesize(target="needsX", entities={})
+    assert len(plan.steps) == 1
+    assert plan.steps[0].args == {"mysteryField": "${user_input.mysteryField}"}
+
+
+def test_synthesize_unknown_target_raises():
+    syn = PathSynthesizer(_basic_graph())
+    with pytest.raises(PlanSynthesisError):
+        syn.synthesize(target="ghostTool", entities={})
+
+
+def test_synthesize_context_field_uses_collection_default():
+    """kind=context 인 필드는 entity 없으면 context_defaults 에서 채움."""
+    g = {
+        "tools": {
+            "needsLocale": {
+                "metadata": {
+                    "consumes": [
+                        {
+                            "field_name": "locale",
+                            "kind": "context",
+                            "required": True,
+                        }
+                    ],
+                    "produces": [],
+                    "ai_metadata": {"canonical_action": "read"},
+                },
+            },
+        },
+    }
+    syn = PathSynthesizer(g, context_defaults={"locale": "ko_KR"})
+    plan = syn.synthesize(target="needsLocale", entities={})
+    assert plan.steps[0].args == {"locale": "ko_KR"}

From 20245604209250c88092a79904dc40517bfc7339 Mon Sep 17 00:00:00 2001
From: daehee <1998opening@gmail.com>
Date: Sun, 3 May 2026 19:16:40 +0900
Subject: [PATCH 12/14] =?UTF-8?q?fix(core/tool):=20=5FANNOTATION=5FBY=5FVE?=
 =?UTF-8?q?RB=20=EC=97=90=20register/regist/reg/insert=20=EC=B6=94?=
 =?UTF-8?q?=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

코드 리뷰의 _VERB_TO_INTENT['reg'] 누락과 동일 패턴 — sibling vocabulary
인 _ANNOTATION_BY_VERB (MCP annotation 추론용) 도 register 계열이 통째로
빠져 있었다. 동작 자체는 망가지지 않지만 registerUser / insertOrder
같은 도구가 MCP 클라이언트에 read_only_hint / destructive_hint 힌트
못 받음.

회귀 테스트 추가: 두 dict 간 register 계열 커버리지 일관성 검증.
---
 graph_tool_call/core/tool.py   | 20 ++++++++++++++++++++
 tests/test_dependency_verbs.py | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/graph_tool_call/core/tool.py b/graph_tool_call/core/tool.py
index 25df150..b3e9d71 100644
--- a/graph_tool_call/core/tool.py
+++ b/graph_tool_call/core/tool.py
@@ -408,6 +408,26 @@ def parse_tool(tool: Any) -> ToolSchema:
         destructive_hint=False,
         idempotent_hint=False,
     ),
+    "insert": MCPAnnotations(
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "register": MCPAnnotations(
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "regist": MCPAnnotations(  # 일부 코드베이스 약어 (regUser, registOrder)
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "reg": MCPAnnotations(  # camelCase 짧은 약어 (regGoodsApprove)
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
     # update verbs
     "update": MCPAnnotations(
         read_only_hint=False,
diff --git a/tests/test_dependency_verbs.py b/tests/test_dependency_verbs.py
index e583d05..ccca65f 100644
--- a/tests/test_dependency_verbs.py
+++ b/tests/test_dependency_verbs.py
@@ -23,3 +23,21 @@ def test_basic_verbs_unchanged():
     assert _VERB_TO_INTENT.get("create") == "write"
     assert _VERB_TO_INTENT.get("update") == "update"
     assert _VERB_TO_INTENT.get("delete") == "delete"
+
+
+# ─── _ANNOTATION_BY_VERB sibling 일관성 (잠복 결함) ──
+
+
+def test_annotation_by_verb_covers_register_family():
+    """``_ANNOTATION_BY_VERB`` 도 register 계열 커버해야 — _VERB_TO_INTENT 와 sibling.
+
+    ``registerUser`` / ``insertOrder`` / ``regGoodsApprove`` 같은 도구가 MCP
+    annotation 을 받을 수 있어야 한다 (read_only_hint=False, ...).
+    """
+    from graph_tool_call.core.tool import _ANNOTATION_BY_VERB
+    for verb in ("register", "regist", "reg", "insert"):
+        assert verb in _ANNOTATION_BY_VERB, (
+            f"verb {verb!r} 누락 — _VERB_TO_INTENT 와 sibling vocabulary 불일치"
+        )
+        assert _ANNOTATION_BY_VERB[verb].read_only_hint is False
+        assert _ANNOTATION_BY_VERB[verb].destructive_hint is False

From 3d37a5ce70e16a2987c77e5fac9282504391d91e Mon Sep 17 00:00:00 2001
From: daehee <1998opening@gmail.com>
Date: Sun, 3 May 2026 19:50:15 +0900
Subject: [PATCH 13/14] =?UTF-8?q?style:=20ruff=20format=20=EC=A0=81?=
 =?UTF-8?q?=EC=9A=A9=20=E2=80=94=20=EC=8B=A0=EA=B7=9C=20=ED=85=8C=EC=8A=A4?=
 =?UTF-8?q?=ED=8A=B8=205=EA=B0=9C=20=ED=8C=8C=EC=9D=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI 의 'ruff format --check .' 실패 해소. 동작 변경 없음.
---
 tests/test_dependency_verbs.py |  2 ++
 tests/test_io_contract.py      |  1 +
 tests/test_plan_binding.py     |  1 +
 tests/test_plan_runner.py      | 13 +++++--------
 tests/test_plan_synthesizer.py | 20 ++++++++------------
 5 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/tests/test_dependency_verbs.py b/tests/test_dependency_verbs.py
index ccca65f..756e8a4 100644
--- a/tests/test_dependency_verbs.py
+++ b/tests/test_dependency_verbs.py
@@ -2,6 +2,7 @@
 
 특히 'reg' 약어가 'write' intent 로 분류되는지 확인 (리뷰 🟢 항목).
 """
+
 from __future__ import annotations
 
 from graph_tool_call.analyze.dependency import _VERB_TO_INTENT
@@ -35,6 +36,7 @@ def test_annotation_by_verb_covers_register_family():
     annotation 을 받을 수 있어야 한다 (read_only_hint=False, ...).
     """
     from graph_tool_call.core.tool import _ANNOTATION_BY_VERB
+
     for verb in ("register", "regist", "reg", "insert"):
         assert verb in _ANNOTATION_BY_VERB, (
             f"verb {verb!r} 누락 — _VERB_TO_INTENT 와 sibling vocabulary 불일치"
diff --git a/tests/test_io_contract.py b/tests/test_io_contract.py
index b9b9b84..865b646 100644
--- a/tests/test_io_contract.py
+++ b/tests/test_io_contract.py
@@ -2,6 +2,7 @@
 
 특히 query/path parameter 의 enum 추출 (리뷰에서 빠뜨려진 부분) 확인.
 """
+
 from __future__ import annotations
 
 from graph_tool_call.ingest.io_contract import (
diff --git a/tests/test_plan_binding.py b/tests/test_plan_binding.py
index 139860e..eee0ae9 100644
--- a/tests/test_plan_binding.py
+++ b/tests/test_plan_binding.py
@@ -2,6 +2,7 @@
 
 binding placeholder resolution + error 동작.
 """
+
 from __future__ import annotations
 
 import pytest
diff --git a/tests/test_plan_runner.py b/tests/test_plan_runner.py
index 923522d..a4cf216 100644
--- a/tests/test_plan_runner.py
+++ b/tests/test_plan_runner.py
@@ -2,6 +2,7 @@
 
 리뷰 CRITICAL #1, #2 회귀 방지 + 핵심 동작 cover.
 """
+
 from __future__ import annotations
 
 from typing import Any
@@ -106,6 +107,7 @@ def test_execution_trace_accumulates_steps():
 
 def test_execution_trace_includes_failed_step():
     """실패해도 실패한 step + 그 이전 step 이 trace 에 포함."""
+
     def flaky(name: str, args: dict[str, Any]) -> dict[str, Any]:
         if name == "boom":
             raise RuntimeError("simulated")
@@ -155,24 +157,19 @@ def test_plan_completed_carries_trace_steps():
         goal="g",
         steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})],
     )
-    completed = next(
-        e for e in PlanRunner(_echo).run_stream(plan)
-        if isinstance(e, PlanCompleted)
-    )
+    completed = next(e for e in PlanRunner(_echo).run_stream(plan) if isinstance(e, PlanCompleted))
     assert len(completed.trace_steps) == 1
     assert completed.trace_steps[0].id == "s1"
 
 
 def test_plan_aborted_carries_trace_steps():
     """abort 시에도 PlanAborted 가 그때까지의 trace_steps 를 실어 보내야 함."""
+
     def fail(name: str, args: dict[str, Any]) -> dict[str, Any]:
         raise RuntimeError("boom")
 
     plan = Plan(id="t", goal="g", steps=[PlanStep(id="s1", tool="x")])
-    aborted = next(
-        e for e in PlanRunner(fail).run_stream(plan)
-        if isinstance(e, PlanAborted)
-    )
+    aborted = next(e for e in PlanRunner(fail).run_stream(plan) if isinstance(e, PlanAborted))
     assert len(aborted.trace_steps) == 1
     assert aborted.trace_steps[0].error is not None
 
diff --git a/tests/test_plan_synthesizer.py b/tests/test_plan_synthesizer.py
index 7ad4717..d1793b9 100644
--- a/tests/test_plan_synthesizer.py
+++ b/tests/test_plan_synthesizer.py
@@ -2,6 +2,7 @@
 
 핵심 합성 시나리오 + Cycle/F2 fallback 의 user_input placeholder 출력.
 """
+
 from __future__ import annotations
 
 import pytest
@@ -15,8 +16,8 @@
 
 def _basic_graph() -> dict:
     """포함:
-      - 'searchProduct': 입력=keyword, 출력=goodsNo (semantic=goods.id)
-      - 'getProductDetail': 입력=goodsNo (semantic=goods.id) → 의존
+    - 'searchProduct': 입력=keyword, 출력=goodsNo (semantic=goods.id)
+    - 'getProductDetail': 입력=goodsNo (semantic=goods.id) → 의존
     """
     return {
         "tools": {
@@ -24,9 +25,7 @@ def _basic_graph() -> dict:
                 "metadata": {
                     "method": "GET",
                     "path": "/api/v1/products",
-                    "consumes": [
-                        {"field_name": "keyword", "kind": "data", "required": True}
-                    ],
+                    "consumes": [{"field_name": "keyword", "kind": "data", "required": True}],
                     "produces": [
                         {
                             "field_name": "goodsNo",
@@ -52,9 +51,7 @@ def _basic_graph() -> dict:
                             "required": True,
                         }
                     ],
-                    "produces": [
-                        {"field_name": "name", "json_path": "$.body.name"}
-                    ],
+                    "produces": [{"field_name": "name", "json_path": "$.body.name"}],
                     "ai_metadata": {
                         "canonical_action": "read",
                         "primary_resource": "product",
@@ -104,7 +101,8 @@ def test_synthesize_chains_producer_when_entity_missing():
     """
     syn = PathSynthesizer(_basic_graph())
     plan = syn.synthesize(
-        target="getProductDetail", entities={"keyword": "shoes"},
+        target="getProductDetail",
+        entities={"keyword": "shoes"},
     )
     assert len(plan.steps) == 2, "검색 + 상세조회 2-step chain"
     assert plan.steps[0].tool == "searchProduct"
@@ -127,9 +125,7 @@ def test_synthesize_falls_back_to_user_input_placeholder():
         "tools": {
             "needsX": {
                 "metadata": {
-                    "consumes": [
-                        {"field_name": "mysteryField", "kind": "data", "required": True}
-                    ],
+                    "consumes": [{"field_name": "mysteryField", "kind": "data", "required": True}],
                     "produces": [],
                     "ai_metadata": {"canonical_action": "read"},
                 },

From 5c2370c015e1f4cec6bbd88b0e6167a1197ba669 Mon Sep 17 00:00:00 2001
From: daehee <1998opening@gmail.com>
Date: Wed, 6 May 2026 01:13:52 +0900
Subject: [PATCH 14/14] =?UTF-8?q?feat(plan):=20Stage=204=20prompt=20count?=
 =?UTF-8?q?=20=EC=A0=95=ED=99=95=EC=84=B1=20+=20path=20param=20required=20?=
 =?UTF-8?q?=EA=B0=95=EC=A0=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- response.py: _SUCCESS_PROMPT에 count/total 처리 지침 추가 — totalCount 등
  명시적 total 필드가 있으면 사용하고, 없으면 "N개 등록" 같은 단정 금지.
  result_char_limit 2000→4000으로 늘려 list 응답 truncate 완화.
- ingest/openapi.py: Swagger 2 / OpenAPI 3 둘 다 path 파라미터를 무조건
  required=True로 마킹. 많은 spec이 명시 안 해도 URL placeholder라 호출 시
  반드시 값 필요. synthesizer가 빈 entity로 plan 생성하던 회귀 차단.
---
 graph_tool_call/ingest/openapi.py | 10 ++++++++++
 graph_tool_call/plan/response.py  | 14 +++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py
index 8f93dea..8f53173 100644
--- a/graph_tool_call/ingest/openapi.py
+++ b/graph_tool_call/ingest/openapi.py
@@ -202,6 +202,11 @@ def _extract_params_swagger2(
                 )
         else:
             is_required = p.get("required", False)
+            # OpenAPI 3.x / Swagger 2.0: path 파라미터는 본질적으로 required.
+            # 많은 spec이 명시 안 해도 URL placeholder라 호출 시 반드시 값이 있어야 함.
+            # synthesizer가 required 안 보고 빈 entity로 plan 생성 → HTTP 호출 실패 케이스 차단.
+            if location == "path":
+                is_required = True
             if required_only and not is_required:
                 continue
             params.append(
@@ -308,6 +313,11 @@ def _extract_params_openapi3(
             continue  # skip malformed parameters (missing required 'name' field)
         schema = p.get("schema", {})
         is_required = p.get("required", False)
+        # OpenAPI 3.x: path 파라미터는 본질적으로 required (URL placeholder 채우려면 필수).
+        # 많은 spec이 명시 안 해도 강제로 required 처리해야 synthesizer가 빈 entity를
+        # UnsatisfiableFieldError로 raise → question.required popup으로 사용자에게 묻는다.
+        if p.get("in") == "path":
+            is_required = True
         ptype = _schema_type(schema)
 
         # Wrapper-object/array query parameter handling.
diff --git a/graph_tool_call/plan/response.py b/graph_tool_call/plan/response.py
index 714b5d4..4eefdfc 100644
--- a/graph_tool_call/plan/response.py
+++ b/graph_tool_call/plan/response.py
@@ -35,6 +35,18 @@
 Respond in Korean unless the user's question is clearly in another language.
 Keep it concise — 1~3 sentences for simple answers, short bullet list for
 multi-item results. Do not invent data not present in the result.
+
+CRITICAL — count/total claims:
+- The result above may be **truncated** for length. The list you see is NOT
+  necessarily the complete list.
+- If the result contains an explicit total field (e.g. ``totalCount``,
+  ``totalElements``, ``total``, ``count``, ``size`` at top-level or inside
+  ``payload`` / ``data``), USE THAT NUMBER as the actual count and say
+  "총 N개 중 일부" or similar.
+- If no total field exists, do NOT claim a specific count. Avoid phrases like
+  "현재 1개 등록되어 있습니다" — instead say "조회된 리뷰" or
+  "응답에 포함된 항목". Counting visible list items as the absolute total
+  is forbidden.
 """
 
 
@@ -69,7 +81,7 @@ def synthesize_success_response(
     requirement: str,
     result: Any,
     llm: OntologyLLM,
-    result_char_limit: int = 2000,
+    result_char_limit: int = 4000,
 ) -> str:
     """Success case — plan completed, convert output to NL answer."""
     prompt = _SUCCESS_PROMPT.format(