diff --git a/.gitignore b/.gitignore
index b8d4330..2dceec9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,6 @@ htmlcov/
 
 # Personal memo
 memo/
+
+# Benchmark output (timestamped, auto-generated)
+benchmarks/results/benchmark_*.json
diff --git a/benchmarks/results/benchmark_e2e_20260407_014809.json b/benchmarks/results/benchmark_e2e_20260407_014809.json
deleted file mode 100644
index 3f72158..0000000
--- a/benchmarks/results/benchmark_e2e_20260407_014809.json
+++ /dev/null
@@ -1,740 +0,0 @@
-{
-  "timestamp": "2026-04-07T01:45:56.543757+00:00",
-  "model": "Bonsai-8B.gguf",
-  "mode": "e2e",
-  "top_k": 5,
-  "datasets": [
-    {
-      "name": "Petstore 3.0",
-      "tool_count": 19,
-      "query_count": 23,
-      "queries": [
-        {
-          "query": "Find all available pets",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "findPetsByStatus"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "findPetsByStatus",
-            "findPetsByTags",
-            "getPetById",
-            "getOrderById",
-            "getInventory"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "findPetsByStatus",
-          "baseline_correct": true,
-          "baseline_latency_ms": 10839.814374921843,
-          "baseline_input_tokens": 1873,
-          "retrieve_tool": "findPetsByStatus",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 2766.176208970137,
-          "retrieve_input_tokens": 462,
-          "error": null
-        },
-        {
-          "query": "Add a new dog to the pet store",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "addPet"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "addPet",
-            "placeOrder",
-            "updatePetWithForm",
-            "getInventory",
-            "uploadFile"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "addPet",
-          "baseline_correct": true,
-          "baseline_latency_ms": 2812.666874960996,
-          "baseline_input_tokens": 1877,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 5412.9798751091585,
-          "retrieve_input_tokens": 658,
-          "error": null
-        },
-        {
-          "query": "Get pet with ID 42",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "getPetById"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "getPetById",
-            "getOrderById",
-            "getInventory",
-            "findPetsByStatus",
-            "findPetsByTags"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "getPetById",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1094.6545000188053,
-          "baseline_input_tokens": 1876,
-          "retrieve_tool": "getPetById",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 2944.9569159187376,
-          "retrieve_input_tokens": 465,
-          "error": null
-        },
-        {
-          "query": "Update the name of my pet",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "updatePet",
-            "updatePetWithForm"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "updatePetWithForm",
-            "addPet",
-            "createUser",
-            "updatePet",
-            "uploadFile"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 1457.0880000246689,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4513.7791249435395,
-          "retrieve_input_tokens": 774,
-          "error": null
-        },
-        {
-          "query": "Delete pet number 7",
-          "category": "delete",
-          "difficulty": "easy",
-          "expected_tools": [
-            "deletePet"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "deletePet",
-            "deleteUser",
-            "deleteOrder",
-            "getPetById",
-            "updatePetWithForm"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "deletePet",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1022.0272920560092,
-          "baseline_input_tokens": 1874,
-          "retrieve_tool": "deletePet",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3215.026624966413,
-          "retrieve_input_tokens": 533,
-          "error": null
-        },
-        {
-          "query": "Search pets by their tags",
-          "category": "read",
-          "difficulty": "medium",
-          "expected_tools": [
-            "findPetsByTags"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "findPetsByTags",
-            "findPetsByStatus",
-            "getPetById",
-            "updatePet",
-            "addPet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "findPetsByTags",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1254.0720420656726,
-          "baseline_input_tokens": 1874,
-          "retrieve_tool": "findPetsByTags",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3970.8062920253724,
-          "retrieve_input_tokens": 630,
-          "error": null
-        },
-        {
-          "query": "Upload a photo of my pet",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "uploadFile"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "uploadFile",
-            "addPet",
-            "updatePetWithForm",
-            "placeOrder",
-            "updatePet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 1132.3371669277549,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4183.0666669411585,
-          "retrieve_input_tokens": 750,
-          "error": null
-        },
-        {
-          "query": "Check the store inventory",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "getInventory"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "getInventory",
-            "getOrderById",
-            "deleteOrder",
-            "placeOrder",
-            "addPet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "getInventory",
-          "baseline_correct": true,
-          "baseline_latency_ms": 891.8864169390872,
-          "baseline_input_tokens": 1873,
-          "retrieve_tool": "getInventory",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3237.7606669906527,
-          "retrieve_input_tokens": 589,
-          "error": null
-        },
-        {
-          "query": "Place an order to buy a pet",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "placeOrder"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "placeOrder",
-            "getOrderById",
-            "deleteOrder",
-            "getPetById",
-            "deletePet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 3977.326374966651,
-          "baseline_input_tokens": 1876,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3718.421708093956,
-          "retrieve_input_tokens": 551,
-          "error": null
-        },
-        {
-          "query": "Look up order number 5",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "getOrderById"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "getOrderById",
-            "deleteOrder",
-            "placeOrder",
-            "getPetById",
-            "addPet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "getOrderById",
-          "baseline_correct": true,
-          "baseline_latency_ms": 954.2415420291945,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": "getOrderById",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3530.3365000290796,
-          "retrieve_input_tokens": 613,
-          "error": null
-        },
-        {
-          "query": "Cancel my order",
-          "category": "delete",
-          "difficulty": "easy",
-          "expected_tools": [
-            "deleteOrder"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "deleteOrder",
-            "getOrderById",
-            "placeOrder",
-            "deletePet",
-            "deleteUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 927.9855410568416,
-          "baseline_input_tokens": 1872,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3115.135750034824,
-          "retrieve_input_tokens": 545,
-          "error": null
-        },
-        {
-          "query": "Create a new user account",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "createUser"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "createUser",
-            "createUsersWithListInput",
-            "addPet",
-            "getUserByName",
-            "deleteUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "createUser",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1530.5797919863835,
-          "baseline_input_tokens": 1874,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 7155.023292056285,
-          "retrieve_input_tokens": 619,
-          "error": null
-        },
-        {
-          "query": "Sign in with username and password",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "loginUser"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "updateUser",
-            "loginUser",
-            "createUsersWithListInput",
-            "createUser",
-            "updatePetWithForm"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "loginUser",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1160.5052499799058,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": "loginUser",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4270.270042004995,
-          "retrieve_input_tokens": 703,
-          "error": null
-        },
-        {
-          "query": "Log out of my account",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "logoutUser"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "logoutUser",
-            "loginUser",
-            "getUserByName",
-            "createUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "logoutUser",
-          "baseline_correct": true,
-          "baseline_latency_ms": 770.2793339267373,
-          "baseline_input_tokens": 1874,
-          "retrieve_tool": "logoutUser",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 2691.7580839945003,
-          "retrieve_input_tokens": 491,
-          "error": null
-        },
-        {
-          "query": "View user profile for john123",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "getUserByName"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "getUserByName",
-            "loginUser",
-            "logoutUser",
-            "deleteUser",
-            "createUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "getUserByName",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1026.410625083372,
-          "baseline_input_tokens": 1877,
-          "retrieve_tool": "getUserByName",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3224.2564579937607,
-          "retrieve_input_tokens": 560,
-          "error": null
-        },
-        {
-          "query": "Change user email address",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "updateUser"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "createUser",
-            "createUsersWithListInput",
-            "updateUser",
-            "getUserByName",
-            "deleteUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 963.7468330329284,
-          "baseline_input_tokens": 1873,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3086.039707995951,
-          "retrieve_input_tokens": 635,
-          "error": null
-        },
-        {
-          "query": "Remove user john123",
-          "category": "delete",
-          "difficulty": "easy",
-          "expected_tools": [
-            "deleteUser"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "deleteUser",
-            "getUserByName",
-            "createUser",
-            "loginUser",
-            "updateUser"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "deleteUser",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1020.6558749778196,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": "deleteUser",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3887.1555000077933,
-          "retrieve_input_tokens": 671,
-          "error": null
-        },
-        {
-          "query": "Create multiple user accounts at once",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "createUsersWithListInput"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "createUser",
-            "createUsersWithListInput",
-            "deleteUser",
-            "deletePet",
-            "deleteOrder"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 1033.013999927789,
-          "baseline_input_tokens": 1875,
-          "retrieve_tool": "createUsersWithListInput",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 2155.3955409908667,
-          "retrieve_input_tokens": 553,
-          "error": null
-        },
-        {
-          "query": "Show me sold pets",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "findPetsByStatus"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "findPetsByStatus",
-            "findPetsByTags",
-            "getPetById",
-            "getInventory",
-            "deletePet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "findPetsByStatus",
-          "baseline_correct": true,
-          "baseline_latency_ms": 978.3695839578286,
-          "baseline_input_tokens": 1873,
-          "retrieve_tool": "findPetsByStatus",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 1738.3150000823662,
-          "retrieve_input_tokens": 472,
-          "error": null
-        },
-        {
-          "query": "I want to adopt a pet — find one, check details, then buy it",
-          "category": "workflow",
-          "difficulty": "hard",
-          "expected_tools": [
-            "findPetsByStatus",
-            "getPetById",
-            "placeOrder"
-          ],
-          "recall_at_k": 0.6666666666666666,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "findPetsByStatus",
-            "findPetsByTags",
-            "getPetById",
-            "getOrderById",
-            "getInventory"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 3996.3776669465005,
-          "baseline_input_tokens": 1885,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4942.036333028227,
-          "retrieve_input_tokens": 474,
-          "error": null
-        },
-        {
-          "query": "Update pet using form data, not JSON body",
-          "category": "write",
-          "difficulty": "hard",
-          "expected_tools": [
-            "updatePetWithForm"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "updatePetWithForm",
-            "addPet",
-            "uploadFile",
-            "placeOrder",
-            "updatePet"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "updatePetWithForm",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1568.5455829370767,
-          "baseline_input_tokens": 1878,
-          "retrieve_tool": "updatePetWithForm",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3606.6056670388207,
-          "retrieve_input_tokens": 753,
-          "error": null
-        },
-        {
-          "query": "What pets are in the store?",
-          "category": "read",
-          "difficulty": "medium",
-          "expected_tools": [
-            "findPetsByStatus"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "findPetsByStatus",
-            "findPetsByTags",
-            "getInventory",
-            "placeOrder",
-            "updatePetWithForm"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 2117.048083106056,
-          "baseline_input_tokens": 1876,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 6635.67674998194,
-          "retrieve_input_tokens": 581,
-          "error": null
-        },
-        {
-          "query": "Remove a pet listing and also delete its order",
-          "category": "delete",
-          "difficulty": "hard",
-          "expected_tools": [
-            "deletePet",
-            "deleteOrder"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "deleteOrder",
-            "deletePet",
-            "deleteUser",
-            "placeOrder",
-            "getOrderById"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": "deletePet",
-          "baseline_correct": true,
-          "baseline_latency_ms": 1092.8001669235528,
-          "baseline_input_tokens": 1878,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 5173.107750015333,
-          "retrieve_input_tokens": 551,
-          "error": null
-        }
-      ],
-      "avg_recall_at_k": 0.9855072463768116,
-      "avg_mrr": 0.0,
-      "avg_map": 0.0,
-      "avg_ndcg_at_k": 0.0,
-      "avg_latency_ms": 0.0,
-      "stdev_recall": 0.0695048046856916,
-      "stdev_mrr": 0.0,
-      "ci_recall": [
-        0.9565217391304348,
-        1.0
-      ],
-      "ci_mrr": [
-        0.0,
-        0.0
-      ],
-      "miss_rate": 0.0,
-      "hit_rate": 1.0,
-      "recall_at_3": 0.0,
-      "recall_at_10": 0.0,
-      "avg_keyword_contribution": 0.0,
-      "avg_graph_contribution": 0.0,
-      "avg_embedding_contribution": 0.0,
-      "avg_annotation_contribution": 0.0,
-      "baseline_accuracy": 0.6521739130434783,
-      "retrieve_accuracy": 0.5652173913043478,
-      "avg_token_reduction": 0.6839270983265007,
-      "avg_baseline_latency_ms": 1896.627518206673,
-      "avg_retrieve_latency_ms": 3877.1341938788637,
-      "token_efficiency_baseline": 0.3477615746644101,
-      "token_efficiency_retrieve": 0.9535685469082372,
-      "t_statistic": -1.0,
-      "p_value": 0.3282693794948748
-    }
-  ]
-}
\ No newline at end of file
diff --git a/benchmarks/results/benchmark_e2e_20260407_015032.json b/benchmarks/results/benchmark_e2e_20260407_015032.json
deleted file mode 100644
index db37683..0000000
--- a/benchmarks/results/benchmark_e2e_20260407_015032.json
+++ /dev/null
@@ -1,947 +0,0 @@
-{
-  "timestamp": "2026-04-07T01:48:15.598411+00:00",
-  "model": "Bonsai-8B.gguf",
-  "mode": "e2e",
-  "top_k": 5,
-  "datasets": [
-    {
-      "name": "Mixed MCP Servers",
-      "tool_count": 38,
-      "query_count": 30,
-      "queries": [
-        {
-          "query": "Read the contents of config.yaml",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "read_file"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "read_file",
-            "read_multiple_files",
-            "get_file_contents",
-            "write_file",
-            "create_or_update_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "read_file",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3737.8421670291573,
-          "retrieve_input_tokens": 656,
-          "error": null
-        },
-        {
-          "query": "Write a new configuration file",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "write_file"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "write_file",
-            "edit_file",
-            "read_file",
-            "get_file_info",
-            "create_or_update_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "write_file",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4167.178874951787,
-          "retrieve_input_tokens": 667,
-          "error": null
-        },
-        {
-          "query": "List all files in the src directory",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "list_directory"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "list_directory",
-            "get_pull_request_files",
-            "directory_tree",
-            "push_files",
-            "read_multiple_files"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "list_directory",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3458.496374893002,
-          "retrieve_input_tokens": 604,
-          "error": null
-        },
-        {
-          "query": "Create the output directory",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "create_directory"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_directory",
-            "create_repository",
-            "create_branch",
-            "list_directory",
-            "directory_tree"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "create_directory",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3271.133041009307,
-          "retrieve_input_tokens": 589,
-          "error": null
-        },
-        {
-          "query": "Find all Python files in the project",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "search_files"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "push_files",
-            "read_multiple_files",
-            "search_files",
-            "get_pull_request_files",
-            "list_directory"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "search_files",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3595.6752080237493,
-          "retrieve_input_tokens": 622,
-          "error": null
-        },
-        {
-          "query": "Move the old log file to archive",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "move_file"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "move_file",
-            "edit_file",
-            "read_file",
-            "write_file",
-            "get_file_info"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "move_file",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3558.0179590033367,
-          "retrieve_input_tokens": 582,
-          "error": null
-        },
-        {
-          "query": "Check the file size and permissions",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "get_file_info"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "get_file_info",
-            "read_file",
-            "write_file",
-            "create_or_update_file",
-            "get_file_contents"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "get_file_info",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3864.9533330462873,
-          "retrieve_input_tokens": 663,
-          "error": null
-        },
-        {
-          "query": "Show the directory tree structure",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "directory_tree"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "directory_tree",
-            "create_directory",
-            "list_directory",
-            "get_file_info",
-            "get_file_contents"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "directory_tree",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3439.5341670606285,
-          "retrieve_input_tokens": 565,
-          "error": null
-        },
-        {
-          "query": "Edit the import statement in main.py",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "edit_file"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "edit_file",
-            "update_issue",
-            "list_pull_requests",
-            "list_issues",
-            "write_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "edit_file",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 6146.625166991726,
-          "retrieve_input_tokens": 972,
-          "error": null
-        },
-        {
-          "query": "Read multiple config files at once",
-          "category": "read",
-          "difficulty": "medium",
-          "expected_tools": [
-            "read_multiple_files"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "read_multiple_files",
-            "push_files",
-            "search_files",
-            "get_pull_request_files",
-            "read_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "read_multiple_files",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3947.7652920177206,
-          "retrieve_input_tokens": 619,
-          "error": null
-        },
-        {
-          "query": "Create a new issue for the bug I found",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "create_issue"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_issue",
-            "create_repository",
-            "create_branch",
-            "create_directory",
-            "create_pull_request"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 8379.22374997288,
-          "retrieve_input_tokens": 760,
-          "error": null
-        },
-        {
-          "query": "Open a pull request for my changes",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "create_pull_request"
-          ],
-          "recall_at_k": 0.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "update_pull_request_branch",
-            "get_pull_request_files",
-            "get_pull_request",
-            "merge_pull_request",
-            "get_pull_request_reviews"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4580.256165936589,
-          "retrieve_input_tokens": 709,
-          "error": null
-        },
-        {
-          "query": "Search for repositories about machine learning",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "search_repositories"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "search_repositories",
-            "search_code",
-            "search_users",
-            "search_files",
-            "search_issues"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "search_repositories",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4150.481750024483,
-          "retrieve_input_tokens": 651,
-          "error": null
-        },
-        {
-          "query": "Fork the upstream repository",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "fork_repository"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "fork_repository",
-            "create_repository",
-            "search_repositories",
-            "create_branch",
-            "list_branches"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3917.6195840118453,
-          "retrieve_input_tokens": 638,
-          "error": null
-        },
-        {
-          "query": "List all open issues with bug label",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "list_issues"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "list_issues",
-            "search_issues",
-            "list_directory",
-            "list_branches",
-            "list_commits"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "list_issues",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 5854.736082954332,
-          "retrieve_input_tokens": 770,
-          "error": null
-        },
-        {
-          "query": "Get the README from the GitHub repo",
-          "category": "read",
-          "difficulty": "medium",
-          "expected_tools": [
-            "get_file_contents"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "get_file_contents",
-            "get_issue",
-            "get_pull_request",
-            "get_pull_request_reviews",
-            "get_pull_request_comments"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "get_file_contents",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4693.792750011198,
-          "retrieve_input_tokens": 647,
-          "error": null
-        },
-        {
-          "query": "Merge the feature branch PR",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "merge_pull_request"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "merge_pull_request",
-            "create_branch",
-            "update_pull_request_branch",
-            "list_branches",
-            "list_commits"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4134.696291992441,
-          "retrieve_input_tokens": 750,
-          "error": null
-        },
-        {
-          "query": "Comment on the pull request with review feedback",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "add_issue_comment"
-          ],
-          "recall_at_k": 0.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_pull_request_review",
-            "get_pull_request_comments",
-            "get_pull_request_reviews",
-            "update_pull_request_branch",
-            "get_pull_request"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 4508.81649996154,
-          "retrieve_input_tokens": 717,
-          "error": null
-        },
-        {
-          "query": "Create a new branch for the feature",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "create_branch"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_branch",
-            "create_repository",
-            "create_directory",
-            "create_issue",
-            "create_or_update_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "create_branch",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 5065.368999959901,
-          "retrieve_input_tokens": 750,
-          "error": null
-        },
-        {
-          "query": "Push the updated files to GitHub",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "push_files"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "push_files",
-            "read_multiple_files",
-            "search_files",
-            "get_pull_request_files",
-            "move_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3782.872375100851,
-          "retrieve_input_tokens": 629,
-          "error": null
-        },
-        {
-          "query": "Search code for the function definition",
-          "category": "read",
-          "difficulty": "medium",
-          "expected_tools": [
-            "search_code"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "search_code",
-            "search_repositories",
-            "search_users",
-            "search_files",
-            "search_issues"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "search_code",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4113.567291060463,
-          "retrieve_input_tokens": 651,
-          "error": null
-        },
-        {
-          "query": "Which directories can the file server access?",
-          "category": "read",
-          "difficulty": "hard",
-          "expected_tools": [
-            "list_allowed_directories"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "list_allowed_directories",
-            "move_file",
-            "read_file",
-            "write_file",
-            "search_files"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "list_allowed_directories",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3236.2780830590054,
-          "retrieve_input_tokens": 549,
-          "error": null
-        },
-        {
-          "query": "Check details of PR number 55",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "get_pull_request"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "get_pull_request",
-            "get_issue",
-            "get_pull_request_status",
-            "merge_pull_request",
-            "get_pull_request_reviews"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "get_pull_request",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4608.008833019994,
-          "retrieve_input_tokens": 690,
-          "error": null
-        },
-        {
-          "query": "Approve the pull request after review",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "create_pull_request_review"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_pull_request_review",
-            "get_pull_request_reviews",
-            "get_pull_request_comments",
-            "get_pull_request",
-            "merge_pull_request"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": null,
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 3880.9942500665784,
-          "retrieve_input_tokens": 749,
-          "error": null
-        },
-        {
-          "query": "View the commit history",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "list_commits"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "list_commits",
-            "merge_pull_request",
-            "directory_tree",
-            "push_files",
-            "read_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "list_commits",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 5402.960458071902,
-          "retrieve_input_tokens": 707,
-          "error": null
-        },
-        {
-          "query": "Create a new GitHub repo and initialize it",
-          "category": "write",
-          "difficulty": "easy",
-          "expected_tools": [
-            "create_repository"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_branch",
-            "create_issue",
-            "create_repository",
-            "create_pull_request",
-            "create_or_update_file"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "create_repository",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 6041.763041983359,
-          "retrieve_input_tokens": 852,
-          "error": null
-        },
-        {
-          "query": "Update the issue title and close it",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "update_issue"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "update_issue",
-            "create_issue",
-            "create_or_update_file",
-            "update_pull_request_branch",
-            "get_issue"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "update_issue",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 6064.246875001118,
-          "retrieve_input_tokens": 872,
-          "error": null
-        },
-        {
-          "query": "See what files were changed in pull request 10",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "get_pull_request_files"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "get_pull_request_files",
-            "update_pull_request_branch",
-            "get_pull_request",
-            "merge_pull_request",
-            "get_pull_request_reviews"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "get_pull_request_files",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 4493.489707936533,
-          "retrieve_input_tokens": 713,
-          "error": null
-        },
-        {
-          "query": "Find all TypeScript files matching *.test.ts",
-          "category": "read",
-          "difficulty": "easy",
-          "expected_tools": [
-            "search_files"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "search_files",
-            "push_files",
-            "read_multiple_files",
-            "get_pull_request_files",
-            "list_directory"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "search_files",
-          "retrieve_correct": true,
-          "retrieve_latency_ms": 3920.63737497665,
-          "retrieve_input_tokens": 623,
-          "error": null
-        },
-        {
-          "query": "Create a file on GitHub with the deployment config",
-          "category": "write",
-          "difficulty": "medium",
-          "expected_tools": [
-            "create_or_update_file"
-          ],
-          "recall_at_k": 1.0,
-          "mrr": 0.0,
-          "average_precision": 0.0,
-          "ndcg_at_k": 0.0,
-          "latency_ms": 0.0,
-          "retrieved_tools": [
-            "create_or_update_file",
-            "write_file",
-            "create_pull_request_review",
-            "create_repository",
-            "create_branch"
-          ],
-          "score_breakdown": {},
-          "baseline_tool": null,
-          "baseline_correct": false,
-          "baseline_latency_ms": 0.0,
-          "baseline_input_tokens": 0,
-          "retrieve_tool": "create_repository",
-          "retrieve_correct": false,
-          "retrieve_latency_ms": 5247.74162506219,
-          "retrieve_input_tokens": 782,
-          "error": null
-        }
-      ],
-      "avg_recall_at_k": 0.9333333333333333,
-      "avg_mrr": 0.0,
-      "avg_map": 0.0,
-      "avg_ndcg_at_k": 0.0,
-      "avg_latency_ms": 0.0,
-      "stdev_recall": 0.2537081317024624,
-      "stdev_mrr": 0.0,
-      "ci_recall": [
-        0.8333333333333334,
-        1.0
-      ],
-      "ci_mrr": [
-        0.0,
-        0.0
-      ],
-      "miss_rate": 0.06666666666666667,
-      "hit_rate": 0.9333333333333333,
-      "recall_at_3": 0.0,
-      "recall_at_10": 0.0,
-      "avg_keyword_contribution": 0.0,
-      "avg_graph_contribution": 0.0,
-      "avg_embedding_contribution": 0.0,
-      "avg_annotation_contribution": 0.0,
-      "baseline_accuracy": 0.0,
-      "retrieve_accuracy": 0.7333333333333333,
-      "avg_token_reduction": 0.0,
-      "avg_baseline_latency_ms": 0.0,
-      "avg_retrieve_latency_ms": 4508.825779139685,
-      "token_efficiency_baseline": 0.0,
-      "token_efficiency_retrieve": 1.0603431656063234,
-      "t_statistic": 8.930285549745875,
-      "p_value": 8.378144844556346e-10
-    }
-  ]
-}
\ No newline at end of file
diff --git a/docs/architecture-plan-and-execute.md b/docs/architecture-plan-and-execute.md
new file mode 100644
index 0000000..caca509
--- /dev/null
+++ b/docs/architecture-plan-and-execute.md
@@ -0,0 +1,830 @@
+# Plan-and-Execute Architecture
+
+> 작성: 2026-04-22, 업데이트: 2026-04-23
+> 상태: 확정 (설계) / 미구현
+> 범위: graph-tool-call 라이브러리 + xgen-workflow 통합
+
+## 변경 이력
+
+- **2026-04-23**: 설계 간소화
+  - Ingest 시 embedding + Qdrant 저장 **삭제** (YAGNI). Field 이름 exact match 로 충분, cross-field synonym 은 LLM enrichment 가 해결
+  - L0 에 **LLM per-tool enrichment (Pass 2)** 도입. graph-tool-call 이 이미 보유한 `OntologyLLM` 추상화 활용
+  - Stage 1 retrieval 은 기존 BM25 + graph (graph-tool-call retrieval) 재사용. embedding prefilter 생략
+  - Knowledge Base 가 **두 층** 으로 명확화: (A) 결정론적 파서 / (B) LLM semantic enrichment
+
+---
+
+## 0. 한 쪽 요약
+
+**문제:** 현재 LLM-as-orchestrator (ReAct) 는 요청당 15 iteration × ~15KB context = **30초, 225KB 토큰**. 비용·지연·품질 모두 구조적 한계.
+
+**해결:** **사전 지식 (graph + schemas + ingest 시 LLM 의미 주석)** 을 최대한 활용하고, runtime LLM 은 자연어 ↔ 구조 변환에만 사용하는 **5-layer 아키텍처** (L0 Knowledge Base + Stage 1~4 Runtime).
+
+**기대 효과:**
+- LLM 호출 15 → 2~3회
+- Context 225KB → ~2~3KB (**~75배 감소**)
+- Latency 30초 → 2~5초 (**~10배 개선**)
+- 실행 단계 재현성, 감사 가능성 확보
+- 확장 축 확보 (fan-out, template, interactive)
+
+---
+
+## 1. 설계 원칙
+
+| # | 원칙 | 의미 |
+|---|---|---|
+| 1 | 사전 지식 최대 활용 | graph, schemas, embeddings 는 offline 구축 후 영속. 요청 처리 시 재계산 금지 |
+| 2 | LLM 은 semantic bridge 에만 | 자연어 이해 / 의미 추출 / 자연어 생성 — 그 외 결정론 |
+| 3 | 결정 가능한 것은 결정론적으로 | 매칭·순서·바인딩은 알고리즘. LLM 폴백은 **실패한 결정론의 보완** |
+| 4 | 각 단계는 독립 입출력 계약 | 테스트·캐싱·디버깅·부분 교체 가능 |
+| 5 | 하드코딩은 "학습된 지식" 으로 대체 | synonym → embedding cluster, verb → intent classifier |
+| 6 | Failure mode 관측 가능 | 어느 stage 에서 왜 실패했는지 항상 명확해야 함 |
+
+---
+
+## 2. 시스템 개요
+
+```
+╔═══════════════════════════════════════════════════════════════╗
+║                    OFFLINE / INGEST TIME                      ║
+║  ┌─────────────────────────────────────────────────────────┐ ║
+║  │ L0. KNOWLEDGE BASE                                       │ ║
+║  │                                                          │ ║
+║  │  Swagger → ToolSchema + Tool Embeddings +                │ ║
+║  │            IO Contract + Tool Graph                      │ ║
+║  │                                                          │ ║
+║  │  저장: api_tool_collections.graph (JSONB)                 │ ║
+║  │       api_tool_collections.embeddings (pgvector)         │ ║
+║  │       api_tool_collections.io_contracts (JSONB)          │ ║
+║  └─────────────────────────────────────────────────────────┘ ║
+╚═══════════════════════════════════════════════════════════════╝
+                            │
+                            ▼ (요청 도착)
+╔═══════════════════════════════════════════════════════════════╗
+║                    REQUEST TIME PIPELINE                      ║
+║                                                               ║
+║  requirement (자연어)                                          ║
+║     │                                                         ║
+║     ▼                                                         ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 1. RETRIEVAL + TARGET SELECTION                 │    ║
+║  │  (a) embedding prefilter: 108 → top-20                │    ║
+║  │  (b) LLM pick: 20개 catalog → target + entities       │    ║
+║  │  context: ~1KB  │  LLM: 1회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 2. PATH SYNTHESIZER                             │    ║
+║  │  (결정론) target 의 consumes → IO Contract 역추적      │    ║
+║  │          → DAG 구성 + argument bindings                │    ║
+║  │  context: —     │  LLM: 0회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║         ┌─────────┴─────────┐                                 ║
+║         │                   │                                 ║
+║    확정 plan           모호 (2+ 경로)                          ║
+║         │                   │                                 ║
+║         │                   ▼                                 ║
+║         │      ┌────────────────────────────────────────┐    ║
+║         │      │ (조건부) DISAMBIGUATION                 │    ║
+║         │      │  context: ~2KB (후보만) │ LLM: 1회       │    ║
+║         │      └────────────┬───────────────────────────┘    ║
+║         │                   │                                 ║
+║         └───────────────────┘                                 ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 3. RUNNER                                       │    ║
+║  │  (결정론) DAG topological 실행                         │    ║
+║  │          JsonPath 치환 + tool_executor HTTP           │    ║
+║  │          step 단위 streaming event                     │    ║
+║  │  context: —     │  LLM: 0회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║  ┌──────────────────────────────────────────────────────┐    ║
+║  │ STAGE 4. RESPONSE SYNTHESIS                           │    ║
+║  │  execution trace (요약) → 자연어 응답                   │    ║
+║  │  context: ~1KB  │  LLM: 1회                            │    ║
+║  └────────────────┬─────────────────────────────────────┘    ║
+║                   │                                           ║
+║                   ▼                                           ║
+║                최종 답변                                        ║
+╚═══════════════════════════════════════════════════════════════╝
+```
+
+**일반 케이스 예산:** LLM 2회, context ~2KB, 2~4초.
+**모호 케이스:** LLM 3회, context ~4KB, 4~6초.
+
+---
+
+## 3. L0 — Knowledge Base
+
+ingest 1회. 영속 저장. 요청 처리에서 재계산 금지.
+
+**두 층 구조:**
+- **Pass 1 — Deterministic parser**: Swagger 의 구조적 사실 (schema, HTTP, dependency) 추출. LLM 금지.
+- **Pass 2 — Semantic enrichment**: Description 등을 LLM 이 읽고 의미 주석 (언제 써, 무엇을 내놓는다, 누구와 쌍을 이룬다). graph-tool-call 의 `OntologyLLM` 추상화 재사용.
+
+### 3.1 ToolSchema (Pass 1, 기존 확장)
+
+기존 `tools` 테이블. 추가 필드는 아래 섹션들이 채움.
+
+| 필드 | 설명 | 출처 |
+|---|---|---|
+| `function_id` | 컬렉션 범위 고유 slug | 파서 |
+| `function_name` | 원본 operationId | 파서 |
+| `description` | summary + description + tags | 파서 |
+| `api_url`, `api_method`, `api_header`, `api_body` | 실행용 | 파서 |
+| `metadata` | method/path/base_url/tags/response_schema/controller/request_type/response_type | 파서 |
+| `ai_metadata` | canonical_action, primary_resource, when_to_use, pairs_well_with 등 | **Pass 2 (LLM)** |
+
+### 3.2 IO Contract (Pass 1, 결정론)
+
+각 tool 의 **필드 수준 produces/consumes** 를 swagger schema 에서 기계적으로 추출.
+
+**저장:** 신규 테이블 `tool_io_contracts`:
+```sql
+CREATE TABLE tool_io_contracts (
+  tool_id          VARCHAR(100) REFERENCES tools(function_id),
+  direction        VARCHAR(10)  CHECK (direction IN ('produces', 'consumes')),
+  json_path        TEXT,         -- $.body.goods[*].goodsNo  (produces)
+                                 -- goodsNo                   (consumes)
+  field_name       VARCHAR(100), -- goodsNo
+  field_type       VARCHAR(40),  -- integer, string, object
+  required         BOOLEAN,      -- consumes 에 한함
+  semantic_tag     VARCHAR(80)   -- Pass 2 LLM 이 채움 (빈 값 허용)
+);
+```
+
+**추출 프로세스 (LLM 없음):**
+```
+for each tool in schemas:
+  request_leaves  = walk_schema_leaves(tool.request_schema)
+  response_leaves = walk_schema_leaves(tool.response_schema)
+  
+  for each leaf in request_leaves:
+    insert consumes (field_name, type, required)
+  
+  for each leaf in response_leaves:
+    insert produces (json_path, field_name, type)
+```
+
+**1차 매칭: exact field name + type** — 동일 swagger 내 field 이름 규약 보통 일관. 이걸로 대부분의 엣지 생성.
+
+```python
+# 결정론적 field match edge
+for A in tools:
+  for p in A.produces:
+    for B in tools:
+      if A == B: continue
+      for c in B.consumes.required:
+        if p.field_name == c.field_name and p.type == c.type:
+          graph.add_edge(A, B, "produces_for",
+                         binding={c.field_name: p.json_path})
+```
+
+### 3.3 Semantic Enrichment (Pass 2, LLM)
+
+**목적:** Description 등의 비정형 정보를 LLM 이 해석해 의미 주석 추가. 하드코딩된 verb 사전 / synonym 테이블 **완전 대체**.
+
+**인프라:** graph-tool-call 에 이미 있는 `OntologyLLM` 활용 ([graph_tool_call/ontology/llm_provider.py](graph_tool_call/ontology/llm_provider.py)).
+
+**이미 제공되는 메서드:**
+- `infer_relations(tools)` — LLM 기반 관계 추론
+- `suggest_categories(tools)` — 카테고리 그룹핑
+- `verify_relations(relations, tools)` — 휴리스틱 엣지 검증 / 거르기
+- `suggest_missing(tools, existing)` — 빠진 엣지 제안
+- `enrich_keywords(tools)` — BM25 향상용 키워드
+- `generate_example_queries(tools)` — 임베딩 매칭용 예시 쿼리
+
+**신규 메서드 (추가 구현):**
+```python
+class OntologyLLM:
+    def enrich_tool_semantics(
+        self, tools: list[ToolSummary], batch_size: int = 10,
+    ) -> dict[str, ToolEnrichment]:
+        """Per-tool 의미 주석 (action, resource, use-when, semantic tags, pairs)."""
+```
+
+**ToolEnrichment 스키마:**
+```typescript
+type ToolEnrichment = {
+  canonical_action: "search" | "read" | "create" | "update" | "delete" | "action";
+  primary_resource: string;                 // 정규화 리소스명 (예: "product")
+  one_line_summary: string;                 // 한 줄 요약 (Stage 1 catalog 용)
+  when_to_use: string;                      // 언제 쓰는지
+  when_not_to_use?: string;                 // 쓰면 안 되는 경우
+  produces_semantics: Array<{               // 의미 태깅된 produces
+    semantic: string;                       // "product_id" 같은 canonical
+    json_path: string;                      // 실제 경로
+  }>;
+  consumes_semantics: Array<{
+    semantic: string;
+    field: string;
+  }>;
+  pairs_well_with: Array<{                  // 함께 / 순서대로 쓰이는 도구들
+    tool: string;
+    reason: string;
+  }>;
+}
+```
+
+**Prompt 예시:**
+```
+You are annotating an API tool for a planning system.
+
+Tool: seltSearchProduct
+Summary: 상품 검색
+Description: 키워드로 상품을 검색하는 API입니다. ...
+HTTP: GET /v1/search/product
+Request fields: [searchWord, langCd, siteNo, sort, ...]
+Response fields: [$.body.goods[*].goodsNo, $.body.goods[*].goodsName, ...]
+
+Produce JSON with:
+- canonical_action (search|read|create|update|delete|action)
+- primary_resource (one word like "product", "order", "user")
+- one_line_summary (Korean, within 40 chars)
+- when_to_use (1~2 sentences)
+- produces_semantics: map internal field names to semantic ids like "product_id"
+- pairs_well_with: 2~3 related tools with brief reason
+
+Output JSON only. 
+```
+
+**저장:**
+- `tools.ai_metadata` JSONB 컬럼 (전체 enrichment 덤프)
+- `tool_io_contracts.semantic_tag` (produces_semantics / consumes_semantics 의 semantic 을 해당 row 에 매핑)
+
+**재실행 조건:** swagger 변경, LLM 모델 업그레이드, 관리자 강제 재생성. 일상 요청 처리와 **분리**.
+
+### 3.4 Tool Graph (재정의)
+
+엣지 타입:
+
+| 엣지 | 근거 | 신뢰도 | 용도 |
+|---|---|---|---|
+| `produces_for` (exact) | Pass 1 — field name + type 일치 | high | Stage 2 주 신호 |
+| `produces_for` (semantic) | Pass 2 — `semantic_tag` 일치 | medium | Pass 1 이 못 잡는 교차 명명 (cross-collection 등) |
+| `pairs_with` | Pass 2 — `pairs_well_with` 에서 | medium | Stage 1 catalog 힌트, Stage 2 보조 |
+| `similar_to` | 구조적 (같은 controller / tag / CRUD 역할) | low | Disambiguation 후보 확장 |
+| `precedes` | 구조적 (POST → GET single 등) | low | 레거시 엣지, 보조 힌트 |
+
+**기존 하드코딩 반응성 패치 (selt, synonym clusters, *No/*Seq heuristic, search-bridge exception) 는 Pass 2 완성 시 모두 제거.** Pass 1 field exact match + Pass 2 LLM enrichment 가 그 역할을 대체.
+
+### 3.5 Ingest 파이프라인
+
+```python
+# xgen-workflow 측
+def ingest_collection(collection_id, spec_source, llm_config):
+    from graph_tool_call.ontology.llm_provider import wrap_llm
+    from graph_tool_call.ingest.openapi import parse_operations
+    
+    # Pass 1: 결정론
+    schemas = parse_operations(spec_source)
+    io_contracts = extract_io_contracts(schemas)          # 3.2
+    graph = build_structural_edges(schemas, io_contracts) # 3.4
+    
+    # Pass 2: LLM (옵션)
+    if llm_config.enabled:
+        llm = wrap_llm(build_llm_spec(llm_config))
+        enrichments = llm.enrich_tool_semantics(schemas)
+        apply_semantic_tags(io_contracts, enrichments)    # semantic_tag 채움
+        graph = augment_with_semantic_edges(graph, enrichments)
+    
+    store_all(schemas, io_contracts, graph, enrichments)
+```
+
+**옵션:** Pass 2 는 `llm_config.enabled=False` 로 **생략 가능**. Pass 1 만으로도 기본 동작은 가능 (품질은 낮음).
+
+### 3.6 xgen-workflow 통합
+
+xgen 은 이미 agent 노드에서 provider/model/api_key 선택 지원. Ingest 시에도 동일 config 재사용:
+
+```python
+# xgen-workflow: api_tool_collection/service.py
+def refresh_with_enrichment(collection_id, llm_settings):
+    llm_spec = f"{llm_settings.provider}/{llm_settings.model}"  
+    # "openai/gpt-4.1-mini"
+    
+    # api_key 는 env 또는 xgen secret store 에서
+    os.environ["OPENAI_API_KEY"] = xgen_secret.get(user_id, "openai")
+    
+    ingest_collection(collection_id, spec_source, LLMConfig(
+        enabled=True,
+        spec=llm_spec,
+    ))
+```
+
+graph-tool-call 은 xgen 에 의존하지 않음. xgen 이 config 주는 쪽, graph-tool-call 이 받는 쪽.
+
+---
+
+## 4. Stage 1 — Retrieval + Target Selection
+
+**입력:** `requirement: str`
+
+**출력:**
+```json
+{
+  "target": "seltProductDetailInfo",
+  "confidence": 0.92,
+  "entities": {
+    "keyword": "quarzen 티셔츠",
+    "locale": "ko"
+  },
+  "output_shape": "single",
+  "reasoning": "..."
+}
+```
+
+### 4.1 알고리즘
+
+**(a) Retrieval prefilter (결정론):** graph-tool-call 의 기존 `retrieve_with_scores()` 그대로 사용.
+```python
+candidates = tg.retrieve_with_scores(requirement, top_k=20)
+# BM25 + graph + (optional) annotation 채널
+```
+embedding prefilter 는 생략. 기존 BM25 + graph 가 top-20 recall 을 충분히 내는 것을 실측으로 확인 (x2bee `"product search"` → `seltSearchProduct` top-10 안에 들어옴).
+
+향후 recall 부족 증거가 나오면 embedding 채널을 **그때** 연결. 지금은 YAGNI.
+
+**(b) LLM structured pick:**
+- 20개의 catalog 에 **ai_metadata 포함**:
+  ```
+  {
+    function_name,
+    description[:80],
+    one_line_summary,       // Pass 2 에서 생성
+    when_to_use,            // Pass 2
+    pairs_well_with         // Pass 2 (이름만)
+  }
+  ```
+- system prompt: "고른 target 1개와 추출한 entities 를 반환"
+- OpenAI structured output (JSON schema 강제)
+
+**context 크기:** 20 × 200자 ≈ 4KB (ai_metadata 포함 확장). ai_metadata 없을 땐 20 × 100자 ≈ 2KB.
+
+### 4.2 오류 처리
+
+- Retrieval 이 top-20 모두 low score 면 → "적합한 도구 없음" 에러. 사용자 재질의 유도.
+- LLM 이 JSON schema 위반 시 → 1회 retry. 실패하면 fallback: top-1 embedding 결과로 진행 (entities 는 빈 dict).
+
+### 4.3 Stage 1 의 성능 지표
+- Target 정확도 (샘플 요구사항 N개에 대해 "맞는 target 선정" 비율)
+- Entity 추출 재현율
+- LLM 응답 latency p50/p95
+
+---
+
+## 5. Stage 2 — Path Synthesizer
+
+**입력:** Stage 1 output (`target`, `entities`)
+**출력:** Plan (Plan 스키마는 §9 참조) OR "ambiguous" 플래그 (Disambiguation 발동)
+
+### 5.1 DAG 구성 알고리즘 (Bottom-up)
+
+```python
+def synthesize(target, entities, collection_defaults):
+    plan = {"steps": [], "output_binding": None}
+    context = entities | collection_defaults   # 이미 아는 값들
+    
+    needed = target.consumes.required_only()   # 필수 입력만 먼저
+    resolved = {}                              # {field: source_step_id}
+    pending = list(needed)
+    visited = set()
+    
+    while pending:
+        field = pending.pop(0)
+        if field.semantic_tag in available_tags(context, resolved):
+            resolved[field.name] = bind_from_available(field, context, resolved)
+            continue
+        
+        # graph 에서 이 semantic 을 produces 하는 tool 찾기
+        producers = graph.producers_of(field.semantic_tag)
+        if not producers:
+            raise UnsatisfiableFieldError(field)
+        
+        # 후보 여러 개면 "ambiguous" 로 분기 (Stage 3 LLM)
+        if len(producers) > 1 and not strictly_better(producers):
+            return AmbiguousPlan(target, candidates=producers)
+        
+        # prerequisite 추가 (재귀)
+        producer = producers[0]
+        if producer.name in visited:
+            raise CyclicDependencyError
+        visited.add(producer.name)
+        
+        step = build_step(producer)
+        plan.steps.insert(0, step)  # 앞쪽에 삽입 (위상 순서)
+        
+        # producer 의 consumes 를 다시 확인
+        pending.extend(producer.consumes.required_only())
+    
+    # target 을 마지막 step 으로 추가
+    plan.steps.append(build_step(target, bindings=resolved))
+    plan.output_binding = f"$.{target.step_id}.body"
+    
+    return plan
+```
+
+### 5.2 "strictly_better" 판단
+
+여러 producer 후보 중:
+- IO Contract confidence 높은 순
+- 경로 짧은 순 (재귀 depth)
+- similar_to weight 높은 순 (requirement 와 가까운)
+- 모두 비슷하면 → Ambiguous 플래그
+
+### 5.3 초기 버전 범위
+
+- **선형 chain** (각 step 1회 호출): 지원
+- **다중 참조** (한 step 이 이전 N개 step 의 출력 조합): 지원
+- **Fan-out** (배열 전체 loop): **초기 범위 밖** — §10 확장 포인트
+- **조건 분기** (if/else): **초기 범위 밖**
+
+### 5.4 실패 경로
+
+| 케이스 | 반환 |
+|---|---|
+| 필수 field 해소 불가 | `UnsatisfiableFieldError` — Stage 4 에 그대로 reveal |
+| 순환 의존 | `CyclicDependencyError` — 보고 |
+| 복수 경로 | `AmbiguousPlan` — Disambiguation 발동 |
+
+---
+
+## 6. Disambiguation (조건부)
+
+**발동 조건:** Stage 2 가 `AmbiguousPlan` 반환.
+
+**입력:** 후보 경로 2~N개 각각의 요약
+```
+후보 A: seltSearchProduct → seltProductDetailInfo
+후보 B: getCategoryList → seltSearchProduct → seltProductDetailInfo
+```
+
+**LLM 호출:**
+- system: "요구사항에 가장 맞는 경로 1개를 고르고 이유를 설명"
+- user: requirement + 후보 경로 설명
+- structured output: `{"chosen": "A", "reason": "..."}`
+
+**context:** ~2KB
+
+---
+
+## 7. Stage 3 — Runner
+
+**입력:** 확정 Plan
+
+**동작:**
+```python
+async def run(plan: Plan):
+    context = {}                              # step_id → result
+    trace = ExecutionTrace(plan=plan)
+    
+    for step in topological_order(plan.steps):
+        resolved_args = resolve_bindings(step.args, context)
+        
+        trace.emit("step.start", step_id=step.id, args=resolved_args)
+        
+        try:
+            result = await tool_executor.execute(
+                function_id=step.tool_function_id,
+                args=resolved_args,
+                timeout=step.timeout or 30,
+            )
+        except ToolExecutionError as e:
+            trace.emit("step.error", step_id=step.id, error=str(e))
+            return trace.fail(step.id, e)
+        
+        context[step.id] = result
+        trace.emit("step.done", step_id=step.id, output_preview=preview(result))
+    
+    final = jsonpath_extract(context, plan.output_binding)
+    trace.emit("plan.done", output=final)
+    return trace.success(final)
+```
+
+### 7.1 Argument 바인딩 치환
+
+바인딩 syntax: `${step_id.json_path}` — JsonPath 표준 사용 (jsonpath-ng 라이브러리).
+
+```
+args = {"goodsNo": "${s1.body.goods[0].goodsNo}",
+        "langCd": "ko"}
+context = {"s1": {"body": {"goods": [{"goodsNo": 12345, ...}]}}}
+→ resolved = {"goodsNo": 12345, "langCd": "ko"}
+```
+
+### 7.2 에러 / 재시도 정책 (초기 버전)
+
+| 에러 유형 | 동작 |
+|---|---|
+| HTTP 4xx | fail fast, trace 에 응답 body 포함 |
+| HTTP 5xx | 최대 2회 재시도 (exponential backoff) |
+| 타임아웃 | fail fast |
+| JsonPath 미스 | fail fast — "step sX 의 bindings 가 실제 응답 구조와 불일치: [list of missing paths]" |
+| Schema 검증 실패 | fail fast |
+
+**재계획 (re-plan) 은 v1 범위 밖.** 실패 시 Stage 4 가 사용자에게 설명.
+
+### 7.3 스트리밍
+
+각 step 단위로 이벤트 emit. UI 는 step 단위 진행 상황 표시.
+
+---
+
+## 8. Stage 4 — Response Synthesis
+
+**입력:** requirement + ExecutionTrace
+
+**동작:**
+```python
+def synthesize_response(requirement, trace):
+    if trace.success:
+        # 최종 output 의 관련 필드만 추림 (schema-aware projection)
+        relevant = project_relevant_fields(trace.output, requirement)
+        prompt = f"""
+        요구사항: {requirement}
+        실행 결과 요약: {relevant}
+        사용자에게 자연스럽게 답변.
+        """
+    else:
+        prompt = f"""
+        요구사항: {requirement}
+        실행 중 실패: step={trace.failed_step}, 이유={trace.error}
+        부분 결과: {trace.partial_results}
+        사용자에게 무엇이 됐고 무엇이 안 됐는지 설명.
+        """
+    return llm.complete(prompt)
+```
+
+**context:** 요약된 결과 기준 ~1KB. 전체 response 를 그대로 넘기지 않음 — `project_relevant_fields` 가 requirement 에 관련된 필드만 추림.
+
+---
+
+## 9. 핵심 데이터 계약
+
+### 9.1 Intent Schema (Stage 1 출력)
+
+```typescript
+type Intent = {
+  target: string;                    // function_name
+  confidence: number;                // 0.0 ~ 1.0
+  entities: Record<string, any>;     // {keyword: "...", locale: "ko", ...}
+  output_shape: "single" | "list" | "count";
+  reasoning?: string;                // 디버그용
+}
+```
+
+### 9.2 Plan Schema (Stage 2 출력)
+
+```typescript
+type Plan = {
+  id: string;                         // uuid (캐시 키 포함)
+  goal: string;                       // Intent 의 요약
+  steps: PlanStep[];
+  output_binding: string;             // JsonPath "$.s2.body" 등
+  metadata: {
+    created_at: string;
+    target: string;
+    disambiguation_used: boolean;
+  };
+}
+
+type PlanStep = {
+  id: string;                         // "s1", "s2", ...
+  tool: string;                       // function_name
+  tool_function_id: string;           // DB 룩업용 slug
+  args: Record<string, string>;       // {"goodsNo": "${s1.body.goods[0].goodsNo}", ...}
+  timeout_ms?: number;
+  retryable?: boolean;
+  rationale?: string;                 // "검색 결과로 goodsNo 획득"
+}
+```
+
+### 9.3 ExecutionTrace Schema (Stage 3 출력)
+
+```typescript
+type ExecutionTrace = {
+  plan_id: string;
+  success: boolean;
+  steps: StepTrace[];
+  output?: any;                       // 성공 시
+  failed_step?: string;               // 실패 시
+  error?: ErrorDetail;                // 실패 시
+  duration_ms: number;
+  started_at: string;
+  ended_at: string;
+}
+
+type StepTrace = {
+  id: string;
+  tool: string;
+  args: Record<string, any>;          // resolved (바인딩 치환 후)
+  output?: any;
+  error?: ErrorDetail;
+  duration_ms: number;
+  retries: number;
+}
+```
+
+---
+
+## 10. 하드코딩 제거 매핑표
+
+| 현 하드코딩 | 제거 방법 | 대체 메커니즘 |
+|---|---|---|
+| `_SYNONYM_CLUSTERS` (goods↔product) | 제거 | Pass 2 `primary_resource` + `semantic_tag` (LLM per-tool enrichment) |
+| `selt`, `sel` verb 특수 케이스 | 제거 | Pass 2 `canonical_action` (LLM 이 context 읽고 분류) |
+| `*Id/*No/*Seq` 접미사 heuristic | 제거 | Pass 1 field name + type exact match (동일 swagger 안에선 충분) + 필요시 Pass 2 semantic_tag |
+| `search-bridge` 예외 | 제거 | Pass 2 `pairs_well_with` + `canonical_action = search` |
+| `_is_single_resource_path` 필터 | 제거 | IO Contract 의 produces/consumes 가 판단 |
+| `_VERB_TO_INTENT` CRUD 사전 | **유지** (Pass 1 fallback) | Pass 2 가 LLM 으로 action 태깅 담당. Pass 2 생략 시 이 사전이 fallback |
+
+---
+
+## 11. 확장 포인트
+
+### 11.1 Fan-out (foreach)
+
+**시나리오:** "카트의 모든 상품 상세 보여줘"
+
+**Plan schema 확장:**
+```typescript
+type PlanStep = {
+  // ... 기존 필드
+  foreach?: {
+    source: string;                 // "${s1.body.items[*]}"
+    item_alias: string;             // "item"
+  };
+  // args 안에서 `${item.goodsNo}` 참조 가능
+}
+```
+
+**Runner 확장:** foreach step 은 N회 호출 후 결과를 배열로 묶어 context 에 저장.
+
+### 11.2 조건 분기 (if/else)
+
+**Plan schema 확장:** step 에 `condition` 필드 (JsonPath 기반 부울 식). Runner 가 evaluate 후 skip/execute.
+
+### 11.3 Workflow Template Library
+
+- 성공한 Plan 을 `workflow_templates` 테이블에 승격
+- 새 requirement → embedding 기반 template match → 재사용
+- Stage 1~2 skip 가능 → 더 빠름
+- Intent 유사 판정 임계값 튜닝 필요
+
+### 11.4 Interactive Refinement
+
+- Runner 가 특정 step 에서 `user_input_required` 이벤트 발행
+- UI 가 사용자에게 선택지 제시
+- 응답 받아 Runner 재개 (suspend/resume)
+- 민감 액션 (결제, 삭제) 에 필수
+
+### 11.5 Self-healing Re-plan
+
+- Runner 실패 시 ExecutionTrace + 에러를 Stage 1~2 에 다시 넘겨 1회 re-plan
+- 예: "빈 배열 반환 → 검색 키워드 재조정" 같은 케이스
+
+---
+
+## 12. 마이그레이션
+
+### 12.1 기존 자산 활용
+
+- `graph_tool_call.analyze.dependency.detect_dependencies`: **유지**. IO Contract 가 못 잡는 구조적 엣지는 여전히 여기서. 단 반응성 패치 (`selt`, `_SYNONYM_CLUSTERS`, `*No/*Seq`, `search-bridge`) 는 Pass 2 enrichment 정착 시 **단계적 제거**.
+- `graph_tool_call.retrieval`: **유지**. Stage 1 의 prefilter 로 그대로 활용 (BM25 + graph).
+- `graph_tool_call.ontology.llm_provider`: **유지**. Pass 2 enrichment 의 `enrich_tool_semantics` 메서드 추가.
+- `tool_executor.execute_collection_tool`: **유지**. Stage 3 Runner 가 호출.
+- `APICollectionLoader` Canvas 노드: **유지** (그래프 + ai_metadata 로드 역할).
+- `Agent Xgen` 노드: **유지** (범용 ReAct / 일반 채팅 용도). API collection 시나리오에 쓰일 땐 `Agent Planflow` 로 대체 권장.
+
+### 12.2 Canvas 노드 구성 변경
+
+```
+기존:  Input → APICollectionLoader → Agent Xgen → Output
+신규:  Input → APICollectionLoader → Agent Planflow → Output
+              (graph/ai_metadata/io_contracts 로드)  (Stage 1~4 통합)
+```
+
+`Agent Planflow` 내부 구조:
+```
+┌── Stage 1: retrieval + target pick  (LLM 1회)
+├── Stage 2: path synthesizer           (결정론, DAG)
+├── (conditional) disambiguation        (LLM 조건부)
+├── Stage 3: runner (streaming)          (결정론, HTTP)
+└── Stage 4: response synthesis          (LLM 1회, streaming)
+```
+
+설정 UI 는 `Agent Xgen` 과 공용 컴포넌트 재사용 (provider/model/api_key/temperature/max_tokens). 전용 파라미터 (`enable_disambiguation`, `max_plan_steps`) 만 추가.
+
+### 12.3 점진 마이그레이션 전략
+
+1. **Phase A:** L0 Knowledge Base 구축 — IO Contract 추출 (결정론) + `OntologyLLM.enrich_tool_semantics` 메서드 추가. 기존 graph 와 공존.
+2. **Phase B:** Stage 3 Runner 독립 구현 (plan fixture 로 단위 테스트).
+3. **Phase C:** Stage 2 Path Synthesizer — DAG + exact field match + semantic_tag 보강.
+4. **Phase D:** Stage 1 + 4 LLM 호출 구현 (structured output). 기존 `retrieve_with_scores` 를 Stage 1 prefilter 로 연결.
+5. **Phase E:** Canvas 노드 `Agent Planflow` 개발. 설정 UI 는 `Agent Xgen` 컴포넌트 재사용.
+6. **Phase F:** 평가 세트로 A/B 측정. 안정화 후 기존 반응성 패치 (`selt`, synonym 등) 제거.
+
+---
+
+## 13. 운영 리스크 및 완화
+
+| 리스크 | 영향 | 완화 |
+|---|---|---|
+| IO Contract semantic_tag 오태깅 | Stage 2 가 틀린 path 생성 | ingest 시 LLM 태깅 → 관리자 UI 검수/오버라이드 |
+| Stage 1 target 오선정 | 전혀 다른 도구 실행 | confidence threshold → 낮으면 disambiguation 강제 |
+| Stage 2 Ambiguous 빈발 | 매 요청 LLM 추가 호출 | IO Contract 개선으로 장기적으로 완화. 초기엔 허용 |
+| Runner JsonPath miss | 실행 실패 | plan validate 단계에서 response schema 와 bindings 교차 검증 (Stage 2 출력 직후) |
+| HTTP 외부 장애 | 사용자 체감 실패 | retry + 명확한 trace + Stage 4 에서 "일부 성공/실패" 구분 |
+| Embedding API 비용 | ingest 비용↑ | ingest 시 1회만. 요청당 embed 는 requirement 1회만 |
+| LLM structured output 깨짐 | Stage 1 파싱 실패 | 1회 retry → 실패 시 top-1 embedding 결과 fallback |
+
+---
+
+## 14. 측정 지표 (성공 기준)
+
+### 14.1 성능
+
+- Latency p50 / p95 (목표: p50 ≤ 3s, p95 ≤ 6s)
+- LLM 호출 수 / 요청 (목표: ≤ 2.5 평균)
+- Context 총량 / 요청 (목표: ≤ 3KB 평균)
+
+### 14.2 품질
+
+평가 세트: 요구사항 20~50개 (각 collection 당).
+
+- **Stage 1 target 정확도:** 고른 target 이 사람 판단과 일치하는 비율
+- **Stage 2 path 정확도:** 생성된 plan 이 유효한 실행 시퀀스인 비율
+- **End-to-end 성공률:** 사용자 요구사항 → 의미 있는 답변까지 성공한 비율
+- **Ambiguity rate:** Disambiguation 발동 빈도 (낮을수록 graph 품질 좋음)
+
+### 14.3 비용
+
+- OpenAI 토큰 소비 / 요청 (입력/출력 분리)
+- Embedding 호출 수 (ingest + 요청별 1회)
+
+### 14.4 감사성
+
+- 모든 Plan artifact 조회 가능
+- 실패 시 failed_step + error + partial_results 복원 가능
+
+---
+
+## 15. 비전과의 정합성
+
+사용자가 그린 그림:
+
+> Swagger → tool list 정의 → 사전 graph 관계 구축 →
+> 워크플로우에서 컬렉션 노드 연결 + 요구사항 입력 →
+> 필요한 API 들 찾아 req/res 세팅 후 순서대로 호출 → 결과 반환
+
+이 아키텍처의 대응:
+
+| 사용자 의도 | 이 설계에서 |
+|---|---|
+| "사전 graph 관계 구축" | L0 Knowledge Base (Pass 1 구조적 + Pass 2 LLM 의미 주석) |
+| "요구사항 입력" | Stage 1 입력 |
+| "필요한 API 찾기" | Stage 1 (retrieval + target pick) + Stage 2 (DAG 구성) |
+| "req/res 세팅" | Stage 2 의 argument bindings (exact field match + semantic_tag) |
+| "순서대로 호출" | Stage 3 Runner (DAG topological) |
+| "결과 반환" | Stage 4 Response Synthesis |
+
+**정합성 완전.** LLM 은 의미 해석이 필요한 지점에만 최소한으로 사용:
+- **Ingest 시 Pass 2** — description 을 읽고 의미 주석 (1회, 영속 저장)
+- **Runtime Stage 1** — 사용자 자연어 → target tool + entities
+- **Runtime Stage 4** — 실행 결과 → 자연어 응답
+
+Request/response schema 는 LLM 이 일절 건드리지 않음 (swagger 가 source of truth).
+
+---
+
+## 16. 결정 사항
+
+### 해결된 항목 (2026-04-23)
+
+| # | 주제 | 결정 | 근거 |
+|---|---|---|---|
+| 1 | Field semantic 매칭 방식 | **Pass 1 exact match (기본) + Pass 2 LLM semantic_tag (보강)**. embedding clustering 불필요 | 동일 swagger 안에선 field 이름 일관. cross-convention 은 LLM 이 해결 |
+| 2 | LLM 모델 선택 | **xgen agent 노드 config 재사용**. Stage 1/4 는 사용자 노드 설정 상속. Pass 2 는 컬렉션별 별도 설정 (기본 gpt-4.1-mini) | UX 일관성, 기존 provider/key 관리 재사용 |
+| 3 | Ingest embedding 모델 | **사용 안 함 (v1)**. 필요시 `text-embedding-3-small` 추후 연결 | BM25 + graph 가 Stage 1 top-20 recall 확보 (실측) |
+| 4 | Plan / ExecutionTrace 영속성 | **로그 기반 (DB 테이블 없음)**. 구조화 JSON 이벤트로 plan 생명주기 기록 | YAGNI. 필요 기능 (history UI, template auto-promotion) 생길 때 해당 테이블 추가 |
+| 5 | Canvas 노드 구성 | **신규 노드 `Agent Planflow`**. `Agent Xgen` 은 유지 (범용 ReAct), `Agent Planflow` 는 API collection 전용 Plan-and-Execute. 설정 UI 공용화 (provider/model/key) | 기존 자산 유지 + 특화 경로 분리. 코드 간결성 |
+| 6 | Plan 실행 범위 (v1) | **선형 chain 만**. Fan-out / 조건 분기 / parallel / re-plan 은 v2+. Plan schema 는 optional 필드로 **확장 가능하게 설계** | v1 목표 (30s→5s + 정확도) 는 선형으로 달성. 복잡 케이스는 사용자에게 명시적 에러 |
+
+### 미결 항목
+
+모두 해결됨 (2026-04-23).
+
+---
+
+## 17. 참고 문서
+
+- [pathfinder-plan.md](./pathfinder-plan.md) — 기존 로드맵 (이 문서 확정 후 섹션 3.7 업데이트 필요)
+- [pathfinder-bug-analysis.md](./pathfinder-bug-analysis.md) — ingest 파이프라인 과거 이슈
+- [xgen-ai-chat-architecture.md](./xgen-ai-chat-architecture.md) — AI chat / 사이드패널 / canvas 통합
+
+---
diff --git a/graph_tool_call/analyze/dependency.py b/graph_tool_call/analyze/dependency.py
index 709c9de..28864fa 100644
--- a/graph_tool_call/analyze/dependency.py
+++ b/graph_tool_call/analyze/dependency.py
@@ -79,6 +79,7 @@ def detect_dependencies(
     relations.extend(_detect_structural(tools, spec))
     relations.extend(_detect_name_based(tools))
     relations.extend(_detect_cross_resource(tools))
+    relations.extend(_detect_rpc_patterns(tools))
     relations = _deduplicate(relations)
     relations = [r for r in relations if r.confidence >= min_confidence]
     relations.sort(key=lambda r: r.confidence, reverse=True)
@@ -131,17 +132,56 @@ def _is_single_resource_path(path: str) -> bool:
 def _group_by_resource(tools: list[ToolSchema]) -> dict[str, list[ToolSchema]]:
     """Group tools that have ``method`` and ``path`` metadata by their base resource.
 
-    The base resource is the first non-param path segment (e.g. ``/pets``).
+    The base resource is the first *meaningful* non-param path segment.
+    A segment is considered a non-meaningful prefix when it groups more than
+    ``prefix_threshold`` percent of all tools — this handles version prefixes
+    (``/v1``, ``/v2``), routing prefixes (``/api``, ``/rest``), etc. without
+    requiring a hardcoded list.
     """
+    prefix_threshold = 0.4  # if a segment covers >40% of tools, it's a prefix
+
+    api_tools = [t for t in tools if t.metadata.get("path") and t.metadata.get("method")]
+    if not api_tools:
+        return {}
+
+    total = len(api_tools)
+
+    # Collect static segments per tool
+    tool_segments: list[tuple[ToolSchema, list[str]]] = []
+    for tool in api_tools:
+        segs = [s for s in tool.metadata["path"].split("/") if s and not s.startswith("{")]
+        tool_segments.append((tool, segs))
+
+    # Determine max depth to scan for prefixes (usually 1-2 levels)
+    max_depth = max((len(segs) for _, segs in tool_segments), default=1)
+
+    # Find how many prefix levels to skip:
+    # walk from depth 0 and keep skipping while the segment at that depth
+    # covers >threshold of all tools
+    skip_depth = 0
+    for depth in range(min(max_depth, 4)):  # cap at 4 to avoid pathological cases
+        counter: dict[str, int] = {}
+        for _, segs in tool_segments:
+            if depth < len(segs):
+                counter.setdefault(segs[depth], 0)
+                counter[segs[depth]] += 1
+        if not counter:
+            break
+        most_common_count = max(counter.values())
+        if most_common_count / total > prefix_threshold:
+            skip_depth = depth + 1
+        else:
+            break
+
+    # Group by the segment at skip_depth
     groups: dict[str, list[ToolSchema]] = {}
-    for tool in tools:
-        path = tool.metadata.get("path")
-        method = tool.metadata.get("method")
-        if not path or not method:
-            continue
-        # base resource = first static segment of the path
-        segments = [s for s in path.split("/") if s and not s.startswith("{")]
-        base = "/" + segments[0] if segments else "/"
+    for tool, segs in tool_segments:
+        if skip_depth < len(segs):
+            base = "/" + segs[skip_depth]
+        elif segs:
+            base = "/" + segs[-1]
+        else:
+            base = "/"
         groups.setdefault(base, []).append(tool)
     return groups
 
@@ -607,6 +647,257 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]:
     return relations
 
 
+# ---------------------------------------------------------------------------
+# Layer 4: RPC-style method name & DTO pattern detection
+# ---------------------------------------------------------------------------
+
+# Maps leading verb in an RPC method name to a CRUD intent category.
+_VERB_TO_INTENT: dict[str, str] = {
+    # read
+    "get": "read",
+    "find": "read",
+    "fetch": "read",
+    "list": "read",
+    "search": "read",
+    "select": "read",
+    "load": "read",
+    "read": "read",
+    "download": "read",
+    # write (create)
+    "save": "write",
+    "create": "write",
+    "add": "write",
+    "insert": "write",
+    "register": "write",
+    "regist": "write",
+    "reg": "write",  # camelCase 약어 (regGoodsApprove 등)
+    # update
+    "modify": "update",
+    "update": "update",
+    "edit": "update",
+    "change": "update",
+    "patch": "update",
+    # delete
+    "delete": "delete",
+    "remove": "delete",
+    "cancel": "delete",
+    "withdraw": "delete",
+    # action (side-effect operations)
+    "process": "action",
+    "execute": "action",
+    "apply": "action",
+    "approve": "action",
+    "reject": "action",
+    "confirm": "action",
+    "accept": "action",
+    "send": "action",
+    "upload": "action",
+    "export": "action",
+}
+
+# Trailing tokens in method names that describe the *view*, not the resource.
+_NAME_SUFFIXES: frozenset[str] = frozenset(
+    {
+        "list",
+        "detail",
+        "details",
+        "info",
+        "count",
+        "excel",
+        "popup",
+        "summary",
+        "check",
+        "data",
+        "total",
+        "all",
+        "page",
+        "download",
+    }
+)
+
+# Common DTO class-name suffixes that are not part of the resource identity.
+_DTO_SUFFIXES: frozenset[str] = frozenset(
+    {
+        "request",
+        "response",
+        "dto",
+        "entity",
+        "info",
+        "base",
+        "api",
+        "vo",
+        "model",
+        "form",
+        "param",
+        "result",
+        "ml",
+    }
+)
+
+# CRUD workflow rules: (source_intent, target_intent, relation, same_ctrl_conf, cross_ctrl_conf)
+# ``None`` for cross_ctrl_conf means the rule is skipped across controllers.
+_WORKFLOW_RULES: list[tuple[str, str, RelationType, float, float | None]] = [
+    ("read", "write", RelationType.REQUIRES, 0.9, 0.8),
+    ("update", "read", RelationType.REQUIRES, 0.85, 0.75),
+    ("delete", "read", RelationType.REQUIRES, 0.85, 0.75),
+    ("action", "read", RelationType.REQUIRES, 0.75, None),
+]
+
+
+def _same_controller(a: ToolSchema, b: ToolSchema) -> bool:
+    """Return True if both tools belong to the same (non-empty) controller."""
+    ctrl_a = a.metadata.get("controller") or ""
+    ctrl_b = b.metadata.get("controller") or ""
+    return ctrl_a == ctrl_b != ""
+
+
+def _extract_verb_and_resource(name: str) -> tuple[str, str]:
+    """Extract (verb, resource) from an RPC-style method name.
+
+    ``getGoodsList`` → ``("get", "goods")``
+    ``saveOptionCategoryList`` → ``("save", "optioncategory")``
+    """
+    tokens = _normalize_name(name)
+    if not tokens:
+        return "", ""
+
+    verb = ""
+    resource_start = 0
+    for i, tok in enumerate(tokens):
+        if tok in _VERB_TO_INTENT:
+            verb = tok
+            resource_start = i + 1
+            break
+
+    resource = "".join(t for t in tokens[resource_start:] if t not in _NAME_SUFFIXES)
+    return verb, resource
+
+
+def _extract_dto_resource(type_name: str | None) -> str:
+    """Extract the resource root from a DTO class name.
+
+    ``GoodsMgmtApiResponse`` → ``goodsmgmt``
+    ``ClaimTargetRequest``   → ``claimtarget``
+    """
+    if not type_name:
+        return ""
+    tokens = _normalize_name(type_name)
+    return "".join(t for t in tokens if t not in _DTO_SUFFIXES)
+
+
+def _detect_rpc_patterns(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Detect relations for RPC-style APIs (Layer 4).
+
+    Handles non-RESTful endpoints (e.g. ``/v1/goods/goodsMgmtApi/getGoodsList``)
+    where structural path analysis is ineffective.
+
+    Two strategies:
+      1. **Verb-resource grouping** — methods sharing the same resource token
+         form CRUD workflows with controller-scoped confidence.
+      2. **DTO type matching** — methods sharing a request/response type across
+         controllers are marked COMPLEMENTARY.
+    """
+    relations: list[DetectedRelation] = []
+    relations.extend(_detect_rpc_crud_workflows(tools))
+    relations.extend(_detect_rpc_dto_links(tools))
+    return relations
+
+
+def _detect_rpc_crud_workflows(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Build CRUD workflow relations from verb-resource analysis."""
+    relations: list[DetectedRelation] = []
+
+    # Group tools by extracted resource token.
+    resource_groups: dict[str, list[tuple[str, ToolSchema]]] = {}
+    for tool in tools:
+        verb, resource = _extract_verb_and_resource(tool.name)
+        if verb and resource:
+            resource_groups.setdefault(resource, []).append((verb, tool))
+
+    for resource, members in resource_groups.items():
+        if len(members) < 2:
+            continue
+
+        # Classify members by CRUD intent.
+        by_intent: dict[str, list[ToolSchema]] = {}
+        for verb, tool in members:
+            intent = _VERB_TO_INTENT.get(verb, "other")
+            by_intent.setdefault(intent, []).append(tool)
+
+        # Apply workflow rules.
+        for src_intent, tgt_intent, rel_type, same_conf, cross_conf in _WORKFLOW_RULES:
+            for src in by_intent.get(src_intent, []):
+                for tgt in by_intent.get(tgt_intent, []):
+                    if src.name == tgt.name:
+                        continue
+                    same = _same_controller(src, tgt)
+                    if not same and cross_conf is None:
+                        continue
+                    relations.append(
+                        DetectedRelation(
+                            source=src.name,
+                            target=tgt.name,
+                            relation_type=rel_type,
+                            confidence=same_conf if same else cross_conf,  # type: ignore[arg-type]
+                            evidence=(
+                                f"{src.name} ({src_intent}) → {tgt.name} ({tgt_intent})"
+                                f" — resource '{resource}'"
+                            ),
+                            layer=4,
+                        )
+                    )
+
+        # Readers within same controller are SIMILAR_TO.
+        readers = by_intent.get("read", [])
+        for i, r1 in enumerate(readers):
+            for r2 in readers[i + 1 :]:
+                if r1.name != r2.name and _same_controller(r1, r2):
+                    relations.append(
+                        DetectedRelation(
+                            source=r1.name,
+                            target=r2.name,
+                            relation_type=RelationType.SIMILAR_TO,
+                            confidence=0.8,
+                            evidence=f"{r1.name} ↔ {r2.name} — similar reads for '{resource}'",
+                            layer=4,
+                        )
+                    )
+
+    return relations
+
+
+def _detect_rpc_dto_links(tools: list[ToolSchema]) -> list[DetectedRelation]:
+    """Link tools that share a DTO type across controllers (COMPLEMENTARY)."""
+    relations: list[DetectedRelation] = []
+
+    # Group tools by normalised DTO resource name.
+    dto_groups: dict[str, list[ToolSchema]] = {}
+    for tool in tools:
+        for type_name in (tool.metadata.get("request_type"), tool.metadata.get("response_type")):
+            dto_res = _extract_dto_resource(type_name)
+            if len(dto_res) >= 4:
+                dto_groups.setdefault(dto_res, []).append(tool)
+
+    for dto_res, members in dto_groups.items():
+        if not 2 <= len(members) <= 20:
+            continue
+        for i, a in enumerate(members):
+            for b in members[i + 1 :]:
+                if a.name != b.name and not _same_controller(a, b):
+                    relations.append(
+                        DetectedRelation(
+                            source=a.name,
+                            target=b.name,
+                            relation_type=RelationType.COMPLEMENTARY,
+                            confidence=0.75,
+                            evidence=f"{a.name} ↔ {b.name} — shared DTO '{dto_res}'",
+                            layer=4,
+                        )
+                    )
+
+    return relations
+
+
 # ---------------------------------------------------------------------------
 # De-duplication
 # ---------------------------------------------------------------------------
diff --git a/graph_tool_call/core/tool.py b/graph_tool_call/core/tool.py
index 25df150..b3e9d71 100644
--- a/graph_tool_call/core/tool.py
+++ b/graph_tool_call/core/tool.py
@@ -408,6 +408,26 @@ def parse_tool(tool: Any) -> ToolSchema:
         destructive_hint=False,
         idempotent_hint=False,
     ),
+    "insert": MCPAnnotations(
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "register": MCPAnnotations(
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "regist": MCPAnnotations(  # 일부 코드베이스 약어 (regUser, registOrder)
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
+    "reg": MCPAnnotations(  # camelCase 짧은 약어 (regGoodsApprove)
+        read_only_hint=False,
+        destructive_hint=False,
+        idempotent_hint=False,
+    ),
     # update verbs
     "update": MCPAnnotations(
         read_only_hint=False,
diff --git a/graph_tool_call/execute/http_executor.py b/graph_tool_call/execute/http_executor.py
index 32859fa..55e5126 100644
--- a/graph_tool_call/execute/http_executor.py
+++ b/graph_tool_call/execute/http_executor.py
@@ -77,7 +77,12 @@ def build_request(
         for k, v in path_params.items():
             path = path.replace(f"{{{k}}}", urllib.parse.quote(str(v), safe=""))
 
-        url = f"{self._base_url}{path}"
+        # tool 자체 base_url(spec.servers 유래)이 있으면 그쪽 우선 — 한 컬렉션에
+        # 다른 호스트(common/product/member 등)의 source가 섞여 있을 때 source별
+        # 호스트로 라우팅한다. 없으면 executor 기본 base_url 사용.
+        tool_base = (metadata.get("base_url") or "").rstrip("/")
+        base = tool_base or self._base_url
+        url = f"{base}{path}"
         if query_params:
             url += "?" + urllib.parse.urlencode(query_params, doseq=True)
 
diff --git a/graph_tool_call/graphify/__init__.py b/graph_tool_call/graphify/__init__.py
new file mode 100644
index 0000000..98bbbce
--- /dev/null
+++ b/graph_tool_call/graphify/__init__.py
@@ -0,0 +1,39 @@
+"""graphify-mode: deterministic edge extraction + zero-vector retrieval.
+
+Inspired by the graphify project (https://github.com/safishamsi/graphify).
+The core idea: every edge carries a Confidence label, retrieval is a
+keyword-seeded BFS over confidence-weighted edges, and the result is a
+token-budgeted text rendering of the matched subgraph — no embeddings,
+no wRRF fusion, no MMR reranking.
+
+Public API:
+  - ingest_openapi_graphify(schemas) -> (ToolGraph, edge_stats)
+  - retrieve_graphify(tg, query, ...) -> {results, subgraph_text, intent, stats}
+  - render_subgraph_text(tg, nodes, edges, budget) -> str
+"""
+
+from graph_tool_call.graphify.ingest import (
+    DEFAULT_CONF_AMBIGUOUS,
+    DEFAULT_CONF_EXTRACTED,
+    DEFAULT_CONF_INFERRED,
+    _apply_pair_hints,
+    bucket_confidence,
+    ingest_openapi_graphify,
+    preserve_refs_for_detection,
+)
+from graph_tool_call.graphify.retrieval import (
+    render_subgraph_text,
+    retrieve_graphify,
+)
+
+__all__ = [
+    "DEFAULT_CONF_AMBIGUOUS",
+    "DEFAULT_CONF_EXTRACTED",
+    "DEFAULT_CONF_INFERRED",
+    "_apply_pair_hints",
+    "bucket_confidence",
+    "ingest_openapi_graphify",
+    "preserve_refs_for_detection",
+    "render_subgraph_text",
+    "retrieve_graphify",
+]
diff --git a/graph_tool_call/graphify/ingest.py b/graph_tool_call/graphify/ingest.py
new file mode 100644
index 0000000..afa23f3
--- /dev/null
+++ b/graph_tool_call/graphify/ingest.py
@@ -0,0 +1,437 @@
+"""Deterministic ingest: ToolSchema list -> ToolGraph with confidence labels.
+
+Pipeline (no LLM, no embeddings):
+  1. ``detect_dependencies`` runs all four layers (path-hierarchy, CRUD,
+     shared $ref, name/RPC/cross-resource) at threshold 0.0.
+  2. Each ``DetectedRelation`` is bucketed by (layer, conf_score) into one of
+     EXTRACTED / INFERRED / AMBIGUOUS / dropped.
+  3. Edges are added to a fresh ``ToolGraph`` with the bucket as ``confidence``
+     attr, plus ``conf_score`` / ``layer`` / ``evidence`` for transparency.
+  4. ``edge_stats`` summarises bucket counts, per-relation counts, and the
+     count of cross-source edges (different ``source_label`` on each end —
+     the key signal that adding a new source linked into the existing graph).
+
+For specs that use a lot of $ref pointers (typical of Swagger/OpenAPI 3.x
+generators like SpringDoc), pass the raw spec dict to
+``preserve_refs_for_detection`` BEFORE calling ``ingest_openapi_graphify`` so
+``detect_dependencies._detect_shared_schemas`` can fire — without this step
+the library's ``ingest_openapi`` resolves refs inline and the shared-schema
+signal is lost. ``ingest_openapi_graphify`` accepts the raw spec directly via
+``raw_spec=`` and runs preservation automatically.
+
+This is the ONLY ingest path used by xgen-workflow. The legacy 14-stage
+``RetrievalEngine`` plumbing in graph_tool_call.retrieval is left intact
+for benchmark/example users but is not invoked from this module.
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any
+
+from graph_tool_call.analyze.dependency import (
+    DetectedRelation,
+    detect_dependencies,
+)
+from graph_tool_call.core.tool import ToolSchema
+from graph_tool_call.ontology.schema import Confidence, RelationType
+from graph_tool_call.tool_graph import ToolGraph
+
+# Thresholds — same numbers graphify uses for INFERRED vs AMBIGUOUS.
+# EXTRACTED additionally requires layer == 1 (deterministic structural).
+DEFAULT_CONF_EXTRACTED = 0.85
+DEFAULT_CONF_INFERRED = 0.85
+DEFAULT_CONF_AMBIGUOUS = 0.70
+
+
+def bucket_confidence(
+    layer: int,
+    conf_score: float,
+    *,
+    extracted_min: float = DEFAULT_CONF_EXTRACTED,
+    inferred_min: float = DEFAULT_CONF_INFERRED,
+    ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS,
+) -> Confidence | None:
+    """Bucket a (layer, conf_score) pair into a Confidence label.
+
+    layer == 1 (path/CRUD/$ref) AND conf >= extracted_min  -> EXTRACTED
+    conf >= inferred_min                                   -> INFERRED
+    ambiguous_min <= conf < inferred_min                   -> AMBIGUOUS
+    else                                                   -> None  (dropped)
+    """
+    if conf_score >= extracted_min and layer == 1:
+        return Confidence.EXTRACTED
+    if conf_score >= inferred_min:
+        return Confidence.INFERRED
+    if conf_score >= ambiguous_min:
+        return Confidence.AMBIGUOUS
+    return None
+
+
+# ---------------------------------------------------------------------------
+# $ref preservation
+#
+# Library ``ingest_openapi`` calls ``_resolve_refs`` which inlines every
+# ``$ref`` pointer into its target schema. That makes life easier for runtime
+# users (they get full schemas, no traversal needed) but it ERASES the signal
+# ``_detect_shared_schemas`` relies on — that detector walks metadata looking
+# for literal ``$ref`` strings to spot tools sharing a DTO.
+#
+# This helper rescans the raw spec, captures refs per operation BEFORE they're
+# resolved, applies a frequency filter (drop common wrappers + singletons),
+# and re-injects them as ``__refs__`` markers into each tool's metadata so
+# ``_collect_refs`` finds them. Identical algorithm to xgen-workflow's
+# ``swagger_tool_generator._collect_operation_refs``.
+# ---------------------------------------------------------------------------
+
+_HTTP_METHODS = ("get", "post", "put", "patch", "delete", "head", "options")
+
+
+def _scan_refs(obj: Any) -> set[str]:
+    """Recursively collect ``$ref`` pointer strings from a schema fragment."""
+    refs: set[str] = set()
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            if k == "$ref" and isinstance(v, str):
+                refs.add(v)
+            else:
+                refs.update(_scan_refs(v))
+    elif isinstance(obj, list):
+        for item in obj:
+            refs.update(_scan_refs(item))
+    return refs
+
+
+def preserve_refs_for_detection(
+    tools: list[ToolSchema],
+    raw_spec: dict[str, Any],
+    *,
+    min_freq: int = 2,
+    max_freq_ratio: float = 0.3,
+) -> int:
+    """Inject ``__refs__`` markers into tool metadata so shared-schema detection fires.
+
+    Walk ``raw_spec`` BEFORE resolve, find $refs per operation, filter to the
+    "domain DTO" sweet spot (>=min_freq references, <=max_freq_ratio of all ops),
+    and re-inject them into each tool's ``metadata.response_schema.__refs__`` and
+    ``metadata.request_body_refs``.
+
+    Why filter:
+      - Common wrappers like ``ApiResponse`` show up in nearly every operation;
+        leaving them in produces a fully-connected COMPLEMENTARY graph (noise).
+      - Singletons show up once and can't form edges anyway.
+
+    Returns the number of tools whose metadata was updated. Mutates ``tools``
+    in place.
+    """
+    paths = raw_spec.get("paths") or {}
+    if not isinstance(paths, dict):
+        return 0
+
+    raw_per_op: dict[tuple[str, str], tuple[set[str], set[str]]] = {}
+    freq: Counter[str] = Counter()
+
+    for path, item in paths.items():
+        if not isinstance(item, dict):
+            continue
+        for method in _HTTP_METHODS:
+            op = item.get(method)
+            if not isinstance(op, dict):
+                continue
+            req = _scan_refs(op.get("requestBody")) | _scan_refs(op.get("parameters"))
+            resp = _scan_refs(op.get("responses"))
+            if not (req or resp):
+                continue
+            raw_per_op[(method, path)] = (req, resp)
+            for r in req | resp:
+                freq[r] += 1
+
+    if not raw_per_op:
+        return 0
+
+    total_ops = len(raw_per_op)
+    ceiling = max(min_freq, int(total_ops * max_freq_ratio))
+
+    def _useful(r: str) -> bool:
+        return min_freq <= freq[r] <= ceiling
+
+    op_refs: dict[tuple[str, str], tuple[list[str], list[str]]] = {}
+    for k, (req, resp) in raw_per_op.items():
+        rq = sorted(r for r in req if _useful(r))
+        rp = sorted(r for r in resp if _useful(r))
+        if rq or rp:
+            op_refs[k] = (rq, rp)
+
+    updated = 0
+    for tool in tools:
+        md = tool.metadata or {}
+        method = str(md.get("method") or "").lower()
+        path = str(md.get("path") or "")
+        refs = op_refs.get((method, path))
+        if not refs:
+            continue
+        rq, rp = refs
+        if rp:
+            rs = md.get("response_schema") or {}
+            if isinstance(rs, dict):
+                rs = dict(rs)
+                rs["__refs__"] = [{"$ref": r} for r in rp]
+                md["response_schema"] = rs
+        if rq:
+            md["request_body_refs"] = [{"$ref": r} for r in rq]
+        tool.metadata = md
+        updated += 1
+
+    return updated
+
+
+# ---------------------------------------------------------------------------
+# ai_metadata.pairs_well_with → graphify edge derivation
+#
+# ``ai_metadata`` is the source-of-truth (LLM Pass 2 fills it; the operator
+# can hand-edit it via ToolGraphView). On every rebuild we derive the
+# corresponding workflow edges into the graphify graph so ``_find_producer``
+# can score them as a first-class signal — no separate lookup, no two-system
+# sync drift. The frontend keeps reading ``ai_metadata.pairs_well_with``
+# directly (single read path, no UI churn).
+#
+# Confidence mapping reflects the trust we place in each source:
+#   PairHint.source == "manual" → EXTRACTED  (operator deliberately curated)
+#   PairHint.source == "auto"   → INFERRED   (LLM Pass 2 high-confidence)
+#   anything else / missing     → INFERRED   (legacy entries default safe)
+#
+# Layer is set to 2 because pair hints are not structural (path/$ref/CRUD)
+# even when curated — they encode workflow semantics, which sits one level
+# above structural inference in the graphify confidence model.
+# ---------------------------------------------------------------------------
+
+
+def _apply_pair_hints(
+    tg: ToolGraph,
+    schemas: list[ToolSchema],
+) -> dict[str, int]:
+    """Convert ``metadata.ai_metadata.pairs_well_with`` into graphify edges.
+
+    Skips pairs whose target tool isn't in the current graph (cross-source
+    enrichment can list pairs that haven't been ingested yet) and self-pairs.
+    Skips when the same (src, tgt) pair already carries a structural relation
+    from ``detect_dependencies`` UNLESS the new pair is operator-curated
+    (``source="manual"``) — operator intent overrides automatic detection.
+    """
+    stats = {
+        "manual": 0,
+        "auto": 0,
+        "skipped_target_missing": 0,
+        "skipped_self": 0,
+        "skipped_existing_structural": 0,
+    }
+    tool_names = set(tg.tools.keys())
+
+    for s in schemas:
+        ai = (s.metadata or {}).get("ai_metadata") or {}
+        pairs = ai.get("pairs_well_with") or []
+        if not isinstance(pairs, list):
+            continue
+        for p in pairs:
+            if not isinstance(p, dict):
+                continue
+            target = str(p.get("tool") or "").strip()
+            if not target:
+                continue
+            if target == s.name:
+                stats["skipped_self"] += 1
+                continue
+            if target not in tool_names:
+                stats["skipped_target_missing"] += 1
+                continue
+
+            source = str(p.get("source") or "auto").strip().lower()
+            is_manual = source == "manual"
+            confidence = Confidence.EXTRACTED if is_manual else Confidence.INFERRED
+            reason = str(p.get("reason") or "")[:200]
+
+            # Existing-edge policy: if detect_dependencies already produced
+            # an edge here we keep it unless the operator is overriding.
+            if tg.graph.has_edge(s.name, target):
+                if not is_manual:
+                    stats["skipped_existing_structural"] += 1
+                    continue
+
+            try:
+                tg.add_relation(
+                    s.name,
+                    target,
+                    RelationType.COMPLEMENTARY,
+                    confidence=confidence,
+                    layer=2,
+                    evidence=f"pair[{source}]: {reason}" if reason else f"pair[{source}]",
+                )
+                stats["manual" if is_manual else "auto"] += 1
+            except (KeyError, ValueError):
+                stats["skipped_target_missing"] += 1
+
+    return stats
+
+
+def _source_label(schema: ToolSchema) -> str:
+    """Return the source label that distinguishes which OpenAPI spec a tool came from.
+
+    xgen-workflow tags each tool with ``metadata.source_label`` (e.g. "order",
+    "claim"). When that's absent, fall back to the first path segment so
+    cross-source detection still works for libraries used outside xgen.
+    """
+    md = schema.metadata or {}
+    label = md.get("source_label")
+    if label:
+        return str(label)
+    path = str(md.get("path") or "")
+    segs = [s for s in path.split("/") if s and not s.startswith("{")]
+    return segs[0] if segs else ""
+
+
+def ingest_openapi_graphify(
+    schemas: list[ToolSchema],
+    *,
+    extracted_min: float = DEFAULT_CONF_EXTRACTED,
+    inferred_min: float = DEFAULT_CONF_INFERRED,
+    ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS,
+    spec: dict[str, Any] | None = None,
+    raw_spec: dict[str, Any] | None = None,
+) -> tuple[ToolGraph, dict[str, Any]]:
+    """Build a graphify-style ToolGraph from a list of ToolSchemas.
+
+    Parameters
+    ----------
+    schemas:
+        Tools to ingest. Pre-existing ``metadata.source_label`` enables
+        cross-source edge tracking.
+    extracted_min / inferred_min / ambiguous_min:
+        Confidence bucket thresholds (see ``bucket_confidence``).
+    spec:
+        Optional normalized spec dict, forwarded to ``detect_dependencies``.
+        Currently unused by the detector but kept for forward compat.
+    raw_spec:
+        Optional ORIGINAL OpenAPI/Swagger spec dict (BEFORE $ref resolution).
+        When supplied, runs ``preserve_refs_for_detection`` so the layer-1
+        shared-schema detector can fire on heavily $ref-using specs (typical
+        of SpringDoc-generated OpenAPI). xgen-workflow callers who already
+        bake refs into tool metadata via swagger_tool_generator can leave
+        this None.
+
+    Returns
+    -------
+    (ToolGraph, edge_stats):
+        ``edge_stats`` keys:
+          EXTRACTED, INFERRED, AMBIGUOUS, dropped:  int counts
+          by_relation:                              {relation_value: int}
+          cross_source:                             int  (edges across labels)
+          tool_count, edge_count:                   int
+          refs_preserved:                           int  (tools touched by
+                                                          preserve_refs_for_detection)
+    """
+    tg = ToolGraph()
+    for s in schemas:
+        tg.add_tool(s)
+
+    label_by_name = {s.name: _source_label(s) for s in schemas}
+
+    stats: dict[str, Any] = {
+        "EXTRACTED": 0,
+        "INFERRED": 0,
+        "AMBIGUOUS": 0,
+        "dropped": 0,
+        "by_relation": {},
+        "cross_source": 0,
+        "tool_count": len(schemas),
+        "edge_count": 0,
+        "refs_preserved": 0,
+    }
+
+    if len(schemas) < 2:
+        return tg, stats
+
+    # Optional: rescue layer-1 shared-schema signal that ingest_openapi inlined.
+    if raw_spec is not None:
+        stats["refs_preserved"] = preserve_refs_for_detection(schemas, raw_spec)
+
+    # min_confidence=0.0 so we see every candidate; we re-bucket here.
+    relations: list[DetectedRelation] = detect_dependencies(schemas, spec, min_confidence=0.0)
+
+    seen: set[tuple[str, str, str]] = set()  # (src, tgt, relation_value)
+    for rel in relations:
+        bucket = bucket_confidence(
+            rel.layer,
+            rel.confidence,
+            extracted_min=extracted_min,
+            inferred_min=inferred_min,
+            ambiguous_min=ambiguous_min,
+        )
+        if bucket is None:
+            stats["dropped"] += 1
+            continue
+
+        rel_value = (
+            rel.relation_type.value
+            if hasattr(rel.relation_type, "value")
+            else str(rel.relation_type)
+        )
+        key = (rel.source, rel.target, rel_value)
+        if key in seen:
+            # detect_dependencies already de-duplicates, but be defensive.
+            continue
+        seen.add(key)
+
+        try:
+            tg.add_relation(
+                rel.source,
+                rel.target,
+                rel.relation_type,
+                confidence=bucket,
+                conf_score=rel.confidence,
+                layer=rel.layer,
+                evidence=rel.evidence,
+            )
+        except (KeyError, ValueError):
+            # Endpoint not in graph (shouldn't happen — tools were just added) — skip.
+            stats["dropped"] += 1
+            continue
+
+        stats[bucket.value] += 1
+        stats["by_relation"][rel_value] = stats["by_relation"].get(rel_value, 0) + 1
+
+        src_label = label_by_name.get(rel.source, "")
+        tgt_label = label_by_name.get(rel.target, "")
+        if src_label and tgt_label and src_label != tgt_label:
+            stats["cross_source"] += 1
+
+    # Derive workflow edges from ai_metadata.pairs_well_with — single
+    # source-of-truth lives on each tool's metadata, edges are regenerated
+    # on every rebuild so operator/LLM curation flows in automatically.
+    pair_stats = _apply_pair_hints(tg, schemas)
+    stats["pair_edges"] = pair_stats
+    # Roll the pair edges into the global confidence/by_relation counters
+    # so ``edge_stats`` accurately reflects the final graph contents.
+    stats["EXTRACTED"] += pair_stats.get("manual", 0)
+    stats["INFERRED"] += pair_stats.get("auto", 0)
+    if pair_stats.get("manual") or pair_stats.get("auto"):
+        stats["by_relation"]["complementary"] = (
+            stats["by_relation"].get("complementary", 0)
+            + pair_stats.get("manual", 0)
+            + pair_stats.get("auto", 0)
+        )
+        # cross_source also re-counted on these new edges for completeness.
+        for s in schemas:
+            ai = (s.metadata or {}).get("ai_metadata") or {}
+            for p in ai.get("pairs_well_with") or []:
+                if not isinstance(p, dict):
+                    continue
+                tgt = str(p.get("tool") or "").strip()
+                if not tgt or tgt == s.name or tgt not in tg.tools:
+                    continue
+                src_lab = label_by_name.get(s.name, "")
+                tgt_lab = label_by_name.get(tgt, "")
+                if src_lab and tgt_lab and src_lab != tgt_lab:
+                    stats["cross_source"] += 1
+
+    stats["edge_count"] = tg.graph.edge_count()
+    return tg, stats
diff --git a/graph_tool_call/graphify/retrieval.py b/graph_tool_call/graphify/retrieval.py
new file mode 100644
index 0000000..f15e4bc
--- /dev/null
+++ b/graph_tool_call/graphify/retrieval.py
@@ -0,0 +1,467 @@
+"""Zero-vector retrieval over a graphify-style ToolGraph.
+
+Algorithm (mirrors graphify/serve.py):
+  1. seed = top-5 of BM25(query)  (substring fallback if BM25 returns empty)
+  2. weights = INTENT_RELATION_WEIGHTS[dominant_intent] or DEFAULT
+  3. score = rel_weight[rel] * CONF_FACTOR[confidence] * decay(depth)
+     CONF_FACTOR = {EXTRACTED: 1.0, INFERRED: 0.7, AMBIGUOUS: 0.4, None: 0.5}
+     decay(d)   = 1 / (0.5*d + 1)
+  4. BFS from seeds, depth=2, accumulate max score per neighbour
+  5. history-aware demote (used tools * 0.6)
+  6. render_subgraph_text(top_k nodes + edges, token_budget)
+
+Why this works without embeddings:
+  - The graph carries the semantic signal (CRUD chains, $ref data flow,
+    cross-resource matches) — once a relationship is in the graph, traversal
+    finds it.
+  - Confidence labels let the score down-weight guesses without dropping them;
+    AMBIGUOUS edges still appear, just behind EXTRACTED ones.
+  - Token-budgeted rendering means an LLM gets a compact, structured context
+    (not a list of tool JSON blobs) and can decide chains via the EDGE lines.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Any
+
+from graph_tool_call.core.protocol import GraphEngine
+from graph_tool_call.core.tool import ToolSchema
+from graph_tool_call.ontology.schema import (
+    DEFAULT_RELATION_WEIGHTS,
+    INTENT_RELATION_WEIGHTS,
+    NodeType,
+    RelationType,
+)
+from graph_tool_call.retrieval.intent import classify_intent
+from graph_tool_call.tool_graph import ToolGraph
+
+# Score multiplier per confidence bucket. EXTRACTED edges are deterministic
+# (path/CRUD/$ref) and trusted at 1.0; INFERRED is heuristic but still
+# high-confidence; AMBIGUOUS gets a strong penalty so it's surfaced for
+# review without dominating EXTRACTED chains.
+#
+# Edges added by callers without a confidence attr (e.g. legacy code paths)
+# get the same weight as the no-bucket fallback (0.5) — neither rewarded
+# nor heavily penalised.
+CONF_FACTOR: dict[str | None, float] = {
+    "EXTRACTED": 1.0,
+    "INFERRED": 0.7,
+    "AMBIGUOUS": 0.4,
+    None: 0.5,
+}
+
+_DEFAULT_DEPTH = 2
+_DEFAULT_TOP_K = 10
+_DEFAULT_BUDGET = 2000
+_HISTORY_DEMOTE = 0.6
+
+
+# ---------------------------------------------------------------------------
+# Seed selection
+# ---------------------------------------------------------------------------
+
+
+def _strip_diacritics(text: str) -> str:
+    nfkd = unicodedata.normalize("NFKD", text)
+    return "".join(c for c in nfkd if not unicodedata.combining(c))
+
+
+def _substring_seeds(
+    tools: dict[str, ToolSchema],
+    query: str,
+    *,
+    limit: int = 5,
+) -> list[tuple[str, float]]:
+    """Substring fallback when BM25 returns no hits (very short or non-Latin queries)."""
+    q = _strip_diacritics(query).lower()
+    terms = [t for t in re.split(r"[\s_\-/.,;:!?()]+", q) if t and len(t) > 1]
+    scored: list[tuple[str, float]] = []
+    for name, tool in tools.items():
+        nname = _strip_diacritics(name).lower()
+        ndesc = _strip_diacritics(tool.description or "").lower()
+        score = sum(1.0 for t in terms if t in nname) + 0.5 * sum(1.0 for t in terms if t in ndesc)
+        if score > 0:
+            scored.append((name, score))
+    scored.sort(key=lambda x: x[1], reverse=True)
+    return scored[:limit]
+
+
+def _bm25_seeds(tg: ToolGraph, query: str, *, limit: int = 5) -> list[tuple[str, float]]:
+    """Top-N BM25 hits as seeds. Uses the engine's BM25 index, lazy-built once."""
+    try:
+        engine = tg._get_retrieval_engine()  # noqa: SLF001
+        bm25 = engine._get_bm25()  # noqa: SLF001
+    except Exception:
+        return []
+    scores = bm25.score(query) or {}
+    if not scores:
+        return []
+    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    return [(name, score) for name, score in ranked[:limit]]
+
+
+def _select_seeds(
+    tg: ToolGraph,
+    query: str,
+    *,
+    limit: int = 5,
+) -> list[tuple[str, float]]:
+    seeds = _bm25_seeds(tg, query, limit=limit)
+    if seeds:
+        return seeds
+    return _substring_seeds(tg.tools, query, limit=limit)
+
+
+# ---------------------------------------------------------------------------
+# BFS traversal
+# ---------------------------------------------------------------------------
+
+
+def _intent_weights(query: str) -> tuple[dict[str, float], str]:
+    """Pick relation weights based on dominant query intent.
+
+    Returns (weights_map, dominant_label) where label is one of
+    'read'/'write'/'delete'/'neutral'.
+    """
+    intent = classify_intent(query)
+    if intent.is_neutral:
+        return DEFAULT_RELATION_WEIGHTS, "neutral"
+    by_dim = {
+        "read": intent.read_intent,
+        "write": intent.write_intent,
+        "delete": intent.delete_intent,
+    }
+    dominant = max(by_dim, key=lambda k: by_dim[k])
+    if by_dim[dominant] < 0.5:
+        return DEFAULT_RELATION_WEIGHTS, "neutral"
+    weights = INTENT_RELATION_WEIGHTS.get(dominant, DEFAULT_RELATION_WEIGHTS)
+    return weights, dominant
+
+
+def _normalize_relation_key(rel: Any) -> Any:
+    """Relation weights are keyed by RelationType. Normalize string attrs to enum."""
+    if isinstance(rel, RelationType):
+        return rel
+    if isinstance(rel, str):
+        try:
+            return RelationType(rel)
+        except ValueError:
+            return rel
+    return rel
+
+
+def _bfs_from_seeds(
+    graph: GraphEngine,
+    seed_scores: list[tuple[str, float]],
+    *,
+    depth: int,
+    rel_weights: dict[str, float],
+) -> tuple[dict[str, float], list[tuple[str, str]]]:
+    """Confidence-weighted BFS. Returns (scores, edges_visited).
+
+    Score policy:
+      seeds:        normalized BM25 score (top seed = 1.0, others scaled)
+      neighbour at depth d via edge of weight w and confidence c:
+        score(neighbour) = max(prev,  parent_score * w * CONF_FACTOR[c] * 1/(0.5*d + 1))
+
+    Why normalize seeds: if all 5 BM25 hits got flat 1.0, top-K shows them in
+    arbitrary order with identical scores and BFS-found neighbours never compete.
+    Scaling by ``score / max_seed_score`` preserves BM25's relative ranking and
+    lets a strongly-matching seed lift its 1-hop neighbours above weakly-matching
+    sibling seeds.
+
+    Tools nodes are scored; CATEGORY/DOMAIN nodes are passthrough so we can
+    reach sibling tools on the next hop.
+    """
+    if not seed_scores:
+        return {}, []
+
+    max_seed = max((s for _, s in seed_scores), default=1.0) or 1.0
+    scores: dict[str, float] = {n: s / max_seed for n, s in seed_scores if graph.has_node(n)}
+    visited: set[str] = set(scores)
+    frontier: list[str] = list(scores)
+    edges_visited: list[tuple[str, str]] = []
+
+    for d in range(1, depth + 1):
+        decay = 1.0 / (0.5 * d + 1)
+        next_frontier: list[str] = []
+        for node in frontier:
+            parent_score = scores.get(node, 0.0)
+            try:
+                edges = graph.get_edges_from(node, direction="both")
+            except (KeyError, ValueError):
+                continue
+            for src, tgt, attrs in edges:
+                neighbour = tgt if src == node else src
+                if neighbour in visited:
+                    continue
+                neighbour_attrs = graph.get_node_attrs(neighbour)
+                neighbour_type = neighbour_attrs.get("node_type")
+
+                rel_key = _normalize_relation_key(attrs.get("relation"))
+                rel_w = rel_weights.get(rel_key, 0.3)
+                conf = attrs.get("confidence")
+                conf_factor = CONF_FACTOR.get(conf, CONF_FACTOR[None])
+
+                if neighbour_type == NodeType.TOOL:
+                    # Propagate parent's score so a high-BM25 seed lifts its
+                    # neighbours more than a low-BM25 seed does. This is what
+                    # makes the ranking actually informative — without
+                    # parent_score multiplication every BFS-discovered tool
+                    # would inherit the same fixed weight.
+                    score = parent_score * rel_w * conf_factor * decay
+                    scores[neighbour] = max(scores.get(neighbour, 0.0), score)
+                    edges_visited.append((src, tgt))
+                    next_frontier.append(neighbour)
+                    visited.add(neighbour)
+                elif neighbour_type in (NodeType.CATEGORY, NodeType.DOMAIN):
+                    # Passthrough — visit but don't score; lets BFS reach
+                    # sibling tools via CATEGORY hubs without inflating scores.
+                    next_frontier.append(neighbour)
+                    visited.add(neighbour)
+        frontier = next_frontier
+        if not frontier:
+            break
+
+    return scores, edges_visited
+
+
+# ---------------------------------------------------------------------------
+# Subgraph rendering
+# ---------------------------------------------------------------------------
+
+
+def _node_line(name: str, tool: ToolSchema | None, attrs: dict) -> str:
+    """One NODE line for the subgraph text rendering."""
+    md = (tool.metadata if tool else {}) or {}
+    method = str(md.get("method") or "").upper()
+    path = str(md.get("path") or "")
+    src_label = str(md.get("source_label") or "")
+    community = attrs.get("community")
+    parts = [name]
+    if method or path:
+        parts.append(f"[{method} {path}]".strip())
+    if src_label:
+        parts.append(f"[source={src_label}]")
+    if community is not None:
+        parts.append(f"[community={community}]")
+    return "NODE " + " ".join(p for p in parts if p)
+
+
+def _edge_line(
+    u: str,
+    v: str,
+    attrs: dict,
+) -> str:
+    """One EDGE line. confidence in [], evidence in (...)."""
+    rel = attrs.get("relation")
+    rel_str = rel.value if hasattr(rel, "value") else str(rel)
+    conf = attrs.get("confidence", "")
+    conf_str = f" [{conf}]" if conf else ""
+    line = f"EDGE {u} --{rel_str}{conf_str}--> {v}"
+    evidence = attrs.get("evidence")
+    if evidence:
+        line += f"   ({evidence})"
+    return line
+
+
+def render_subgraph_text(
+    tg: ToolGraph,
+    nodes: set[str] | list[str],
+    edges: list[tuple[str, str]] | None = None,
+    *,
+    token_budget: int = _DEFAULT_BUDGET,
+    sort_by_score: dict[str, float] | None = None,
+) -> str:
+    """Render the matched subgraph as ``NODE ...`` / ``EDGE ...`` lines.
+
+    Approx 3 chars per token is the budget conversion. When the rendering
+    overflows the budget, the tail is cut and a ``... (truncated)`` line
+    is appended.
+
+    sort_by_score: if provided, NODE lines are emitted in descending score
+    order so the LLM sees the most relevant tools first.
+
+    edges: optional hint listing edges visited during BFS — purely for
+    ordering. Whether or not this is supplied, ALL graph edges between any
+    pair of chosen nodes are emitted so the LLM sees the full local
+    structure (matching graphify's behaviour).
+    """
+    char_budget = token_budget * 3
+    node_set: set[str] = set(nodes)
+
+    # Order nodes: by retrieval score (desc) if known, else by name.
+    if sort_by_score:
+        node_order = sorted(node_set, key=lambda n: (-sort_by_score.get(n, 0.0), n))
+    else:
+        node_order = sorted(node_set)
+
+    lines: list[str] = []
+    for n in node_order:
+        if not tg.graph.has_node(n):
+            continue
+        attrs = tg.graph.get_node_attrs(n)
+        tool = tg.tools.get(n)
+        lines.append(_node_line(n, tool, attrs))
+
+    # Walk all graph edges between chosen nodes (not just BFS visited ones)
+    # so the LLM gets the complete local structure. BFS-visited edges naturally
+    # come first when we sort, ensuring no surprise gaps.
+    seen_edges: set[tuple[str, str]] = set()
+    edge_lines: list[str] = []
+    for u in node_order:
+        if not tg.graph.has_node(u):
+            continue
+        try:
+            outgoing = tg.graph.get_edges_from(u, direction="out")
+        except (KeyError, ValueError):
+            continue
+        for src, tgt, attrs in outgoing:
+            if tgt not in node_set:
+                continue
+            key = (src, tgt)
+            if key in seen_edges:
+                continue
+            seen_edges.add(key)
+            edge_lines.append(_edge_line(src, tgt, attrs))
+
+    lines.extend(edge_lines)
+
+    output = "\n".join(lines)
+    if len(output) > char_budget:
+        # Cut at the last newline that fits, then append a marker. Keep the
+        # marker even if it pushes us slightly over the char budget — the
+        # token budget is a soft cap.
+        cut = output[:char_budget].rsplit("\n", 1)[0]
+        output = cut + f"\n... (truncated to ~{token_budget} token budget)"
+    return output
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def retrieve_graphify(
+    tg: ToolGraph,
+    query: str,
+    *,
+    top_k: int = _DEFAULT_TOP_K,
+    depth: int = _DEFAULT_DEPTH,
+    token_budget: int = _DEFAULT_BUDGET,
+    history: list[str] | None = None,
+) -> dict[str, Any]:
+    """Retrieve tools for a natural-language query using graph traversal only.
+
+    Parameters
+    ----------
+    tg:
+        A graphify-style ``ToolGraph``. Edges should carry ``confidence``
+        attrs (EXTRACTED/INFERRED/AMBIGUOUS); edges without one get the
+        neutral 0.5 multiplier.
+    query:
+        Natural-language search.
+    top_k:
+        Maximum tools in the result set (and the rendered subgraph).
+    depth:
+        BFS depth from seeds. 2 is graphify's default and works for most
+        workflow chains (createX -> getX -> doSomethingWithX).
+    token_budget:
+        Char-budget for the rendered text (~3 chars/token).
+    history:
+        Tool names already called in this session — they are demoted (×0.6)
+        to encourage progress through a workflow rather than re-suggesting.
+
+    Returns
+    -------
+    dict with keys:
+      - results:        list of {name, score, tool: {...}} sorted desc.
+      - subgraph_text:  the LLM-ready NODE/EDGE rendering.
+      - intent:         {dominant: 'read'|'write'|'delete'|'neutral', read, write, delete}
+      - stats:          {seeds: [...], visited_nodes: int, visited_edges: int}
+
+    Note: prerequisite chain construction (e.g. listOrders → getOrder → cancelOrder)
+    is NOT this function's job — it lives in Stage 2 ``synthesize_plan`` which
+    consumes the graph this module produces. retrieve_graphify only finds the
+    primary candidates; chain assembly is downstream.
+    """
+    if not query or not tg.tools:
+        return {
+            "results": [],
+            "subgraph_text": "",
+            "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0},
+            "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0},
+        }
+
+    # 1) Seeds
+    seeds_with_scores = _select_seeds(tg, query, limit=5)
+    seed_names = [s for s, _ in seeds_with_scores]
+
+    if not seed_names:
+        return {
+            "results": [],
+            "subgraph_text": "",
+            "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0},
+            "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0},
+        }
+
+    # 2) Intent → relation weight map
+    rel_weights, dominant = _intent_weights(query)
+    from graph_tool_call.retrieval.intent import classify_intent  # noqa: I001 (re-import OK)
+
+    intent_obj = classify_intent(query)
+
+    # 3) BFS — pass full (name, score) pairs so seed scores reflect BM25 ranking
+    scores, edges_visited = _bfs_from_seeds(
+        tg.graph,
+        seeds_with_scores,
+        depth=depth,
+        rel_weights=rel_weights,
+    )
+
+    # 4) History demote
+    if history:
+        for h in history:
+            if h in scores:
+                scores[h] *= _HISTORY_DEMOTE
+
+    # 5) Filter to TOOL nodes only and rank
+    tool_scores: dict[str, float] = {n: s for n, s in scores.items() if n in tg.tools}
+    ranked = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
+    chosen_names: set[str] = {n for n, _ in ranked}
+
+    # 6) Render
+    subgraph_text = render_subgraph_text(
+        tg,
+        chosen_names,
+        edges_visited,
+        token_budget=token_budget,
+        sort_by_score=tool_scores,
+    )
+
+    results = [
+        {
+            "name": name,
+            "score": round(score, 4),
+            "tool": tg.tools[name].to_dict() if name in tg.tools else None,
+        }
+        for name, score in ranked
+    ]
+
+    return {
+        "results": results,
+        "subgraph_text": subgraph_text,
+        "intent": {
+            "dominant": dominant,
+            "read": round(intent_obj.read_intent, 3),
+            "write": round(intent_obj.write_intent, 3),
+            "delete": round(intent_obj.delete_intent, 3),
+        },
+        "stats": {
+            "seeds": seed_names,
+            "visited_nodes": len(scores),
+            "visited_edges": len(edges_visited),
+        },
+    }
diff --git a/graph_tool_call/ingest/io_contract.py b/graph_tool_call/ingest/io_contract.py
new file mode 100644
index 0000000..90bf308
--- /dev/null
+++ b/graph_tool_call/ingest/io_contract.py
@@ -0,0 +1,349 @@
+"""Field-level IO contract extraction from OpenAPI / Swagger schemas.
+
+Used by L0 Knowledge Base — **Pass 1, deterministic**. Walks request and
+response schemas and emits leaf field descriptors with JsonPath. The output
+feeds:
+
+  - Tool Graph: produces × consumes field-name match → ``produces_for`` edge
+  - Pass 2 enrichment: provides field list to LLM for ``semantic_tag`` assign
+  - Stage 3 Runner: bindings reference these json_paths
+
+This module assumes the input schema is **already $ref-resolved** (caller
+runs ``_resolve_refs`` from ``graph_tool_call.ingest.openapi``).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class FieldLeaf:
+    """A leaf field extracted from a JSON Schema.
+
+    ``json_path`` is the dotted JSONPath from the schema root, with ``[*]``
+    used as the array wildcard (for produces). For consumes, callers usually
+    flatten to ``field_name`` since binding keys by name not path.
+    """
+
+    json_path: str
+    field_name: str
+    field_type: str
+    required: bool = False
+    description: str = ""
+    enum: list[Any] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Schema walker
+# ---------------------------------------------------------------------------
+
+
+_DEFAULT_MAX_DEPTH = 8
+
+
+def extract_leaves(
+    schema: Any,
+    *,
+    base_path: str = "$",
+    parent_required: bool = False,
+    max_depth: int = _DEFAULT_MAX_DEPTH,
+    _depth: int = 0,
+) -> list[FieldLeaf]:
+    """Recursively walk a JSON Schema, emitting leaf field info.
+
+    Parameters
+    ----------
+    schema:
+        JSON Schema dict (already $ref-resolved).
+    base_path:
+        Starting JSONPath for this subtree (e.g. ``$``, ``$.body``).
+    parent_required:
+        Whether the containing field is required by its parent. Propagated to
+        leaves so the caller can filter ``required-only`` consumes.
+    max_depth:
+        Hard recursion limit. Cyclic schemas or pathological nesting stop here.
+
+    Returns
+    -------
+    list[FieldLeaf]
+        One entry per primitive (or array-of-primitive) leaf reachable.
+    """
+    if not isinstance(schema, dict) or _depth > max_depth:
+        return []
+
+    schema = _resolve_combinators(schema)
+
+    schema_type = _normalize_type(schema.get("type"))
+
+    # Object: walk properties
+    if schema_type == "object" or "properties" in schema:
+        return _walk_object(schema, base_path, max_depth, _depth)
+
+    # Array: walk items with [*] suffix
+    if schema_type == "array":
+        items = schema.get("items") or {}
+        return extract_leaves(
+            items,
+            base_path=f"{base_path}[*]",
+            parent_required=parent_required,
+            max_depth=max_depth,
+            _depth=_depth + 1,
+        )
+
+    # Primitive: emit a single leaf using the trailing path segment as name
+    field_name = _last_path_segment(base_path)
+    if not field_name:
+        # At root with no parent name — nothing useful to emit
+        return []
+    return [
+        FieldLeaf(
+            json_path=base_path,
+            field_name=field_name,
+            field_type=schema_type or "string",
+            required=parent_required,
+            description=str(schema.get("description") or "")[:200],
+            enum=list(schema.get("enum") or []),
+        )
+    ]
+
+
+def _walk_object(
+    schema: dict[str, Any],
+    base_path: str,
+    max_depth: int,
+    depth: int,
+) -> list[FieldLeaf]:
+    leaves: list[FieldLeaf] = []
+    properties = schema.get("properties") or {}
+    if not isinstance(properties, dict):
+        return leaves
+    required_set = set(schema.get("required") or [])
+
+    for prop_name, prop_schema in properties.items():
+        child_path = f"{base_path}.{prop_name}"
+        is_required = prop_name in required_set
+        child_leaves = extract_leaves(
+            prop_schema,
+            base_path=child_path,
+            parent_required=is_required,
+            max_depth=max_depth,
+            _depth=depth + 1,
+        )
+        if child_leaves:
+            leaves.extend(child_leaves)
+        else:
+            # Object/array with no resolvable children — keep as a generic leaf
+            # so downstream knows the field exists (e.g. opaque additionalProps).
+            leaves.append(
+                FieldLeaf(
+                    json_path=child_path,
+                    field_name=prop_name,
+                    field_type=_schema_type(prop_schema) or "object",
+                    required=is_required,
+                    description=(
+                        str(prop_schema.get("description") or "")[:200]
+                        if isinstance(prop_schema, dict)
+                        else ""
+                    ),
+                )
+            )
+    return leaves
+
+
+def _resolve_combinators(schema: dict[str, Any]) -> dict[str, Any]:
+    """Flatten ``allOf`` / pick first ``oneOf`` / ``anyOf``.
+
+    v1 strategy: best-effort. Doesn't handle JSON Schema combinator semantics
+    fully — sufficient to surface field shapes for our planning use.
+    """
+    if "allOf" in schema and isinstance(schema["allOf"], list):
+        merged_props: dict[str, Any] = dict(schema.get("properties") or {})
+        merged_required: list[str] = list(schema.get("required") or [])
+        for sub in schema["allOf"]:
+            if not isinstance(sub, dict):
+                continue
+            merged_props.update(sub.get("properties") or {})
+            for r in sub.get("required") or []:
+                if r not in merged_required:
+                    merged_required.append(r)
+        out = dict(schema)
+        out["type"] = "object"
+        out["properties"] = merged_props
+        out["required"] = merged_required
+        return out
+
+    for key in ("oneOf", "anyOf"):
+        candidates = schema.get(key)
+        if isinstance(candidates, list) and candidates:
+            first = next((c for c in candidates if isinstance(c, dict)), None)
+            if first is not None:
+                # Merge the candidate as a base, parent fields override
+                base = dict(first)
+                base.update({k: v for k, v in schema.items() if k != key})
+                return base
+    return schema
+
+
+def _normalize_type(t: Any) -> str:
+    """JSON Schema 'type' can be str or list. Pick first non-null."""
+    if isinstance(t, list):
+        return next((x for x in t if x and x != "null"), "")
+    return t or ""
+
+
+def _schema_type(schema: Any) -> str:
+    if not isinstance(schema, dict):
+        return ""
+    return _normalize_type(schema.get("type"))
+
+
+def _last_path_segment(path: str) -> str:
+    """Extract trailing field name from a JsonPath like ``$.body.goods[*].goodsNo``."""
+    if not path or path == "$":
+        return ""
+    last = path.rsplit(".", 1)[-1]
+    if last.endswith("[*]"):
+        last = last[:-3]
+    return last
+
+
+# ---------------------------------------------------------------------------
+# Operation-level extraction (combines body + parameters)
+# ---------------------------------------------------------------------------
+
+
+def extract_produces_for_operation(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> list[FieldLeaf]:
+    """Walk operation's success response schema → leaf produces with JsonPath."""
+    response_schema = _pick_response_schema(operation, is_swagger2=is_swagger2)
+    if not response_schema:
+        return []
+    return extract_leaves(response_schema, base_path="$")
+
+
+def extract_consumes_for_operation(
+    operation: dict[str, Any],
+    path_item: dict[str, Any] | None = None,
+    *,
+    is_swagger2: bool = False,
+    required_only: bool = True,
+) -> list[FieldLeaf]:
+    """Combine query/path/header parameters and request body into a flat
+    consume list.
+
+    Body fields are flattened to field-name level (the LLM-visible name) —
+    binding keys by name in Stage 2/3, not by nested path. The original
+    nested structure for HTTP injection is handled separately via the
+    existing ``leaf_path_map`` mechanism on the tool row.
+    """
+    leaves: list[FieldLeaf] = []
+    seen_names: set[str] = set()
+
+    # query / path / header parameters
+    all_params = (operation.get("parameters") or []) + ((path_item or {}).get("parameters") or [])
+    for p in all_params:
+        if not isinstance(p, dict) or "name" not in p:
+            continue
+        loc = p.get("in")
+        if loc not in ("query", "path", "header"):
+            continue
+        is_required = bool(p.get("required", loc == "path"))
+        if required_only and not is_required:
+            continue
+        if is_swagger2:
+            ftype = p.get("type") or "string"
+            # Swagger 2.0 — enum lives directly on the parameter object.
+            enum_vals = p.get("enum") or []
+        else:
+            param_schema = p.get("schema") or {}
+            ftype = _schema_type(param_schema) or "string"
+            # OpenAPI 3.x — enum lives under ``schema``.
+            enum_vals = param_schema.get("enum") or [] if isinstance(param_schema, dict) else []
+        if p["name"] in seen_names:
+            continue
+        seen_names.add(p["name"])
+        leaves.append(
+            FieldLeaf(
+                json_path=p["name"],  # flat for consumes
+                field_name=p["name"],
+                field_type=ftype,
+                required=is_required,
+                description=str(p.get("description") or "")[:200],
+                enum=list(enum_vals),
+            )
+        )
+
+    # request body (flattened)
+    body_schema = _pick_request_body_schema(operation, is_swagger2=is_swagger2)
+    if body_schema:
+        for leaf in extract_leaves(body_schema, base_path="$"):
+            if required_only and not leaf.required:
+                continue
+            if leaf.field_name in seen_names:
+                continue
+            seen_names.add(leaf.field_name)
+            leaves.append(
+                FieldLeaf(
+                    json_path=leaf.field_name,  # flat for consumes
+                    field_name=leaf.field_name,
+                    field_type=leaf.field_type,
+                    required=leaf.required,
+                    description=leaf.description,
+                    enum=leaf.enum,
+                )
+            )
+
+    return leaves
+
+
+def _pick_response_schema(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> dict[str, Any] | None:
+    responses = operation.get("responses") or {}
+    for code in ("200", "201", "default"):
+        resp = responses.get(code)
+        if not isinstance(resp, dict):
+            continue
+        # Swagger 2.0
+        if "schema" in resp:
+            return resp["schema"]
+        # OpenAPI 3.x
+        content = resp.get("content") or {}
+        if "application/json" in content:
+            return content["application/json"].get("schema")
+    return None
+
+
+def _pick_request_body_schema(
+    operation: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> dict[str, Any] | None:
+    if is_swagger2:
+        for p in operation.get("parameters") or []:
+            if isinstance(p, dict) and p.get("in") == "body":
+                return p.get("schema")
+        return None
+    body = operation.get("requestBody") or {}
+    content = body.get("content") or {}
+    if "application/json" in content:
+        return content["application/json"].get("schema")
+    if content:
+        first = next(iter(content.values()))
+        return first.get("schema") if isinstance(first, dict) else None
+    return None
+
+
+__all__ = [
+    "FieldLeaf",
+    "extract_leaves",
+    "extract_produces_for_operation",
+    "extract_consumes_for_operation",
+]
diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py
index 90399dd..8f53173 100644
--- a/graph_tool_call/ingest/openapi.py
+++ b/graph_tool_call/ingest/openapi.py
@@ -134,6 +134,41 @@ def _schema_type(schema: dict[str, Any]) -> str:
     return _TYPE_MAP.get(schema.get("type", "string"), "string")
 
 
+def _pick_content_schema(content: dict[str, Any]) -> dict[str, Any]:
+    """Pick a usable schema from an OpenAPI ``content`` object.
+
+    OpenAPI 3.x lets a request body / response declare schemas under any
+    media-type key. The preferred order is:
+
+      1. ``application/json``                 — most common
+      2. ``application/*+json`` (e.g. hal+json) — JSON variants
+      3. ``*/*``                                — Spring/SpringDoc default when
+                                                  the operation doesn't pin a
+                                                  specific content type
+      4. first available media-type            — last resort
+
+    Returning the schema dict (possibly empty). The earlier code only
+    looked at ``application/json`` and silently dropped everything else,
+    which produced empty ``response_schema`` for every Spring endpoint
+    that uses the default ``*/*`` (real-world failure: x2bee Order API,
+    where this caused PathSynthesizer to find zero producers).
+    """
+    if not isinstance(content, dict) or not content:
+        return {}
+    if "application/json" in content:
+        return (content["application/json"] or {}).get("schema") or {}
+    for ct, val in content.items():
+        if isinstance(ct, str) and ct.endswith("+json"):
+            return (val or {}).get("schema") or {}
+    if "*/*" in content:
+        return (content["*/*"] or {}).get("schema") or {}
+    # Last resort: the first content type with a schema.
+    for val in content.values():
+        if isinstance(val, dict) and val.get("schema"):
+            return val["schema"]
+    return {}
+
+
 # ---------------------------------------------------------------------------
 # Operation -> ToolSchema
 # ---------------------------------------------------------------------------
@@ -167,6 +202,11 @@ def _extract_params_swagger2(
                 )
         else:
             is_required = p.get("required", False)
+            # OpenAPI 3.x / Swagger 2.0: path 파라미터는 본질적으로 required.
+            # 많은 spec이 명시 안 해도 URL placeholder라 호출 시 반드시 값이 있어야 함.
+            # synthesizer가 required 안 보고 빈 entity로 plan 생성 → HTTP 호출 실패 케이스 차단.
+            if location == "path":
+                is_required = True
             if required_only and not is_required:
                 continue
             params.append(
@@ -181,48 +221,190 @@ def _extract_params_swagger2(
     return params
 
 
+def _summarize_object_schema(schema: dict[str, Any], *, max_depth: int = 2) -> str:
+    """Object/array schema의 nested properties를 사람/LLM이 읽기 좋게 요약.
+
+    parameter type이 'object'/'array'인데 안의 필드명이 ToolParameter에 안 드러나면
+    LLM이 필드명을 추측하게 된다. 이 함수는 properties + required + description을
+    description 텍스트로 합쳐서 LLM 컨텍스트에 함께 노출되도록 한다.
+    """
+    if not isinstance(schema, dict):
+        return ""
+
+    def _walk(s: dict[str, Any], depth: int, indent: int) -> list[str]:
+        if depth > max_depth or not isinstance(s, dict):
+            return []
+        out: list[str] = []
+        prefix = "  " * indent
+
+        # Unwrap array → items
+        if s.get("type") == "array":
+            items = s.get("items") or {}
+            out.append(f"{prefix}[array of:]")
+            out.extend(_walk(items, depth + 1, indent + 1))
+            return out
+
+        props = s.get("properties") or {}
+        if not props:
+            return out
+        required = set(s.get("required") or [])
+        for name, prop in props.items():
+            if not isinstance(prop, dict):
+                continue
+            ptype = _schema_type(prop)
+            req = "*" if name in required else ""
+            desc = (prop.get("description") or "").strip()
+            example = prop.get("example")
+            line = f"{prefix}- {name}{req} ({ptype})"
+            if desc:
+                line += f": {desc}"
+            if example is not None and not desc:
+                line += f"  e.g. {example}"
+            out.append(line)
+            # Nested object/array 1단계 더 펼치기
+            if depth < max_depth:
+                if ptype == "object":
+                    out.extend(_walk(prop, depth + 1, indent + 1))
+                elif ptype == "array":
+                    items = prop.get("items") or {}
+                    if items.get("properties") or items.get("type") in ("object", "array"):
+                        out.extend(_walk(items, depth + 1, indent + 1))
+        return out
+
+    lines = _walk(schema, 0, 0)
+    return "\n".join(lines)
+
+
 def _extract_params_openapi3(
     operation: dict[str, Any],
     resolved_spec: dict[str, Any],
     *,
     required_only: bool = False,
 ) -> list[ToolParameter]:
-    """Extract parameters from an OpenAPI 3.x operation."""
+    """Extract parameters from an OpenAPI 3.x operation.
+
+    Spring/SpringDoc gotcha: when a controller takes a `@ModelAttribute`
+    DTO via query string, the spec sometimes lists BOTH the wrapper
+    object AND its inner fields as separate query parameters
+    (``regularOrderDetailRequest`` ``in=query`` ``type=object`` AND
+    ``rglrDeliNo`` ``in=query`` ``type=string``). Treating the wrapper
+    as a real input field poisons downstream producer matching: nothing
+    in the API ever returns a value named after the wrapper class, so
+    PathSynthesizer raises ``UnsatisfiableField`` on a phantom field.
+
+    Strategy: drop wrapper parameters when their inner properties are
+    already exposed as siblings; otherwise expand the wrapper into its
+    leaf properties so callers see the real input names.
+    """
     params: list[ToolParameter] = []
 
+    raw_parameters = list(operation.get("parameters", []))
+    # Pre-collect names from non-object parameters — used to detect when
+    # a wrapper's inner property is already exposed alongside it.
+    sibling_names: set[str] = {
+        str(p.get("name") or "")
+        for p in raw_parameters
+        if isinstance(p, dict) and _schema_type(p.get("schema", {}) or {}) not in ("object",)
+    }
+
     # Path / query / header / cookie parameters
-    for p in operation.get("parameters", []):
+    for p in raw_parameters:
         if "name" not in p:
             continue  # skip malformed parameters (missing required 'name' field)
         schema = p.get("schema", {})
         is_required = p.get("required", False)
+        # OpenAPI 3.x: path 파라미터는 본질적으로 required (URL placeholder 채우려면 필수).
+        # 많은 spec이 명시 안 해도 강제로 required 처리해야 synthesizer가 빈 entity를
+        # UnsatisfiableFieldError로 raise → question.required popup으로 사용자에게 묻는다.
+        if p.get("in") == "path":
+            is_required = True
+        ptype = _schema_type(schema)
+
+        # Wrapper-object/array query parameter handling.
+        # type=object → wrapper itself (Spring @ModelAttribute style).
+        # type=array of objects → wrapper used to send a list of structured
+        # records (less common but seen in some Spring specs); we expand the
+        # element schema's properties. Primitive arrays (array of integers /
+        # strings) are real list inputs and are NOT expanded here — those
+        # belong to the caller as a single multi-value field.
+        if ptype in ("object", "array") and p.get("in") == "query":
+            wrapper_props: dict[str, Any] = {}
+            wrapper_required: set[str] = set()
+            if ptype == "object":
+                wrapper_props = (schema.get("properties") or {}) if isinstance(schema, dict) else {}
+                wrapper_required = set(schema.get("required") or [])
+            else:  # array
+                items = (schema.get("items") or {}) if isinstance(schema, dict) else {}
+                if isinstance(items, dict) and items.get("type") == "object":
+                    wrapper_props = items.get("properties") or {}
+                    wrapper_required = set(items.get("required") or [])
+                # else: primitive-element array — don't expand, treat as real input
+            if wrapper_props:
+                # If every inner property is already a sibling parameter,
+                # drop the wrapper entirely (deduplication).
+                if all(prop in sibling_names for prop in wrapper_props):
+                    continue
+                # Otherwise expand the wrapper into individual leaves so
+                # producer matching has real field names to chase.
+                for prop_name, prop_schema in wrapper_props.items():
+                    if prop_name in sibling_names:
+                        continue  # don't double-list ones already exposed
+                    inner_required = prop_name in wrapper_required
+                    if required_only and not inner_required:
+                        continue
+                    inner_type = _schema_type(prop_schema or {})
+                    inner_desc = (prop_schema or {}).get("description", "") or ""
+                    params.append(
+                        ToolParameter(
+                            name=prop_name,
+                            type=inner_type,
+                            description=inner_desc,
+                            required=inner_required,
+                            enum=(prop_schema or {}).get("enum"),
+                        )
+                    )
+                continue  # wrapper itself is not added
+
         if required_only and not is_required:
             continue
+        desc = p.get("description", "") or ""
+        # object/array 타입이면 nested fields를 description에 펼쳐서
+        # LLM이 정확한 필드명(예: searchWord)을 알 수 있게 한다.
+        if ptype in ("object", "array"):
+            nested = _summarize_object_schema(schema)
+            if nested:
+                desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}"
         params.append(
             ToolParameter(
                 name=p["name"],
-                type=_schema_type(schema),
-                description=p.get("description", ""),
+                type=ptype,
+                description=desc,
                 required=is_required,
                 enum=schema.get("enum"),
             )
         )
 
-    # requestBody
+    # requestBody — pick the most specific schema across declared media types
+    # (Spring/SpringDoc commonly emits */* — see _pick_content_schema notes).
     request_body = operation.get("requestBody", {})
     content = request_body.get("content", {})
-    json_content = content.get("application/json", {})
-    body_schema = json_content.get("schema", {})
+    body_schema = _pick_content_schema(content)
     body_required = set(body_schema.get("required", []))
     for prop_name, prop_schema in body_schema.get("properties", {}).items():
         is_required = prop_name in body_required
         if required_only and not is_required:
             continue
+        desc = prop_schema.get("description") or ""
+        # nested object/array는 한 단계 더 펼치기
+        if _schema_type(prop_schema) in ("object", "array"):
+            nested = _summarize_object_schema(prop_schema)
+            if nested:
+                desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}"
         params.append(
             ToolParameter(
                 name=prop_name,
                 type=_schema_type(prop_schema),
-                description=prop_schema.get("description", ""),
+                description=desc,
                 required=is_required,
             )
         )
@@ -304,6 +486,34 @@ def _enrich_description(description: str, method: str, path: str) -> str:
     return description
 
 
+def _resolve_server_url(
+    operation: dict[str, Any],
+    path_item: dict[str, Any] | None,
+    spec: dict[str, Any],
+    *,
+    is_swagger2: bool = False,
+) -> str | None:
+    """OpenAPI 우선순위: operation.servers > path.servers > spec.servers.
+
+    Swagger 2.0은 ``host`` + ``basePath`` + ``schemes`` 조합으로 base_url 구성.
+    """
+    if is_swagger2:
+        host = spec.get("host")
+        if not host:
+            return None
+        scheme = (spec.get("schemes") or ["https"])[0]
+        base_path = spec.get("basePath") or ""
+        return f"{scheme}://{host}{base_path}".rstrip("/")
+
+    for source in (operation, path_item or {}, spec):
+        servers = source.get("servers") if isinstance(source, dict) else None
+        if servers and isinstance(servers, list) and servers:
+            url = (servers[0] or {}).get("url")
+            if url:
+                return str(url).rstrip("/")
+    return None
+
+
 def _operation_to_tool(
     operation_id: str,
     operation: dict[str, Any],
@@ -313,6 +523,7 @@ def _operation_to_tool(
     *,
     is_swagger2: bool = False,
     required_only: bool = False,
+    path_item: dict[str, Any] | None = None,
 ) -> ToolSchema:
     """Convert a single OpenAPI operation into a ToolSchema."""
     description = operation.get("summary") or operation.get("description", "")
@@ -333,21 +544,24 @@ def _operation_to_tool(
     else:
         parameters = _extract_params_openapi3(operation, resolved_spec, required_only=required_only)
 
-    # Build response schema metadata
+    # Build response schema metadata. Walk responses in success-code order
+    # and use _pick_content_schema so we don't drop schemas declared under
+    # */*, application/*+json, or other non-JSON media types.
     responses = operation.get("responses", {})
     response_schema: dict[str, Any] = {}
     for code in ("200", "201", "default"):
-        if code in responses:
-            resp = responses[code]
-            # Swagger 2.0
-            if "schema" in resp:
-                response_schema = resp["schema"]
-                break
-            # OpenAPI 3.x
-            resp_content = resp.get("content", {})
-            if "application/json" in resp_content:
-                response_schema = resp_content["application/json"].get("schema", {})
-                break
+        if code not in responses:
+            continue
+        resp = responses[code] or {}
+        # Swagger 2.0 puts the schema directly on the response object.
+        if "schema" in resp and isinstance(resp.get("schema"), dict):
+            response_schema = resp["schema"]
+            break
+        # OpenAPI 3.x: inspect the content map.
+        picked = _pick_content_schema(resp.get("content") or {})
+        if picked:
+            response_schema = picked
+            break
 
     metadata: dict[str, Any] = {
         "source": "openapi",
@@ -357,6 +571,13 @@ def _operation_to_tool(
     if response_schema:
         metadata["response_schema"] = response_schema
 
+    # spec/path/operation 단위의 servers field → tool 자체 base_url 부여.
+    # 한 컬렉션에 다른 host를 가진 source들이 섞여 있을 때 executor가 tool마다
+    # 알맞은 base_url로 호출할 수 있게 한다.
+    server_url = _resolve_server_url(operation, path_item, resolved_spec, is_swagger2=is_swagger2)
+    if server_url:
+        metadata["base_url"] = server_url
+
     return ToolSchema(
         name=operation_id,
         description=description,
@@ -459,6 +680,7 @@ def ingest_openapi(
                 resolved_raw,
                 is_swagger2=is_swagger2,
                 required_only=required_only,
+                path_item=path_item,
             )
             tools.append(tool)
 
diff --git a/graph_tool_call/langchain/gateway.py b/graph_tool_call/langchain/gateway.py
index cfde75e..a570589 100644
--- a/graph_tool_call/langchain/gateway.py
+++ b/graph_tool_call/langchain/gateway.py
@@ -66,6 +66,89 @@ def _extract_parameters_info(tool: Any) -> list[dict[str, Any]] | None:
     return None
 
 
+def _summarize_response_schema(schema: dict[str, Any]) -> str | None:
+    """Produce a one-line summary of an OpenAPI response schema for the LLM.
+
+    Lists top-level field names + types so the model can plan parameter
+    extraction for the next call.
+    """
+    if not isinstance(schema, dict):
+        return None
+
+    # Unwrap arrays
+    container = schema
+    is_array = False
+    if container.get("type") == "array" and isinstance(container.get("items"), dict):
+        container = container["items"]
+        is_array = True
+
+    props = container.get("properties")
+    if not isinstance(props, dict) or not props:
+        # Fall back to a bare type description
+        t = container.get("type")
+        return f"array of {t}" if is_array and t else t
+
+    fields = []
+    for name, info in list(props.items())[:12]:
+        if not isinstance(info, dict):
+            fields.append(name)
+            continue
+        t = info.get("type") or info.get("$ref", "object").rsplit("/", 1)[-1]
+        fields.append(f"{name}:{t}")
+    summary = "{" + ", ".join(fields) + "}"
+    return f"array of {summary}" if is_array else summary
+
+
+def _enrich_from_graph(name: str, graph: Any | None) -> dict[str, Any]:
+    """Pull source_label, method/path, response summary, and outgoing edges
+    from the underlying ToolGraph for *name*. Returns an empty dict if the
+    graph or tool is not available — callers should treat all keys as optional.
+    """
+    if graph is None:
+        return {}
+
+    enrichment: dict[str, Any] = {}
+
+    tool_schema = None
+    try:
+        tool_schema = graph.tools.get(name)
+    except Exception:
+        return enrichment
+
+    if tool_schema is not None and getattr(tool_schema, "metadata", None):
+        meta = tool_schema.metadata
+        if meta.get("source_label"):
+            enrichment["source"] = meta["source_label"]
+        if meta.get("method") and meta.get("path"):
+            enrichment["http"] = f"{meta['method'].upper()} {meta['path']}"
+        rs = meta.get("response_schema")
+        if isinstance(rs, dict):
+            summary = _summarize_response_schema(rs)
+            if summary:
+                enrichment["returns"] = summary
+
+    # Outgoing edges → chain hints
+    try:
+        engine = graph.graph
+        edges = engine.get_edges_from(name, direction="out")
+        chains: list[str] = []
+        for _src, target, attrs in edges:
+            relation = attrs.get("relation")
+            relation_name = relation.value if hasattr(relation, "value") else str(relation)
+            # Skip purely structural BELONGS_TO edges
+            if relation_name in ("belongs_to", "BELONGS_TO"):
+                continue
+            chains.append(f"{relation_name}→{target}")
+            if len(chains) >= 5:
+                break
+        if chains:
+            enrichment["next_candidates"] = chains
+    except Exception:
+        pass
+
+    return enrichment
+
+
 def create_gateway_tools(
     tools: list[Any],
     *,
@@ -111,12 +194,15 @@ def create_gateway_tools(
     total = len(tool_map)
     call_history: list[str] = []
 
+    underlying_graph = getattr(toolkit, "graph", None)
+
     @langchain_tool
     def search_tools(query: str, top_k: int | None = None) -> str:
         """Search available tools by natural language query.
 
         Use this FIRST to find which tools are available for the task.
-        Returns tool names, descriptions, and required parameters.
+        Returns tool names, descriptions, parameters, response shape, and
+        ``next_candidates`` (related tools you may want to call afterwards).
 
         Args:
             query: Natural language search query (e.g. "cancel order", "send email")
@@ -135,11 +221,12 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                 desc = t.get("description", "")
             entry: dict[str, Any] = {
                 "name": name,
-                "description": desc[:200],
+                "description": desc[:300],
             }
             params = _extract_parameters_info(t)
             if params:
                 entry["parameters"] = params
+            entry.update(_enrich_from_graph(name, underlying_graph))
             matched.append(entry)
 
         output = {
@@ -148,8 +235,10 @@ def search_tools(query: str, top_k: int | None = None) -> str:
             "total_tools": total,
             "tools": matched,
             "hint": (
-                "Use call_tool to execute a tool. "
-                "Pass tool_name and arguments as a dict matching the parameters above."
+                "Use call_tool to execute a tool. Pass tool_name and arguments "
+                "as a dict matching the parameters above. The 'returns' field "
+                "shows the response shape — extract values from there to build "
+                "arguments for the next call (see 'next_candidates')."
             ),
         }
 
diff --git a/graph_tool_call/net.py b/graph_tool_call/net.py
index dfe1c35..466ae30 100644
--- a/graph_tool_call/net.py
+++ b/graph_tool_call/net.py
@@ -44,8 +44,23 @@ def redirect_request(
         return super().redirect_request(req, fp, code, msg, headers, newurl)
 
 
-def _open_url(request: urllib.request.Request | str, *, timeout: int, max_redirects: int) -> Any:
-    opener = urllib.request.build_opener(_LimitedRedirectHandler(max_redirects))
+def _open_url(
+    request: urllib.request.Request | str,
+    *,
+    timeout: int,
+    max_redirects: int,
+    verify_ssl: bool = True,
+) -> Any:
+    """urllib opener — verify_ssl=False 시 self-signed/사내 CA 인증서 허용."""
+    handlers: list[Any] = [_LimitedRedirectHandler(max_redirects)]
+    if not verify_ssl:
+        import ssl
+
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False
+        ctx.verify_mode = ssl.CERT_NONE
+        handlers.append(urllib.request.HTTPSHandler(context=ctx))
+    opener = urllib.request.build_opener(*handlers)
     return opener.open(request, timeout=timeout)
 
 
@@ -128,13 +143,27 @@ def fetch_url_text(
     allowed_content_types: tuple[str, ...] = _DEFAULT_ALLOWED_CONTENT_TYPES,
     allow_private_hosts: bool = False,
     max_redirects: int = _DEFAULT_MAX_REDIRECTS,
+    verify_ssl: bool | None = None,
 ) -> str:
-    """Fetch UTF-8 text from a remote URL with basic SSRF protections."""
+    """Fetch UTF-8 text from a remote URL with basic SSRF protections.
+
+    ``verify_ssl`` — None 이면 ``allow_private_hosts`` 값에 따라 자동 결정
+    (사내망 hosts 는 self-signed CA 가 일반적이므로 verify off 가 기본).
+    """
     validate_remote_url(url, allow_private_hosts=allow_private_hosts)
 
+    if verify_ssl is None:
+        # allow_private_hosts=True 사용자는 보통 사내망 hitting. 사내 CA 포용.
+        verify_ssl = not allow_private_hosts
+
     req = urllib.request.Request(url, headers=headers or {})
     try:
-        with _open_url(req, timeout=timeout, max_redirects=max_redirects) as resp:
+        with _open_url(
+            req,
+            timeout=timeout,
+            max_redirects=max_redirects,
+            verify_ssl=verify_ssl,
+        ) as resp:
             final_url = url
             if hasattr(resp, "geturl"):
                 candidate = resp.geturl()
diff --git a/graph_tool_call/ontology/builder.py b/graph_tool_call/ontology/builder.py
index f6fb1a7..517d730 100644
--- a/graph_tool_call/ontology/builder.py
+++ b/graph_tool_call/ontology/builder.py
@@ -5,7 +5,7 @@
 from graph_tool_call.core.dict_graph import DictGraph
 from graph_tool_call.core.protocol import GraphEngine
 from graph_tool_call.core.tool import ToolSchema
-from graph_tool_call.ontology.schema import NodeType, RelationType
+from graph_tool_call.ontology.schema import Confidence, NodeType, RelationType
 
 
 class OntologyBuilder:
@@ -64,11 +64,36 @@ def add_relation(
         target: str,
         relation: str | RelationType,
         weight: float = 1.0,
+        *,
+        confidence: str | Confidence | None = None,
+        conf_score: float | None = None,
+        layer: int | None = None,
+        evidence: str | None = None,
     ) -> None:
-        """Add a directed relation between two nodes."""
+        """Add a directed relation between two nodes.
+
+        Optional graphify-style attrs (all default None — existing callers
+        unaffected):
+
+        confidence:  Confidence label (EXTRACTED / INFERRED / AMBIGUOUS).
+        conf_score:  Raw 0.0–1.0 score from the upstream detector.
+        layer:       1=structural (path/CRUD/$ref), 2=heuristic (name/RPC).
+        evidence:    Human-readable reason; capped at 200 chars to avoid bloat.
+        """
         if isinstance(relation, str):
             relation = RelationType(relation)
-        self._graph.add_edge(source, target, relation=relation, weight=weight)
+        if isinstance(confidence, Confidence):
+            confidence = confidence.value
+        attrs: dict = {"relation": relation, "weight": weight}
+        if confidence is not None:
+            attrs["confidence"] = confidence
+        if conf_score is not None:
+            attrs["conf_score"] = float(conf_score)
+        if layer is not None:
+            attrs["layer"] = int(layer)
+        if evidence:
+            attrs["evidence"] = evidence[:200]
+        self._graph.add_edge(source, target, **attrs)
 
     # --- queries ---
 
diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py
index 7897554..8d19923 100644
--- a/graph_tool_call/ontology/llm_provider.py
+++ b/graph_tool_call/ontology/llm_provider.py
@@ -5,7 +5,7 @@
 import json
 import urllib.request
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any
 
 from graph_tool_call.ontology.schema import RelationType
@@ -13,11 +13,20 @@
 
 @dataclass
 class ToolSummary:
-    """Lightweight tool representation for LLM prompts."""
+    """Lightweight tool representation for LLM prompts.
+
+    The optional fields (``method``, ``path``, ``response_fields``) extend the
+    summary for semantic enrichment (``enrich_tool_semantics``). They are
+    ignored by methods that don't need them, preserving backward compat.
+    """
 
     name: str
     description: str
     parameters: list[str]  # just parameter names
+    # Extended context for semantic enrichment (optional)
+    method: str = ""
+    path: str = ""
+    response_fields: list[str] = field(default_factory=list)
 
 
 @dataclass
@@ -31,6 +40,76 @@ class InferredRelation:
     reason: str
 
 
+@dataclass
+class FieldSemantic:
+    """A field annotated with its semantic identifier.
+
+    Used on both produces (what a tool outputs) and consumes (what it
+    requires). ``json_path`` is set on produces; ``field`` is set on consumes.
+
+    ``kind`` (consumes only) distinguishes two roles:
+      - ``"data"``    — true data dependency (e.g. a business identifier
+                        needed to address the operation). PathSynthesizer
+                        will chain to a producer for this field.
+      - ``"context"`` — ambient config (locale, site, pagination). Must be
+                        supplied as an entity or collection default; the
+                        synthesizer will NOT build a prerequisite chain
+                        just to fetch it.
+
+    The default ``"data"`` matches pre-kind behavior (safe for tools whose
+    enrichment predates this schema change).
+    """
+
+    semantic: str
+    json_path: str = ""
+    field: str = ""
+    kind: str = "data"
+
+
+@dataclass
+class PairHint:
+    """A tool that pairs with the current tool in a workflow.
+
+    ``source`` distinguishes ownership so re-running auto enrichment doesn't
+    overwrite operator curation:
+      - ``"auto"``   — produced by Pass 2a (per-tool batch) or Pass 2b
+                       (cross-batch). Replaced on every Pass 2b re-run.
+      - ``"manual"`` — added by an operator through the UI. Never overwritten
+                       by automatic enrichment.
+
+    Default ``"manual"`` is intentional: legacy data without a ``source``
+    field gets the safer label, so a Pass 2b re-run does not silently delete
+    pre-existing entries that may have been hand-curated.
+    """
+
+    tool: str
+    reason: str = ""
+    source: str = "manual"
+
+
+@dataclass
+class ToolEnrichment:
+    """Per-tool semantic annotation produced by ``enrich_tool_semantics``.
+
+    This is the Pass 2 output of the Plan-and-Execute L0 knowledge base.
+    Used downstream by:
+      - Stage 1 target selection (``when_to_use`` in catalog)
+      - Stage 2 path synthesis (``produces_semantics`` / ``consumes_semantics``
+        replace hardcoded synonym tables)
+      - Graph edges (``pairs_well_with`` becomes semantic edges)
+    """
+
+    # canonical_action: search | read | create | update | delete | action
+    canonical_action: str
+    primary_resource: str  # e.g. "product"
+    one_line_summary: str
+    when_to_use: str
+    when_not_to_use: str = ""
+    produces_semantics: list[FieldSemantic] = field(default_factory=list)
+    consumes_semantics: list[FieldSemantic] = field(default_factory=list)
+    pairs_well_with: list[PairHint] = field(default_factory=list)
+
+
 # ---------------------------------------------------------------------------
 # Prompt templates
 # ---------------------------------------------------------------------------
@@ -124,6 +203,103 @@ class InferredRelation:
 [{{"source":"toolA","target":"toolB","relation":"PRECEDES","confidence":0.9,"reason":"..."}}]"""
 
 
+_ENRICH_SEMANTICS_PROMPT = """\
+You are annotating API tools for a plan-and-execute planning system.
+Produce structured metadata that downstream components use to (1) pick the
+right tool for a user's goal, (2) synthesize execution plans, and (3) wire
+one tool's output to another tool's input.
+{reference_block}{vocab_block}
+TOOLS TO ANNOTATE (this batch):
+{batch_detailed}
+
+For each tool in the batch, output a JSON object with these fields:
+  - canonical_action: one of "search" | "read" | "create" | "update" | "delete" | "action"
+  - primary_resource: one lowercase noun (e.g. "product", "order", "user", "shop", "category")
+  - one_line_summary: short natural-language summary (<=60 chars)
+  - when_to_use: 1-2 sentences describing the trigger condition
+  - when_not_to_use: optional 1 sentence (can be empty) — alternative tool cases
+  - produces_semantics: array of {{"semantic": "canonical_id", "json_path": "$.body..."}}
+      * Include only MEANINGFUL fields (IDs, names, key metrics).
+      * Skip pagination, headers, status codes.
+      * Use CONSISTENT semantic ids across tools. If two tools both return a
+        product identifier (one calls it "goodsNo", another "productId"),
+        use the same semantic like "product_id".
+  - consumes_semantics: array of {{"semantic": "canonical_id",
+                                    "field": "paramName",
+                                    "kind": "data" | "context"}}
+      * REQUIRED inputs only. Skip optional filters, pagination.
+      * Same semantic id conventions as produces.
+      * kind="data" — business-data dependency: an identifier or value that
+        addresses a specific record (e.g. product_id, order_id, user_id,
+        search_keyword). A prior step in a plan normally produces it.
+      * kind="context" — ambient/environmental config shared across the
+        workflow (locale, site_no, tenant, pagination cursors, flag switches).
+        The user or the caller supplies it as a default — it is NOT produced
+        by a prior step. Use this for anything a plain UI user would set
+        once per session, not per request.
+  - pairs_well_with: array of {{"tool": "tool_name_from_available_list",
+                                "reason": "brief reason"}}
+      * 2-4 tools that typically precede or follow this tool.
+      * Names MUST match the available list exactly. Do not invent.
+
+OUTPUT FORMAT (strict):
+{{
+  "tool_name_1": {{...fields...}},
+  "tool_name_2": {{...fields...}}
+}}
+
+STRICT RULES:
+  - You MUST produce one entry for EVERY tool in the batch.
+  - Do NOT skip tools with unclear descriptions — make your best guess.
+  - Keep fields concise (short sentences) so all tools fit in the output.
+  - Return JSON only. No markdown fences, no prose, no comments."""
+
+
+# Pass 2b — cross-batch workflow pairing.
+#
+# Per-tool enrichment (Pass 2a) only sees one batch at a time, so it cannot
+# spot pairs whose other half lives in a different batch. This prompt shows
+# the entire collection's 1-line summaries so the LLM can suggest workflow
+# successors that span resources.
+#
+# The output is batched (subset of tools per call) to stay within the
+# response token budget — input stays full, output stays small.
+_PAIRS_PROMPT = """\
+You are reviewing an API tool collection to suggest workflow pairs.
+
+For EACH tool in the OUTPUT BATCH, suggest 2-4 OTHER tools from the FULL
+TOOL LIST that are commonly invoked just before or just after this tool in
+a real-world workflow. Pairs SHOULD cross resource boundaries when there is
+a natural business sequence (e.g. product detail → add to cart → checkout).
+
+Pair quality matters more than quantity — only suggest tools you are
+confident about. If a tool has no good pair candidates, return an empty
+array for it.
+
+FULL TOOL LIST (all available tools — pick pairs only from this list):
+{full_list}
+
+OUTPUT BATCH (suggest pairs ONLY for these tools):
+{batch_list}
+
+OUTPUT FORMAT (strict JSON):
+{{
+  "tool_name_1": [
+    {{"tool": "other_tool_name", "reason": "short reason"}},
+    ...
+  ],
+  "tool_name_2": [...],
+  ...
+}}
+
+STRICT RULES:
+  - You MUST include one entry for EVERY tool in the OUTPUT BATCH (use
+    empty array if no good pairs).
+  - Pair tool names MUST exactly match a name in the FULL TOOL LIST.
+  - Do NOT pair a tool with itself.
+  - Return JSON only. No markdown fences, no prose, no comments."""
+
+
 def _format_tools_list(tools: list[ToolSummary]) -> str:
     lines = []
     for i, t in enumerate(tools, 1):
@@ -132,6 +308,107 @@ def _format_tools_list(tools: list[ToolSummary]) -> str:
     return "\n".join(lines)
 
 
+def _format_tools_brief(tools: list[ToolSummary]) -> str:
+    """Compact name list for the ``pairs_well_with`` reference.
+
+    Name-only (no descriptions) to keep prompt small — descriptions would
+    bloat the prompt by N× since every batch prompt contains this list.
+    Tool names like ``seltSearchProduct`` already encode intent.
+    """
+    return "\n".join(f"- {t.name}" for t in tools)
+
+
+def _format_tools_for_pairs(tools: list[ToolSummary]) -> str:
+    """Compact ``name: 1-line summary`` block for Pass 2b prompts.
+
+    Uses ``description`` (mapped from ai_metadata.one_line_summary by the
+    caller for tools that have been Pass 2a annotated) so the LLM can pair
+    based on workflow meaning, not just tool names.
+    """
+    lines = []
+    for t in tools:
+        summary = (t.description or "").strip().replace("\n", " ")
+        if len(summary) > 100:
+            summary = summary[:97] + "..."
+        lines.append(f"- {t.name}: {summary}" if summary else f"- {t.name}")
+    return "\n".join(lines)
+
+
+def _format_tools_for_enrichment(tools: list[ToolSummary]) -> str:
+    """Detailed per-tool block for enrichment prompt input."""
+    blocks = []
+    for t in tools:
+        parts = [f"== {t.name} =="]
+        if t.method and t.path:
+            parts.append(f"HTTP: {t.method.upper()} {t.path}")
+        if t.description:
+            desc = t.description.strip()[:400]
+            parts.append(f"Description: {desc}")
+        if t.parameters:
+            params = ", ".join(t.parameters[:25])
+            parts.append(f"Request fields: {params}")
+        if t.response_fields:
+            resp = ", ".join(t.response_fields[:25])
+            parts.append(f"Response fields: {resp}")
+        blocks.append("\n".join(parts))
+    return "\n\n".join(blocks)
+
+
+def _parse_enrichment(data: Any) -> ToolEnrichment | None:
+    """Build a ToolEnrichment from LLM JSON output. Tolerant of missing keys."""
+    if not isinstance(data, dict):
+        return None
+    try:
+        produces = [
+            FieldSemantic(
+                semantic=str(p.get("semantic", "")).strip(),
+                json_path=str(p.get("json_path", "")).strip(),
+            )
+            for p in (data.get("produces_semantics") or [])
+            if isinstance(p, dict) and str(p.get("semantic", "")).strip()
+        ]
+        consumes = []
+        for c in data.get("consumes_semantics") or []:
+            if not (isinstance(c, dict) and str(c.get("semantic", "")).strip()):
+                continue
+            raw_kind = str(c.get("kind", "data")).strip().lower()
+            kind = raw_kind if raw_kind in ("data", "context") else "data"
+            consumes.append(
+                FieldSemantic(
+                    semantic=str(c.get("semantic", "")).strip(),
+                    field=str(c.get("field", "")).strip(),
+                    kind=kind,
+                )
+            )
+        # Pairs from per-tool enrichment are batch-scoped (LLM only sees the
+        # current batch), so quality is lower than cross-batch Pass 2b.
+        # Marked source="auto" so a Pass 2b run can replace them while
+        # preserving operator-curated source="manual" entries.
+        pairs = [
+            PairHint(
+                tool=str(p.get("tool", "")).strip(),
+                reason=str(p.get("reason", "")).strip(),
+                source="auto",
+            )
+            for p in (data.get("pairs_well_with") or [])
+            if isinstance(p, dict) and str(p.get("tool", "")).strip()
+        ]
+        action = str(data.get("canonical_action", "")).strip().lower()
+        resource = str(data.get("primary_resource", "")).strip().lower()
+        return ToolEnrichment(
+            canonical_action=action,
+            primary_resource=resource,
+            one_line_summary=str(data.get("one_line_summary", "")).strip(),
+            when_to_use=str(data.get("when_to_use", "")).strip(),
+            when_not_to_use=str(data.get("when_not_to_use", "")).strip(),
+            produces_semantics=produces,
+            consumes_semantics=consumes,
+            pairs_well_with=pairs,
+        )
+    except (KeyError, TypeError, ValueError, AttributeError):
+        return None
+
+
 def _parse_relation_type(s: str) -> RelationType | None:
     mapping = {
         "REQUIRES": RelationType.REQUIRES,
@@ -424,6 +701,157 @@ def generate_example_queries(
 
         return all_queries
 
+    def enrich_pairs(
+        self,
+        tools: list[ToolSummary],
+        batch_size: int = 30,
+    ) -> dict[str, list[PairHint]]:
+        """Pass 2b — cross-batch workflow pair suggestion.
+
+        Unlike Pass 2a (``enrich_tool_semantics``) which sees only the
+        current batch, this pass shows the LLM the full collection's 1-line
+        summaries so it can suggest pairs that cross resource boundaries
+        (e.g. ``getProductDetail → addToCart`` even when the two tools live
+        in different swagger sources).
+
+        Output is batched only on the OUTPUT axis: input list stays full
+        for every call, output covers ``batch_size`` tools per call. This
+        keeps the prompt short and avoids the 8k-token output limit
+        truncating long pair lists.
+
+        Tools should arrive with ``description`` set to ai_metadata
+        ``one_line_summary`` when available (Pass 2a output) so pairing can
+        rely on workflow meaning, not just tool names.
+
+        Returns: {tool_name: [PairHint(source="auto"), ...]}
+        """
+        results: dict[str, list[PairHint]] = {}
+        if not tools:
+            return results
+
+        full_list = _format_tools_for_pairs(tools)
+
+        for i in range(0, len(tools), batch_size):
+            batch = tools[i : i + batch_size]
+            batch_list = _format_tools_for_pairs(batch)
+            prompt = _PAIRS_PROMPT.format(full_list=full_list, batch_list=batch_list)
+            response = self.generate(prompt)
+
+            try:
+                parsed = _extract_json(response)
+                if not isinstance(parsed, dict):
+                    continue
+                for name, raw_pairs in parsed.items():
+                    if not isinstance(raw_pairs, list):
+                        continue
+                    pair_list: list[PairHint] = []
+                    for p in raw_pairs:
+                        if not isinstance(p, dict):
+                            continue
+                        target = str(p.get("tool", "")).strip()
+                        if not target or target == name:
+                            continue
+                        pair_list.append(
+                            PairHint(
+                                tool=target,
+                                reason=str(p.get("reason", "")).strip(),
+                                source="auto",
+                            )
+                        )
+                    results[str(name)] = pair_list
+            except (json.JSONDecodeError, KeyError, TypeError):
+                continue
+
+        return results
+
+    def enrich_tool_semantics(
+        self,
+        tools: list[ToolSummary],
+        batch_size: int = 10,
+        *,
+        reference_tools: list[ToolSummary] | None = None,
+        existing_vocab: list[str] | None = None,
+        valid_tool_names: set[str] | None = None,
+    ) -> dict[str, ToolEnrichment]:
+        """Per-tool semantic annotation for Plan-and-Execute architecture.
+
+        ``tools`` = the batch(es) to produce detailed enrichment for.
+
+        ``reference_tools`` (optional, default ``None``) — when supplied,
+        rendered as a brief tool list in the prompt so the LLM can pick
+        ``pairs_well_with`` from valid names. **Streaming callers should
+        usually pass ``None``** — Pass 2b handles pairs in a separate
+        cross-batch call, and skipping the reference block saves ~50%
+        prompt tokens. The pair list emitted in this pass is post-validated
+        against ``valid_tool_names`` instead.
+
+        ``existing_vocab`` (optional) — accumulated semantic ids decided in
+        previous batches of the same enrichment run. The LLM is asked to
+        reuse these labels when applicable, which keeps cross-batch vocab
+        consistent (avoids ``product_id`` vs ``productId`` divergence).
+        Streaming callers should pass the unique semantics seen so far.
+
+        ``valid_tool_names`` (optional) — full set of tool names in the
+        collection. When supplied, ``pairs_well_with`` entries pointing to
+        tools outside this set are dropped silently (LLM hallucination
+        guard). When ``reference_tools`` is None the LLM only knows the
+        names in the current batch; without this guard it would invent
+        names for cross-batch pairs.
+        """
+        results: dict[str, ToolEnrichment] = {}
+        if not tools:
+            return results
+
+        ref_block = ""
+        if reference_tools:
+            ref_block = (
+                "\nAVAILABLE TOOLS IN THE COLLECTION (names + 1-line "
+                "descriptions, for pairs_well_with reference):\n"
+                + _format_tools_brief(reference_tools)
+                + "\n"
+            )
+
+        vocab_block = ""
+        if existing_vocab:
+            vocab_block = (
+                "\nEXISTING SEMANTIC VOCABULARY (reuse these canonical ids "
+                "when the field has the same meaning — keeps cross-batch "
+                "labels consistent):\n"
+                + "\n".join(f"- {s}" for s in sorted(set(existing_vocab)))
+                + "\n"
+            )
+
+        for i in range(0, len(tools), batch_size):
+            batch = tools[i : i + batch_size]
+            prompt = _ENRICH_SEMANTICS_PROMPT.format(
+                reference_block=ref_block,
+                vocab_block=vocab_block,
+                batch_detailed=_format_tools_for_enrichment(batch),
+            )
+            response = self.generate(prompt)
+
+            try:
+                parsed = _extract_json(response)
+                if not isinstance(parsed, dict):
+                    continue
+                for name, data in parsed.items():
+                    enrichment = _parse_enrichment(data)
+                    if enrichment is None or not enrichment.canonical_action:
+                        continue
+                    # Hallucination guard for pairs_well_with — drop entries
+                    # whose target name is not in the catalog.
+                    if valid_tool_names is not None:
+                        enrichment.pairs_well_with = [
+                            p
+                            for p in enrichment.pairs_well_with
+                            if p.tool in valid_tool_names and p.tool != str(name)
+                        ]
+                    results[str(name)] = enrichment
+            except (json.JSONDecodeError, KeyError, TypeError):
+                continue
+
+        return results
+
 
 # ---------------------------------------------------------------------------
 # Ollama Provider
@@ -476,18 +904,25 @@ def __init__(
         model: str = "gpt-4o-mini",
         base_url: str = "https://api.openai.com/v1",
         api_key: str = "",
+        max_tokens: int = 8192,
+        timeout: int = 300,
     ) -> None:
         self.model = model
         self.base_url = base_url.rstrip("/")
         self.api_key = api_key
+        self.max_tokens = max_tokens
+        self.timeout = timeout
 
     def generate(self, prompt: str) -> str:
         url = f"{self.base_url}/chat/completions"
+        # max_tokens 를 명시 지정하지 않으면 provider 기본값 (일부 모델은 4096)
+        # 으로 잘려서 batch enrichment JSON 이 중간에 truncate → 일부 tool 누락.
         payload = json.dumps(
             {
                 "model": self.model,
                 "messages": [{"role": "user", "content": prompt}],
                 "temperature": 0.1,
+                "max_tokens": self.max_tokens,
             }
         ).encode()
 
@@ -496,7 +931,7 @@ def generate(self, prompt: str) -> str:
             headers["Authorization"] = f"Bearer {self.api_key}"
 
         req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
-        with urllib.request.urlopen(req, timeout=120) as resp:  # noqa: S310
+        with urllib.request.urlopen(req, timeout=self.timeout) as resp:  # noqa: S310
             result = json.loads(resp.read().decode())
             choices = result.get("choices", [])
             if choices:
diff --git a/graph_tool_call/ontology/schema.py b/graph_tool_call/ontology/schema.py
index 04086fb..2a67290 100644
--- a/graph_tool_call/ontology/schema.py
+++ b/graph_tool_call/ontology/schema.py
@@ -24,6 +24,26 @@ class NodeType(str, Enum):
     DOMAIN = "domain"
 
 
+class Confidence(str, Enum):
+    """Edge confidence label, graphify-style.
+
+    Every edge in a graphify-style ToolGraph carries one of three labels so
+    downstream consumers (LLM agents, retrieval scoring, UI) can distinguish
+    deterministic facts from heuristic guesses.
+
+    EXTRACTED  — derived deterministically from the spec (path hierarchy,
+                 shared $ref, CRUD pattern). conf_score >= 0.85 AND layer == 1.
+    INFERRED   — heuristic match (name-based, RPC pattern, cross-resource).
+                 conf_score >= 0.85 but not strictly structural.
+    AMBIGUOUS  — low-confidence heuristic (0.70 <= conf_score < 0.85).
+                 Surface in UI for review; retrieval applies a score penalty.
+    """
+
+    EXTRACTED = "EXTRACTED"
+    INFERRED = "INFERRED"
+    AMBIGUOUS = "AMBIGUOUS"
+
+
 # Weights for relation types during retrieval scoring
 DEFAULT_RELATION_WEIGHTS: dict[str, float] = {
     RelationType.SIMILAR_TO: 0.8,
diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py
new file mode 100644
index 0000000..dbab1f3
--- /dev/null
+++ b/graph_tool_call/plan/__init__.py
@@ -0,0 +1,95 @@
+"""Plan-and-Execute primitives: schemas, binding resolver, runner.
+
+The ``plan`` package is deliberately transport-agnostic. It knows nothing
+about HTTP, authentication, or xgen internals — it only defines how a
+Plan looks, how string bindings are resolved against step outputs, and how
+to drive execution via an injected callable.
+
+Typical use (from an integration layer like xgen-workflow):
+
+    from graph_tool_call.plan import Plan, PlanStep, PlanRunner
+
+    plan = Plan(id="...", goal="...", steps=[PlanStep(...), ...])
+
+    def call_tool(tool_name, args):
+        return my_http_executor.execute(tool_name, args)
+
+    runner = PlanRunner(call_tool)
+    for event in runner.run(plan):
+        # event: StepStarted | StepCompleted | StepFailed | PlanCompleted
+        ...
+"""
+
+from graph_tool_call.plan.binding import (
+    BindingError,
+    resolve_bindings,
+)
+from graph_tool_call.plan.intent import (
+    IntentParseError,
+    ParsedIntent,
+    ToolCatalogEntry,
+    parse_intent,
+)
+from graph_tool_call.plan.response import (
+    synthesize_failure_response,
+    synthesize_success_response,
+)
+from graph_tool_call.plan.runner import (
+    PlanAborted,
+    PlanCompleted,
+    PlanEvent,
+    PlanRunner,
+    PlanStarted,
+    StepCompleted,
+    StepFailed,
+    StepStarted,
+)
+from graph_tool_call.plan.schema import (
+    ExecutionTrace,
+    Plan,
+    PlanStep,
+    StepTrace,
+)
+from graph_tool_call.plan.synthesizer import (
+    CyclicDependencyError,
+    DynamicOptionRequired,
+    MaxDepthExceededError,
+    PathSynthesizer,
+    PlanSynthesisError,
+    UnsatisfiableFieldError,
+)
+
+__all__ = [
+    # schema
+    "Plan",
+    "PlanStep",
+    "ExecutionTrace",
+    "StepTrace",
+    # binding
+    "BindingError",
+    "resolve_bindings",
+    # runner + events
+    "PlanRunner",
+    "PlanEvent",
+    "PlanStarted",
+    "StepStarted",
+    "StepCompleted",
+    "StepFailed",
+    "PlanCompleted",
+    "PlanAborted",
+    # synthesizer
+    "PathSynthesizer",
+    "PlanSynthesisError",
+    "UnsatisfiableFieldError",
+    "CyclicDependencyError",
+    "MaxDepthExceededError",
+    "DynamicOptionRequired",
+    # intent
+    "ToolCatalogEntry",
+    "ParsedIntent",
+    "IntentParseError",
+    "parse_intent",
+    # response
+    "synthesize_success_response",
+    "synthesize_failure_response",
+]
diff --git a/graph_tool_call/plan/binding.py b/graph_tool_call/plan/binding.py
new file mode 100644
index 0000000..2ae6a50
--- /dev/null
+++ b/graph_tool_call/plan/binding.py
@@ -0,0 +1,161 @@
+"""Binding resolver for Plan args.
+
+Substitutes ``${source.dotted.path}`` placeholders in step arguments with
+actual values drawn from the runtime context. The context is a dict mapping
+source names (``"s1"``, ``"s2"``, ``"input"``, ...) to arbitrary JSON-like
+objects.
+
+v1 path syntax (kept deliberately small):
+
+  - dotted keys          : ``s1.body.goods`` → ``ctx["s1"]["body"]["goods"]``
+  - array index          : ``s1.body.goods[0].goodsNo``
+  - whole-source         : ``s1`` → entire result dict of step s1
+  - input alias          : ``input.keyword`` — caller injects a special
+                           ``"input"`` entry at runtime for user-provided
+                           entities extracted by Stage 1.
+
+Explicitly NOT supported in v1:
+
+  - wildcard ``[*]`` (fan-out) — see §11.1 of the design doc
+  - filter expressions (JSONPath ``[?(...)]``)
+  - functions / casts (``int(...)``, ``default(...)``)
+
+Behavior rules:
+
+  1. If a string argument is **entirely** one binding (``"${s1.id}"``) the
+     resolved value keeps its native type (int, dict, list, ...). This is
+     important so integer IDs aren't accidentally stringified.
+  2. If a string contains bindings mixed with literal text
+     (``"prefix-${s1.id}"``) each binding is ``str()``-cast during
+     interpolation. The result is always a string.
+  3. Unresolved bindings raise ``BindingError`` — callers should treat
+     this as a plan validation failure, not a tool execution error.
+  4. ``dict`` and ``list`` values are walked recursively.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+
+class BindingError(ValueError):
+    """Raised when a ``${...}`` expression cannot be resolved."""
+
+
+# Matches one ``${...}`` placeholder. Accepts empty body so ``${}`` triggers
+# a clear BindingError downstream instead of passing through as a literal.
+# ``{`` and ``}`` inside a binding are not supported in v1.
+_BINDING_RE = re.compile(r"\$\{([^${}]*)\}")
+
+
+def resolve_bindings(value: Any, context: dict[str, Any]) -> Any:
+    """Recursively resolve bindings in *value* against *context*.
+
+    Dict/list values are walked; strings are interpolated. Non-string
+    scalars pass through unchanged.
+    """
+    if isinstance(value, dict):
+        return {k: resolve_bindings(v, context) for k, v in value.items()}
+    if isinstance(value, list):
+        return [resolve_bindings(v, context) for v in value]
+    if isinstance(value, str):
+        return _resolve_string(value, context)
+    return value
+
+
+def _resolve_string(s: str, context: dict[str, Any]) -> Any:
+    """Resolve a string value.
+
+    If the string is exactly one binding (``${path}``), returns the native
+    value. Otherwise substitutes each match with its stringified form.
+    """
+    # Whole-string binding → native type
+    m = _BINDING_RE.fullmatch(s.strip())
+    if m:
+        return _lookup(m.group(1).strip(), context)
+
+    # Mixed / multi-binding → string interpolation
+    def _sub(match: re.Match[str]) -> str:
+        val = _lookup(match.group(1).strip(), context)
+        return "" if val is None else str(val)
+
+    return _BINDING_RE.sub(_sub, s)
+
+
+def _lookup(expr: str, context: dict[str, Any]) -> Any:
+    """Walk a dotted path with optional ``[N]`` indices against *context*."""
+    tokens = _tokenize(expr)
+    if not tokens:
+        raise BindingError(f"empty binding expression: {expr!r}")
+
+    head = tokens[0]
+    if head not in context:
+        raise BindingError(
+            f"unknown source {head!r} in binding ${{...}}: context has {sorted(context)!r}"
+        )
+    node: Any = context[head]
+
+    for tok in tokens[1:]:
+        if tok.startswith("[") and tok.endswith("]"):
+            # array index — allow negative too
+            try:
+                idx = int(tok[1:-1])
+            except ValueError as exc:
+                raise BindingError(f"non-numeric array index {tok!r} in binding {expr!r}") from exc
+            if not isinstance(node, (list, tuple)):
+                raise BindingError(
+                    f"indexing {tok} on non-list type {type(node).__name__} (expr={expr!r})"
+                )
+            try:
+                node = node[idx]
+            except IndexError as exc:
+                raise BindingError(f"index {idx} out of range in binding {expr!r}") from exc
+        else:
+            if not isinstance(node, dict):
+                raise BindingError(
+                    f"cannot descend into .{tok} on non-dict type {type(node).__name__} "
+                    f"(expr={expr!r})"
+                )
+            if tok not in node:
+                raise BindingError(
+                    f"key {tok!r} not found in binding {expr!r} "
+                    f"(available: {sorted(node)[:8]!r}...)"
+                )
+            node = node[tok]
+
+    return node
+
+
+def _tokenize(expr: str) -> list[str]:
+    """Tokenize a dotted path with ``[N]`` indices.
+
+    ``s1.body.goods[0].goodsNo`` → ``["s1", "body", "goods", "[0]", "goodsNo"]``
+    """
+    tokens: list[str] = []
+    buf = []
+    i = 0
+    while i < len(expr):
+        ch = expr[i]
+        if ch == ".":
+            if buf:
+                tokens.append("".join(buf))
+                buf = []
+        elif ch == "[":
+            if buf:
+                tokens.append("".join(buf))
+                buf = []
+            end = expr.find("]", i)
+            if end == -1:
+                raise BindingError(f"unclosed '[' in binding {expr!r}")
+            tokens.append(expr[i : end + 1])
+            i = end
+        else:
+            buf.append(ch)
+        i += 1
+    if buf:
+        tokens.append("".join(buf))
+    return tokens
+
+
+__all__ = ["BindingError", "resolve_bindings"]
diff --git a/graph_tool_call/plan/intent.py b/graph_tool_call/plan/intent.py
new file mode 100644
index 0000000..c62d396
--- /dev/null
+++ b/graph_tool_call/plan/intent.py
@@ -0,0 +1,361 @@
+"""Stage 1 — Intent Parser.
+
+자연어 요구사항을 Stage 2 (PathSynthesizer) 가 소비할 수 있는 구조화
+``{target, entities}`` 로 변환한다. LLM 1회 호출, 작은 context.
+
+Catalog 구성 원칙 (설계 §4):
+  - 사전에 retrieval 로 상위 K개 도구만 넘김 (전체 카탈로그 X)
+  - 각 도구는 name + one_line_summary + when_to_use + 핵심 semantic tags
+  - Pass 2 enrichment 가 채운 ai_metadata 가 있으면 그 정보를 우선 사용;
+    없으면 description 축약본으로 fallback
+
+LLM 은 structured JSON 만 반환 — 파싱 실패 시 BindingError 같은 방식으로
+호출자에게 명확히 전달.
+"""
+
+from __future__ import annotations
+
+import difflib
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+from graph_tool_call.ontology.llm_provider import OntologyLLM, _extract_json
+
+# Minimum SequenceMatcher ratio for treating an LLM-emitted entity key as
+# a typo/expansion of a real vocab entry. 0.8 catches "search_keyword_name"
+# vs "search_keyword" (~0.85) while rejecting unrelated pairs like
+# "search_keyword" vs "search_query" (~0.54).
+_VOCAB_FUZZY_CUTOFF = 0.8
+
+
+# ---------------------------------------------------------------------------
+# data shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ToolCatalogEntry:
+    """Condensed tool view for intent-parsing prompt — under ~150 chars each."""
+
+    name: str
+    summary: str = ""  # one_line_summary from ai_metadata
+    when_to_use: str = ""  # ai_metadata.when_to_use
+    consumes_tags: list[str] = field(default_factory=list)  # required semantic ids
+    canonical_action: str = ""  # "read" | "search" | "create" | ...
+    primary_resource: str = ""  # "product" | ...
+
+
+@dataclass
+class ParsedIntent:
+    """Stage 1 output — consumed by Stage 2 PathSynthesizer."""
+
+    target: str  # tool name picked by LLM
+    entities: dict[str, Any] = field(default_factory=dict)
+    confidence: float = 0.0  # 0.0 ~ 1.0
+    output_shape: str = "single"  # "single" | "list" | "count"
+    reasoning: str = ""
+
+
+class IntentParseError(Exception):
+    """Raised when the LLM output can't be mapped to a valid ParsedIntent."""
+
+
+# ---------------------------------------------------------------------------
+# prompt
+# ---------------------------------------------------------------------------
+
+
+_INTENT_PROMPT = """\
+You pick the right API tool and extract entity values for a planning system.
+
+User requirement:
+{requirement}
+
+Candidate tools (shortlisted by retrieval — includes the target's
+prerequisite producers so every key you need should appear in some
+tool's "needs:" line below):
+{catalog}
+{vocabulary_block}{enum_block}{seed_block}
+HARD CONSTRAINTS — violating any of these is a planning error, not a
+stylistic choice. Re-check the constraints before you emit JSON.
+
+  HC1. DO NOT put a value into an identifier-style field (a field name
+       ending in "No" / "Id" / "Idx" / "Code" / "id") if the value
+       contains spaces, Korean/Chinese/Japanese letters, or category
+       words ("티셔츠", "신발", "shoes", a brand or model name).
+       Identifier fields accept short alphanumeric record locators
+       only ("G12345", "10293"). A descriptive phrase placed in such
+       a field is always wrong.
+  HC2. DO NOT invent field names. Every entity key MUST appear in one
+       of the candidate tools' "needs:" lines. If no listed field can
+       carry the user's value without violating HC1, omit the entity —
+       empty entities are fine; the downstream synthesizer chains
+       through a producer.
+  HC3. DO NOT put the same value into more than one field. Each value
+       goes into zero or exactly one field.
+  HC4. DO NOT translate, normalize, paraphrase, or expand the user's
+       value. Copy it byte-for-byte as written in the requirement.
+  HC5. For fields that have an enum mapping below, the entity value
+       MUST be one of the listed CODES (left side), never the label
+       (right side) and never the user's original phrase. Pick the
+       code whose label best matches the user's intent. If nothing
+       matches clearly, omit that entity.
+
+Selection guidance (apply only after the constraints hold):
+  - Pick exactly ONE tool — the final-goal tool. Do not plan the chain;
+    the downstream system builds prerequisite steps automatically.
+  - Free-text values (descriptive phrases like "quarzen 티셔츠",
+    "black hoodie") match fields named "searchWord", "query",
+    "keyword", or names ending in "Nm" / "Name".
+  - When several fields could carry the value without violating HC1,
+    prefer one a candidate's "needs:" line lists — that is a field a
+    tool you already considered actually accepts.
+  - output_shape: "single" / "list" / "count".
+  - confidence: 0.0~1.0 — your certainty in the tool pick.
+  - reasoning: one short sentence for audit logs.
+
+Output JSON only — no markdown, no prose. Schema:
+{{
+  "target": "<tool_name>",
+  "entities": {{...}},
+  "confidence": 0.0,
+  "output_shape": "single" | "list" | "count",
+  "reasoning": "..."
+}}
+"""
+
+
+def _coerce_entity_keys(
+    entities: dict[str, Any],
+    vocab: list[str],
+) -> dict[str, Any]:
+    """Map LLM-emitted entity keys onto the vocabulary.
+
+    Exact match → kept. Close match above ``_VOCAB_FUZZY_CUTOFF`` → coerced
+    to the canonical vocab entry. Otherwise the entry is dropped — silently
+    passing an invented key downstream causes producer-chain failures or
+    cycle detection (the vocab miss is the failure, not the symptom).
+    """
+    vocab_set = set(vocab)
+    out: dict[str, Any] = {}
+    for key, value in entities.items():
+        key_str = str(key)
+        if key_str in vocab_set:
+            out[key_str] = value
+            continue
+        match = difflib.get_close_matches(
+            key_str,
+            vocab,
+            n=1,
+            cutoff=_VOCAB_FUZZY_CUTOFF,
+        )
+        if match:
+            # If multiple LLM keys collapse onto the same vocab entry, the
+            # later one wins. Acceptable: same canonical key with two
+            # values is already a degenerate LLM output.
+            out[match[0]] = value
+    return out
+
+
+def _format_seed_block(seed_entities: dict[str, Any] | None) -> str:
+    """Render a 'carry forward' section for entities the caller already
+    decided in a previous turn.
+
+    Multi-turn flow: when a previous synthesize attempt asked the user to
+    pick a value (e.g. via a popup of enum options), the chosen pairs are
+    fed back as ``seed_entities``. The LLM should keep them as-is unless
+    the new requirement explicitly contradicts a value, and only EXTRACT
+    NEW entities to add. Empty / None ⇒ section omitted.
+    """
+    if not seed_entities:
+        return ""
+    lines = "\n".join(
+        f"  - {k}: {json.dumps(v, ensure_ascii=False)}" for k, v in seed_entities.items()
+    )
+    return (
+        "\n\nExisting entities (carried over from prior turns — keep these "
+        "values exactly unless the user's new requirement explicitly "
+        "overrides one. You only need to extract additional entities that "
+        "the new requirement introduces):\n"
+        f"{lines}"
+    )
+
+
+def _format_enum_block(enum_mappings: dict[str, dict[str, str]] | None) -> str:
+    """Render the optional enum-mapping section of the prompt.
+
+    ``enum_mappings`` shape: ``{field_name: {code: label}}`` — operator-
+    registered code lookups for backend enum fields whose values aren't
+    in the swagger schema (e.g. "10" -> "비회원" for a basket type code).
+    The LLM picks the code whose label matches the user's natural-language
+    intent. Empty / None ⇒ section omitted entirely.
+    """
+    if not enum_mappings:
+        return ""
+    lines: list[str] = []
+    for field_name, codes in enum_mappings.items():
+        if not isinstance(codes, dict) or not codes:
+            continue
+        lines.append(f"  - {field_name}:")
+        for code, label in codes.items():
+            lines.append(f'      "{code}" → {label}')
+    if not lines:
+        return ""
+    body = "\n".join(lines)
+    return (
+        "\n\nEnum code mappings (operator-registered — when one of these "
+        "fields needs a value, pick the CODE whose label matches the "
+        "user's intent):\n"
+        f"{body}"
+    )
+
+
+def _format_vocabulary_block(tags: list[str]) -> str:
+    """Render the optional vocabulary section of the prompt.
+
+    Returns an empty string when no vocab is provided so the prompt
+    stays focused on ``catalog``. Callers that want LLM access to
+    field names beyond the catalog (e.g. when retrieval failed to pull
+    in producers) can pass a non-empty list.
+    """
+    if not tags:
+        return ""
+    lines = "\n".join(f"  - {t}" for t in tags)
+    return (
+        "\n\nAvailable entity field names — backup vocabulary used only when "
+        "no candidate tool's \"needs:\" line carries the user's value:\n"
+        f"{lines}"
+    )
+
+
+def _format_catalog(entries: list[ToolCatalogEntry]) -> str:
+    lines: list[str] = []
+    for i, e in enumerate(entries, start=1):
+        parts = [f"{i}. {e.name}"]
+        if e.canonical_action or e.primary_resource:
+            parts.append(f"[{e.canonical_action}/{e.primary_resource}]".strip("[/]"))
+        if e.summary:
+            parts.append(f"— {e.summary}")
+        lines.append(" ".join(p for p in parts if p))
+        if e.when_to_use:
+            lines.append(f"   when: {e.when_to_use[:140]}")
+        if e.consumes_tags:
+            lines.append(f"   needs: {', '.join(e.consumes_tags[:6])}")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+
+def parse_intent(
+    requirement: str,
+    catalog: list[ToolCatalogEntry],
+    llm: OntologyLLM,
+    *,
+    vocabulary: list[str] | None = None,
+    enum_mappings: dict[str, dict[str, str]] | None = None,
+    seed_entities: dict[str, Any] | None = None,
+) -> ParsedIntent:
+    """Call the LLM once to produce a ParsedIntent.
+
+    ``catalog`` should be the retrieval-shortlisted candidate tools (keep
+    small — ~10 entries — to control prompt size). ``vocabulary`` is the
+    full set of ``kind=data`` semantic ids in the graph (so the LLM can
+    map free-text inputs to a search-style key even when the matching
+    producer wasn't retrieved). ``enum_mappings`` is operator-registered
+    ``{field_name: {code: label}}`` lookups for backend enum fields whose
+    values aren't in the swagger schema — exposed only when relevant
+    (caller should pre-filter to the catalog's consumes fields).
+    ``seed_entities`` carries entities decided in earlier turns of a
+    multi-turn flow (e.g. user clicked an option in a popup); the LLM
+    keeps them and only extracts additional ones from the new
+    ``requirement``. ``llm`` is any OntologyLLM-compatible provider.
+    """
+    if not catalog:
+        raise IntentParseError("empty catalog — cannot pick a target")
+
+    vocab = vocabulary or []
+    if not vocab:
+        # Fallback: derive from catalog. Same-domain narrowing only —
+        # callers that supply the full graph vocab get better accuracy.
+        seen: set[str] = set()
+        for e in catalog:
+            for tag in e.consumes_tags:
+                if tag and tag not in seen:
+                    seen.add(tag)
+                    vocab.append(tag)
+
+    prompt = _INTENT_PROMPT.format(
+        requirement=requirement.strip(),
+        catalog=_format_catalog(catalog),
+        vocabulary_block=_format_vocabulary_block(vocab),
+        enum_block=_format_enum_block(enum_mappings),
+        seed_block=_format_seed_block(seed_entities),
+    )
+    raw = llm.generate(prompt)
+
+    try:
+        parsed = _extract_json(raw)
+    except json.JSONDecodeError as exc:
+        raise IntentParseError(f"LLM output not parseable JSON: {exc}") from exc
+
+    if not isinstance(parsed, dict):
+        raise IntentParseError(f"expected JSON object, got {type(parsed).__name__}")
+
+    target = str(parsed.get("target") or "").strip()
+    if not target:
+        raise IntentParseError("target missing from LLM output")
+
+    # Validate target is in the catalog — guard against hallucinated names
+    allowed = {e.name for e in catalog}
+    if target not in allowed:
+        raise IntentParseError(
+            f"target {target!r} not in catalog (candidates: {sorted(allowed)[:5]!r}...)"
+        )
+
+    entities_raw = parsed.get("entities")
+    entities = entities_raw if isinstance(entities_raw, dict) else {}
+
+    # Validate entity keys against the vocabulary. The LLM regularly emits
+    # a slightly-elaborated key ("search_keyword_name" instead of
+    # "search_keyword") that nothing downstream can match — coerce the
+    # close ones, drop the rest. A wrong key triggers worse downstream
+    # behavior than no key.
+    if vocab and entities:
+        entities = _coerce_entity_keys(entities, vocab)
+
+    # Multi-turn safety net: even if the LLM ignored the carry-forward
+    # instructions, prior-turn entities must persist. New entities from
+    # this turn override on conflict (later turn wins for explicit
+    # contradictions in the requirement).
+    if seed_entities:
+        entities = {**seed_entities, **entities}
+
+    try:
+        confidence = float(parsed.get("confidence") or 0.0)
+    except (TypeError, ValueError):
+        confidence = 0.0
+    confidence = max(0.0, min(1.0, confidence))
+
+    shape = str(parsed.get("output_shape") or "single").strip().lower()
+    if shape not in ("single", "list", "count"):
+        shape = "single"
+
+    return ParsedIntent(
+        target=target,
+        entities=entities,
+        confidence=confidence,
+        output_shape=shape,
+        reasoning=str(parsed.get("reasoning") or "").strip(),
+    )
+
+
+__all__ = [
+    "ToolCatalogEntry",
+    "ParsedIntent",
+    "IntentParseError",
+    "parse_intent",
+]
diff --git a/graph_tool_call/plan/response.py b/graph_tool_call/plan/response.py
new file mode 100644
index 0000000..4eefdfc
--- /dev/null
+++ b/graph_tool_call/plan/response.py
@@ -0,0 +1,136 @@
+"""Stage 4 — Response Synthesizer.
+
+ExecutionTrace 를 사용자 친화적 자연어 응답으로 변환한다. LLM 1회 호출,
+context 는 execution 결과 요약 + 원본 요구사항.
+
+성공 / 실패 두 경우 모두 다룸:
+  - 성공: plan.output (final step body) + 요구사항 → 답변
+  - 실패: failed_step + error + 부분 결과 → 무엇이 됐고 무엇이 안 됐는지
+
+실행 결과가 대형 JSON 일 수 있으므로 호출자가 미리 projection / 압축한 후
+넘기는 것을 권장 (본 모듈은 단순히 ``str(output)`` 사용).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from graph_tool_call.ontology.llm_provider import OntologyLLM
+
+# ---------------------------------------------------------------------------
+# prompts
+# ---------------------------------------------------------------------------
+
+
+_SUCCESS_PROMPT = """\
+You turn API execution results into a natural answer for the user.
+
+User asked:
+{requirement}
+
+Execution result (from the last step):
+{result}
+
+Respond in Korean unless the user's question is clearly in another language.
+Keep it concise — 1~3 sentences for simple answers, short bullet list for
+multi-item results. Do not invent data not present in the result.
+
+CRITICAL — count/total claims:
+- The result above may be **truncated** for length. The list you see is NOT
+  necessarily the complete list.
+- If the result contains an explicit total field (e.g. ``totalCount``,
+  ``totalElements``, ``total``, ``count``, ``size`` at top-level or inside
+  ``payload`` / ``data``), USE THAT NUMBER as the actual count and say
+  "총 N개 중 일부" or similar.
+- If no total field exists, do NOT claim a specific count. Avoid phrases like
+  "현재 1개 등록되어 있습니다" — instead say "조회된 리뷰" or
+  "응답에 포함된 항목". Counting visible list items as the absolute total
+  is forbidden.
+"""
+
+
+_FAILURE_PROMPT = """\
+You explain an API execution failure to the user.
+
+User asked:
+{requirement}
+
+Plan aborted at step {failed_step!r}.
+Error: {error}
+
+Partial results collected before the failure:
+{partial}
+
+Tell the user clearly in Korean (unless the question is another language):
+  - what they asked for
+  - what was attempted
+  - where and why it failed (in plain language — do not dump stack traces)
+  - what they can try next, if obvious
+Keep it short and helpful — 2~4 sentences.
+"""
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+
+def synthesize_success_response(
+    *,
+    requirement: str,
+    result: Any,
+    llm: OntologyLLM,
+    result_char_limit: int = 4000,
+) -> str:
+    """Success case — plan completed, convert output to NL answer."""
+    prompt = _SUCCESS_PROMPT.format(
+        requirement=requirement.strip(),
+        result=_render(result, result_char_limit),
+    )
+    return llm.generate(prompt).strip()
+
+
+def synthesize_failure_response(
+    *,
+    requirement: str,
+    failed_step: str,
+    error: Any,
+    partial_results: Any = None,
+    llm: OntologyLLM,
+    partial_char_limit: int = 1000,
+) -> str:
+    """Failure case — plan aborted, explain to user."""
+    prompt = _FAILURE_PROMPT.format(
+        requirement=requirement.strip(),
+        failed_step=failed_step,
+        error=_render(error, 300),
+        partial=_render(partial_results, partial_char_limit) if partial_results else "(none)",
+    )
+    return llm.generate(prompt).strip()
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _render(value: Any, char_limit: int) -> str:
+    """Serialize *value* to a short string for prompt use."""
+    if value is None:
+        return "(none)"
+    if isinstance(value, str):
+        return value[:char_limit] + ("…" if len(value) > char_limit else "")
+    try:
+        text = json.dumps(value, ensure_ascii=False, indent=2)
+    except (TypeError, ValueError):
+        text = str(value)
+    if len(text) <= char_limit:
+        return text
+    return text[:char_limit] + "…"
+
+
+__all__ = [
+    "synthesize_success_response",
+    "synthesize_failure_response",
+]
diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py
new file mode 100644
index 0000000..73038de
--- /dev/null
+++ b/graph_tool_call/plan/runner.py
@@ -0,0 +1,414 @@
+"""PlanRunner — deterministic executor for Plan artifacts.
+
+The runner is transport-agnostic: it takes a ``call_tool`` callable that
+actually performs each step. This decouples ``graph_tool_call`` (pure
+plan/graph logic) from integration concerns (HTTP, auth, retries —
+handled by the caller's adapter).
+
+The runner emits structured events as it progresses — callers can relay
+these over SSE, logs, or progress UIs.
+
+v1 scope reminder: **linear execution, no fan-out, no conditionals, no
+automatic re-planning**. Failures abort the run and return a trace.
+"""
+
+from __future__ import annotations
+
+import time
+from collections.abc import Callable, Iterator
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+from graph_tool_call.plan.binding import BindingError, resolve_bindings
+from graph_tool_call.plan.schema import (
+    ExecutionTrace,
+    Plan,
+    StepTrace,
+)
+
+# ---------------------------------------------------------------------------
+# Event types — structured so callers can pattern-match by ``type`` field
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PlanStarted:
+    type: str = "plan.started"
+    plan_id: str = ""
+    goal: str = ""
+    step_count: int = 0
+
+
+@dataclass
+class StepStarted:
+    type: str = "step.started"
+    step_id: str = ""
+    tool: str = ""
+    args_resolved: dict[str, Any] = field(default_factory=dict)
+    index: int = 0
+    total: int = 0
+
+
+@dataclass
+class StepCompleted:
+    type: str = "step.completed"
+    step_id: str = ""
+    tool: str = ""
+    duration_ms: int = 0
+    output_preview: Any = None  # truncated output for UI
+    output_size: int = 0
+
+
+@dataclass
+class StepFailed:
+    type: str = "step.failed"
+    step_id: str = ""
+    tool: str = ""
+    error: dict[str, Any] = field(default_factory=dict)
+    duration_ms: int = 0
+
+
+@dataclass
+class PlanCompleted:
+    type: str = "plan.completed"
+    plan_id: str = ""
+    output: Any = None
+    total_duration_ms: int = 0
+    # 누적 step traces — 비-스트리밍 ``run()`` 이 ExecutionTrace.steps 채울 때 사용.
+    trace_steps: list[StepTrace] = field(default_factory=list)
+
+
+@dataclass
+class PlanAborted:
+    type: str = "plan.aborted"
+    plan_id: str = ""
+    failed_step: str = ""
+    error: dict[str, Any] = field(default_factory=dict)
+    total_duration_ms: int = 0
+    trace_steps: list[StepTrace] = field(default_factory=list)
+
+
+PlanEvent = PlanStarted | StepStarted | StepCompleted | StepFailed | PlanCompleted | PlanAborted
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+
+# ToolCaller signature: (tool_name, resolved_args) -> output_dict
+ToolCaller = Callable[[str, dict[str, Any]], Any]
+
+
+class PlanRunner:
+    """Execute a Plan step-by-step using a caller-provided tool invoker.
+
+    Usage::
+
+        def call_tool(name: str, args: dict) -> dict:
+            return my_http_executor.execute(name, args)
+
+        runner = PlanRunner(call_tool)
+        trace = runner.run(plan)                  # run to completion, return trace
+        # or — streaming:
+        for event in runner.run_stream(plan):
+            send_over_sse(event)
+    """
+
+    def __init__(
+        self,
+        call_tool: ToolCaller,
+        *,
+        output_preview_limit: int = 512,
+        on_error: str = "abort",  # 'abort' only in v1
+    ) -> None:
+        self._call_tool = call_tool
+        self._preview_limit = output_preview_limit
+        if on_error != "abort":
+            raise ValueError("v1 PlanRunner only supports on_error='abort'")
+
+    # ----------------------------------------------------------------------
+    # Streaming interface — yields PlanEvent instances
+    # ----------------------------------------------------------------------
+
+    def run_stream(
+        self,
+        plan: Plan,
+        *,
+        input_context: dict[str, Any] | None = None,
+    ) -> Iterator[PlanEvent]:
+        """Execute *plan* and yield events as each step progresses.
+
+        ``input_context`` supplies values for ``${input.xxx}`` and
+        ``${user_input.xxx}`` bindings (both keys resolve to the same dict,
+        kept as aliases because the synthesizer emits ``user_input`` for
+        F2/Cycle-policy fallbacks and historical entity-injection paths use
+        ``input``). Typically the entities extracted by Stage 1 (intent
+        parser) plus any operator-supplied seed values.
+        """
+        plan_start = time.monotonic()
+
+        yield PlanStarted(
+            plan_id=plan.id,
+            goal=plan.goal,
+            step_count=len(plan.steps),
+        )
+
+        # step_id -> output (runtime context for binding resolution).
+        # ``input`` and ``user_input`` are aliases — same dict, both names —
+        # so binding ``${input.x}`` and ``${user_input.x}`` both resolve.
+        context: dict[str, Any] = {}
+        if input_context:
+            input_dict = dict(input_context)
+            context["input"] = input_dict
+            context["user_input"] = input_dict
+
+        trace_steps: list[StepTrace] = []
+
+        for idx, step in enumerate(plan.steps, start=1):
+            step_trace = StepTrace(id=step.id, tool=step.tool)
+            step_start = time.monotonic()
+
+            # 1. Resolve bindings
+            try:
+                resolved = resolve_bindings(step.args, context)
+            except BindingError as exc:
+                err = {
+                    "kind": "binding",
+                    "message": str(exc),
+                }
+                step_trace.error = err
+                step_trace.duration_ms = _ms_since(step_start)
+                trace_steps.append(step_trace)
+                yield StepFailed(
+                    step_id=step.id,
+                    tool=step.tool,
+                    error=err,
+                    duration_ms=step_trace.duration_ms,
+                )
+                yield PlanAborted(
+                    plan_id=plan.id,
+                    failed_step=step.id,
+                    error=err,
+                    total_duration_ms=_ms_since(plan_start),
+                    trace_steps=list(trace_steps),
+                )
+                return
+
+            step_trace.args_resolved = resolved
+            yield StepStarted(
+                step_id=step.id,
+                tool=step.tool,
+                args_resolved=resolved,
+                index=idx,
+                total=len(plan.steps),
+            )
+
+            # 2. Execute via caller's tool invoker
+            try:
+                output = self._call_tool(step.tool, resolved)
+            except Exception as exc:  # noqa: BLE001 — caller-defined
+                err = {
+                    "kind": "tool",
+                    "message": str(exc),
+                    "exception_type": type(exc).__name__,
+                }
+                step_trace.error = err
+                step_trace.duration_ms = _ms_since(step_start)
+                trace_steps.append(step_trace)
+                yield StepFailed(
+                    step_id=step.id,
+                    tool=step.tool,
+                    error=err,
+                    duration_ms=step_trace.duration_ms,
+                )
+                yield PlanAborted(
+                    plan_id=plan.id,
+                    failed_step=step.id,
+                    error=err,
+                    total_duration_ms=_ms_since(plan_start),
+                    trace_steps=list(trace_steps),
+                )
+                return
+
+            # 2a. Unwrap a single-level envelope when the response shape
+            # diverges from the schema in the canonical "{code, message,
+            # <wrapper>: {...}, timestamp}" pattern. One detect per step,
+            # not per binding — every binding for this step then resolves
+            # against the unwrapped dict naturally.
+            output = _maybe_unwrap_envelope(output, step.response_root_keys)
+
+            step_trace.output = output
+            step_trace.duration_ms = _ms_since(step_start)
+            trace_steps.append(step_trace)
+
+            # 3. Store output in context for later bindings
+            context[step.id] = output
+
+            yield StepCompleted(
+                step_id=step.id,
+                tool=step.tool,
+                duration_ms=step_trace.duration_ms,
+                output_preview=_preview(output, self._preview_limit),
+                output_size=_output_size(output),
+            )
+
+        # 4. Resolve output_binding for final answer
+        try:
+            final = (
+                resolve_bindings(plan.output_binding, context)
+                if plan.output_binding
+                else (context[plan.steps[-1].id] if plan.steps else None)
+            )
+        except BindingError as exc:
+            err = {"kind": "output_binding", "message": str(exc)}
+            yield PlanAborted(
+                plan_id=plan.id,
+                failed_step="<output_binding>",
+                error=err,
+                total_duration_ms=_ms_since(plan_start),
+                trace_steps=list(trace_steps),
+            )
+            return
+
+        yield PlanCompleted(
+            plan_id=plan.id,
+            output=final,
+            total_duration_ms=_ms_since(plan_start),
+            trace_steps=list(trace_steps),
+        )
+
+    # ----------------------------------------------------------------------
+    # Non-streaming interface — returns final ExecutionTrace
+    # ----------------------------------------------------------------------
+
+    def run(
+        self,
+        plan: Plan,
+        *,
+        input_context: dict[str, Any] | None = None,
+    ) -> ExecutionTrace:
+        """Execute *plan* and return an ExecutionTrace aggregating events.
+
+        ``trace_steps`` 는 종결 이벤트 (``PlanCompleted`` / ``PlanAborted``) 가
+        실어 보내는 것을 그대로 사용 — run_stream 안에서 step 단위로 누적된
+        StepTrace 가 그대로 ExecutionTrace.steps 에 들어간다.
+        """
+        started_at = _now_iso()
+        started = time.monotonic()
+        trace_steps: list[StepTrace] = []
+        success = False
+        failed_step: str | None = None
+        output: Any = None
+
+        for event in self.run_stream(plan, input_context=input_context):
+            etype = event.type
+            if etype == "plan.completed":
+                success = True
+                output = event.output  # type: ignore[union-attr]
+                trace_steps = list(event.trace_steps)  # type: ignore[union-attr]
+            elif etype == "plan.aborted":
+                failed_step = event.failed_step  # type: ignore[union-attr]
+                trace_steps = list(event.trace_steps)  # type: ignore[union-attr]
+
+        return ExecutionTrace(
+            plan_id=plan.id,
+            success=success,
+            steps=trace_steps,
+            output=output,
+            failed_step=failed_step,
+            total_duration_ms=_ms_since(started),
+            started_at=started_at,
+            ended_at=_now_iso(),
+        )
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _ms_since(start_monotonic: float) -> int:
+    return int((time.monotonic() - start_monotonic) * 1000)
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _preview(value: Any, limit: int) -> Any:
+    """Trim large outputs for UI previews. Keep small values intact."""
+    if isinstance(value, (dict, list)):
+        import json as _json
+
+        try:
+            rendered = _json.dumps(value, ensure_ascii=False)
+        except (TypeError, ValueError):
+            return {"_preview": f"<unserializable {type(value).__name__}>"}
+        if len(rendered) <= limit:
+            return value
+        return {"_preview": rendered[:limit] + "…", "_truncated": True}
+    if isinstance(value, str) and len(value) > limit:
+        return value[:limit] + "…"
+    return value
+
+
+def _maybe_unwrap_envelope(
+    output: Any,
+    expected_root_keys: list[str],
+) -> Any:
+    """Peel one envelope layer when the response shape diverges from schema.
+
+    Conservative — unwraps only when ALL of these hold:
+
+      1. ``output`` is a dict with two or more root keys
+         (a bare ``{"payload": ...}`` is more likely real data than envelope).
+      2. Exactly one root value is itself a dict — the wrapper candidate.
+      3. Every other root value is scalar / null
+         (envelope siblings are status/code/message/timestamp — not
+         business collections).
+      4. None of ``expected_root_keys`` appears at the response root
+         (otherwise the response is already in schema-shape).
+      5. At least one ``expected_root_keys`` entry appears inside the
+         wrapper candidate (otherwise the dict-typed sibling is unrelated
+         business data — unwrapping would lose information).
+
+    The wrapper *key name* is never inspected, so this works for
+    ``payload`` / ``data`` / ``result`` / any other convention. Without
+    ``expected_root_keys`` there's no schema signal to validate against,
+    so the output passes through unchanged.
+    """
+    if not expected_root_keys or not isinstance(output, dict) or len(output) < 2:
+        return output
+
+    dict_keys = [k for k, v in output.items() if isinstance(v, dict)]
+    if len(dict_keys) != 1:
+        return output
+
+    wrapper_key = dict_keys[0]
+    for k, v in output.items():
+        if k == wrapper_key:
+            continue
+        if isinstance(v, (dict, list)):
+            return output
+
+    expected = set(expected_root_keys)
+    if expected & set(output.keys()):
+        return output
+
+    wrapper = output[wrapper_key]
+    if not (expected & set(wrapper.keys())):
+        return output
+
+    return wrapper
+
+
+def _output_size(value: Any) -> int:
+    """Approximate serialized byte size (for observability)."""
+    import json as _json
+
+    try:
+        return len(_json.dumps(value, ensure_ascii=False))
+    except (TypeError, ValueError):
+        return 0
diff --git a/graph_tool_call/plan/schema.py b/graph_tool_call/plan/schema.py
new file mode 100644
index 0000000..9fff497
--- /dev/null
+++ b/graph_tool_call/plan/schema.py
@@ -0,0 +1,86 @@
+"""Plan and ExecutionTrace dataclasses.
+
+``Plan`` is the artifact produced by Stage 2 (Path Synthesizer) of the
+Plan-and-Execute architecture. It's consumed by ``PlanRunner`` (Stage 3).
+Both are intentionally plain dataclasses — serializable, introspectable,
+easy to hand-craft for testing.
+
+The schema explicitly does NOT include fan-out / conditional branching in
+v1 (per design doc §16 decision 6). Future versions can add optional
+fields (``foreach``, ``condition``) on ``PlanStep``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class PlanStep:
+    """A single step in a Plan.
+
+    ``args`` may contain binding placeholders of the form
+    ``${step_id.json.path}`` or ``${input.keyword}``. These are resolved
+    at runtime by ``resolve_bindings`` using the accumulated step context.
+    """
+
+    id: str  # "s1", "s2", ...
+    tool: str  # function_name (graph node name)
+    args: dict[str, Any] = field(default_factory=dict)
+    rationale: str = ""  # why this step exists (for audit)
+    timeout_ms: int | None = None
+    retryable: bool = False  # reserved for v1.1 retry policy
+    # Top-level keys the synthesizer expects in this tool's response,
+    # derived from ``produces[].json_path``. Used by PlanRunner to detect
+    # envelope wrappers (e.g. ``{code, message, payload: {...}}``) when the
+    # ingest captured the wrapped fields without the wrapper itself. Empty
+    # list means "no hint" — the runner then leaves the response untouched.
+    response_root_keys: list[str] = field(default_factory=list)
+
+
+@dataclass
+class Plan:
+    """Executable plan — ordered steps with binding references.
+
+    v1 scope: **linear execution only**. Steps run in listed order. No
+    fan-out, no conditional branching, no parallelism. Each step may
+    reference earlier step outputs via ``${sN.path}`` bindings.
+
+    ``output_binding`` designates which step's (or sub-path's) result is
+    the final answer. If unset, runner returns the last step's result.
+    """
+
+    id: str  # uuid
+    goal: str  # user requirement summary
+    steps: list[PlanStep] = field(default_factory=list)
+    output_binding: str | None = None  # e.g. "${s2.body}"
+    created_at: str = ""  # ISO8601
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class StepTrace:
+    """Record of a single step execution."""
+
+    id: str
+    tool: str
+    args_resolved: dict[str, Any] = field(default_factory=dict)
+    output: Any = None  # set on success
+    error: dict[str, Any] | None = None  # set on failure
+    duration_ms: int = 0
+    retries: int = 0
+
+
+@dataclass
+class ExecutionTrace:
+    """Result of a full Plan execution."""
+
+    plan_id: str
+    success: bool
+    steps: list[StepTrace] = field(default_factory=list)
+    output: Any = None  # plan.output_binding resolved
+    failed_step: str | None = None
+    total_duration_ms: int = 0
+    started_at: str = ""
+    ended_at: str = ""
diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py
new file mode 100644
index 0000000..ac8f2de
--- /dev/null
+++ b/graph_tool_call/plan/synthesizer.py
@@ -0,0 +1,1045 @@
+"""PathSynthesizer — Stage 2 of Plan-and-Execute.
+
+Given a target tool and user-provided entities, walk the ToolGraph's
+produces/consumes metadata backwards to construct a Plan (ordered steps +
+bindings) that, when executed by PlanRunner, satisfies the target.
+
+This module is transport-agnostic. It consumes a plain ``graph`` dict (the
+shape persisted as ``api_tool_collections.graph.graph``) — no DB, no HTTP.
+
+v1 scope (per design §16.6):
+  - Linear chain only — no fan-out, no parallel, no branching.
+  - Max recursion depth = 5 (guard against cyclic or pathological graphs).
+
+Matching order for each required consume field:
+  1. User ``entities`` (Stage 1 output) — preferred, no extra step.
+  2. Another tool's ``produces`` with the same ``semantic_tag``
+     (Pass 2 LLM enrichment quality).
+  3. Another tool's ``produces`` with the same ``field_name``
+     (Pass 1 deterministic extraction, fallback).
+
+Producer selection is ranked by Pass 2 metadata signals — no hardcoded
+domain or field rules:
+  - Entity affinity: producer consumes an entity the user supplied,
+    so chaining through it actually uses that entity.
+  - Pair hint: target's ``pairs_well_with`` includes this producer.
+  - Action preference: ``canonical_action`` = search/read fits a
+    prerequisite role better than create/update/delete.
+
+``consumes[].kind`` ("data" | "context", set by Pass 2):
+  - "data" — chain to a producer if entity doesn't match.
+  - "context" — ambient config (locale, site, tenant). Never chained;
+    must come from entity or skipped (runtime uses API default).
+"""
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+from graph_tool_call.plan.schema import Plan, PlanStep
+
+
+class PlanSynthesisError(Exception):
+    """Base class for synthesis failures."""
+
+
+class UnsatisfiableFieldError(PlanSynthesisError):
+    """A required field cannot be supplied by entities or any producer."""
+
+
+class CyclicDependencyError(PlanSynthesisError):
+    """The synthesis trace revisits a tool already in progress."""
+
+
+class MaxDepthExceededError(PlanSynthesisError):
+    """Recursion depth exceeded — likely a misshapen graph."""
+
+
+class DynamicOptionRequired(UnsatisfiableFieldError):  # noqa: N818
+    """A required data field has a single-hop producer that can be called
+    immediately with the user's entities + context_defaults. Surface this
+    so the caller can fetch the option list (instead of weaving a chain)
+    and ask the user to pick — the popup-driven UX for fields like
+    ``itmNo`` (single-품목 option) where the choices are dynamic per
+    request.
+
+    The exception carries enough metadata for the caller to:
+      * know which producer to call (``producer_name``)
+      * find the option array in the producer's response (``options_path``)
+      * pick a sensible label field next to each code (``label_field_hints``)
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        field_name: str,
+        semantic_tag: str,
+        producer_name: str,
+        options_path: str,
+        label_field_hints: list[str],
+    ) -> None:
+        super().__init__(message)
+        self.field_name = field_name
+        self.semantic_tag = semantic_tag
+        self.producer_name = producer_name
+        self.options_path = options_path
+        self.label_field_hints = list(label_field_hints)
+
+
+def _normalize_field_name(name: str) -> str:
+    """Lowercase + strip non-alphanumerics for loose field-name matching.
+
+    Conservative on purpose:
+      ``ordNo`` → ``ordno``    ``ord_no`` → ``ordno``    ``ORD-NO`` → ``ordno``
+
+    Token roots stay distinct:
+      ``ordNo`` ≠ ``orderNo``  (``ordno`` ≠ ``orderno``)
+
+    Token-level synonym mapping (``ord`` ↔ ``order``) is domain-specific
+    and intentionally NOT done here — that's the job of the graph-edge
+    fallback in ``_find_producer``, which uses path/$ref/CRUD signals
+    instead of name guessing.
+    """
+    if not name:
+        return ""
+    return "".join(ch.lower() for ch in name if ch.isalnum())
+
+
+@dataclass
+class _PartialStep:
+    """In-progress step being built during bottom-up synthesis."""
+
+    tool: str
+    args: dict[str, Any] = field(default_factory=dict)
+    rationale: str = ""
+    step_id: str = ""  # assigned at topological sort
+
+
+class PathSynthesizer:
+    """Deterministic plan builder driven by graph ``produces``/``consumes``.
+
+    Usage::
+
+        syn = PathSynthesizer(graph_dict)
+        plan = syn.synthesize(
+            target="seltProductDetailInfo",
+            entities={"search_keyword": "quarzen 티셔츠"},
+        )
+    """
+
+    def __init__(
+        self,
+        graph: dict[str, Any],
+        *,
+        max_depth: int = 5,
+        context_defaults: dict[str, Any] | None = None,
+        enum_field_names: set[str] | None = None,
+    ) -> None:
+        self._tools: dict[str, dict[str, Any]] = dict(graph.get("tools") or {})
+        self._max_depth = max_depth
+        # Collection-level ambient values (locale, tenant id, site id, ...) the
+        # operator registers once per collection. Filled into ``kind=context``
+        # consume fields when the user's entities don't supply them — avoids
+        # repeating env-style args in every requirement and avoids leaking
+        # backend-specific defaults into library code. Lookup precedence:
+        # entities > context_defaults > skip.
+        self._context_defaults: dict[str, Any] = dict(context_defaults or {})
+        # Field names the operator registered an enum mapping for. When a
+        # required-data field of this kind can't be filled by an entity,
+        # the synthesizer raises UnsatisfiableFieldError instead of
+        # producer-chaining — the caller (service layer) is expected to
+        # surface a popup to the user rather than weaving an awkward
+        # producer chain that pulls in unrelated tools just to source a
+        # code value. User intent (popup choice) wins over chain depth.
+        self._enum_field_names: set[str] = set(enum_field_names or ())
+        # semantic_tag -> [tool_name], insertion order preserved
+        self._producers_by_semantic: dict[str, list[str]] = {}
+        self._producers_by_field: dict[str, list[str]] = {}
+        # Loose-field index: normalised field name → [tool_name].
+        # Lets ``ordNo`` match producers of ``ordno`` / ``ord_no`` / ``ORDNO``.
+        # Conservative — only normalises case + separators, never strips
+        # tokens (so ``ordNo`` ≠ ``orderNo`` — those need the graph fallback).
+        self._producers_by_loose_field: dict[str, list[str]] = {}
+        # graphify-mode adjacency: ``tool_name -> [edge_dict]`` for outgoing
+        # workflow edges (REQUIRES / PRECEDES / COMPLEMENTARY). Used as a
+        # fallback in ``_find_producer`` when neither semantic_tag nor
+        # field_name match — we walk the graph the user/extractor built
+        # rather than failing on field-name divergence.
+        self._workflow_edges_out: dict[str, list[dict[str, Any]]] = {}
+        self._index_workflow_edges(graph)
+        self._build_producer_indexes()
+
+    # ------------------------------------------------------------------
+    # public API
+    # ------------------------------------------------------------------
+
+    def synthesize(
+        self,
+        *,
+        target: str,
+        entities: dict[str, Any] | None = None,
+        goal: str = "",
+    ) -> Plan:
+        """Build a Plan whose final step is ``target`` with required args
+        filled by entities + prerequisite steps.
+
+        Raises ``UnsatisfiableFieldError`` if a required field has no
+        producer or entity mapping.
+        """
+        if target not in self._tools:
+            raise PlanSynthesisError(f"target tool not in graph: {target!r}")
+
+        entities = entities or {}
+        steps_by_tool: dict[str, _PartialStep] = {}
+        visiting: set[str] = set()
+
+        # Resolve recursively; populates steps_by_tool with target at the end
+        self._resolve(
+            tool_name=target,
+            entities=entities,
+            steps_by_tool=steps_by_tool,
+            visiting=visiting,
+            depth=0,
+        )
+
+        # Assign topological ids s1..sN by insertion order
+        ordered_tools = list(steps_by_tool.keys())
+        for idx, tool_name in enumerate(ordered_tools, start=1):
+            steps_by_tool[tool_name].step_id = f"s{idx}"
+
+        # Replace tool-name bindings with step-id bindings
+        final_steps: list[PlanStep] = []
+        for tool_name in ordered_tools:
+            partial = steps_by_tool[tool_name]
+            args = {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in partial.args.items()}
+            final_steps.append(
+                PlanStep(
+                    id=partial.step_id,
+                    tool=partial.tool,
+                    args=args,
+                    rationale=partial.rationale,
+                    response_root_keys=self._response_root_keys(tool_name),
+                )
+            )
+
+        target_step_id = steps_by_tool[target].step_id
+
+        # Collect user_input slots so the runner can prompt the caller in
+        # advance and the UI can render a single popup with all missing
+        # fields, instead of one popup per step. Each entry: which step
+        # needs which field, and (when known) the original semantic_tag
+        # so frontend can show the same enum/popup the operator
+        # registered for that field.
+        user_input_slots: list[dict[str, Any]] = []
+        for step in final_steps:
+            for arg_name, arg_val in (step.args or {}).items():
+                if isinstance(arg_val, str) and arg_val.startswith("${user_input."):
+                    user_input_slots.append(
+                        {
+                            "step_id": step.id,
+                            "tool": step.tool,
+                            "field_name": arg_name,
+                        }
+                    )
+
+        return Plan(
+            id=str(uuid.uuid4()),
+            goal=goal or f"Execute {target}",
+            steps=final_steps,
+            # PlanRunner adapter 는 step ctx 에 응답 body 를 root 로 노출 →
+            # ``${sN}`` 만으로 전체 응답 dict 가 잡힌다 (과거 ``${sN.body}`` 는
+            # adapter 가 ``{status, body}`` 을 그대로 흘릴 때의 흔적).
+            output_binding=f"${{{target_step_id}}}",
+            created_at=datetime.now(timezone.utc).isoformat(),
+            metadata={
+                "target": target,
+                "entities": dict(entities),
+                "synthesized_by": "PathSynthesizer/v1",
+                "user_input_slots": user_input_slots,
+            },
+        )
+
+    # ------------------------------------------------------------------
+    # core recursion
+    # ------------------------------------------------------------------
+
+    def _resolve(
+        self,
+        *,
+        tool_name: str,
+        entities: dict[str, Any],
+        steps_by_tool: dict[str, _PartialStep],
+        visiting: set[str],
+        depth: int,
+    ) -> str:
+        """Ensure ``tool_name`` has a PartialStep with resolved args.
+
+        Returns the tool name itself (used as a placeholder in args until
+        step_ids are assigned by the caller).
+        """
+        if depth > self._max_depth:
+            raise MaxDepthExceededError(
+                f"synthesis exceeded max_depth={self._max_depth} at {tool_name!r}"
+            )
+        if tool_name in steps_by_tool:
+            return tool_name
+        if tool_name in visiting:
+            raise CyclicDependencyError(
+                f"cycle detected at {tool_name!r} (chain: {sorted(visiting)!r})"
+            )
+        visiting.add(tool_name)
+
+        tool = self._tools.get(tool_name) or {}
+        metadata = tool.get("metadata") or {}
+        consumes = metadata.get("consumes") or []
+
+        args: dict[str, Any] = {}
+        rationales: list[str] = []
+
+        for consume in consumes:
+            field_name = consume.get("field_name") or ""
+            semantic = consume.get("semantic_tag") or ""
+            kind = str(consume.get("kind") or "data").strip().lower()
+            is_required = bool(consume.get("required"))
+
+            # 1. Entity match (user-supplied) — applies to both data and
+            #    context, both required and optional. The user's input
+            #    always wins.
+            entity_val = self._match_entity(entities, semantic, field_name)
+            if entity_val is not None:
+                args[field_name] = entity_val
+                continue
+
+            # 2. Context-kind: try collection-level defaults regardless of
+            #    required flag. Context is never chained — ambient config
+            #    must come from entity or operator-registered default
+            #    (chaining through e.g. getSiteInfo would inflate the plan
+            #    with steps that don't produce business value).
+            if kind == "context":
+                default = self._lookup_context_default(semantic, field_name)
+                if default is not None:
+                    args[field_name] = default
+                continue
+
+            # 3. Optional data field: leave out. The caller's backend will
+            #    apply its own defaults — synthesizer has no business
+            #    inventing values for optional business inputs.
+            if not is_required:
+                continue
+
+            # 4. Enum-field popup priority. If the operator registered an
+            #    enum mapping for this field, it's the kind of value the
+            #    user should pick from a popup — NOT something to chain
+            #    through a producer (which often drags in semantically
+            #    unrelated tools just because their response happens to
+            #    contain a code by the same name). Surface
+            #    UnsatisfiableFieldError so the caller can yield a
+            #    question.required event instead.
+            if field_name in self._enum_field_names:
+                raise UnsatisfiableFieldError(
+                    f"tool {tool_name!r} requires {field_name!r} "
+                    f"(semantic={semantic!r}) — enum field, expects user "
+                    f"selection (no producer chain attempted)"
+                )
+
+            # 5. Required data field → rank candidate producers and pick the best.
+            #    Pass ``visiting`` as ``excluded`` so cycle-prone candidates are
+            #    skipped here (Cycle policy A). The chain reroutes around the
+            #    cycle when an alternative producer exists; only when none
+            #    remains does the caller fall through to user-input slot (F2).
+            producer = self._find_producer(
+                semantic=semantic,
+                field_name=field_name,
+                target_tool=tool_name,
+                entities=entities,
+                excluded=visiting,
+            )
+            if producer is None:
+                # F2 + Cycle policy B: gracefully surface the field as a
+                # ``${user_input.<field>}`` placeholder rather than aborting
+                # the entire plan. The runner detects the placeholder at
+                # step-start and asks the user (or its surrounding agent)
+                # to supply the value. The plan's metadata records every
+                # such slot so the caller can pre-collect inputs.
+                placeholder = f"${{user_input.{field_name}}}"
+                args[field_name] = placeholder
+                rationales.append(f"{field_name} ← user_input")
+                continue
+
+            # 5a. Dynamic-option popup priority. Detect "read-detail then
+            #     pick one" patterns where the producer is a single-hop
+            #     read of a product/record whose response carries a
+            #     list of options the user must choose from (e.g.
+            #     ``getProductInfo`` exposes ``$.itmInfo[*].itmNo`` —
+            #     the available SKUs). In that case, defer to the caller
+            #     to fetch options and pop up a question, instead of
+            #     chaining the producer in and binding ``[0]`` blindly.
+            #
+            #     Constrained to ``canonical_action='read'`` because
+            #     ``search`` producers (e.g. seltSearchProduct → goodsNo)
+            #     are exactly the chain idiom we DO want — pick the first
+            #     hit and continue. Without this constraint legitimate
+            #     search→detail chains turn into popups.
+            producer_action = self._producer_action(producer)
+            if producer_action == "read" and self._is_producer_simple_callable(producer, entities):
+                opt_path = self._produces_path_for(
+                    producer,
+                    semantic=semantic,
+                    field_name=field_name,
+                )
+                if opt_path and "[*]" in opt_path:
+                    raise DynamicOptionRequired(
+                        f"tool {tool_name!r} requires {field_name!r} "
+                        f"(semantic={semantic!r}) — dynamic option from "
+                        f"{producer!r}; caller should fetch options and "
+                        f"prompt the user",
+                        field_name=field_name,
+                        semantic_tag=semantic,
+                        producer_name=producer,
+                        options_path=opt_path,
+                        label_field_hints=self._label_hints_for(producer, opt_path),
+                    )
+
+            # Recurse into the producer first so step_id ordering is correct.
+            # Cycle policy B + F2: if the producer's own chain is too deep
+            # or cycles back, we don't abort the whole plan — we drop this
+            # producer and fall back to a user_input slot for the field.
+            # This keeps the surface tool callable when the prerequisite
+            # chain extends beyond what the synthesiser can flatten.
+            try:
+                self._resolve(
+                    tool_name=producer,
+                    entities=entities,
+                    steps_by_tool=steps_by_tool,
+                    visiting=visiting,
+                    depth=depth + 1,
+                )
+            except (MaxDepthExceededError, CyclicDependencyError) as exc:
+                placeholder = f"${{user_input.{field_name}}}"
+                args[field_name] = placeholder
+                rationales.append(
+                    f"{field_name} ← user_input (chain unflattenable: {exc.__class__.__name__})"
+                )
+                continue
+
+            # Build a placeholder binding — will be rewritten after step_ids
+            # are assigned. Format: ${<tool_name>.<jsonpath-sans-root>}
+            prod_path = self._producer_jsonpath(producer, semantic, field_name)
+            args[field_name] = f"${{{producer}.{prod_path}}}"
+            rationales.append(f"{field_name} ← {producer} ({prod_path})")
+
+        steps_by_tool[tool_name] = _PartialStep(
+            tool=tool_name,
+            args=args,
+            rationale="; ".join(rationales) if rationales else "",
+        )
+        visiting.discard(tool_name)
+        return tool_name
+
+    # ------------------------------------------------------------------
+    # helpers
+    # ------------------------------------------------------------------
+
+    def _build_producer_indexes(self) -> None:
+        """Index which tools produce which semantic / field across the graph.
+
+        Echo-back filter: a tool that takes ``ordNo`` as input and echoes it
+        back in its response is NOT a producer of ``ordNo`` in any useful
+        sense — it's just relaying the value the caller already supplied. We
+        skip those entries so the index reflects tools that actually CREATE
+        or DISCOVER the value (``listOrders``, ``createOrder``,
+        ``searchOrders`` etc.) rather than every endpoint that happens to
+        round-trip the field.
+
+        Same rule applied to ``semantic_tag`` for parity with the LLM Pass 2
+        enrichment path. Empty consumes (no input fields) → never echo, so
+        all produces are real producers.
+        """
+        for name, tool in self._tools.items():
+            meta = tool.get("metadata") or {}
+            consumed_fields: set[str] = set()
+            consumed_semantics: set[str] = set()
+            for c in meta.get("consumes") or []:
+                if not isinstance(c, dict):
+                    continue
+                cf = c.get("field_name") or ""
+                cs = c.get("semantic_tag") or ""
+                if cf:
+                    consumed_fields.add(cf)
+                if cs:
+                    consumed_semantics.add(cs)
+
+            for produce in meta.get("produces") or []:
+                sem = produce.get("semantic_tag") or ""
+                fname = produce.get("field_name") or ""
+                # Skip pure echo-back: the field came in, gets relayed out.
+                if fname and fname in consumed_fields:
+                    continue
+                if sem and sem in consumed_semantics:
+                    continue
+                if sem:
+                    self._producers_by_semantic.setdefault(sem, []).append(name)
+                if fname:
+                    self._producers_by_field.setdefault(fname, []).append(name)
+                    loose = _normalize_field_name(fname)
+                    if loose and loose != fname:
+                        self._producers_by_loose_field.setdefault(loose, []).append(name)
+
+    # ---- graphify edge indexing & traversal ---------------------------------
+
+    _WORKFLOW_RELATIONS: frozenset[str] = frozenset({"requires", "precedes", "complementary"})
+    _CONFIDENCE_RANK: dict[str, int] = {
+        "EXTRACTED": 0,
+        "INFERRED": 1,
+        "AMBIGUOUS": 2,
+    }
+
+    def _index_workflow_edges(self, graph: dict[str, Any]) -> None:
+        """Bucket the graphify graph's outgoing workflow edges by source tool.
+
+        Accepts the same graph dict the rest of the class consumes — looks
+        for ``graph.graph.edges`` (DictGraph.to_dict() output) or the
+        legacy NetworkX-style ``graph.graph.links`` if present. Edges
+        without a confidence label are kept (treated as fallback) so this
+        also works on graphs built before the graphify ingest landed.
+        """
+        graph_inner = graph.get("graph") or {}
+        edges = graph_inner.get("edges") or graph_inner.get("links") or []
+        for e in edges:
+            if not isinstance(e, dict):
+                continue
+            src = e.get("source") or e.get("from")
+            tgt = e.get("target") or e.get("to")
+            rel = e.get("relation")
+            rel_str = (
+                rel.value if hasattr(rel, "value") else str(rel) if rel is not None else ""
+            ).lower()
+            if not src or not tgt or rel_str not in self._WORKFLOW_RELATIONS:
+                continue
+            self._workflow_edges_out.setdefault(src, []).append(
+                {
+                    "target": tgt,
+                    "relation": rel_str,
+                    "confidence": e.get("confidence"),
+                    "conf_score": float(e.get("conf_score") or 0.0),
+                    "evidence": e.get("evidence") or "",
+                }
+            )
+
+    # Producer-signal score weights. Higher = stronger signal that this
+    # candidate genuinely produces the value the target needs. Weights chosen
+    # so combined signals (e.g. graph EXTRACTED + field exact = 90) beat any
+    # single signal, and graph EXTRACTED alone (50) beats field exact alone
+    # (40) — Path/$ref/CRUD-derived edges are more reliable than coincidental
+    # field-name overlap. ``semantic_exact`` requires LLM Pass 2 enrichment;
+    # when present it's the strongest signal we have.
+    _SIGNAL_WEIGHTS: dict[str, int] = {
+        "semantic_exact": 100,
+        "graph_EXTRACTED": 50,
+        "field_exact": 40,
+        "graph_INFERRED": 20,
+        "field_loose": 10,
+        "graph_AMBIGUOUS": 5,
+    }
+
+    def _find_producer(
+        self,
+        *,
+        semantic: str,
+        field_name: str,
+        target_tool: str,
+        entities: dict[str, Any],
+        excluded: set[str] | None = None,
+    ) -> str | None:
+        """Pick the best producer using combined graph + schema signals.
+
+        Producer matching is treated as the intersection of two first-class
+        signals (NOT a fallback chain):
+          (a) Schema match — semantic_tag / field_name on ``produces``.
+          (b) Graph traversal — outgoing REQUIRES / PRECEDES / COMPLEMENTARY
+              edges from ``target_tool``, ranked by ``confidence``.
+
+        A candidate accumulates one entry per matching signal. The signal
+        weights live in ``_SIGNAL_WEIGHTS`` and combine additively, so a
+        candidate matched by both graph EXTRACTED and field_exact (90) wins
+        over one matched only by field_exact (40). Tie-break uses the
+        existing Pass-2 ``_rank_producers`` (entity affinity, pair hint,
+        canonical action), and ``_is_chain_eligible`` still gates the final
+        pick — sparse Pass-2 metadata pass-throughs apply unchanged.
+
+        ``excluded`` is the set of tools currently being resolved (the
+        caller's ``visiting`` set). Producer candidates in this set would
+        re-enter recursion and trigger ``CyclicDependencyError`` — we skip
+        them here so the second-best candidate gets a chance instead. This
+        is the "skip-this-branch" cycle policy: the chain reroutes around
+        the cycle when alternative producers exist; only when all candidates
+        cycle does the caller fall back to user-input slot handling.
+
+        Returns the highest-scoring eligible candidate, or None if no
+        candidate has any signal (or all signals point to ``excluded`` tools).
+        """
+        excluded = excluded or set()
+        candidate_signals: dict[str, set[str]] = {}
+
+        def _record(name: str, signal: str) -> None:
+            if name and name != target_tool:
+                candidate_signals.setdefault(name, set()).add(signal)
+
+        # (a) schema-side: exact semantic / field_name (echo-back already
+        # filtered when the index was built).
+        if semantic:
+            for n in self._producers_by_semantic.get(semantic, []):
+                _record(n, "semantic_exact")
+        if field_name:
+            for n in self._producers_by_field.get(field_name, []):
+                _record(n, "field_exact")
+
+        # (a') schema-side: loose field match — separator/case folded.
+        # ``ordNo`` won't match ``orderNo`` (different roots) but will match
+        # ``ord_no`` / ``ORDNO``. Cross-naming-convention safety net.
+        if field_name:
+            loose = _normalize_field_name(field_name)
+            if loose:
+                for n in self._producers_by_loose_field.get(loose, []):
+                    if n in candidate_signals:
+                        continue  # already had a stronger signal
+                    _record(n, "field_loose")
+
+        # (b) graph-side: walk outgoing workflow edges, verify each
+        # candidate actually has a matching produces entry.
+        edges = self._workflow_edges_out.get(target_tool) or []
+        loose_target = _normalize_field_name(field_name) if field_name else ""
+        for e in edges:
+            cand = e.get("target")
+            if not cand or cand == target_tool:
+                continue
+            tool = self._tools.get(cand)
+            if not tool:
+                continue
+            cand_consumes_fields = {
+                (c or {}).get("field_name", "")
+                for c in (tool.get("metadata") or {}).get("consumes") or []
+                if isinstance(c, dict)
+            }
+            cand_consumes_semantics = {
+                (c or {}).get("semantic_tag", "")
+                for c in (tool.get("metadata") or {}).get("consumes") or []
+                if isinstance(c, dict)
+            }
+            for p in (tool.get("metadata") or {}).get("produces") or []:
+                if not isinstance(p, dict):
+                    continue
+                p_sem = p.get("semantic_tag") or ""
+                p_fname = p.get("field_name") or ""
+                # Echo-back guard for the candidate itself — same rule as
+                # _build_producer_indexes, applied here so graph-edge
+                # discoveries don't sneak in a relayed value.
+                if p_fname and p_fname in cand_consumes_fields:
+                    continue
+                if p_sem and p_sem in cand_consumes_semantics:
+                    continue
+
+                matched = False
+                if semantic and p_sem == semantic:
+                    matched = True
+                elif field_name and p_fname == field_name:
+                    matched = True
+                elif loose_target and _normalize_field_name(p_fname) == loose_target:
+                    matched = True
+                if not matched:
+                    continue
+
+                conf = e.get("confidence") or "AMBIGUOUS"
+                _record(cand, f"graph_{conf}")
+                break  # one signal per candidate per edge target is enough
+
+        if not candidate_signals:
+            return None
+
+        # Score and pre-rank by signal strength (stable for equal scores).
+        def _score(signals: set[str]) -> int:
+            return sum(self._SIGNAL_WEIGHTS.get(s, 0) for s in signals)
+
+        scored = sorted(
+            candidate_signals.items(),
+            key=lambda item: (-_score(item[1]), item[0]),
+        )
+        sorted_names = [n for n, _ in scored]
+
+        # Pass 2 / chain-eligibility gate — pass-through when ai_metadata
+        # is sparse, identical behaviour to the previous implementation.
+        # Cycle filter: skip candidates currently in the resolution stack so
+        # the synthesiser reroutes around the cycle instead of raising.
+        ranked = self._rank_producers(
+            sorted_names,
+            target_tool=target_tool,
+            entities=entities,
+        )
+        for cand in ranked:
+            if cand in excluded:
+                continue
+            if self._is_chain_eligible(cand, target_tool=target_tool):
+                return cand
+        return None
+
+    def _producer_action(self, producer_name: str) -> str:
+        """Return the producer's ``ai_metadata.canonical_action`` (lowercased,
+        empty string if missing). Used to gate dynamic-option popups to
+        ``read`` producers — search producers are the chain idiom (pick
+        first hit), not popup candidates.
+        """
+        tool = self._tools.get(producer_name) or {}
+        ai = (tool.get("metadata") or {}).get("ai_metadata") or {}
+        return str(ai.get("canonical_action") or "").strip().lower()
+
+    def _is_producer_simple_callable(
+        self,
+        producer_name: str,
+        entities: dict[str, Any],
+    ) -> bool:
+        """True iff the producer can be called with only the user's entities
+        and the collection's context_defaults — i.e. no further producer
+        chain needed to source its inputs.
+
+        Used to detect "single-hop dynamic option" cases: instead of
+        chaining the producer into the plan, the caller fetches it once
+        and pops up the resulting list to the user (e.g. itmNo from
+        getProductInfo when the user already supplied goodsNo).
+        """
+        producer = self._tools.get(producer_name) or {}
+        for c in (producer.get("metadata") or {}).get("consumes") or []:
+            if not isinstance(c, dict) or not c.get("required"):
+                continue
+            field = c.get("field_name") or ""
+            sem = c.get("semantic_tag") or ""
+            kind = str(c.get("kind") or "data").strip().lower()
+            if self._match_entity(entities, sem, field) is not None:
+                continue
+            if kind == "context" and self._lookup_context_default(sem, field) is not None:
+                continue
+            return False
+        return True
+
+    def _produces_path_for(
+        self,
+        producer_name: str,
+        *,
+        semantic: str,
+        field_name: str,
+    ) -> str:
+        """Find the producer's json_path that emits the given field — the
+        location of the option array in the response (e.g.
+        ``$.itmInfo[*].itmNo``). Empty string if no match.
+        """
+        producer = self._tools.get(producer_name) or {}
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            if semantic and p.get("semantic_tag") == semantic:
+                return str(p.get("json_path") or "")
+        # Fallback: match by field_name when semantic missing/mismatched
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            if field_name and p.get("field_name") == field_name:
+                return str(p.get("json_path") or "")
+        return ""
+
+    def _label_hints_for(
+        self,
+        producer_name: str,
+        options_path: str,
+    ) -> list[str]:
+        """Return field names that look like human labels living next to
+        the option-code field in the producer's response. Heuristic: same
+        array prefix, name ending in ``Nm`` / ``Name`` / ``Label``.
+
+        ``options_path`` looks like ``$.itmInfo[*].itmNo``; we walk the
+        producer's other produces entries that share the prefix
+        ``$.itmInfo[*].`` and pick the ones whose field_name suggests a
+        label.
+        """
+        producer = self._tools.get(producer_name) or {}
+        # Compute the array prefix: everything up to the last "."
+        if "." not in options_path:
+            return []
+        prefix = options_path.rsplit(".", 1)[0] + "."
+        hints: list[str] = []
+        seen: set[str] = set()
+        for p in (producer.get("metadata") or {}).get("produces") or []:
+            if not isinstance(p, dict):
+                continue
+            jp = str(p.get("json_path") or "")
+            if not jp.startswith(prefix):
+                continue
+            field = str(p.get("field_name") or "")
+            if not field or field in seen:
+                continue
+            lower = field.lower()
+            if lower.endswith("nm") or lower.endswith("name") or lower.endswith("label"):
+                hints.append(field)
+                seen.add(field)
+        return hints
+
+    def _is_chain_eligible(self, producer_name: str, *, target_tool: str) -> bool:
+        """Return True if ``producer_name`` may be added to the prerequisite
+        chain for ``target_tool``.
+
+        Two signals from Pass 2 ``ai_metadata`` decide:
+
+          1. ``canonical_action`` ∈ {search, read}
+             create/update/delete/action are not prerequisite material —
+             they perform side effects, never just data lookup.
+          2. ``primary_resource`` is in the target's domain set
+             (target's own resource + the prefix of every consume's
+             semantic_tag, e.g. ``product_id`` ⇒ ``product``).
+
+        Either signal absent (sparse ``ai_metadata``) ⇒ pass through.
+        Operators that haven't enriched the graph yet keep the previous
+        behaviour; once enriched, the policy starts filtering. Also
+        reverts to pass-through if the target itself has no ``ai_metadata``,
+        because the "domain set" can't be computed.
+        """
+        producer = self._tools.get(producer_name) or {}
+        p_meta = (producer.get("metadata") or {}).get("ai_metadata") or {}
+        p_action = str(p_meta.get("canonical_action") or "").strip().lower()
+        if not p_action:
+            return True
+        if p_action not in ("search", "read"):
+            return False
+
+        p_resource = str(p_meta.get("primary_resource") or "").strip().lower()
+        if not p_resource:
+            return True
+
+        target = self._tools.get(target_tool) or {}
+        t_meta_full = target.get("metadata") or {}
+        t_meta = t_meta_full.get("ai_metadata") or {}
+        t_resource = str(t_meta.get("primary_resource") or "").strip().lower()
+
+        related: set[str] = set()
+        if t_resource:
+            related.add(t_resource)
+            if "_" in t_resource:
+                related.add(t_resource.split("_", 1)[0])
+
+        for c in t_meta_full.get("consumes") or []:
+            if not isinstance(c, dict):
+                continue
+            sem = str(c.get("semantic_tag") or "").strip().lower()
+            if not sem:
+                continue
+            related.add(sem.split("_", 1)[0] if "_" in sem else sem)
+
+        if not related:
+            return True
+
+        p_prefix = p_resource.split("_", 1)[0] if "_" in p_resource else p_resource
+        return p_resource in related or p_prefix in related
+
+    def _rank_producers(
+        self,
+        candidates: list[str],
+        *,
+        target_tool: str,
+        entities: dict[str, Any],
+    ) -> list[str]:
+        """Rank candidates by Pass 2 metadata signals.
+
+        Order:
+          1. Entity affinity — producer consumes a field the user already
+             supplied (so the chain actually uses user input).
+          2. Pair hint — target's ``pairs_well_with`` names this producer.
+          3. Action preference — ``search`` > ``read`` > others as a
+             prerequisite role.
+        Ties fall back to insertion order (stable sort).
+
+        No hardcoded names / regexes. Every signal is a per-tool Pass 2
+        field the LLM filled at ingest time.
+        """
+        target_meta = (self._tools.get(target_tool) or {}).get("metadata") or {}
+        target_ai = target_meta.get("ai_metadata") or {}
+        pair_names = {
+            str(p.get("tool") or "").strip()
+            for p in (target_ai.get("pairs_well_with") or [])
+            if isinstance(p, dict)
+        }
+        pair_names.discard("")
+        entity_keys = {str(k) for k in (entities or {}).keys()}
+
+        action_score = {"search": 3, "read": 2, "action": 1}
+
+        def _score(name: str) -> tuple[int, int, int]:
+            tool = self._tools.get(name) or {}
+            meta = tool.get("metadata") or {}
+            ai = meta.get("ai_metadata") or {}
+
+            affinity = 0
+            for c in meta.get("consumes") or []:
+                tag = c.get("semantic_tag") or ""
+                fname = c.get("field_name") or ""
+                if (tag and tag in entity_keys) or (fname and fname in entity_keys):
+                    affinity += 1
+
+            pair_bonus = 1 if name in pair_names else 0
+            action = str(ai.get("canonical_action") or "").strip().lower()
+            return (affinity, pair_bonus, action_score.get(action, 0))
+
+        # Python's sort is stable; higher score wins, ties keep insertion order.
+        return sorted(candidates, key=_score, reverse=True)
+
+    def _response_root_keys(self, tool_name: str) -> list[str]:
+        """Top-level keys of the tool's response, taken from ``produces``.
+
+        Each ``produces[].json_path`` (e.g. ``$.searchDataList[*].goodsNo``)
+        contributes its first dotted segment (``searchDataList``). Used by
+        PlanRunner as a schema hint for envelope detection — when the
+        actual response is missing every hint at root but a single nested
+        dict contains them, the wrapper is peeled away.
+        """
+        tool = self._tools.get(tool_name) or {}
+        produces = (tool.get("metadata") or {}).get("produces") or []
+        out: list[str] = []
+        seen: set[str] = set()
+        for p in produces:
+            raw = p.get("json_path") or ""
+            head = _jsonpath_head(raw)
+            if head and head not in seen:
+                out.append(head)
+                seen.add(head)
+        return out
+
+    def _producer_jsonpath(
+        self,
+        producer: str,
+        semantic: str,
+        field_name: str,
+    ) -> str:
+        """Return a dotted path under the producer's response that yields
+        the desired field. Converts ``$.a.b[*].c`` → ``a.b[0].c`` (v1 picks
+        the first array element when a wildcard is present).
+
+        Falls back to ``body`` + field_name if we can't locate the produces.
+        """
+        tool = self._tools.get(producer) or {}
+        produces = (tool.get("metadata") or {}).get("produces") or []
+        match = None
+        if semantic:
+            match = next(
+                (p for p in produces if p.get("semantic_tag") == semantic),
+                None,
+            )
+        if match is None and field_name:
+            match = next(
+                (p for p in produces if p.get("field_name") == field_name),
+                None,
+            )
+        if match is None:
+            return f"body.{field_name}" if field_name else "body"
+
+        raw = match.get("json_path") or ""
+        return _normalize_jsonpath_for_binding(raw)
+
+    def _lookup_context_default(
+        self,
+        semantic: str,
+        field_name: str,
+    ) -> Any | None:
+        """Pick a registered context default for a consume field.
+
+        Mirrors ``_match_entity`` lookup order — semantic tag first (Pass 2
+        canonical id), field name second (Pass 1 raw). Returns ``None`` if
+        the operator hasn't registered a value for either key.
+        """
+        if not self._context_defaults:
+            return None
+        if semantic and semantic in self._context_defaults:
+            return self._context_defaults[semantic]
+        if field_name and field_name in self._context_defaults:
+            return self._context_defaults[field_name]
+        return None
+
+    def _match_entity(
+        self,
+        entities: dict[str, Any],
+        semantic: str,
+        field_name: str,
+    ) -> Any | None:
+        """Look up user-supplied entity by semantic tag or field name."""
+        if semantic and semantic in entities:
+            return entities[semantic]
+        if field_name and field_name in entities:
+            return entities[field_name]
+        return None
+
+    def _rewrite_tool_refs(
+        self,
+        value: Any,
+        steps_by_tool: dict[str, _PartialStep],
+    ) -> Any:
+        """Recursively rewrite ``${<tool_name>.<path>}`` → ``${sN.<path>}``."""
+        if isinstance(value, dict):
+            return {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in value.items()}
+        if isinstance(value, list):
+            return [self._rewrite_tool_refs(v, steps_by_tool) for v in value]
+        if not isinstance(value, str):
+            return value
+        # Only rewrite full-string bindings that we inserted. Entities
+        # supplied by the caller are left alone (no ${...} wrapping).
+        if not (value.startswith("${") and value.endswith("}")):
+            return value
+        inner = value[2:-1]
+        head, _, tail = inner.partition(".")
+        if head in steps_by_tool:
+            step_id = steps_by_tool[head].step_id
+            rest = f".{tail}" if tail else ""
+            return f"${{{step_id}{rest}}}"
+        return value
+
+
+def _jsonpath_head(raw: str) -> str:
+    """First dotted segment of a JSONPath, stripping ``$``, ``.`` and ``[…]``.
+
+    ``$.payload.searchDataList[*].goodsNo`` → ``"payload"``.
+    ``$.totalCount`` → ``"totalCount"``.
+    Returns ``""`` for empty / unparseable input.
+    """
+    if not raw:
+        return ""
+    path = raw[1:] if raw.startswith("$") else raw
+    if path.startswith("."):
+        path = path[1:]
+    # Cut at the first separator (``.`` or ``[``).
+    for i, ch in enumerate(path):
+        if ch in ".[":
+            return path[:i]
+    return path
+
+
+def _normalize_jsonpath_for_binding(raw: str) -> str:
+    """``$.body.goods[*].goodsNo`` → ``body.goods[0].goodsNo``.
+
+    v1 always picks index 0 for arrays. Fan-out is v2 (design §11.1).
+    """
+    if not raw:
+        return ""
+    path = raw
+    if path.startswith("$"):
+        path = path[1:]
+    if path.startswith("."):
+        path = path[1:]
+    return path.replace("[*]", "[0]")
+
+
+__all__ = [
+    "PathSynthesizer",
+    "PlanSynthesisError",
+    "UnsatisfiableFieldError",
+    "CyclicDependencyError",
+    "MaxDepthExceededError",
+    "DynamicOptionRequired",
+]
diff --git a/graph_tool_call/serialization.py b/graph_tool_call/serialization.py
index cfa56ea..81e56b6 100644
--- a/graph_tool_call/serialization.py
+++ b/graph_tool_call/serialization.py
@@ -52,7 +52,10 @@ def save_graph(
     path = Path(path)
     try:
         path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(json.dumps(data, indent=2, ensure_ascii=False, default=str))
+        path.write_text(
+            json.dumps(data, indent=2, ensure_ascii=False, default=str),
+            encoding="utf-8",
+        )
     except PermissionError:
         msg = f"Permission denied: {path}. Check directory permissions."
         raise PermissionError(msg) from None
diff --git a/graph_tool_call/tool_graph.py b/graph_tool_call/tool_graph.py
index e784ded..e415368 100644
--- a/graph_tool_call/tool_graph.py
+++ b/graph_tool_call/tool_graph.py
@@ -16,7 +16,7 @@
 from graph_tool_call.core.protocol import GraphEngine
 from graph_tool_call.core.tool import ToolSchema, normalize_tool, parse_tool
 from graph_tool_call.ontology.builder import OntologyBuilder
-from graph_tool_call.ontology.schema import RelationType
+from graph_tool_call.ontology.schema import Confidence, RelationType
 
 
 def _encode_spec_url(base: str, raw_url: str) -> str:
@@ -289,6 +289,9 @@ def ingest_openapi(
         min_confidence: float = 0.7,
         allow_private_hosts: bool = False,
         max_response_bytes: int = 5_000_000,
+        source_label: str | None = None,
+        on_conflict: str = "overwrite",
+        relink_existing: bool = True,
     ) -> list[ToolSchema]:
         """Ingest an OpenAPI/Swagger spec, register tools, and auto-detect relations.
 
@@ -304,11 +307,29 @@ def ingest_openapi(
             If True (default), run automatic dependency detection.
         min_confidence:
             Minimum confidence threshold for detected relations.
+        source_label:
+            Optional origin tag stored on each tool's ``metadata["source_label"]``.
+            Enables :meth:`list_sources` / :meth:`remove_source` and is used
+            to derive the namespace prefix when ``on_conflict="prefix"``.
+        on_conflict:
+            How to handle a name collision with an already-registered tool.
+
+            - ``"overwrite"`` (default): replace the existing tool.
+            - ``"prefix"``: rename incoming as ``{source_label}.{name}`` (or
+              ``incoming.{name}`` if no label provided). Subsequent collisions
+              after prefixing fall back to ``overwrite``.
+            - ``"skip"``: keep the existing tool, drop the incoming one.
+            - ``"error"``: raise ``ValueError`` on the first collision.
+        relink_existing:
+            When True (default), after adding the new batch, dependency
+            detection is re-run across **new ↔ existing** tools so that
+            cross-source edges are discovered. Has no effect when this is
+            the first ingest or ``detect_dependencies=False``.
 
         Returns
         -------
         list[ToolSchema]
-            The ingested tool schemas.
+            The ingested tool schemas (with any prefix-rename applied).
         """
         from graph_tool_call.ingest.openapi import ingest_openapi
 
@@ -319,13 +340,16 @@ def ingest_openapi(
             allow_private_hosts=allow_private_hosts,
             max_response_bytes=max_response_bytes,
         )
-        self._register_tools_batch(
+        registered = self._register_tools_batch(
             tools,
             detect_dependencies=detect_dependencies,
             min_confidence=min_confidence,
             spec=spec.raw,
+            source_label=source_label,
+            on_conflict=on_conflict,
+            relink_existing=relink_existing,
         )
-        return tools
+        return registered
 
     def ingest_mcp_tools(
         self,
@@ -464,9 +488,27 @@ def add_relation(
         target: str,
         relation: str | RelationType,
         weight: float = 1.0,
+        *,
+        confidence: str | Confidence | None = None,
+        conf_score: float | None = None,
+        layer: int | None = None,
+        evidence: str | None = None,
     ) -> None:
-        """Add a relation between two tools."""
-        self._builder.add_relation(source, target, relation, weight)
+        """Add a relation between two tools.
+
+        Optional graphify-style attrs are forwarded to ``OntologyBuilder``;
+        see ``OntologyBuilder.add_relation`` for semantics.
+        """
+        self._builder.add_relation(
+            source,
+            target,
+            relation,
+            weight,
+            confidence=confidence,
+            conf_score=conf_score,
+            layer=layer,
+            evidence=evidence,
+        )
         self._invalidate_retrieval()
 
     def add_domain(self, domain: str, description: str = "") -> None:
@@ -923,33 +965,92 @@ def _register_tools_batch(
         detect_dependencies: bool = True,
         min_confidence: float = 0.7,
         spec: dict | None = None,
-    ) -> None:
+        source_label: str | None = None,
+        on_conflict: str = "overwrite",
+        relink_existing: bool = True,
+    ) -> list[ToolSchema]:
         """Register tools, assign categories, and detect dependencies.
 
         Shared logic for ingest_openapi, ingest_mcp_tools, and ingest_functions.
+        Returns the list of tools that were actually registered (after any
+        conflict-driven rename or skip).
         """
+        had_existing = bool(self._tools)
+        registered: list[ToolSchema] = []
         categories_seen: set[str] = set()
+
         for tool in tools:
-            self._tools[tool.name] = tool
-            self._builder.add_tool(tool)
-            if tool.domain:
-                if tool.domain not in categories_seen:
-                    if not self._graph.has_node(tool.domain):
-                        self._builder.add_category(tool.domain)
-                    categories_seen.add(tool.domain)
-                self._builder.assign_category(tool.name, tool.domain)
-
-        if detect_dependencies and len(tools) >= 2:
+            resolved = self._resolve_conflict(tool, on_conflict, source_label)
+            if resolved is None:
+                continue
+            if source_label:
+                resolved.metadata["source_label"] = source_label
+            self._tools[resolved.name] = resolved
+            self._builder.add_tool(resolved)
+            if resolved.domain:
+                if resolved.domain not in categories_seen:
+                    if not self._graph.has_node(resolved.domain):
+                        self._builder.add_category(resolved.domain)
+                    categories_seen.add(resolved.domain)
+                self._builder.assign_category(resolved.name, resolved.domain)
+            registered.append(resolved)
+
+        if detect_dependencies and registered:
             from graph_tool_call.analyze.dependency import detect_dependencies as _detect
 
-            kwargs: dict = {"min_confidence": min_confidence}
-            if spec:
-                kwargs["spec"] = spec
-            relations = _detect(tools, **kwargs)
-            for rel in relations:
-                self._builder.add_relation(rel.source, rel.target, rel.relation_type)
+            # Scope of detection:
+            #   - First ingest, or relink disabled  → only the new batch.
+            #   - Incremental + relink_existing     → union of new + all existing,
+            #     so cross-source edges (e.g. order.* ↔ claim.*) are discovered.
+            if had_existing and relink_existing and len(self._tools) >= 2:
+                scope = list(self._tools.values())
+            else:
+                scope = registered
+
+            if len(scope) >= 2:
+                kwargs: dict = {"min_confidence": min_confidence}
+                if spec:
+                    kwargs["spec"] = spec
+                relations = _detect(scope, **kwargs)
+                for rel in relations:
+                    self._builder.add_relation(rel.source, rel.target, rel.relation_type)
 
         self._invalidate_retrieval()
+        return registered
+
+    def _resolve_conflict(
+        self,
+        tool: ToolSchema,
+        on_conflict: str,
+        source_label: str | None,
+    ) -> ToolSchema | None:
+        """Apply the *on_conflict* policy. Returns the tool to register, or None to skip.
+
+        Mutates ``tool.name`` when prefix-renaming.
+        """
+        if tool.name not in self._tools:
+            return tool
+
+        if on_conflict == "overwrite":
+            return tool
+        if on_conflict == "skip":
+            return None
+        if on_conflict == "error":
+            raise ValueError(
+                f"Tool '{tool.name}' already exists "
+                f"(on_conflict='error', incoming source_label={source_label!r})"
+            )
+        if on_conflict == "prefix":
+            prefix = source_label or "incoming"
+            new_name = f"{prefix}.{tool.name}"
+            # If the prefixed name also collides, fall through to overwrite —
+            # the caller has already chosen prefix as the deconfliction strategy.
+            tool.name = new_name
+            return tool
+        raise ValueError(
+            f"Unknown on_conflict policy: {on_conflict!r} "
+            "(expected 'overwrite' | 'prefix' | 'skip' | 'error')"
+        )
 
     # --- from_url ---
 
@@ -1167,6 +1268,60 @@ def apply_conflicts(self, conflicts: list | None = None, *, min_confidence: floa
             self._invalidate_retrieval()
         return added
 
+    # --- source management (incremental ingest) ---
+
+    def list_sources(self) -> list[str]:
+        """Return distinct ``source_label`` values across all registered tools."""
+        seen: dict[str, None] = {}
+        for tool in self._tools.values():
+            label = tool.metadata.get("source_label") if tool.metadata else None
+            if label and label not in seen:
+                seen[label] = None
+        return list(seen.keys())
+
+    def tools_by_source(self, source_label: str) -> list[ToolSchema]:
+        """Return all tools tagged with the given ``source_label``."""
+        return [
+            t
+            for t in self._tools.values()
+            if t.metadata and t.metadata.get("source_label") == source_label
+        ]
+
+    def remove_source(self, source_label: str) -> int:
+        """Remove every tool tagged with *source_label* and its incident edges.
+
+        Returns the number of tools removed.
+        """
+        victims = [t.name for t in self.tools_by_source(source_label)]
+        for name in victims:
+            self._tools.pop(name, None)
+            if self._graph.has_node(name):
+                self._graph.remove_node(name)
+        if victims:
+            self._invalidate_retrieval()
+        return len(victims)
+
+    def relink(self, *, min_confidence: float = 0.7) -> int:
+        """Re-run dependency detection across all currently registered tools.
+
+        New relations are added to the existing graph. Existing edges are
+        preserved (the underlying graph engine deduplicates edges by
+        ``(source, target, relation)``).
+
+        Returns the number of detected relations applied (including
+        previously known ones — use this as an upper bound, not a delta).
+        """
+        if len(self._tools) < 2:
+            return 0
+        from graph_tool_call.analyze.dependency import detect_dependencies as _detect
+
+        relations = _detect(list(self._tools.values()), min_confidence=min_confidence)
+        for rel in relations:
+            self._builder.add_relation(rel.source, rel.target, rel.relation_type)
+        if relations:
+            self._invalidate_retrieval()
+        return len(relations)
+
     def analyze(
         self,
         *,
@@ -1397,17 +1552,28 @@ def search_tools(query: str, top_k: int | None = None) -> str:
             """Search available tools by natural language query.
 
             Use this FIRST to find which tools are available for the task.
-            Returns tool names, descriptions, and required parameters.
+            Returns tool names, descriptions, required parameters, and
+            **dependency hints** (``prerequisites`` for tools that must be
+            called first, ``relations`` for tools used together or in order).
+
+            Planning rule:
+              - Pick the single tool that best matches the user's goal.
+              - If its ``prerequisites`` are non-empty, call those first and
+                feed their results into the target tool's arguments.
+              - ``relations`` with type=precedes/requires imply call order.
 
             Args:
                 query: Natural language search query (e.g. "add numbers", "get weather")
                 top_k: Max number of results (optional)
             """
             k = top_k if top_k is not None else default_top_k
-            results = graph_ref.retrieve(query, top_k=k)
+            # retrieve_with_scores 를 써야 _enrich_relations 가 채운 relations/prerequisites
+            # 가 살아남는다. retrieve() 는 ToolSchema 만 반환해 이 정보가 버려짐.
+            results = graph_ref.retrieve_with_scores(query, top_k=k)
 
             matched = []
-            for schema in results:
+            for result in results:
+                schema = result.tool
                 entry: dict[str, Any] = {
                     "name": schema.name,
                     "description": (schema.description or "")[:200],
@@ -1422,6 +1588,22 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                         }
                         for p in schema.parameters
                     ]
+                # Dependency / ordering hints from graph edges.
+                # prerequisites: REQUIRES targets not in the result set — LLM
+                # should call these first. relations: edges among result set
+                # members, carrying human-readable hint strings.
+                if result.prerequisites:
+                    entry["prerequisites"] = list(result.prerequisites)
+                if result.relations:
+                    entry["relations"] = [
+                        {
+                            "target": rel.target,
+                            "type": rel.type,
+                            "direction": rel.direction,
+                            "hint": rel.hint,
+                        }
+                        for rel in result.relations
+                    ]
                 matched.append(entry)
 
             output = {
@@ -1430,8 +1612,10 @@ def search_tools(query: str, top_k: int | None = None) -> str:
                 "total_tools": len(graph_ref._tools),
                 "tools": matched,
                 "hint": (
-                    "Use call_tool to execute a tool. "
-                    "Pass tool_name and arguments as a dict matching the parameters above."
+                    "Pick ONE tool matching the user's goal. If its "
+                    "'prerequisites' list is non-empty, call those tools "
+                    "first and use their results to fill the target tool's "
+                    "arguments. Then call_tool the target."
                 ),
             }
             return json.dumps(output, ensure_ascii=False, indent=2)
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..ff68b3a
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1,5 @@
+"""Internal scripts package — referenced by tests/test_release_script.py.
+
+Empty marker so Python treats ``scripts/`` as an importable package.
+Not included in the published wheel (see ``pyproject.toml`` ``packages``).
+"""
diff --git a/tests/test_dependency_verbs.py b/tests/test_dependency_verbs.py
new file mode 100644
index 0000000..756e8a4
--- /dev/null
+++ b/tests/test_dependency_verbs.py
@@ -0,0 +1,45 @@
+"""Unit tests for ``graph_tool_call.analyze.dependency`` verb mapping.
+
+특히 'reg' 약어가 'write' intent 로 분류되는지 확인 (리뷰 🟢 항목).
+"""
+
+from __future__ import annotations
+
+from graph_tool_call.analyze.dependency import _VERB_TO_INTENT
+
+
+def test_reg_abbrev_maps_to_write():
+    """``regGoodsApprove`` 같은 camelCase 약어를 위해 'reg' 도 write 로 잡아야."""
+    assert _VERB_TO_INTENT.get("reg") == "write"
+
+
+def test_register_full_form_still_maps_to_write():
+    assert _VERB_TO_INTENT.get("register") == "write"
+    assert _VERB_TO_INTENT.get("regist") == "write"
+
+
+def test_basic_verbs_unchanged():
+    """기존 verb mapping 회귀 방지."""
+    assert _VERB_TO_INTENT.get("get") == "read"
+    assert _VERB_TO_INTENT.get("create") == "write"
+    assert _VERB_TO_INTENT.get("update") == "update"
+    assert _VERB_TO_INTENT.get("delete") == "delete"
+
+
+# ─── _ANNOTATION_BY_VERB sibling 일관성 (잠복 결함) ──
+
+
+def test_annotation_by_verb_covers_register_family():
+    """``_ANNOTATION_BY_VERB`` 도 register 계열 커버해야 — _VERB_TO_INTENT 와 sibling.
+
+    ``registerUser`` / ``insertOrder`` / ``regGoodsApprove`` 같은 도구가 MCP
+    annotation 을 받을 수 있어야 한다 (read_only_hint=False, ...).
+    """
+    from graph_tool_call.core.tool import _ANNOTATION_BY_VERB
+
+    for verb in ("register", "regist", "reg", "insert"):
+        assert verb in _ANNOTATION_BY_VERB, (
+            f"verb {verb!r} 누락 — _VERB_TO_INTENT 와 sibling vocabulary 불일치"
+        )
+        assert _ANNOTATION_BY_VERB[verb].read_only_hint is False
+        assert _ANNOTATION_BY_VERB[verb].destructive_hint is False
diff --git a/tests/test_gateway_xgen_workflow.py b/tests/test_gateway_xgen_workflow.py
index d9a0f91..8517b0f 100644
--- a/tests/test_gateway_xgen_workflow.py
+++ b/tests/test_gateway_xgen_workflow.py
@@ -473,7 +473,10 @@ def api_get_customer_info(customer_id: str) -> str:
 
 @tool
 def api_submit_approval(document_id: str, action: str) -> str:
-    """결재: 문서 결재를 승인 또는 반려합니다. Approve or reject a document in the approval workflow."""
+    """결재: 문서 결재를 승인 또는 반려합니다.
+
+    Approve or reject a document in the approval workflow.
+    """
     return json.dumps({"document_id": document_id, "action": action, "result": "processed"})
 
 
diff --git a/tests/test_io_contract.py b/tests/test_io_contract.py
new file mode 100644
index 0000000..865b646
--- /dev/null
+++ b/tests/test_io_contract.py
@@ -0,0 +1,171 @@
+"""Unit tests for ``graph_tool_call.ingest.io_contract``.
+
+특히 query/path parameter 의 enum 추출 (리뷰에서 빠뜨려진 부분) 확인.
+"""
+
+from __future__ import annotations
+
+from graph_tool_call.ingest.io_contract import (
+    extract_consumes_for_operation,
+    extract_leaves,
+    extract_produces_for_operation,
+)
+
+# ─── extract_leaves ──
+
+
+def test_extract_leaves_object_with_primitives():
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+        },
+        "required": ["name"],
+    }
+    leaves = extract_leaves(schema, base_path="$")
+    by_name = {leaf.field_name: leaf for leaf in leaves}
+    assert by_name["name"].required is True
+    assert by_name["name"].field_type == "string"
+    assert by_name["age"].required is False
+
+
+def test_extract_leaves_array_of_objects():
+    schema = {
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {"id": {"type": "string"}},
+        },
+    }
+    leaves = extract_leaves(schema, base_path="$.body")
+    paths = {leaf.json_path for leaf in leaves}
+    assert any("[*]" in p for p in paths), "array → [*] wildcard 경로"
+
+
+def test_extract_leaves_captures_enum():
+    schema = {
+        "type": "object",
+        "properties": {
+            "status": {"type": "string", "enum": ["pending", "shipped"]},
+        },
+    }
+    leaves = extract_leaves(schema, base_path="$")
+    status = next(leaf for leaf in leaves if leaf.field_name == "status")
+    assert status.enum == ["pending", "shipped"]
+
+
+# ─── consumes — enum 추출 회귀 (리뷰 🟢 항목) ──
+
+
+def test_query_param_enum_extracted_openapi3():
+    """OpenAPI 3.x query param 의 schema.enum 이 FieldLeaf.enum 에 들어가야."""
+    operation = {
+        "parameters": [
+            {
+                "name": "sort",
+                "in": "query",
+                "required": True,
+                "schema": {"type": "string", "enum": ["asc", "desc"]},
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    by_name = {leaf.field_name: leaf for leaf in leaves}
+    assert "sort" in by_name
+    assert by_name["sort"].enum == ["asc", "desc"]
+
+
+def test_query_param_enum_extracted_swagger2():
+    """Swagger 2.0 query param 의 enum (parameter level) 도 잡아야."""
+    operation = {
+        "parameters": [
+            {
+                "name": "type",
+                "in": "query",
+                "required": True,
+                "type": "string",
+                "enum": ["A", "B", "C"],
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation, is_swagger2=True)
+    type_leaf = next(leaf for leaf in leaves if leaf.field_name == "type")
+    assert type_leaf.enum == ["A", "B", "C"]
+
+
+def test_path_param_enum_extracted():
+    """Path param 의 enum 도 동일."""
+    operation = {
+        "parameters": [
+            {
+                "name": "kind",
+                "in": "path",
+                "required": True,
+                "schema": {"type": "string", "enum": ["x", "y"]},
+            },
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    kind = next(leaf for leaf in leaves if leaf.field_name == "kind")
+    assert kind.enum == ["x", "y"]
+
+
+def test_param_without_enum_has_empty_list():
+    """enum 없는 일반 param 은 enum=[] 으로 들어가야 (None 아님)."""
+    operation = {
+        "parameters": [
+            {"name": "page", "in": "query", "schema": {"type": "integer"}},
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation, required_only=False)
+    page = next(leaf for leaf in leaves if leaf.field_name == "page")
+    assert page.enum == []
+
+
+# ─── produces ──
+
+
+def test_extract_produces_walks_response_body():
+    operation = {
+        "responses": {
+            "200": {
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "type": "object",
+                            "properties": {
+                                "data": {
+                                    "type": "object",
+                                    "properties": {
+                                        "id": {"type": "string"},
+                                    },
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    }
+    leaves = extract_produces_for_operation(operation)
+    paths = {leaf.json_path for leaf in leaves}
+    assert "$.data.id" in paths
+
+
+def test_consumes_skips_optional_when_required_only():
+    operation = {
+        "parameters": [
+            {"name": "must", "in": "query", "required": True, "schema": {"type": "string"}},
+            {"name": "maybe", "in": "query", "required": False, "schema": {"type": "string"}},
+        ],
+        "responses": {"200": {"description": "OK"}},
+    }
+    leaves = extract_consumes_for_operation(operation)
+    names = {leaf.field_name for leaf in leaves}
+    assert "must" in names
+    assert "maybe" not in names
diff --git a/tests/test_plan_binding.py b/tests/test_plan_binding.py
new file mode 100644
index 0000000..eee0ae9
--- /dev/null
+++ b/tests/test_plan_binding.py
@@ -0,0 +1,71 @@
+"""Unit tests for ``graph_tool_call.plan.binding``.
+
+binding placeholder resolution + error 동작.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from graph_tool_call.plan.binding import BindingError, resolve_bindings
+
+
+def test_literal_passes_through():
+    assert resolve_bindings("hello", {}) == "hello"
+    assert resolve_bindings(42, {}) == 42
+    assert resolve_bindings(None, {}) is None
+
+
+def test_simple_lookup():
+    ctx = {"s1": {"foo": "BAR"}}
+    assert resolve_bindings("${s1.foo}", ctx) == "BAR"
+
+
+def test_full_step_object():
+    ctx = {"s1": {"a": 1, "b": 2}}
+    assert resolve_bindings("${s1}", ctx) == {"a": 1, "b": 2}
+
+
+def test_array_index():
+    ctx = {"s1": {"items": [{"id": "A"}, {"id": "B"}]}}
+    assert resolve_bindings("${s1.items[0].id}", ctx) == "A"
+    assert resolve_bindings("${s1.items[1].id}", ctx) == "B"
+
+
+def test_array_negative_index():
+    ctx = {"s1": [10, 20, 30]}
+    assert resolve_bindings("${s1[-1]}", ctx) == 30
+
+
+def test_unknown_source_raises():
+    with pytest.raises(BindingError, match="unknown source"):
+        resolve_bindings("${ghost.x}", {"s1": {}})
+
+
+def test_dict_walks_recursively():
+    ctx = {"s1": {"v": 9}}
+    out = resolve_bindings(
+        {"a": "${s1.v}", "b": "literal", "nested": {"c": "${s1.v}"}},
+        ctx,
+    )
+    assert out == {"a": 9, "b": "literal", "nested": {"c": 9}}
+
+
+def test_list_walks_recursively():
+    ctx = {"s1": {"v": "X"}}
+    out = resolve_bindings(["${s1.v}", "lit", {"k": "${s1.v}"}], ctx)
+    assert out == ["X", "lit", {"k": "X"}]
+
+
+def test_oob_index_raises():
+    ctx = {"s1": [1, 2]}
+    with pytest.raises(BindingError, match="out of range"):
+        resolve_bindings("${s1[5]}", ctx)
+
+
+def test_input_alias_lookup():
+    """input / user_input 둘 다 같은 값 가리키도록 caller 가 등록한 케이스."""
+    shared = {"keyword": "shoes"}
+    ctx = {"input": shared, "user_input": shared}
+    assert resolve_bindings("${input.keyword}", ctx) == "shoes"
+    assert resolve_bindings("${user_input.keyword}", ctx) == "shoes"
diff --git a/tests/test_plan_runner.py b/tests/test_plan_runner.py
new file mode 100644
index 0000000..a4cf216
--- /dev/null
+++ b/tests/test_plan_runner.py
@@ -0,0 +1,217 @@
+"""Unit tests for ``graph_tool_call.plan.runner``.
+
+리뷰 CRITICAL #1, #2 회귀 방지 + 핵심 동작 cover.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from graph_tool_call.plan import (
+    Plan,
+    PlanRunner,
+    PlanStep,
+)
+from graph_tool_call.plan.runner import (
+    PlanAborted,
+    PlanCompleted,
+)
+
+
+def _echo(name: str, args: dict[str, Any]) -> dict[str, Any]:
+    return {"echoed": args, "tool": name}
+
+
+# ─── CRITICAL #1: input_context 가 ${user_input.x} / ${input.x} 둘 다 resolve ──
+
+
+def test_user_input_alias_resolves():
+    """``${user_input.foo}`` 가 input_context["foo"] 로 resolve 되어야 한다.
+
+    이전엔 synthesizer 가 ${user_input.x} 만들고 runner 가 context["input"] 에만
+    심어서 첫 step 부터 BindingError 로 abort 됐던 케이스.
+    """
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"foo": "${user_input.foo}"}),
+        ],
+        output_binding="${s1}",
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"})
+    assert trace.success, f"plan should succeed, got: {trace.failed_step}"
+    assert trace.steps[0].args_resolved == {"foo": "BAR"}
+
+
+def test_input_alias_resolves_too():
+    """``${input.foo}`` 도 동일 dict 가리켜야 한다 (backward compat)."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"foo": "${input.foo}"}),
+        ],
+        output_binding="${s1}",
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"})
+    assert trace.success
+    assert trace.steps[0].args_resolved == {"foo": "BAR"}
+
+
+def test_mixed_input_user_input_in_same_step():
+    """한 step 에 ${input.x} 와 ${user_input.y} 가 섞여 있어도 둘 다 resolve."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(
+                id="s1",
+                tool="echo",
+                args={"a": "${input.x}", "b": "${user_input.y}"},
+            ),
+        ],
+    )
+    trace = PlanRunner(_echo).run(plan, input_context={"x": "X", "y": "Y"})
+    assert trace.success
+    assert trace.steps[0].args_resolved == {"a": "X", "b": "Y"}
+
+
+# ─── CRITICAL #2: ExecutionTrace.steps 가 누적 ──
+
+
+def test_execution_trace_accumulates_steps():
+    """run() 의 ExecutionTrace.steps 가 빈 리스트가 아니어야 한다.
+
+    이전엔 runner.py:289 의 pass 때문에 항상 [] 였던 케이스.
+    """
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="echo", args={"x": "hello"}),
+            PlanStep(id="s2", tool="echo", args={"y": "${s1.echoed.x}"}),
+        ],
+        output_binding="${s2}",
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+    assert len(trace.steps) == 2, "두 step 모두 trace 에 누적돼야 함"
+    assert trace.steps[0].id == "s1"
+    assert trace.steps[1].id == "s2"
+    assert trace.steps[0].output == {"echoed": {"x": "hello"}, "tool": "echo"}
+    assert trace.steps[1].args_resolved == {"y": "hello"}, "이전 step 출력 binding"
+
+
+def test_execution_trace_includes_failed_step():
+    """실패해도 실패한 step + 그 이전 step 이 trace 에 포함."""
+
+    def flaky(name: str, args: dict[str, Any]) -> dict[str, Any]:
+        if name == "boom":
+            raise RuntimeError("simulated")
+        return {"ok": True}
+
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[
+            PlanStep(id="s1", tool="ok"),
+            PlanStep(id="s2", tool="boom"),
+            PlanStep(id="s3", tool="never_called"),
+        ],
+    )
+    trace = PlanRunner(flaky).run(plan)
+    assert trace.success is False
+    assert trace.failed_step == "s2"
+    assert len(trace.steps) == 2, "실패까지의 step 만 누적 (s3 는 도달 안 함)"
+    assert trace.steps[0].id == "s1"
+    assert trace.steps[0].error is None
+    assert trace.steps[1].id == "s2"
+    assert trace.steps[1].error is not None
+    assert "simulated" in trace.steps[1].error["message"]
+
+
+# ─── 일반 동작 ──
+
+
+def test_run_stream_yields_expected_events_in_order():
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})],
+    )
+    events = list(PlanRunner(_echo).run_stream(plan))
+    types = [e.type for e in events]
+    assert types[0] == "plan.started"
+    assert types[-1] == "plan.completed"
+    assert "step.started" in types
+    assert "step.completed" in types
+
+
+def test_plan_completed_carries_trace_steps():
+    """run_stream 의 PlanCompleted 가 trace_steps 를 실어 보내야 run() 이 읽을 수 있음."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})],
+    )
+    completed = next(e for e in PlanRunner(_echo).run_stream(plan) if isinstance(e, PlanCompleted))
+    assert len(completed.trace_steps) == 1
+    assert completed.trace_steps[0].id == "s1"
+
+
+def test_plan_aborted_carries_trace_steps():
+    """abort 시에도 PlanAborted 가 그때까지의 trace_steps 를 실어 보내야 함."""
+
+    def fail(name: str, args: dict[str, Any]) -> dict[str, Any]:
+        raise RuntimeError("boom")
+
+    plan = Plan(id="t", goal="g", steps=[PlanStep(id="s1", tool="x")])
+    aborted = next(e for e in PlanRunner(fail).run_stream(plan) if isinstance(e, PlanAborted))
+    assert len(aborted.trace_steps) == 1
+    assert aborted.trace_steps[0].error is not None
+
+
+def test_binding_to_unknown_source_aborts():
+    """존재하지 않는 step id 참조 → BindingError → abort."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "${ghost.foo}"})],
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success is False
+    assert trace.failed_step == "s1"
+    assert trace.steps[0].error["kind"] == "binding"
+
+
+def test_output_binding_resolves_nested_path():
+    """output_binding 이 step 응답 안의 nested path 를 가리킬 수 있어야."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"v": 42})],
+        output_binding="${s1.echoed.v}",
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+    assert trace.output == 42
+
+
+def test_no_input_context_works_when_plan_has_no_input_binding():
+    """input_context 안 줘도 ${input.x} 안 쓰면 동작."""
+    plan = Plan(
+        id="t",
+        goal="g",
+        steps=[PlanStep(id="s1", tool="echo", args={"x": "literal"})],
+    )
+    trace = PlanRunner(_echo).run(plan)
+    assert trace.success
+
+
+def test_v1_only_supports_abort_on_error():
+    """v1 PlanRunner 는 on_error='abort' 만 허용 — 다른 값은 ValueError."""
+    with pytest.raises(ValueError):
+        PlanRunner(_echo, on_error="continue")
diff --git a/tests/test_plan_synthesizer.py b/tests/test_plan_synthesizer.py
new file mode 100644
index 0000000..d1793b9
--- /dev/null
+++ b/tests/test_plan_synthesizer.py
@@ -0,0 +1,168 @@
+"""Unit tests for ``graph_tool_call.plan.synthesizer``.
+
+핵심 합성 시나리오 + Cycle/F2 fallback 의 user_input placeholder 출력.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from graph_tool_call.plan.synthesizer import (
+    PathSynthesizer,
+    PlanSynthesisError,
+    _normalize_field_name,
+)
+
+
+def _basic_graph() -> dict:
+    """포함:
+    - 'searchProduct': 입력=keyword, 출력=goodsNo (semantic=goods.id)
+    - 'getProductDetail': 입력=goodsNo (semantic=goods.id) → 의존
+    """
+    return {
+        "tools": {
+            "searchProduct": {
+                "metadata": {
+                    "method": "GET",
+                    "path": "/api/v1/products",
+                    "consumes": [{"field_name": "keyword", "kind": "data", "required": True}],
+                    "produces": [
+                        {
+                            "field_name": "goodsNo",
+                            "json_path": "$.body.items[*].goodsNo",
+                            "semantic_tag": "goods.id",
+                        }
+                    ],
+                    "ai_metadata": {
+                        "canonical_action": "search",
+                        "primary_resource": "product",
+                    },
+                },
+            },
+            "getProductDetail": {
+                "metadata": {
+                    "method": "GET",
+                    "path": "/api/v1/products/{goodsNo}",
+                    "consumes": [
+                        {
+                            "field_name": "goodsNo",
+                            "semantic_tag": "goods.id",
+                            "kind": "data",
+                            "required": True,
+                        }
+                    ],
+                    "produces": [{"field_name": "name", "json_path": "$.body.name"}],
+                    "ai_metadata": {
+                        "canonical_action": "read",
+                        "primary_resource": "product",
+                    },
+                },
+            },
+        },
+    }
+
+
+# ─── normalize_field_name ──
+
+
+def test_normalize_field_name_collapses_separators():
+    assert _normalize_field_name("ord_no") == "ordno"
+    assert _normalize_field_name("ORD-NO") == "ordno"
+    assert _normalize_field_name("ordNo") == "ordno"
+
+
+def test_normalize_field_name_keeps_token_roots_distinct():
+    """ord ≠ order — token-level synonym mapping은 안 함."""
+    assert _normalize_field_name("ordNo") != _normalize_field_name("orderNo")
+
+
+def test_normalize_field_name_empty():
+    assert _normalize_field_name("") == ""
+    assert _normalize_field_name(None) == ""  # type: ignore[arg-type]
+
+
+# ─── synthesizer 핵심 동작 ──
+
+
+def test_synthesize_uses_entity_when_available():
+    """user 가 keyword 를 entity 로 줬으면 검색 step 1개로 끝나야."""
+    syn = PathSynthesizer(_basic_graph())
+    plan = syn.synthesize(target="searchProduct", entities={"keyword": "shoes"})
+    assert len(plan.steps) == 1
+    assert plan.steps[0].tool == "searchProduct"
+    assert plan.steps[0].args == {"keyword": "shoes"}
+
+
+def test_synthesize_chains_producer_when_entity_missing():
+    """getProductDetail 호출하려면 goodsNo 가 필요 — searchProduct 가 producer.
+
+    keyword 만 entity 로 주면 chain: searchProduct → getProductDetail.
+    합성 후 step 이름은 ``s1``/``s2`` 로 정렬되고, binding 도 그에 맞게 rewrite 됨.
+    """
+    syn = PathSynthesizer(_basic_graph())
+    plan = syn.synthesize(
+        target="getProductDetail",
+        entities={"keyword": "shoes"},
+    )
+    assert len(plan.steps) == 2, "검색 + 상세조회 2-step chain"
+    assert plan.steps[0].tool == "searchProduct"
+    assert plan.steps[1].tool == "getProductDetail"
+    binding = plan.steps[1].args.get("goodsNo", "")
+    # step_id 순서 정렬 후 binding 은 ${s1...} 로 rewrite — 첫 step 의 출력 가리킴
+    assert binding.startswith("${"), "binding placeholder 형식이어야"
+    assert "s1" in binding, f"첫 step (s1) 출력 binding 이어야, got {binding}"
+    assert "goodsNo" in binding, "produces 필드 경로 포함"
+
+
+def test_synthesize_falls_back_to_user_input_placeholder():
+    """필수 field 인데 entity 도 없고 producer 도 없으면 ``${user_input.X}`` 로 fallback.
+
+    F2 + Cycle policy B 의 핵심 동작 — abort 대신 caller 에게 슬롯을 surface.
+    runner 가 input_context 에 ``user_input`` 별칭으로 등록하므로
+    plan 자체는 합성되고, 실행 시 caller 가 값을 공급하면 작동한다.
+    """
+    g = {
+        "tools": {
+            "needsX": {
+                "metadata": {
+                    "consumes": [{"field_name": "mysteryField", "kind": "data", "required": True}],
+                    "produces": [],
+                    "ai_metadata": {"canonical_action": "read"},
+                },
+            },
+        },
+    }
+    syn = PathSynthesizer(g)
+    plan = syn.synthesize(target="needsX", entities={})
+    assert len(plan.steps) == 1
+    assert plan.steps[0].args == {"mysteryField": "${user_input.mysteryField}"}
+
+
+def test_synthesize_unknown_target_raises():
+    syn = PathSynthesizer(_basic_graph())
+    with pytest.raises(PlanSynthesisError):
+        syn.synthesize(target="ghostTool", entities={})
+
+
+def test_synthesize_context_field_uses_collection_default():
+    """kind=context 인 필드는 entity 없으면 context_defaults 에서 채움."""
+    g = {
+        "tools": {
+            "needsLocale": {
+                "metadata": {
+                    "consumes": [
+                        {
+                            "field_name": "locale",
+                            "kind": "context",
+                            "required": True,
+                        }
+                    ],
+                    "produces": [],
+                    "ai_metadata": {"canonical_action": "read"},
+                },
+            },
+        },
+    }
+    syn = PathSynthesizer(g, context_defaults={"locale": "ko_KR"})
+    plan = syn.synthesize(target="needsLocale", entities={})
+    assert plan.steps[0].args == {"locale": "ko_KR"}