diff --git a/.gitignore b/.gitignore index b8d4330..2dceec9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,6 @@ htmlcov/ # Personal memo memo/ + +# Benchmark output (timestamped, auto-generated) +benchmarks/results/benchmark_*.json diff --git a/benchmarks/results/benchmark_e2e_20260407_014809.json b/benchmarks/results/benchmark_e2e_20260407_014809.json deleted file mode 100644 index 3f72158..0000000 --- a/benchmarks/results/benchmark_e2e_20260407_014809.json +++ /dev/null @@ -1,740 +0,0 @@ -{ - "timestamp": "2026-04-07T01:45:56.543757+00:00", - "model": "Bonsai-8B.gguf", - "mode": "e2e", - "top_k": 5, - "datasets": [ - { - "name": "Petstore 3.0", - "tool_count": 19, - "query_count": 23, - "queries": [ - { - "query": "Find all available pets", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "findPetsByStatus" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "findPetsByStatus", - "findPetsByTags", - "getPetById", - "getOrderById", - "getInventory" - ], - "score_breakdown": {}, - "baseline_tool": "findPetsByStatus", - "baseline_correct": true, - "baseline_latency_ms": 10839.814374921843, - "baseline_input_tokens": 1873, - "retrieve_tool": "findPetsByStatus", - "retrieve_correct": true, - "retrieve_latency_ms": 2766.176208970137, - "retrieve_input_tokens": 462, - "error": null - }, - { - "query": "Add a new dog to the pet store", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "addPet" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "addPet", - "placeOrder", - "updatePetWithForm", - "getInventory", - "uploadFile" - ], - "score_breakdown": {}, - "baseline_tool": "addPet", - "baseline_correct": true, - "baseline_latency_ms": 2812.666874960996, - "baseline_input_tokens": 1877, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 5412.9798751091585, - "retrieve_input_tokens": 658, - "error": null - }, - { - "query": "Get pet with ID 42", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "getPetById" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "getPetById", - "getOrderById", - "getInventory", - "findPetsByStatus", - "findPetsByTags" - ], - "score_breakdown": {}, - "baseline_tool": "getPetById", - "baseline_correct": true, - "baseline_latency_ms": 1094.6545000188053, - "baseline_input_tokens": 1876, - "retrieve_tool": "getPetById", - "retrieve_correct": true, - "retrieve_latency_ms": 2944.9569159187376, - "retrieve_input_tokens": 465, - "error": null - }, - { - "query": "Update the name of my pet", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "updatePet", - "updatePetWithForm" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "updatePetWithForm", - "addPet", - "createUser", - "updatePet", - "uploadFile" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 1457.0880000246689, - "baseline_input_tokens": 1875, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4513.7791249435395, - "retrieve_input_tokens": 774, - "error": null - }, - { - "query": "Delete pet number 7", - "category": "delete", - "difficulty": "easy", - "expected_tools": [ - "deletePet" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "deletePet", - "deleteUser", - "deleteOrder", - "getPetById", - "updatePetWithForm" - ], - "score_breakdown": {}, - "baseline_tool": "deletePet", - "baseline_correct": true, - "baseline_latency_ms": 1022.0272920560092, - "baseline_input_tokens": 1874, - "retrieve_tool": "deletePet", - "retrieve_correct": true, - "retrieve_latency_ms": 3215.026624966413, - "retrieve_input_tokens": 533, - "error": null - }, - { - "query": "Search pets by their tags", - "category": "read", - "difficulty": "medium", - "expected_tools": [ - "findPetsByTags" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "findPetsByTags", - "findPetsByStatus", - "getPetById", - "updatePet", - "addPet" - ], - "score_breakdown": {}, - "baseline_tool": "findPetsByTags", - "baseline_correct": true, - "baseline_latency_ms": 1254.0720420656726, - "baseline_input_tokens": 1874, - "retrieve_tool": "findPetsByTags", - "retrieve_correct": true, - "retrieve_latency_ms": 3970.8062920253724, - "retrieve_input_tokens": 630, - "error": null - }, - { - "query": "Upload a photo of my pet", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "uploadFile" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "uploadFile", - "addPet", - "updatePetWithForm", - "placeOrder", - "updatePet" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 1132.3371669277549, - "baseline_input_tokens": 1875, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4183.0666669411585, - "retrieve_input_tokens": 750, - "error": null - }, - { - "query": "Check the store inventory", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "getInventory" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "getInventory", - "getOrderById", - "deleteOrder", - "placeOrder", - "addPet" - ], - "score_breakdown": {}, - "baseline_tool": "getInventory", - "baseline_correct": true, - "baseline_latency_ms": 891.8864169390872, - "baseline_input_tokens": 1873, - "retrieve_tool": "getInventory", - "retrieve_correct": true, - "retrieve_latency_ms": 3237.7606669906527, - "retrieve_input_tokens": 589, - "error": null - }, - { - "query": "Place an order to buy a pet", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "placeOrder" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "placeOrder", - "getOrderById", - "deleteOrder", - "getPetById", - "deletePet" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 3977.326374966651, - "baseline_input_tokens": 1876, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3718.421708093956, - "retrieve_input_tokens": 551, - "error": null - }, - { - "query": "Look up order number 5", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "getOrderById" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "getOrderById", - "deleteOrder", - "placeOrder", - "getPetById", - "addPet" - ], - "score_breakdown": {}, - "baseline_tool": "getOrderById", - "baseline_correct": true, - "baseline_latency_ms": 954.2415420291945, - "baseline_input_tokens": 1875, - "retrieve_tool": "getOrderById", - "retrieve_correct": true, - "retrieve_latency_ms": 3530.3365000290796, - "retrieve_input_tokens": 613, - "error": null - }, - { - "query": "Cancel my order", - "category": "delete", - "difficulty": "easy", - "expected_tools": [ - "deleteOrder" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "deleteOrder", - "getOrderById", - "placeOrder", - "deletePet", - "deleteUser" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 927.9855410568416, - "baseline_input_tokens": 1872, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3115.135750034824, - "retrieve_input_tokens": 545, - "error": null - }, - { - "query": "Create a new user account", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "createUser" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "createUser", - "createUsersWithListInput", - "addPet", - "getUserByName", - "deleteUser" - ], - "score_breakdown": {}, - "baseline_tool": "createUser", - "baseline_correct": true, - "baseline_latency_ms": 1530.5797919863835, - "baseline_input_tokens": 1874, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 7155.023292056285, - "retrieve_input_tokens": 619, - "error": null - }, - { - "query": "Sign in with username and password", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "loginUser" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "updateUser", - "loginUser", - "createUsersWithListInput", - "createUser", - "updatePetWithForm" - ], - "score_breakdown": {}, - "baseline_tool": "loginUser", - "baseline_correct": true, - "baseline_latency_ms": 1160.5052499799058, - "baseline_input_tokens": 1875, - "retrieve_tool": "loginUser", - "retrieve_correct": true, - "retrieve_latency_ms": 4270.270042004995, - "retrieve_input_tokens": 703, - "error": null - }, - { - "query": "Log out of my account", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "logoutUser" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "logoutUser", - "loginUser", - "getUserByName", - "createUser" - ], - "score_breakdown": {}, - "baseline_tool": "logoutUser", - "baseline_correct": true, - "baseline_latency_ms": 770.2793339267373, - "baseline_input_tokens": 1874, - "retrieve_tool": "logoutUser", - "retrieve_correct": true, - "retrieve_latency_ms": 2691.7580839945003, - "retrieve_input_tokens": 491, - "error": null - }, - { - "query": "View user profile for john123", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "getUserByName" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "getUserByName", - "loginUser", - "logoutUser", - "deleteUser", - "createUser" - ], - "score_breakdown": {}, - "baseline_tool": "getUserByName", - "baseline_correct": true, - "baseline_latency_ms": 1026.410625083372, - "baseline_input_tokens": 1877, - "retrieve_tool": "getUserByName", - "retrieve_correct": true, - "retrieve_latency_ms": 3224.2564579937607, - "retrieve_input_tokens": 560, - "error": null - }, - { - "query": "Change user email address", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "updateUser" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "createUser", - "createUsersWithListInput", - "updateUser", - "getUserByName", - "deleteUser" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 963.7468330329284, - "baseline_input_tokens": 1873, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3086.039707995951, - "retrieve_input_tokens": 635, - "error": null - }, - { - "query": "Remove user john123", - "category": "delete", - "difficulty": "easy", - "expected_tools": [ - "deleteUser" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "deleteUser", - "getUserByName", - "createUser", - "loginUser", - "updateUser" - ], - "score_breakdown": {}, - "baseline_tool": "deleteUser", - "baseline_correct": true, - "baseline_latency_ms": 1020.6558749778196, - "baseline_input_tokens": 1875, - "retrieve_tool": "deleteUser", - "retrieve_correct": true, - "retrieve_latency_ms": 3887.1555000077933, - "retrieve_input_tokens": 671, - "error": null - }, - { - "query": "Create multiple user accounts at once", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "createUsersWithListInput" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "createUser", - "createUsersWithListInput", - "deleteUser", - "deletePet", - "deleteOrder" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 1033.013999927789, - "baseline_input_tokens": 1875, - "retrieve_tool": "createUsersWithListInput", - "retrieve_correct": true, - "retrieve_latency_ms": 2155.3955409908667, - "retrieve_input_tokens": 553, - "error": null - }, - { - "query": "Show me sold pets", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "findPetsByStatus" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "findPetsByStatus", - "findPetsByTags", - "getPetById", - "getInventory", - "deletePet" - ], - "score_breakdown": {}, - "baseline_tool": "findPetsByStatus", - "baseline_correct": true, - "baseline_latency_ms": 978.3695839578286, - "baseline_input_tokens": 1873, - "retrieve_tool": "findPetsByStatus", - "retrieve_correct": true, - "retrieve_latency_ms": 1738.3150000823662, - "retrieve_input_tokens": 472, - "error": null - }, - { - "query": "I want to adopt a pet — find one, check details, then buy it", - "category": "workflow", - "difficulty": "hard", - "expected_tools": [ - "findPetsByStatus", - "getPetById", - "placeOrder" - ], - "recall_at_k": 0.6666666666666666, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "findPetsByStatus", - "findPetsByTags", - "getPetById", - "getOrderById", - "getInventory" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 3996.3776669465005, - "baseline_input_tokens": 1885, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4942.036333028227, - "retrieve_input_tokens": 474, - "error": null - }, - { - "query": "Update pet using form data, not JSON body", - "category": "write", - "difficulty": "hard", - "expected_tools": [ - "updatePetWithForm" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "updatePetWithForm", - "addPet", - "uploadFile", - "placeOrder", - "updatePet" - ], - "score_breakdown": {}, - "baseline_tool": "updatePetWithForm", - "baseline_correct": true, - "baseline_latency_ms": 1568.5455829370767, - "baseline_input_tokens": 1878, - "retrieve_tool": "updatePetWithForm", - "retrieve_correct": true, - "retrieve_latency_ms": 3606.6056670388207, - "retrieve_input_tokens": 753, - "error": null - }, - { - "query": "What pets are in the store?", - "category": "read", - "difficulty": "medium", - "expected_tools": [ - "findPetsByStatus" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "findPetsByStatus", - "findPetsByTags", - "getInventory", - "placeOrder", - "updatePetWithForm" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 2117.048083106056, - "baseline_input_tokens": 1876, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 6635.67674998194, - "retrieve_input_tokens": 581, - "error": null - }, - { - "query": "Remove a pet listing and also delete its order", - "category": "delete", - "difficulty": "hard", - "expected_tools": [ - "deletePet", - "deleteOrder" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "deleteOrder", - "deletePet", - "deleteUser", - "placeOrder", - "getOrderById" - ], - "score_breakdown": {}, - "baseline_tool": "deletePet", - "baseline_correct": true, - "baseline_latency_ms": 1092.8001669235528, - "baseline_input_tokens": 1878, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 5173.107750015333, - "retrieve_input_tokens": 551, - "error": null - } - ], - "avg_recall_at_k": 0.9855072463768116, - "avg_mrr": 0.0, - "avg_map": 0.0, - "avg_ndcg_at_k": 0.0, - "avg_latency_ms": 0.0, - "stdev_recall": 0.0695048046856916, - "stdev_mrr": 0.0, - "ci_recall": [ - 0.9565217391304348, - 1.0 - ], - "ci_mrr": [ - 0.0, - 0.0 - ], - "miss_rate": 0.0, - "hit_rate": 1.0, - "recall_at_3": 0.0, - "recall_at_10": 0.0, - "avg_keyword_contribution": 0.0, - "avg_graph_contribution": 0.0, - "avg_embedding_contribution": 0.0, - "avg_annotation_contribution": 0.0, - "baseline_accuracy": 0.6521739130434783, - "retrieve_accuracy": 0.5652173913043478, - "avg_token_reduction": 0.6839270983265007, - "avg_baseline_latency_ms": 1896.627518206673, - "avg_retrieve_latency_ms": 3877.1341938788637, - "token_efficiency_baseline": 0.3477615746644101, - "token_efficiency_retrieve": 0.9535685469082372, - "t_statistic": -1.0, - "p_value": 0.3282693794948748 - } - ] -} \ No newline at end of file diff --git a/benchmarks/results/benchmark_e2e_20260407_015032.json b/benchmarks/results/benchmark_e2e_20260407_015032.json deleted file mode 100644 index db37683..0000000 --- a/benchmarks/results/benchmark_e2e_20260407_015032.json +++ /dev/null @@ -1,947 +0,0 @@ -{ - "timestamp": "2026-04-07T01:48:15.598411+00:00", - "model": "Bonsai-8B.gguf", - "mode": "e2e", - "top_k": 5, - "datasets": [ - { - "name": "Mixed MCP Servers", - "tool_count": 38, - "query_count": 30, - "queries": [ - { - "query": "Read the contents of config.yaml", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "read_file" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "read_file", - "read_multiple_files", - "get_file_contents", - "write_file", - "create_or_update_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "read_file", - "retrieve_correct": true, - "retrieve_latency_ms": 3737.8421670291573, - "retrieve_input_tokens": 656, - "error": null - }, - { - "query": "Write a new configuration file", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "write_file" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "write_file", - "edit_file", - "read_file", - "get_file_info", - "create_or_update_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "write_file", - "retrieve_correct": true, - "retrieve_latency_ms": 4167.178874951787, - "retrieve_input_tokens": 667, - "error": null - }, - { - "query": "List all files in the src directory", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "list_directory" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "list_directory", - "get_pull_request_files", - "directory_tree", - "push_files", - "read_multiple_files" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "list_directory", - "retrieve_correct": true, - "retrieve_latency_ms": 3458.496374893002, - "retrieve_input_tokens": 604, - "error": null - }, - { - "query": "Create the output directory", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "create_directory" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_directory", - "create_repository", - "create_branch", - "list_directory", - "directory_tree" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "create_directory", - "retrieve_correct": true, - "retrieve_latency_ms": 3271.133041009307, - "retrieve_input_tokens": 589, - "error": null - }, - { - "query": "Find all Python files in the project", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "search_files" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "push_files", - "read_multiple_files", - "search_files", - "get_pull_request_files", - "list_directory" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "search_files", - "retrieve_correct": true, - "retrieve_latency_ms": 3595.6752080237493, - "retrieve_input_tokens": 622, - "error": null - }, - { - "query": "Move the old log file to archive", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "move_file" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "move_file", - "edit_file", - "read_file", - "write_file", - "get_file_info" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "move_file", - "retrieve_correct": true, - "retrieve_latency_ms": 3558.0179590033367, - "retrieve_input_tokens": 582, - "error": null - }, - { - "query": "Check the file size and permissions", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "get_file_info" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "get_file_info", - "read_file", - "write_file", - "create_or_update_file", - "get_file_contents" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "get_file_info", - "retrieve_correct": true, - "retrieve_latency_ms": 3864.9533330462873, - "retrieve_input_tokens": 663, - "error": null - }, - { - "query": "Show the directory tree structure", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "directory_tree" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "directory_tree", - "create_directory", - "list_directory", - "get_file_info", - "get_file_contents" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "directory_tree", - "retrieve_correct": true, - "retrieve_latency_ms": 3439.5341670606285, - "retrieve_input_tokens": 565, - "error": null - }, - { - "query": "Edit the import statement in main.py", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "edit_file" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "edit_file", - "update_issue", - "list_pull_requests", - "list_issues", - "write_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "edit_file", - "retrieve_correct": true, - "retrieve_latency_ms": 6146.625166991726, - "retrieve_input_tokens": 972, - "error": null - }, - { - "query": "Read multiple config files at once", - "category": "read", - "difficulty": "medium", - "expected_tools": [ - "read_multiple_files" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "read_multiple_files", - "push_files", - "search_files", - "get_pull_request_files", - "read_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "read_multiple_files", - "retrieve_correct": true, - "retrieve_latency_ms": 3947.7652920177206, - "retrieve_input_tokens": 619, - "error": null - }, - { - "query": "Create a new issue for the bug I found", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "create_issue" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_issue", - "create_repository", - "create_branch", - "create_directory", - "create_pull_request" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 8379.22374997288, - "retrieve_input_tokens": 760, - "error": null - }, - { - "query": "Open a pull request for my changes", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "create_pull_request" - ], - "recall_at_k": 0.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "update_pull_request_branch", - "get_pull_request_files", - "get_pull_request", - "merge_pull_request", - "get_pull_request_reviews" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4580.256165936589, - "retrieve_input_tokens": 709, - "error": null - }, - { - "query": "Search for repositories about machine learning", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "search_repositories" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "search_repositories", - "search_code", - "search_users", - "search_files", - "search_issues" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "search_repositories", - "retrieve_correct": true, - "retrieve_latency_ms": 4150.481750024483, - "retrieve_input_tokens": 651, - "error": null - }, - { - "query": "Fork the upstream repository", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "fork_repository" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "fork_repository", - "create_repository", - "search_repositories", - "create_branch", - "list_branches" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3917.6195840118453, - "retrieve_input_tokens": 638, - "error": null - }, - { - "query": "List all open issues with bug label", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "list_issues" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "list_issues", - "search_issues", - "list_directory", - "list_branches", - "list_commits" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "list_issues", - "retrieve_correct": true, - "retrieve_latency_ms": 5854.736082954332, - "retrieve_input_tokens": 770, - "error": null - }, - { - "query": "Get the README from the GitHub repo", - "category": "read", - "difficulty": "medium", - "expected_tools": [ - "get_file_contents" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "get_file_contents", - "get_issue", - "get_pull_request", - "get_pull_request_reviews", - "get_pull_request_comments" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "get_file_contents", - "retrieve_correct": true, - "retrieve_latency_ms": 4693.792750011198, - "retrieve_input_tokens": 647, - "error": null - }, - { - "query": "Merge the feature branch PR", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "merge_pull_request" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "merge_pull_request", - "create_branch", - "update_pull_request_branch", - "list_branches", - "list_commits" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4134.696291992441, - "retrieve_input_tokens": 750, - "error": null - }, - { - "query": "Comment on the pull request with review feedback", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "add_issue_comment" - ], - "recall_at_k": 0.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_pull_request_review", - "get_pull_request_comments", - "get_pull_request_reviews", - "update_pull_request_branch", - "get_pull_request" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 4508.81649996154, - "retrieve_input_tokens": 717, - "error": null - }, - { - "query": "Create a new branch for the feature", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "create_branch" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_branch", - "create_repository", - "create_directory", - "create_issue", - "create_or_update_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "create_branch", - "retrieve_correct": true, - "retrieve_latency_ms": 5065.368999959901, - "retrieve_input_tokens": 750, - "error": null - }, - { - "query": "Push the updated files to GitHub", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "push_files" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "push_files", - "read_multiple_files", - "search_files", - "get_pull_request_files", - "move_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3782.872375100851, - "retrieve_input_tokens": 629, - "error": null - }, - { - "query": "Search code for the function definition", - "category": "read", - "difficulty": "medium", - "expected_tools": [ - "search_code" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "search_code", - "search_repositories", - "search_users", - "search_files", - "search_issues" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "search_code", - "retrieve_correct": true, - "retrieve_latency_ms": 4113.567291060463, - "retrieve_input_tokens": 651, - "error": null - }, - { - "query": "Which directories can the file server access?", - "category": "read", - "difficulty": "hard", - "expected_tools": [ - "list_allowed_directories" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "list_allowed_directories", - "move_file", - "read_file", - "write_file", - "search_files" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "list_allowed_directories", - "retrieve_correct": true, - "retrieve_latency_ms": 3236.2780830590054, - "retrieve_input_tokens": 549, - "error": null - }, - { - "query": "Check details of PR number 55", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "get_pull_request" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "get_pull_request", - "get_issue", - "get_pull_request_status", - "merge_pull_request", - "get_pull_request_reviews" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "get_pull_request", - "retrieve_correct": true, - "retrieve_latency_ms": 4608.008833019994, - "retrieve_input_tokens": 690, - "error": null - }, - { - "query": "Approve the pull request after review", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "create_pull_request_review" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_pull_request_review", - "get_pull_request_reviews", - "get_pull_request_comments", - "get_pull_request", - "merge_pull_request" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": null, - "retrieve_correct": false, - "retrieve_latency_ms": 3880.9942500665784, - "retrieve_input_tokens": 749, - "error": null - }, - { - "query": "View the commit history", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "list_commits" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "list_commits", - "merge_pull_request", - "directory_tree", - "push_files", - "read_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "list_commits", - "retrieve_correct": true, - "retrieve_latency_ms": 5402.960458071902, - "retrieve_input_tokens": 707, - "error": null - }, - { - "query": "Create a new GitHub repo and initialize it", - "category": "write", - "difficulty": "easy", - "expected_tools": [ - "create_repository" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_branch", - "create_issue", - "create_repository", - "create_pull_request", - "create_or_update_file" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "create_repository", - "retrieve_correct": true, - "retrieve_latency_ms": 6041.763041983359, - "retrieve_input_tokens": 852, - "error": null - }, - { - "query": "Update the issue title and close it", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "update_issue" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "update_issue", - "create_issue", - "create_or_update_file", - "update_pull_request_branch", - "get_issue" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "update_issue", - "retrieve_correct": true, - "retrieve_latency_ms": 6064.246875001118, - "retrieve_input_tokens": 872, - "error": null - }, - { - "query": "See what files were changed in pull request 10", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "get_pull_request_files" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "get_pull_request_files", - "update_pull_request_branch", - "get_pull_request", - "merge_pull_request", - "get_pull_request_reviews" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "get_pull_request_files", - "retrieve_correct": true, - "retrieve_latency_ms": 4493.489707936533, - "retrieve_input_tokens": 713, - "error": null - }, - { - "query": "Find all TypeScript files matching *.test.ts", - "category": "read", - "difficulty": "easy", - "expected_tools": [ - "search_files" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "search_files", - "push_files", - "read_multiple_files", - "get_pull_request_files", - "list_directory" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "search_files", - "retrieve_correct": true, - "retrieve_latency_ms": 3920.63737497665, - "retrieve_input_tokens": 623, - "error": null - }, - { - "query": "Create a file on GitHub with the deployment config", - "category": "write", - "difficulty": "medium", - "expected_tools": [ - "create_or_update_file" - ], - "recall_at_k": 1.0, - "mrr": 0.0, - "average_precision": 0.0, - "ndcg_at_k": 0.0, - "latency_ms": 0.0, - "retrieved_tools": [ - "create_or_update_file", - "write_file", - "create_pull_request_review", - "create_repository", - "create_branch" - ], - "score_breakdown": {}, - "baseline_tool": null, - "baseline_correct": false, - "baseline_latency_ms": 0.0, - "baseline_input_tokens": 0, - "retrieve_tool": "create_repository", - "retrieve_correct": false, - "retrieve_latency_ms": 5247.74162506219, - "retrieve_input_tokens": 782, - "error": null - } - ], - "avg_recall_at_k": 0.9333333333333333, - "avg_mrr": 0.0, - "avg_map": 0.0, - "avg_ndcg_at_k": 0.0, - "avg_latency_ms": 0.0, - "stdev_recall": 0.2537081317024624, - "stdev_mrr": 0.0, - "ci_recall": [ - 0.8333333333333334, - 1.0 - ], - "ci_mrr": [ - 0.0, - 0.0 - ], - "miss_rate": 0.06666666666666667, - "hit_rate": 0.9333333333333333, - "recall_at_3": 0.0, - "recall_at_10": 0.0, - "avg_keyword_contribution": 0.0, - "avg_graph_contribution": 0.0, - "avg_embedding_contribution": 0.0, - "avg_annotation_contribution": 0.0, - "baseline_accuracy": 0.0, - "retrieve_accuracy": 0.7333333333333333, - "avg_token_reduction": 0.0, - "avg_baseline_latency_ms": 0.0, - "avg_retrieve_latency_ms": 4508.825779139685, - "token_efficiency_baseline": 0.0, - "token_efficiency_retrieve": 1.0603431656063234, - "t_statistic": 8.930285549745875, - "p_value": 8.378144844556346e-10 - } - ] -} \ No newline at end of file diff --git a/docs/architecture-plan-and-execute.md b/docs/architecture-plan-and-execute.md new file mode 100644 index 0000000..caca509 --- /dev/null +++ b/docs/architecture-plan-and-execute.md @@ -0,0 +1,830 @@ +# Plan-and-Execute Architecture + +> 작성: 2026-04-22, 업데이트: 2026-04-23 +> 상태: 확정 (설계) / 미구현 +> 범위: graph-tool-call 라이브러리 + xgen-workflow 통합 + +## 변경 이력 + +- **2026-04-23**: 설계 간소화 + - Ingest 시 embedding + Qdrant 저장 **삭제** (YAGNI). Field 이름 exact match 로 충분, cross-field synonym 은 LLM enrichment 가 해결 + - L0 에 **LLM per-tool enrichment (Pass 2)** 도입. graph-tool-call 이 이미 보유한 `OntologyLLM` 추상화 활용 + - Stage 1 retrieval 은 기존 BM25 + graph (graph-tool-call retrieval) 재사용. embedding prefilter 생략 + - Knowledge Base 가 **두 층** 으로 명확화: (A) 결정론적 파서 / (B) LLM semantic enrichment + +--- + +## 0. 한 쪽 요약 + +**문제:** 현재 LLM-as-orchestrator (ReAct) 는 요청당 15 iteration × ~15KB context = **30초, 225KB 토큰**. 비용·지연·품질 모두 구조적 한계. + +**해결:** **사전 지식 (graph + schemas + ingest 시 LLM 의미 주석)** 을 최대한 활용하고, runtime LLM 은 자연어 ↔ 구조 변환에만 사용하는 **5-layer 아키텍처** (L0 Knowledge Base + Stage 1~4 Runtime). + +**기대 효과:** +- LLM 호출 15 → 2~3회 +- Context 225KB → ~2~3KB (**~75배 감소**) +- Latency 30초 → 2~5초 (**~10배 개선**) +- 실행 단계 재현성, 감사 가능성 확보 +- 확장 축 확보 (fan-out, template, interactive) + +--- + +## 1. 설계 원칙 + +| # | 원칙 | 의미 | +|---|---|---| +| 1 | 사전 지식 최대 활용 | graph, schemas, embeddings 는 offline 구축 후 영속. 요청 처리 시 재계산 금지 | +| 2 | LLM 은 semantic bridge 에만 | 자연어 이해 / 의미 추출 / 자연어 생성 — 그 외 결정론 | +| 3 | 결정 가능한 것은 결정론적으로 | 매칭·순서·바인딩은 알고리즘. LLM 폴백은 **실패한 결정론의 보완** | +| 4 | 각 단계는 독립 입출력 계약 | 테스트·캐싱·디버깅·부분 교체 가능 | +| 5 | 하드코딩은 "학습된 지식" 으로 대체 | synonym → embedding cluster, verb → intent classifier | +| 6 | Failure mode 관측 가능 | 어느 stage 에서 왜 실패했는지 항상 명확해야 함 | + +--- + +## 2. 시스템 개요 + +``` +╔═══════════════════════════════════════════════════════════════╗ +║ OFFLINE / INGEST TIME ║ +║ ┌─────────────────────────────────────────────────────────┐ ║ +║ │ L0. KNOWLEDGE BASE │ ║ +║ │ │ ║ +║ │ Swagger → ToolSchema + Tool Embeddings + │ ║ +║ │ IO Contract + Tool Graph │ ║ +║ │ │ ║ +║ │ 저장: api_tool_collections.graph (JSONB) │ ║ +║ │ api_tool_collections.embeddings (pgvector) │ ║ +║ │ api_tool_collections.io_contracts (JSONB) │ ║ +║ └─────────────────────────────────────────────────────────┘ ║ +╚═══════════════════════════════════════════════════════════════╝ + │ + ▼ (요청 도착) +╔═══════════════════════════════════════════════════════════════╗ +║ REQUEST TIME PIPELINE ║ +║ ║ +║ requirement (자연어) ║ +║ │ ║ +║ ▼ ║ +║ ┌──────────────────────────────────────────────────────┐ ║ +║ │ STAGE 1. RETRIEVAL + TARGET SELECTION │ ║ +║ │ (a) embedding prefilter: 108 → top-20 │ ║ +║ │ (b) LLM pick: 20개 catalog → target + entities │ ║ +║ │ context: ~1KB │ LLM: 1회 │ ║ +║ └────────────────┬─────────────────────────────────────┘ ║ +║ │ ║ +║ ▼ ║ +║ ┌──────────────────────────────────────────────────────┐ ║ +║ │ STAGE 2. PATH SYNTHESIZER │ ║ +║ │ (결정론) target 의 consumes → IO Contract 역추적 │ ║ +║ │ → DAG 구성 + argument bindings │ ║ +║ │ context: — │ LLM: 0회 │ ║ +║ └────────────────┬─────────────────────────────────────┘ ║ +║ │ ║ +║ ┌─────────┴─────────┐ ║ +║ │ │ ║ +║ 확정 plan 모호 (2+ 경로) ║ +║ │ │ ║ +║ │ ▼ ║ +║ │ ┌────────────────────────────────────────┐ ║ +║ │ │ (조건부) DISAMBIGUATION │ ║ +║ │ │ context: ~2KB (후보만) │ LLM: 1회 │ ║ +║ │ └────────────┬───────────────────────────┘ ║ +║ │ │ ║ +║ └───────────────────┘ ║ +║ │ ║ +║ ▼ ║ +║ ┌──────────────────────────────────────────────────────┐ ║ +║ │ STAGE 3. RUNNER │ ║ +║ │ (결정론) DAG topological 실행 │ ║ +║ │ JsonPath 치환 + tool_executor HTTP │ ║ +║ │ step 단위 streaming event │ ║ +║ │ context: — │ LLM: 0회 │ ║ +║ └────────────────┬─────────────────────────────────────┘ ║ +║ │ ║ +║ ▼ ║ +║ ┌──────────────────────────────────────────────────────┐ ║ +║ │ STAGE 4. RESPONSE SYNTHESIS │ ║ +║ │ execution trace (요약) → 자연어 응답 │ ║ +║ │ context: ~1KB │ LLM: 1회 │ ║ +║ └────────────────┬─────────────────────────────────────┘ ║ +║ │ ║ +║ ▼ ║ +║ 최종 답변 ║ +╚═══════════════════════════════════════════════════════════════╝ +``` + +**일반 케이스 예산:** LLM 2회, context ~2KB, 2~4초. +**모호 케이스:** LLM 3회, context ~4KB, 4~6초. + +--- + +## 3. L0 — Knowledge Base + +ingest 1회. 영속 저장. 요청 처리에서 재계산 금지. + +**두 층 구조:** +- **Pass 1 — Deterministic parser**: Swagger 의 구조적 사실 (schema, HTTP, dependency) 추출. LLM 금지. +- **Pass 2 — Semantic enrichment**: Description 등을 LLM 이 읽고 의미 주석 (언제 써, 무엇을 내놓는다, 누구와 쌍을 이룬다). graph-tool-call 의 `OntologyLLM` 추상화 재사용. + +### 3.1 ToolSchema (Pass 1, 기존 확장) + +기존 `tools` 테이블. 추가 필드는 아래 섹션들이 채움. + +| 필드 | 설명 | 출처 | +|---|---|---| +| `function_id` | 컬렉션 범위 고유 slug | 파서 | +| `function_name` | 원본 operationId | 파서 | +| `description` | summary + description + tags | 파서 | +| `api_url`, `api_method`, `api_header`, `api_body` | 실행용 | 파서 | +| `metadata` | method/path/base_url/tags/response_schema/controller/request_type/response_type | 파서 | +| `ai_metadata` | canonical_action, primary_resource, when_to_use, pairs_well_with 등 | **Pass 2 (LLM)** | + +### 3.2 IO Contract (Pass 1, 결정론) + +각 tool 의 **필드 수준 produces/consumes** 를 swagger schema 에서 기계적으로 추출. + +**저장:** 신규 테이블 `tool_io_contracts`: +```sql +CREATE TABLE tool_io_contracts ( + tool_id VARCHAR(100) REFERENCES tools(function_id), + direction VARCHAR(10) CHECK (direction IN ('produces', 'consumes')), + json_path TEXT, -- $.body.goods[*].goodsNo (produces) + -- goodsNo (consumes) + field_name VARCHAR(100), -- goodsNo + field_type VARCHAR(40), -- integer, string, object + required BOOLEAN, -- consumes 에 한함 + semantic_tag VARCHAR(80) -- Pass 2 LLM 이 채움 (빈 값 허용) +); +``` + +**추출 프로세스 (LLM 없음):** +``` +for each tool in schemas: + request_leaves = walk_schema_leaves(tool.request_schema) + response_leaves = walk_schema_leaves(tool.response_schema) + + for each leaf in request_leaves: + insert consumes (field_name, type, required) + + for each leaf in response_leaves: + insert produces (json_path, field_name, type) +``` + +**1차 매칭: exact field name + type** — 동일 swagger 내 field 이름 규약 보통 일관. 이걸로 대부분의 엣지 생성. + +```python +# 결정론적 field match edge +for A in tools: + for p in A.produces: + for B in tools: + if A == B: continue + for c in B.consumes.required: + if p.field_name == c.field_name and p.type == c.type: + graph.add_edge(A, B, "produces_for", + binding={c.field_name: p.json_path}) +``` + +### 3.3 Semantic Enrichment (Pass 2, LLM) + +**목적:** Description 등의 비정형 정보를 LLM 이 해석해 의미 주석 추가. 하드코딩된 verb 사전 / synonym 테이블 **완전 대체**. + +**인프라:** graph-tool-call 에 이미 있는 `OntologyLLM` 활용 ([graph_tool_call/ontology/llm_provider.py](graph_tool_call/ontology/llm_provider.py)). + +**이미 제공되는 메서드:** +- `infer_relations(tools)` — LLM 기반 관계 추론 +- `suggest_categories(tools)` — 카테고리 그룹핑 +- `verify_relations(relations, tools)` — 휴리스틱 엣지 검증 / 거르기 +- `suggest_missing(tools, existing)` — 빠진 엣지 제안 +- `enrich_keywords(tools)` — BM25 향상용 키워드 +- `generate_example_queries(tools)` — 임베딩 매칭용 예시 쿼리 + +**신규 메서드 (추가 구현):** +```python +class OntologyLLM: + def enrich_tool_semantics( + self, tools: list[ToolSummary], batch_size: int = 10, + ) -> dict[str, ToolEnrichment]: + """Per-tool 의미 주석 (action, resource, use-when, semantic tags, pairs).""" +``` + +**ToolEnrichment 스키마:** +```typescript +type ToolEnrichment = { + canonical_action: "search" | "read" | "create" | "update" | "delete" | "action"; + primary_resource: string; // 정규화 리소스명 (예: "product") + one_line_summary: string; // 한 줄 요약 (Stage 1 catalog 용) + when_to_use: string; // 언제 쓰는지 + when_not_to_use?: string; // 쓰면 안 되는 경우 + produces_semantics: Array<{ // 의미 태깅된 produces + semantic: string; // "product_id" 같은 canonical + json_path: string; // 실제 경로 + }>; + consumes_semantics: Array<{ + semantic: string; + field: string; + }>; + pairs_well_with: Array<{ // 함께 / 순서대로 쓰이는 도구들 + tool: string; + reason: string; + }>; +} +``` + +**Prompt 예시:** +``` +You are annotating an API tool for a planning system. + +Tool: seltSearchProduct +Summary: 상품 검색 +Description: 키워드로 상품을 검색하는 API입니다. ... +HTTP: GET /v1/search/product +Request fields: [searchWord, langCd, siteNo, sort, ...] +Response fields: [$.body.goods[*].goodsNo, $.body.goods[*].goodsName, ...] + +Produce JSON with: +- canonical_action (search|read|create|update|delete|action) +- primary_resource (one word like "product", "order", "user") +- one_line_summary (Korean, within 40 chars) +- when_to_use (1~2 sentences) +- produces_semantics: map internal field names to semantic ids like "product_id" +- pairs_well_with: 2~3 related tools with brief reason + +Output JSON only. +``` + +**저장:** +- `tools.ai_metadata` JSONB 컬럼 (전체 enrichment 덤프) +- `tool_io_contracts.semantic_tag` (produces_semantics / consumes_semantics 의 semantic 을 해당 row 에 매핑) + +**재실행 조건:** swagger 변경, LLM 모델 업그레이드, 관리자 강제 재생성. 일상 요청 처리와 **분리**. + +### 3.4 Tool Graph (재정의) + +엣지 타입: + +| 엣지 | 근거 | 신뢰도 | 용도 | +|---|---|---|---| +| `produces_for` (exact) | Pass 1 — field name + type 일치 | high | Stage 2 주 신호 | +| `produces_for` (semantic) | Pass 2 — `semantic_tag` 일치 | medium | Pass 1 이 못 잡는 교차 명명 (cross-collection 등) | +| `pairs_with` | Pass 2 — `pairs_well_with` 에서 | medium | Stage 1 catalog 힌트, Stage 2 보조 | +| `similar_to` | 구조적 (같은 controller / tag / CRUD 역할) | low | Disambiguation 후보 확장 | +| `precedes` | 구조적 (POST → GET single 등) | low | 레거시 엣지, 보조 힌트 | + +**기존 하드코딩 반응성 패치 (selt, synonym clusters, *No/*Seq heuristic, search-bridge exception) 는 Pass 2 완성 시 모두 제거.** Pass 1 field exact match + Pass 2 LLM enrichment 가 그 역할을 대체. + +### 3.5 Ingest 파이프라인 + +```python +# xgen-workflow 측 +def ingest_collection(collection_id, spec_source, llm_config): + from graph_tool_call.ontology.llm_provider import wrap_llm + from graph_tool_call.ingest.openapi import parse_operations + + # Pass 1: 결정론 + schemas = parse_operations(spec_source) + io_contracts = extract_io_contracts(schemas) # 3.2 + graph = build_structural_edges(schemas, io_contracts) # 3.4 + + # Pass 2: LLM (옵션) + if llm_config.enabled: + llm = wrap_llm(build_llm_spec(llm_config)) + enrichments = llm.enrich_tool_semantics(schemas) + apply_semantic_tags(io_contracts, enrichments) # semantic_tag 채움 + graph = augment_with_semantic_edges(graph, enrichments) + + store_all(schemas, io_contracts, graph, enrichments) +``` + +**옵션:** Pass 2 는 `llm_config.enabled=False` 로 **생략 가능**. Pass 1 만으로도 기본 동작은 가능 (품질은 낮음). + +### 3.6 xgen-workflow 통합 + +xgen 은 이미 agent 노드에서 provider/model/api_key 선택 지원. Ingest 시에도 동일 config 재사용: + +```python +# xgen-workflow: api_tool_collection/service.py +def refresh_with_enrichment(collection_id, llm_settings): + llm_spec = f"{llm_settings.provider}/{llm_settings.model}" + # "openai/gpt-4.1-mini" + + # api_key 는 env 또는 xgen secret store 에서 + os.environ["OPENAI_API_KEY"] = xgen_secret.get(user_id, "openai") + + ingest_collection(collection_id, spec_source, LLMConfig( + enabled=True, + spec=llm_spec, + )) +``` + +graph-tool-call 은 xgen 에 의존하지 않음. xgen 이 config 주는 쪽, graph-tool-call 이 받는 쪽. + +--- + +## 4. Stage 1 — Retrieval + Target Selection + +**입력:** `requirement: str` + +**출력:** +```json +{ + "target": "seltProductDetailInfo", + "confidence": 0.92, + "entities": { + "keyword": "quarzen 티셔츠", + "locale": "ko" + }, + "output_shape": "single", + "reasoning": "..." +} +``` + +### 4.1 알고리즘 + +**(a) Retrieval prefilter (결정론):** graph-tool-call 의 기존 `retrieve_with_scores()` 그대로 사용. +```python +candidates = tg.retrieve_with_scores(requirement, top_k=20) +# BM25 + graph + (optional) annotation 채널 +``` +embedding prefilter 는 생략. 기존 BM25 + graph 가 top-20 recall 을 충분히 내는 것을 실측으로 확인 (x2bee `"product search"` → `seltSearchProduct` top-10 안에 들어옴). + +향후 recall 부족 증거가 나오면 embedding 채널을 **그때** 연결. 지금은 YAGNI. + +**(b) LLM structured pick:** +- 20개의 catalog 에 **ai_metadata 포함**: + ``` + { + function_name, + description[:80], + one_line_summary, // Pass 2 에서 생성 + when_to_use, // Pass 2 + pairs_well_with // Pass 2 (이름만) + } + ``` +- system prompt: "고른 target 1개와 추출한 entities 를 반환" +- OpenAI structured output (JSON schema 강제) + +**context 크기:** 20 × 200자 ≈ 4KB (ai_metadata 포함 확장). ai_metadata 없을 땐 20 × 100자 ≈ 2KB. + +### 4.2 오류 처리 + +- Retrieval 이 top-20 모두 low score 면 → "적합한 도구 없음" 에러. 사용자 재질의 유도. +- LLM 이 JSON schema 위반 시 → 1회 retry. 실패하면 fallback: top-1 embedding 결과로 진행 (entities 는 빈 dict). + +### 4.3 Stage 1 의 성능 지표 +- Target 정확도 (샘플 요구사항 N개에 대해 "맞는 target 선정" 비율) +- Entity 추출 재현율 +- LLM 응답 latency p50/p95 + +--- + +## 5. Stage 2 — Path Synthesizer + +**입력:** Stage 1 output (`target`, `entities`) +**출력:** Plan (Plan 스키마는 §9 참조) OR "ambiguous" 플래그 (Disambiguation 발동) + +### 5.1 DAG 구성 알고리즘 (Bottom-up) + +```python +def synthesize(target, entities, collection_defaults): + plan = {"steps": [], "output_binding": None} + context = entities | collection_defaults # 이미 아는 값들 + + needed = target.consumes.required_only() # 필수 입력만 먼저 + resolved = {} # {field: source_step_id} + pending = list(needed) + visited = set() + + while pending: + field = pending.pop(0) + if field.semantic_tag in available_tags(context, resolved): + resolved[field.name] = bind_from_available(field, context, resolved) + continue + + # graph 에서 이 semantic 을 produces 하는 tool 찾기 + producers = graph.producers_of(field.semantic_tag) + if not producers: + raise UnsatisfiableFieldError(field) + + # 후보 여러 개면 "ambiguous" 로 분기 (Stage 3 LLM) + if len(producers) > 1 and not strictly_better(producers): + return AmbiguousPlan(target, candidates=producers) + + # prerequisite 추가 (재귀) + producer = producers[0] + if producer.name in visited: + raise CyclicDependencyError + visited.add(producer.name) + + step = build_step(producer) + plan.steps.insert(0, step) # 앞쪽에 삽입 (위상 순서) + + # producer 의 consumes 를 다시 확인 + pending.extend(producer.consumes.required_only()) + + # target 을 마지막 step 으로 추가 + plan.steps.append(build_step(target, bindings=resolved)) + plan.output_binding = f"$.{target.step_id}.body" + + return plan +``` + +### 5.2 "strictly_better" 판단 + +여러 producer 후보 중: +- IO Contract confidence 높은 순 +- 경로 짧은 순 (재귀 depth) +- similar_to weight 높은 순 (requirement 와 가까운) +- 모두 비슷하면 → Ambiguous 플래그 + +### 5.3 초기 버전 범위 + +- **선형 chain** (각 step 1회 호출): 지원 +- **다중 참조** (한 step 이 이전 N개 step 의 출력 조합): 지원 +- **Fan-out** (배열 전체 loop): **초기 범위 밖** — §10 확장 포인트 +- **조건 분기** (if/else): **초기 범위 밖** + +### 5.4 실패 경로 + +| 케이스 | 반환 | +|---|---| +| 필수 field 해소 불가 | `UnsatisfiableFieldError` — Stage 4 에 그대로 reveal | +| 순환 의존 | `CyclicDependencyError` — 보고 | +| 복수 경로 | `AmbiguousPlan` — Disambiguation 발동 | + +--- + +## 6. Disambiguation (조건부) + +**발동 조건:** Stage 2 가 `AmbiguousPlan` 반환. + +**입력:** 후보 경로 2~N개 각각의 요약 +``` +후보 A: seltSearchProduct → seltProductDetailInfo +후보 B: getCategoryList → seltSearchProduct → seltProductDetailInfo +``` + +**LLM 호출:** +- system: "요구사항에 가장 맞는 경로 1개를 고르고 이유를 설명" +- user: requirement + 후보 경로 설명 +- structured output: `{"chosen": "A", "reason": "..."}` + +**context:** ~2KB + +--- + +## 7. Stage 3 — Runner + +**입력:** 확정 Plan + +**동작:** +```python +async def run(plan: Plan): + context = {} # step_id → result + trace = ExecutionTrace(plan=plan) + + for step in topological_order(plan.steps): + resolved_args = resolve_bindings(step.args, context) + + trace.emit("step.start", step_id=step.id, args=resolved_args) + + try: + result = await tool_executor.execute( + function_id=step.tool_function_id, + args=resolved_args, + timeout=step.timeout or 30, + ) + except ToolExecutionError as e: + trace.emit("step.error", step_id=step.id, error=str(e)) + return trace.fail(step.id, e) + + context[step.id] = result + trace.emit("step.done", step_id=step.id, output_preview=preview(result)) + + final = jsonpath_extract(context, plan.output_binding) + trace.emit("plan.done", output=final) + return trace.success(final) +``` + +### 7.1 Argument 바인딩 치환 + +바인딩 syntax: `${step_id.json_path}` — JsonPath 표준 사용 (jsonpath-ng 라이브러리). + +``` +args = {"goodsNo": "${s1.body.goods[0].goodsNo}", + "langCd": "ko"} +context = {"s1": {"body": {"goods": [{"goodsNo": 12345, ...}]}}} +→ resolved = {"goodsNo": 12345, "langCd": "ko"} +``` + +### 7.2 에러 / 재시도 정책 (초기 버전) + +| 에러 유형 | 동작 | +|---|---| +| HTTP 4xx | fail fast, trace 에 응답 body 포함 | +| HTTP 5xx | 최대 2회 재시도 (exponential backoff) | +| 타임아웃 | fail fast | +| JsonPath 미스 | fail fast — "step sX 의 bindings 가 실제 응답 구조와 불일치: [list of missing paths]" | +| Schema 검증 실패 | fail fast | + +**재계획 (re-plan) 은 v1 범위 밖.** 실패 시 Stage 4 가 사용자에게 설명. + +### 7.3 스트리밍 + +각 step 단위로 이벤트 emit. UI 는 step 단위 진행 상황 표시. + +--- + +## 8. Stage 4 — Response Synthesis + +**입력:** requirement + ExecutionTrace + +**동작:** +```python +def synthesize_response(requirement, trace): + if trace.success: + # 최종 output 의 관련 필드만 추림 (schema-aware projection) + relevant = project_relevant_fields(trace.output, requirement) + prompt = f""" + 요구사항: {requirement} + 실행 결과 요약: {relevant} + 사용자에게 자연스럽게 답변. + """ + else: + prompt = f""" + 요구사항: {requirement} + 실행 중 실패: step={trace.failed_step}, 이유={trace.error} + 부분 결과: {trace.partial_results} + 사용자에게 무엇이 됐고 무엇이 안 됐는지 설명. + """ + return llm.complete(prompt) +``` + +**context:** 요약된 결과 기준 ~1KB. 전체 response 를 그대로 넘기지 않음 — `project_relevant_fields` 가 requirement 에 관련된 필드만 추림. + +--- + +## 9. 핵심 데이터 계약 + +### 9.1 Intent Schema (Stage 1 출력) + +```typescript +type Intent = { + target: string; // function_name + confidence: number; // 0.0 ~ 1.0 + entities: Record; // {keyword: "...", locale: "ko", ...} + output_shape: "single" | "list" | "count"; + reasoning?: string; // 디버그용 +} +``` + +### 9.2 Plan Schema (Stage 2 출력) + +```typescript +type Plan = { + id: string; // uuid (캐시 키 포함) + goal: string; // Intent 의 요약 + steps: PlanStep[]; + output_binding: string; // JsonPath "$.s2.body" 등 + metadata: { + created_at: string; + target: string; + disambiguation_used: boolean; + }; +} + +type PlanStep = { + id: string; // "s1", "s2", ... + tool: string; // function_name + tool_function_id: string; // DB 룩업용 slug + args: Record; // {"goodsNo": "${s1.body.goods[0].goodsNo}", ...} + timeout_ms?: number; + retryable?: boolean; + rationale?: string; // "검색 결과로 goodsNo 획득" +} +``` + +### 9.3 ExecutionTrace Schema (Stage 3 출력) + +```typescript +type ExecutionTrace = { + plan_id: string; + success: boolean; + steps: StepTrace[]; + output?: any; // 성공 시 + failed_step?: string; // 실패 시 + error?: ErrorDetail; // 실패 시 + duration_ms: number; + started_at: string; + ended_at: string; +} + +type StepTrace = { + id: string; + tool: string; + args: Record; // resolved (바인딩 치환 후) + output?: any; + error?: ErrorDetail; + duration_ms: number; + retries: number; +} +``` + +--- + +## 10. 하드코딩 제거 매핑표 + +| 현 하드코딩 | 제거 방법 | 대체 메커니즘 | +|---|---|---| +| `_SYNONYM_CLUSTERS` (goods↔product) | 제거 | Pass 2 `primary_resource` + `semantic_tag` (LLM per-tool enrichment) | +| `selt`, `sel` verb 특수 케이스 | 제거 | Pass 2 `canonical_action` (LLM 이 context 읽고 분류) | +| `*Id/*No/*Seq` 접미사 heuristic | 제거 | Pass 1 field name + type exact match (동일 swagger 안에선 충분) + 필요시 Pass 2 semantic_tag | +| `search-bridge` 예외 | 제거 | Pass 2 `pairs_well_with` + `canonical_action = search` | +| `_is_single_resource_path` 필터 | 제거 | IO Contract 의 produces/consumes 가 판단 | +| `_VERB_TO_INTENT` CRUD 사전 | **유지** (Pass 1 fallback) | Pass 2 가 LLM 으로 action 태깅 담당. Pass 2 생략 시 이 사전이 fallback | + +--- + +## 11. 확장 포인트 + +### 11.1 Fan-out (foreach) + +**시나리오:** "카트의 모든 상품 상세 보여줘" + +**Plan schema 확장:** +```typescript +type PlanStep = { + // ... 기존 필드 + foreach?: { + source: string; // "${s1.body.items[*]}" + item_alias: string; // "item" + }; + // args 안에서 `${item.goodsNo}` 참조 가능 +} +``` + +**Runner 확장:** foreach step 은 N회 호출 후 결과를 배열로 묶어 context 에 저장. + +### 11.2 조건 분기 (if/else) + +**Plan schema 확장:** step 에 `condition` 필드 (JsonPath 기반 부울 식). Runner 가 evaluate 후 skip/execute. + +### 11.3 Workflow Template Library + +- 성공한 Plan 을 `workflow_templates` 테이블에 승격 +- 새 requirement → embedding 기반 template match → 재사용 +- Stage 1~2 skip 가능 → 더 빠름 +- Intent 유사 판정 임계값 튜닝 필요 + +### 11.4 Interactive Refinement + +- Runner 가 특정 step 에서 `user_input_required` 이벤트 발행 +- UI 가 사용자에게 선택지 제시 +- 응답 받아 Runner 재개 (suspend/resume) +- 민감 액션 (결제, 삭제) 에 필수 + +### 11.5 Self-healing Re-plan + +- Runner 실패 시 ExecutionTrace + 에러를 Stage 1~2 에 다시 넘겨 1회 re-plan +- 예: "빈 배열 반환 → 검색 키워드 재조정" 같은 케이스 + +--- + +## 12. 마이그레이션 + +### 12.1 기존 자산 활용 + +- `graph_tool_call.analyze.dependency.detect_dependencies`: **유지**. IO Contract 가 못 잡는 구조적 엣지는 여전히 여기서. 단 반응성 패치 (`selt`, `_SYNONYM_CLUSTERS`, `*No/*Seq`, `search-bridge`) 는 Pass 2 enrichment 정착 시 **단계적 제거**. +- `graph_tool_call.retrieval`: **유지**. Stage 1 의 prefilter 로 그대로 활용 (BM25 + graph). +- `graph_tool_call.ontology.llm_provider`: **유지**. Pass 2 enrichment 의 `enrich_tool_semantics` 메서드 추가. +- `tool_executor.execute_collection_tool`: **유지**. Stage 3 Runner 가 호출. +- `APICollectionLoader` Canvas 노드: **유지** (그래프 + ai_metadata 로드 역할). +- `Agent Xgen` 노드: **유지** (범용 ReAct / 일반 채팅 용도). API collection 시나리오에 쓰일 땐 `Agent Planflow` 로 대체 권장. + +### 12.2 Canvas 노드 구성 변경 + +``` +기존: Input → APICollectionLoader → Agent Xgen → Output +신규: Input → APICollectionLoader → Agent Planflow → Output + (graph/ai_metadata/io_contracts 로드) (Stage 1~4 통합) +``` + +`Agent Planflow` 내부 구조: +``` +┌── Stage 1: retrieval + target pick (LLM 1회) +├── Stage 2: path synthesizer (결정론, DAG) +├── (conditional) disambiguation (LLM 조건부) +├── Stage 3: runner (streaming) (결정론, HTTP) +└── Stage 4: response synthesis (LLM 1회, streaming) +``` + +설정 UI 는 `Agent Xgen` 과 공용 컴포넌트 재사용 (provider/model/api_key/temperature/max_tokens). 전용 파라미터 (`enable_disambiguation`, `max_plan_steps`) 만 추가. + +### 12.3 점진 마이그레이션 전략 + +1. **Phase A:** L0 Knowledge Base 구축 — IO Contract 추출 (결정론) + `OntologyLLM.enrich_tool_semantics` 메서드 추가. 기존 graph 와 공존. +2. **Phase B:** Stage 3 Runner 독립 구현 (plan fixture 로 단위 테스트). +3. **Phase C:** Stage 2 Path Synthesizer — DAG + exact field match + semantic_tag 보강. +4. **Phase D:** Stage 1 + 4 LLM 호출 구현 (structured output). 기존 `retrieve_with_scores` 를 Stage 1 prefilter 로 연결. +5. **Phase E:** Canvas 노드 `Agent Planflow` 개발. 설정 UI 는 `Agent Xgen` 컴포넌트 재사용. +6. **Phase F:** 평가 세트로 A/B 측정. 안정화 후 기존 반응성 패치 (`selt`, synonym 등) 제거. + +--- + +## 13. 운영 리스크 및 완화 + +| 리스크 | 영향 | 완화 | +|---|---|---| +| IO Contract semantic_tag 오태깅 | Stage 2 가 틀린 path 생성 | ingest 시 LLM 태깅 → 관리자 UI 검수/오버라이드 | +| Stage 1 target 오선정 | 전혀 다른 도구 실행 | confidence threshold → 낮으면 disambiguation 강제 | +| Stage 2 Ambiguous 빈발 | 매 요청 LLM 추가 호출 | IO Contract 개선으로 장기적으로 완화. 초기엔 허용 | +| Runner JsonPath miss | 실행 실패 | plan validate 단계에서 response schema 와 bindings 교차 검증 (Stage 2 출력 직후) | +| HTTP 외부 장애 | 사용자 체감 실패 | retry + 명확한 trace + Stage 4 에서 "일부 성공/실패" 구분 | +| Embedding API 비용 | ingest 비용↑ | ingest 시 1회만. 요청당 embed 는 requirement 1회만 | +| LLM structured output 깨짐 | Stage 1 파싱 실패 | 1회 retry → 실패 시 top-1 embedding 결과 fallback | + +--- + +## 14. 측정 지표 (성공 기준) + +### 14.1 성능 + +- Latency p50 / p95 (목표: p50 ≤ 3s, p95 ≤ 6s) +- LLM 호출 수 / 요청 (목표: ≤ 2.5 평균) +- Context 총량 / 요청 (목표: ≤ 3KB 평균) + +### 14.2 품질 + +평가 세트: 요구사항 20~50개 (각 collection 당). + +- **Stage 1 target 정확도:** 고른 target 이 사람 판단과 일치하는 비율 +- **Stage 2 path 정확도:** 생성된 plan 이 유효한 실행 시퀀스인 비율 +- **End-to-end 성공률:** 사용자 요구사항 → 의미 있는 답변까지 성공한 비율 +- **Ambiguity rate:** Disambiguation 발동 빈도 (낮을수록 graph 품질 좋음) + +### 14.3 비용 + +- OpenAI 토큰 소비 / 요청 (입력/출력 분리) +- Embedding 호출 수 (ingest + 요청별 1회) + +### 14.4 감사성 + +- 모든 Plan artifact 조회 가능 +- 실패 시 failed_step + error + partial_results 복원 가능 + +--- + +## 15. 비전과의 정합성 + +사용자가 그린 그림: + +> Swagger → tool list 정의 → 사전 graph 관계 구축 → +> 워크플로우에서 컬렉션 노드 연결 + 요구사항 입력 → +> 필요한 API 들 찾아 req/res 세팅 후 순서대로 호출 → 결과 반환 + +이 아키텍처의 대응: + +| 사용자 의도 | 이 설계에서 | +|---|---| +| "사전 graph 관계 구축" | L0 Knowledge Base (Pass 1 구조적 + Pass 2 LLM 의미 주석) | +| "요구사항 입력" | Stage 1 입력 | +| "필요한 API 찾기" | Stage 1 (retrieval + target pick) + Stage 2 (DAG 구성) | +| "req/res 세팅" | Stage 2 의 argument bindings (exact field match + semantic_tag) | +| "순서대로 호출" | Stage 3 Runner (DAG topological) | +| "결과 반환" | Stage 4 Response Synthesis | + +**정합성 완전.** LLM 은 의미 해석이 필요한 지점에만 최소한으로 사용: +- **Ingest 시 Pass 2** — description 을 읽고 의미 주석 (1회, 영속 저장) +- **Runtime Stage 1** — 사용자 자연어 → target tool + entities +- **Runtime Stage 4** — 실행 결과 → 자연어 응답 + +Request/response schema 는 LLM 이 일절 건드리지 않음 (swagger 가 source of truth). + +--- + +## 16. 결정 사항 + +### 해결된 항목 (2026-04-23) + +| # | 주제 | 결정 | 근거 | +|---|---|---|---| +| 1 | Field semantic 매칭 방식 | **Pass 1 exact match (기본) + Pass 2 LLM semantic_tag (보강)**. embedding clustering 불필요 | 동일 swagger 안에선 field 이름 일관. cross-convention 은 LLM 이 해결 | +| 2 | LLM 모델 선택 | **xgen agent 노드 config 재사용**. Stage 1/4 는 사용자 노드 설정 상속. Pass 2 는 컬렉션별 별도 설정 (기본 gpt-4.1-mini) | UX 일관성, 기존 provider/key 관리 재사용 | +| 3 | Ingest embedding 모델 | **사용 안 함 (v1)**. 필요시 `text-embedding-3-small` 추후 연결 | BM25 + graph 가 Stage 1 top-20 recall 확보 (실측) | +| 4 | Plan / ExecutionTrace 영속성 | **로그 기반 (DB 테이블 없음)**. 구조화 JSON 이벤트로 plan 생명주기 기록 | YAGNI. 필요 기능 (history UI, template auto-promotion) 생길 때 해당 테이블 추가 | +| 5 | Canvas 노드 구성 | **신규 노드 `Agent Planflow`**. `Agent Xgen` 은 유지 (범용 ReAct), `Agent Planflow` 는 API collection 전용 Plan-and-Execute. 설정 UI 공용화 (provider/model/key) | 기존 자산 유지 + 특화 경로 분리. 코드 간결성 | +| 6 | Plan 실행 범위 (v1) | **선형 chain 만**. Fan-out / 조건 분기 / parallel / re-plan 은 v2+. Plan schema 는 optional 필드로 **확장 가능하게 설계** | v1 목표 (30s→5s + 정확도) 는 선형으로 달성. 복잡 케이스는 사용자에게 명시적 에러 | + +### 미결 항목 + +모두 해결됨 (2026-04-23). + +--- + +## 17. 참고 문서 + +- [pathfinder-plan.md](./pathfinder-plan.md) — 기존 로드맵 (이 문서 확정 후 섹션 3.7 업데이트 필요) +- [pathfinder-bug-analysis.md](./pathfinder-bug-analysis.md) — ingest 파이프라인 과거 이슈 +- [xgen-ai-chat-architecture.md](./xgen-ai-chat-architecture.md) — AI chat / 사이드패널 / canvas 통합 + +--- diff --git a/graph_tool_call/analyze/dependency.py b/graph_tool_call/analyze/dependency.py index 709c9de..28864fa 100644 --- a/graph_tool_call/analyze/dependency.py +++ b/graph_tool_call/analyze/dependency.py @@ -79,6 +79,7 @@ def detect_dependencies( relations.extend(_detect_structural(tools, spec)) relations.extend(_detect_name_based(tools)) relations.extend(_detect_cross_resource(tools)) + relations.extend(_detect_rpc_patterns(tools)) relations = _deduplicate(relations) relations = [r for r in relations if r.confidence >= min_confidence] relations.sort(key=lambda r: r.confidence, reverse=True) @@ -131,17 +132,56 @@ def _is_single_resource_path(path: str) -> bool: def _group_by_resource(tools: list[ToolSchema]) -> dict[str, list[ToolSchema]]: """Group tools that have ``method`` and ``path`` metadata by their base resource. - The base resource is the first non-param path segment (e.g. ``/pets``). + The base resource is the first *meaningful* non-param path segment. + A segment is considered a non-meaningful prefix when it groups more than + ``prefix_threshold`` percent of all tools — this handles version prefixes + (``/v1``, ``/v2``), routing prefixes (``/api``, ``/rest``), etc. without + requiring a hardcoded list. """ + prefix_threshold = 0.4 # if a segment covers >40% of tools, it's a prefix + + api_tools = [t for t in tools if t.metadata.get("path") and t.metadata.get("method")] + if not api_tools: + return {} + + total = len(api_tools) + + # Collect static segments per tool + tool_segments: list[tuple[ToolSchema, list[str]]] = [] + for tool in api_tools: + segs = [s for s in tool.metadata["path"].split("/") if s and not s.startswith("{")] + tool_segments.append((tool, segs)) + + # Determine max depth to scan for prefixes (usually 1-2 levels) + max_depth = max((len(segs) for _, segs in tool_segments), default=1) + + # Find how many prefix levels to skip: + # walk from depth 0 and keep skipping while the segment at that depth + # covers >threshold of all tools + skip_depth = 0 + for depth in range(min(max_depth, 4)): # cap at 4 to avoid pathological cases + counter: dict[str, int] = {} + for _, segs in tool_segments: + if depth < len(segs): + counter.setdefault(segs[depth], 0) + counter[segs[depth]] += 1 + if not counter: + break + most_common_count = max(counter.values()) + if most_common_count / total > prefix_threshold: + skip_depth = depth + 1 + else: + break + + # Group by the segment at skip_depth groups: dict[str, list[ToolSchema]] = {} - for tool in tools: - path = tool.metadata.get("path") - method = tool.metadata.get("method") - if not path or not method: - continue - # base resource = first static segment of the path - segments = [s for s in path.split("/") if s and not s.startswith("{")] - base = "/" + segments[0] if segments else "/" + for tool, segs in tool_segments: + if skip_depth < len(segs): + base = "/" + segs[skip_depth] + elif segs: + base = "/" + segs[-1] + else: + base = "/" groups.setdefault(base, []).append(tool) return groups @@ -607,6 +647,257 @@ def _detect_cross_resource(tools: list[ToolSchema]) -> list[DetectedRelation]: return relations +# --------------------------------------------------------------------------- +# Layer 4: RPC-style method name & DTO pattern detection +# --------------------------------------------------------------------------- + +# Maps leading verb in an RPC method name to a CRUD intent category. +_VERB_TO_INTENT: dict[str, str] = { + # read + "get": "read", + "find": "read", + "fetch": "read", + "list": "read", + "search": "read", + "select": "read", + "load": "read", + "read": "read", + "download": "read", + # write (create) + "save": "write", + "create": "write", + "add": "write", + "insert": "write", + "register": "write", + "regist": "write", + "reg": "write", # camelCase 약어 (regGoodsApprove 등) + # update + "modify": "update", + "update": "update", + "edit": "update", + "change": "update", + "patch": "update", + # delete + "delete": "delete", + "remove": "delete", + "cancel": "delete", + "withdraw": "delete", + # action (side-effect operations) + "process": "action", + "execute": "action", + "apply": "action", + "approve": "action", + "reject": "action", + "confirm": "action", + "accept": "action", + "send": "action", + "upload": "action", + "export": "action", +} + +# Trailing tokens in method names that describe the *view*, not the resource. +_NAME_SUFFIXES: frozenset[str] = frozenset( + { + "list", + "detail", + "details", + "info", + "count", + "excel", + "popup", + "summary", + "check", + "data", + "total", + "all", + "page", + "download", + } +) + +# Common DTO class-name suffixes that are not part of the resource identity. +_DTO_SUFFIXES: frozenset[str] = frozenset( + { + "request", + "response", + "dto", + "entity", + "info", + "base", + "api", + "vo", + "model", + "form", + "param", + "result", + "ml", + } +) + +# CRUD workflow rules: (source_intent, target_intent, relation, same_ctrl_conf, cross_ctrl_conf) +# ``None`` for cross_ctrl_conf means the rule is skipped across controllers. +_WORKFLOW_RULES: list[tuple[str, str, RelationType, float, float | None]] = [ + ("read", "write", RelationType.REQUIRES, 0.9, 0.8), + ("update", "read", RelationType.REQUIRES, 0.85, 0.75), + ("delete", "read", RelationType.REQUIRES, 0.85, 0.75), + ("action", "read", RelationType.REQUIRES, 0.75, None), +] + + +def _same_controller(a: ToolSchema, b: ToolSchema) -> bool: + """Return True if both tools belong to the same (non-empty) controller.""" + ctrl_a = a.metadata.get("controller") or "" + ctrl_b = b.metadata.get("controller") or "" + return ctrl_a == ctrl_b != "" + + +def _extract_verb_and_resource(name: str) -> tuple[str, str]: + """Extract (verb, resource) from an RPC-style method name. + + ``getGoodsList`` → ``("get", "goods")`` + ``saveOptionCategoryList`` → ``("save", "optioncategory")`` + """ + tokens = _normalize_name(name) + if not tokens: + return "", "" + + verb = "" + resource_start = 0 + for i, tok in enumerate(tokens): + if tok in _VERB_TO_INTENT: + verb = tok + resource_start = i + 1 + break + + resource = "".join(t for t in tokens[resource_start:] if t not in _NAME_SUFFIXES) + return verb, resource + + +def _extract_dto_resource(type_name: str | None) -> str: + """Extract the resource root from a DTO class name. + + ``GoodsMgmtApiResponse`` → ``goodsmgmt`` + ``ClaimTargetRequest`` → ``claimtarget`` + """ + if not type_name: + return "" + tokens = _normalize_name(type_name) + return "".join(t for t in tokens if t not in _DTO_SUFFIXES) + + +def _detect_rpc_patterns(tools: list[ToolSchema]) -> list[DetectedRelation]: + """Detect relations for RPC-style APIs (Layer 4). + + Handles non-RESTful endpoints (e.g. ``/v1/goods/goodsMgmtApi/getGoodsList``) + where structural path analysis is ineffective. + + Two strategies: + 1. **Verb-resource grouping** — methods sharing the same resource token + form CRUD workflows with controller-scoped confidence. + 2. **DTO type matching** — methods sharing a request/response type across + controllers are marked COMPLEMENTARY. + """ + relations: list[DetectedRelation] = [] + relations.extend(_detect_rpc_crud_workflows(tools)) + relations.extend(_detect_rpc_dto_links(tools)) + return relations + + +def _detect_rpc_crud_workflows(tools: list[ToolSchema]) -> list[DetectedRelation]: + """Build CRUD workflow relations from verb-resource analysis.""" + relations: list[DetectedRelation] = [] + + # Group tools by extracted resource token. + resource_groups: dict[str, list[tuple[str, ToolSchema]]] = {} + for tool in tools: + verb, resource = _extract_verb_and_resource(tool.name) + if verb and resource: + resource_groups.setdefault(resource, []).append((verb, tool)) + + for resource, members in resource_groups.items(): + if len(members) < 2: + continue + + # Classify members by CRUD intent. + by_intent: dict[str, list[ToolSchema]] = {} + for verb, tool in members: + intent = _VERB_TO_INTENT.get(verb, "other") + by_intent.setdefault(intent, []).append(tool) + + # Apply workflow rules. + for src_intent, tgt_intent, rel_type, same_conf, cross_conf in _WORKFLOW_RULES: + for src in by_intent.get(src_intent, []): + for tgt in by_intent.get(tgt_intent, []): + if src.name == tgt.name: + continue + same = _same_controller(src, tgt) + if not same and cross_conf is None: + continue + relations.append( + DetectedRelation( + source=src.name, + target=tgt.name, + relation_type=rel_type, + confidence=same_conf if same else cross_conf, # type: ignore[arg-type] + evidence=( + f"{src.name} ({src_intent}) → {tgt.name} ({tgt_intent})" + f" — resource '{resource}'" + ), + layer=4, + ) + ) + + # Readers within same controller are SIMILAR_TO. + readers = by_intent.get("read", []) + for i, r1 in enumerate(readers): + for r2 in readers[i + 1 :]: + if r1.name != r2.name and _same_controller(r1, r2): + relations.append( + DetectedRelation( + source=r1.name, + target=r2.name, + relation_type=RelationType.SIMILAR_TO, + confidence=0.8, + evidence=f"{r1.name} ↔ {r2.name} — similar reads for '{resource}'", + layer=4, + ) + ) + + return relations + + +def _detect_rpc_dto_links(tools: list[ToolSchema]) -> list[DetectedRelation]: + """Link tools that share a DTO type across controllers (COMPLEMENTARY).""" + relations: list[DetectedRelation] = [] + + # Group tools by normalised DTO resource name. + dto_groups: dict[str, list[ToolSchema]] = {} + for tool in tools: + for type_name in (tool.metadata.get("request_type"), tool.metadata.get("response_type")): + dto_res = _extract_dto_resource(type_name) + if len(dto_res) >= 4: + dto_groups.setdefault(dto_res, []).append(tool) + + for dto_res, members in dto_groups.items(): + if not 2 <= len(members) <= 20: + continue + for i, a in enumerate(members): + for b in members[i + 1 :]: + if a.name != b.name and not _same_controller(a, b): + relations.append( + DetectedRelation( + source=a.name, + target=b.name, + relation_type=RelationType.COMPLEMENTARY, + confidence=0.75, + evidence=f"{a.name} ↔ {b.name} — shared DTO '{dto_res}'", + layer=4, + ) + ) + + return relations + + # --------------------------------------------------------------------------- # De-duplication # --------------------------------------------------------------------------- diff --git a/graph_tool_call/core/tool.py b/graph_tool_call/core/tool.py index 25df150..b3e9d71 100644 --- a/graph_tool_call/core/tool.py +++ b/graph_tool_call/core/tool.py @@ -408,6 +408,26 @@ def parse_tool(tool: Any) -> ToolSchema: destructive_hint=False, idempotent_hint=False, ), + "insert": MCPAnnotations( + read_only_hint=False, + destructive_hint=False, + idempotent_hint=False, + ), + "register": MCPAnnotations( + read_only_hint=False, + destructive_hint=False, + idempotent_hint=False, + ), + "regist": MCPAnnotations( # 일부 코드베이스 약어 (regUser, registOrder) + read_only_hint=False, + destructive_hint=False, + idempotent_hint=False, + ), + "reg": MCPAnnotations( # camelCase 짧은 약어 (regGoodsApprove) + read_only_hint=False, + destructive_hint=False, + idempotent_hint=False, + ), # update verbs "update": MCPAnnotations( read_only_hint=False, diff --git a/graph_tool_call/execute/http_executor.py b/graph_tool_call/execute/http_executor.py index 32859fa..55e5126 100644 --- a/graph_tool_call/execute/http_executor.py +++ b/graph_tool_call/execute/http_executor.py @@ -77,7 +77,12 @@ def build_request( for k, v in path_params.items(): path = path.replace(f"{{{k}}}", urllib.parse.quote(str(v), safe="")) - url = f"{self._base_url}{path}" + # tool 자체 base_url(spec.servers 유래)이 있으면 그쪽 우선 — 한 컬렉션에 + # 다른 호스트(common/product/member 등)의 source가 섞여 있을 때 source별 + # 호스트로 라우팅한다. 없으면 executor 기본 base_url 사용. + tool_base = (metadata.get("base_url") or "").rstrip("/") + base = tool_base or self._base_url + url = f"{base}{path}" if query_params: url += "?" + urllib.parse.urlencode(query_params, doseq=True) diff --git a/graph_tool_call/graphify/__init__.py b/graph_tool_call/graphify/__init__.py new file mode 100644 index 0000000..98bbbce --- /dev/null +++ b/graph_tool_call/graphify/__init__.py @@ -0,0 +1,39 @@ +"""graphify-mode: deterministic edge extraction + zero-vector retrieval. + +Inspired by the graphify project (https://github.com/safishamsi/graphify). +The core idea: every edge carries a Confidence label, retrieval is a +keyword-seeded BFS over confidence-weighted edges, and the result is a +token-budgeted text rendering of the matched subgraph — no embeddings, +no wRRF fusion, no MMR reranking. + +Public API: + - ingest_openapi_graphify(schemas) -> (ToolGraph, edge_stats) + - retrieve_graphify(tg, query, ...) -> {results, subgraph_text, intent, stats} + - render_subgraph_text(tg, nodes, edges, budget) -> str +""" + +from graph_tool_call.graphify.ingest import ( + DEFAULT_CONF_AMBIGUOUS, + DEFAULT_CONF_EXTRACTED, + DEFAULT_CONF_INFERRED, + _apply_pair_hints, + bucket_confidence, + ingest_openapi_graphify, + preserve_refs_for_detection, +) +from graph_tool_call.graphify.retrieval import ( + render_subgraph_text, + retrieve_graphify, +) + +__all__ = [ + "DEFAULT_CONF_AMBIGUOUS", + "DEFAULT_CONF_EXTRACTED", + "DEFAULT_CONF_INFERRED", + "_apply_pair_hints", + "bucket_confidence", + "ingest_openapi_graphify", + "preserve_refs_for_detection", + "render_subgraph_text", + "retrieve_graphify", +] diff --git a/graph_tool_call/graphify/ingest.py b/graph_tool_call/graphify/ingest.py new file mode 100644 index 0000000..afa23f3 --- /dev/null +++ b/graph_tool_call/graphify/ingest.py @@ -0,0 +1,437 @@ +"""Deterministic ingest: ToolSchema list -> ToolGraph with confidence labels. + +Pipeline (no LLM, no embeddings): + 1. ``detect_dependencies`` runs all four layers (path-hierarchy, CRUD, + shared $ref, name/RPC/cross-resource) at threshold 0.0. + 2. Each ``DetectedRelation`` is bucketed by (layer, conf_score) into one of + EXTRACTED / INFERRED / AMBIGUOUS / dropped. + 3. Edges are added to a fresh ``ToolGraph`` with the bucket as ``confidence`` + attr, plus ``conf_score`` / ``layer`` / ``evidence`` for transparency. + 4. ``edge_stats`` summarises bucket counts, per-relation counts, and the + count of cross-source edges (different ``source_label`` on each end — + the key signal that adding a new source linked into the existing graph). + +For specs that use a lot of $ref pointers (typical of Swagger/OpenAPI 3.x +generators like SpringDoc), pass the raw spec dict to +``preserve_refs_for_detection`` BEFORE calling ``ingest_openapi_graphify`` so +``detect_dependencies._detect_shared_schemas`` can fire — without this step +the library's ``ingest_openapi`` resolves refs inline and the shared-schema +signal is lost. ``ingest_openapi_graphify`` accepts the raw spec directly via +``raw_spec=`` and runs preservation automatically. + +This is the ONLY ingest path used by xgen-workflow. The legacy 14-stage +``RetrievalEngine`` plumbing in graph_tool_call.retrieval is left intact +for benchmark/example users but is not invoked from this module. +""" + +from __future__ import annotations + +from collections import Counter +from typing import Any + +from graph_tool_call.analyze.dependency import ( + DetectedRelation, + detect_dependencies, +) +from graph_tool_call.core.tool import ToolSchema +from graph_tool_call.ontology.schema import Confidence, RelationType +from graph_tool_call.tool_graph import ToolGraph + +# Thresholds — same numbers graphify uses for INFERRED vs AMBIGUOUS. +# EXTRACTED additionally requires layer == 1 (deterministic structural). +DEFAULT_CONF_EXTRACTED = 0.85 +DEFAULT_CONF_INFERRED = 0.85 +DEFAULT_CONF_AMBIGUOUS = 0.70 + + +def bucket_confidence( + layer: int, + conf_score: float, + *, + extracted_min: float = DEFAULT_CONF_EXTRACTED, + inferred_min: float = DEFAULT_CONF_INFERRED, + ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS, +) -> Confidence | None: + """Bucket a (layer, conf_score) pair into a Confidence label. + + layer == 1 (path/CRUD/$ref) AND conf >= extracted_min -> EXTRACTED + conf >= inferred_min -> INFERRED + ambiguous_min <= conf < inferred_min -> AMBIGUOUS + else -> None (dropped) + """ + if conf_score >= extracted_min and layer == 1: + return Confidence.EXTRACTED + if conf_score >= inferred_min: + return Confidence.INFERRED + if conf_score >= ambiguous_min: + return Confidence.AMBIGUOUS + return None + + +# --------------------------------------------------------------------------- +# $ref preservation +# +# Library ``ingest_openapi`` calls ``_resolve_refs`` which inlines every +# ``$ref`` pointer into its target schema. That makes life easier for runtime +# users (they get full schemas, no traversal needed) but it ERASES the signal +# ``_detect_shared_schemas`` relies on — that detector walks metadata looking +# for literal ``$ref`` strings to spot tools sharing a DTO. +# +# This helper rescans the raw spec, captures refs per operation BEFORE they're +# resolved, applies a frequency filter (drop common wrappers + singletons), +# and re-injects them as ``__refs__`` markers into each tool's metadata so +# ``_collect_refs`` finds them. Identical algorithm to xgen-workflow's +# ``swagger_tool_generator._collect_operation_refs``. +# --------------------------------------------------------------------------- + +_HTTP_METHODS = ("get", "post", "put", "patch", "delete", "head", "options") + + +def _scan_refs(obj: Any) -> set[str]: + """Recursively collect ``$ref`` pointer strings from a schema fragment.""" + refs: set[str] = set() + if isinstance(obj, dict): + for k, v in obj.items(): + if k == "$ref" and isinstance(v, str): + refs.add(v) + else: + refs.update(_scan_refs(v)) + elif isinstance(obj, list): + for item in obj: + refs.update(_scan_refs(item)) + return refs + + +def preserve_refs_for_detection( + tools: list[ToolSchema], + raw_spec: dict[str, Any], + *, + min_freq: int = 2, + max_freq_ratio: float = 0.3, +) -> int: + """Inject ``__refs__`` markers into tool metadata so shared-schema detection fires. + + Walk ``raw_spec`` BEFORE resolve, find $refs per operation, filter to the + "domain DTO" sweet spot (>=min_freq references, <=max_freq_ratio of all ops), + and re-inject them into each tool's ``metadata.response_schema.__refs__`` and + ``metadata.request_body_refs``. + + Why filter: + - Common wrappers like ``ApiResponse`` show up in nearly every operation; + leaving them in produces a fully-connected COMPLEMENTARY graph (noise). + - Singletons show up once and can't form edges anyway. + + Returns the number of tools whose metadata was updated. Mutates ``tools`` + in place. + """ + paths = raw_spec.get("paths") or {} + if not isinstance(paths, dict): + return 0 + + raw_per_op: dict[tuple[str, str], tuple[set[str], set[str]]] = {} + freq: Counter[str] = Counter() + + for path, item in paths.items(): + if not isinstance(item, dict): + continue + for method in _HTTP_METHODS: + op = item.get(method) + if not isinstance(op, dict): + continue + req = _scan_refs(op.get("requestBody")) | _scan_refs(op.get("parameters")) + resp = _scan_refs(op.get("responses")) + if not (req or resp): + continue + raw_per_op[(method, path)] = (req, resp) + for r in req | resp: + freq[r] += 1 + + if not raw_per_op: + return 0 + + total_ops = len(raw_per_op) + ceiling = max(min_freq, int(total_ops * max_freq_ratio)) + + def _useful(r: str) -> bool: + return min_freq <= freq[r] <= ceiling + + op_refs: dict[tuple[str, str], tuple[list[str], list[str]]] = {} + for k, (req, resp) in raw_per_op.items(): + rq = sorted(r for r in req if _useful(r)) + rp = sorted(r for r in resp if _useful(r)) + if rq or rp: + op_refs[k] = (rq, rp) + + updated = 0 + for tool in tools: + md = tool.metadata or {} + method = str(md.get("method") or "").lower() + path = str(md.get("path") or "") + refs = op_refs.get((method, path)) + if not refs: + continue + rq, rp = refs + if rp: + rs = md.get("response_schema") or {} + if isinstance(rs, dict): + rs = dict(rs) + rs["__refs__"] = [{"$ref": r} for r in rp] + md["response_schema"] = rs + if rq: + md["request_body_refs"] = [{"$ref": r} for r in rq] + tool.metadata = md + updated += 1 + + return updated + + +# --------------------------------------------------------------------------- +# ai_metadata.pairs_well_with → graphify edge derivation +# +# ``ai_metadata`` is the source-of-truth (LLM Pass 2 fills it; the operator +# can hand-edit it via ToolGraphView). On every rebuild we derive the +# corresponding workflow edges into the graphify graph so ``_find_producer`` +# can score them as a first-class signal — no separate lookup, no two-system +# sync drift. The frontend keeps reading ``ai_metadata.pairs_well_with`` +# directly (single read path, no UI churn). +# +# Confidence mapping reflects the trust we place in each source: +# PairHint.source == "manual" → EXTRACTED (operator deliberately curated) +# PairHint.source == "auto" → INFERRED (LLM Pass 2 high-confidence) +# anything else / missing → INFERRED (legacy entries default safe) +# +# Layer is set to 2 because pair hints are not structural (path/$ref/CRUD) +# even when curated — they encode workflow semantics, which sits one level +# above structural inference in the graphify confidence model. +# --------------------------------------------------------------------------- + + +def _apply_pair_hints( + tg: ToolGraph, + schemas: list[ToolSchema], +) -> dict[str, int]: + """Convert ``metadata.ai_metadata.pairs_well_with`` into graphify edges. + + Skips pairs whose target tool isn't in the current graph (cross-source + enrichment can list pairs that haven't been ingested yet) and self-pairs. + Skips when the same (src, tgt) pair already carries a structural relation + from ``detect_dependencies`` UNLESS the new pair is operator-curated + (``source="manual"``) — operator intent overrides automatic detection. + """ + stats = { + "manual": 0, + "auto": 0, + "skipped_target_missing": 0, + "skipped_self": 0, + "skipped_existing_structural": 0, + } + tool_names = set(tg.tools.keys()) + + for s in schemas: + ai = (s.metadata or {}).get("ai_metadata") or {} + pairs = ai.get("pairs_well_with") or [] + if not isinstance(pairs, list): + continue + for p in pairs: + if not isinstance(p, dict): + continue + target = str(p.get("tool") or "").strip() + if not target: + continue + if target == s.name: + stats["skipped_self"] += 1 + continue + if target not in tool_names: + stats["skipped_target_missing"] += 1 + continue + + source = str(p.get("source") or "auto").strip().lower() + is_manual = source == "manual" + confidence = Confidence.EXTRACTED if is_manual else Confidence.INFERRED + reason = str(p.get("reason") or "")[:200] + + # Existing-edge policy: if detect_dependencies already produced + # an edge here we keep it unless the operator is overriding. + if tg.graph.has_edge(s.name, target): + if not is_manual: + stats["skipped_existing_structural"] += 1 + continue + + try: + tg.add_relation( + s.name, + target, + RelationType.COMPLEMENTARY, + confidence=confidence, + layer=2, + evidence=f"pair[{source}]: {reason}" if reason else f"pair[{source}]", + ) + stats["manual" if is_manual else "auto"] += 1 + except (KeyError, ValueError): + stats["skipped_target_missing"] += 1 + + return stats + + +def _source_label(schema: ToolSchema) -> str: + """Return the source label that distinguishes which OpenAPI spec a tool came from. + + xgen-workflow tags each tool with ``metadata.source_label`` (e.g. "order", + "claim"). When that's absent, fall back to the first path segment so + cross-source detection still works for libraries used outside xgen. + """ + md = schema.metadata or {} + label = md.get("source_label") + if label: + return str(label) + path = str(md.get("path") or "") + segs = [s for s in path.split("/") if s and not s.startswith("{")] + return segs[0] if segs else "" + + +def ingest_openapi_graphify( + schemas: list[ToolSchema], + *, + extracted_min: float = DEFAULT_CONF_EXTRACTED, + inferred_min: float = DEFAULT_CONF_INFERRED, + ambiguous_min: float = DEFAULT_CONF_AMBIGUOUS, + spec: dict[str, Any] | None = None, + raw_spec: dict[str, Any] | None = None, +) -> tuple[ToolGraph, dict[str, Any]]: + """Build a graphify-style ToolGraph from a list of ToolSchemas. + + Parameters + ---------- + schemas: + Tools to ingest. Pre-existing ``metadata.source_label`` enables + cross-source edge tracking. + extracted_min / inferred_min / ambiguous_min: + Confidence bucket thresholds (see ``bucket_confidence``). + spec: + Optional normalized spec dict, forwarded to ``detect_dependencies``. + Currently unused by the detector but kept for forward compat. + raw_spec: + Optional ORIGINAL OpenAPI/Swagger spec dict (BEFORE $ref resolution). + When supplied, runs ``preserve_refs_for_detection`` so the layer-1 + shared-schema detector can fire on heavily $ref-using specs (typical + of SpringDoc-generated OpenAPI). xgen-workflow callers who already + bake refs into tool metadata via swagger_tool_generator can leave + this None. + + Returns + ------- + (ToolGraph, edge_stats): + ``edge_stats`` keys: + EXTRACTED, INFERRED, AMBIGUOUS, dropped: int counts + by_relation: {relation_value: int} + cross_source: int (edges across labels) + tool_count, edge_count: int + refs_preserved: int (tools touched by + preserve_refs_for_detection) + """ + tg = ToolGraph() + for s in schemas: + tg.add_tool(s) + + label_by_name = {s.name: _source_label(s) for s in schemas} + + stats: dict[str, Any] = { + "EXTRACTED": 0, + "INFERRED": 0, + "AMBIGUOUS": 0, + "dropped": 0, + "by_relation": {}, + "cross_source": 0, + "tool_count": len(schemas), + "edge_count": 0, + "refs_preserved": 0, + } + + if len(schemas) < 2: + return tg, stats + + # Optional: rescue layer-1 shared-schema signal that ingest_openapi inlined. + if raw_spec is not None: + stats["refs_preserved"] = preserve_refs_for_detection(schemas, raw_spec) + + # min_confidence=0.0 so we see every candidate; we re-bucket here. + relations: list[DetectedRelation] = detect_dependencies(schemas, spec, min_confidence=0.0) + + seen: set[tuple[str, str, str]] = set() # (src, tgt, relation_value) + for rel in relations: + bucket = bucket_confidence( + rel.layer, + rel.confidence, + extracted_min=extracted_min, + inferred_min=inferred_min, + ambiguous_min=ambiguous_min, + ) + if bucket is None: + stats["dropped"] += 1 + continue + + rel_value = ( + rel.relation_type.value + if hasattr(rel.relation_type, "value") + else str(rel.relation_type) + ) + key = (rel.source, rel.target, rel_value) + if key in seen: + # detect_dependencies already de-duplicates, but be defensive. + continue + seen.add(key) + + try: + tg.add_relation( + rel.source, + rel.target, + rel.relation_type, + confidence=bucket, + conf_score=rel.confidence, + layer=rel.layer, + evidence=rel.evidence, + ) + except (KeyError, ValueError): + # Endpoint not in graph (shouldn't happen — tools were just added) — skip. + stats["dropped"] += 1 + continue + + stats[bucket.value] += 1 + stats["by_relation"][rel_value] = stats["by_relation"].get(rel_value, 0) + 1 + + src_label = label_by_name.get(rel.source, "") + tgt_label = label_by_name.get(rel.target, "") + if src_label and tgt_label and src_label != tgt_label: + stats["cross_source"] += 1 + + # Derive workflow edges from ai_metadata.pairs_well_with — single + # source-of-truth lives on each tool's metadata, edges are regenerated + # on every rebuild so operator/LLM curation flows in automatically. + pair_stats = _apply_pair_hints(tg, schemas) + stats["pair_edges"] = pair_stats + # Roll the pair edges into the global confidence/by_relation counters + # so ``edge_stats`` accurately reflects the final graph contents. + stats["EXTRACTED"] += pair_stats.get("manual", 0) + stats["INFERRED"] += pair_stats.get("auto", 0) + if pair_stats.get("manual") or pair_stats.get("auto"): + stats["by_relation"]["complementary"] = ( + stats["by_relation"].get("complementary", 0) + + pair_stats.get("manual", 0) + + pair_stats.get("auto", 0) + ) + # cross_source also re-counted on these new edges for completeness. + for s in schemas: + ai = (s.metadata or {}).get("ai_metadata") or {} + for p in ai.get("pairs_well_with") or []: + if not isinstance(p, dict): + continue + tgt = str(p.get("tool") or "").strip() + if not tgt or tgt == s.name or tgt not in tg.tools: + continue + src_lab = label_by_name.get(s.name, "") + tgt_lab = label_by_name.get(tgt, "") + if src_lab and tgt_lab and src_lab != tgt_lab: + stats["cross_source"] += 1 + + stats["edge_count"] = tg.graph.edge_count() + return tg, stats diff --git a/graph_tool_call/graphify/retrieval.py b/graph_tool_call/graphify/retrieval.py new file mode 100644 index 0000000..f15e4bc --- /dev/null +++ b/graph_tool_call/graphify/retrieval.py @@ -0,0 +1,467 @@ +"""Zero-vector retrieval over a graphify-style ToolGraph. + +Algorithm (mirrors graphify/serve.py): + 1. seed = top-5 of BM25(query) (substring fallback if BM25 returns empty) + 2. weights = INTENT_RELATION_WEIGHTS[dominant_intent] or DEFAULT + 3. score = rel_weight[rel] * CONF_FACTOR[confidence] * decay(depth) + CONF_FACTOR = {EXTRACTED: 1.0, INFERRED: 0.7, AMBIGUOUS: 0.4, None: 0.5} + decay(d) = 1 / (0.5*d + 1) + 4. BFS from seeds, depth=2, accumulate max score per neighbour + 5. history-aware demote (used tools * 0.6) + 6. render_subgraph_text(top_k nodes + edges, token_budget) + +Why this works without embeddings: + - The graph carries the semantic signal (CRUD chains, $ref data flow, + cross-resource matches) — once a relationship is in the graph, traversal + finds it. + - Confidence labels let the score down-weight guesses without dropping them; + AMBIGUOUS edges still appear, just behind EXTRACTED ones. + - Token-budgeted rendering means an LLM gets a compact, structured context + (not a list of tool JSON blobs) and can decide chains via the EDGE lines. +""" + +from __future__ import annotations + +import re +import unicodedata +from typing import Any + +from graph_tool_call.core.protocol import GraphEngine +from graph_tool_call.core.tool import ToolSchema +from graph_tool_call.ontology.schema import ( + DEFAULT_RELATION_WEIGHTS, + INTENT_RELATION_WEIGHTS, + NodeType, + RelationType, +) +from graph_tool_call.retrieval.intent import classify_intent +from graph_tool_call.tool_graph import ToolGraph + +# Score multiplier per confidence bucket. EXTRACTED edges are deterministic +# (path/CRUD/$ref) and trusted at 1.0; INFERRED is heuristic but still +# high-confidence; AMBIGUOUS gets a strong penalty so it's surfaced for +# review without dominating EXTRACTED chains. +# +# Edges added by callers without a confidence attr (e.g. legacy code paths) +# get the same weight as the no-bucket fallback (0.5) — neither rewarded +# nor heavily penalised. +CONF_FACTOR: dict[str | None, float] = { + "EXTRACTED": 1.0, + "INFERRED": 0.7, + "AMBIGUOUS": 0.4, + None: 0.5, +} + +_DEFAULT_DEPTH = 2 +_DEFAULT_TOP_K = 10 +_DEFAULT_BUDGET = 2000 +_HISTORY_DEMOTE = 0.6 + + +# --------------------------------------------------------------------------- +# Seed selection +# --------------------------------------------------------------------------- + + +def _strip_diacritics(text: str) -> str: + nfkd = unicodedata.normalize("NFKD", text) + return "".join(c for c in nfkd if not unicodedata.combining(c)) + + +def _substring_seeds( + tools: dict[str, ToolSchema], + query: str, + *, + limit: int = 5, +) -> list[tuple[str, float]]: + """Substring fallback when BM25 returns no hits (very short or non-Latin queries).""" + q = _strip_diacritics(query).lower() + terms = [t for t in re.split(r"[\s_\-/.,;:!?()]+", q) if t and len(t) > 1] + scored: list[tuple[str, float]] = [] + for name, tool in tools.items(): + nname = _strip_diacritics(name).lower() + ndesc = _strip_diacritics(tool.description or "").lower() + score = sum(1.0 for t in terms if t in nname) + 0.5 * sum(1.0 for t in terms if t in ndesc) + if score > 0: + scored.append((name, score)) + scored.sort(key=lambda x: x[1], reverse=True) + return scored[:limit] + + +def _bm25_seeds(tg: ToolGraph, query: str, *, limit: int = 5) -> list[tuple[str, float]]: + """Top-N BM25 hits as seeds. Uses the engine's BM25 index, lazy-built once.""" + try: + engine = tg._get_retrieval_engine() # noqa: SLF001 + bm25 = engine._get_bm25() # noqa: SLF001 + except Exception: + return [] + scores = bm25.score(query) or {} + if not scores: + return [] + ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) + return [(name, score) for name, score in ranked[:limit]] + + +def _select_seeds( + tg: ToolGraph, + query: str, + *, + limit: int = 5, +) -> list[tuple[str, float]]: + seeds = _bm25_seeds(tg, query, limit=limit) + if seeds: + return seeds + return _substring_seeds(tg.tools, query, limit=limit) + + +# --------------------------------------------------------------------------- +# BFS traversal +# --------------------------------------------------------------------------- + + +def _intent_weights(query: str) -> tuple[dict[str, float], str]: + """Pick relation weights based on dominant query intent. + + Returns (weights_map, dominant_label) where label is one of + 'read'/'write'/'delete'/'neutral'. + """ + intent = classify_intent(query) + if intent.is_neutral: + return DEFAULT_RELATION_WEIGHTS, "neutral" + by_dim = { + "read": intent.read_intent, + "write": intent.write_intent, + "delete": intent.delete_intent, + } + dominant = max(by_dim, key=lambda k: by_dim[k]) + if by_dim[dominant] < 0.5: + return DEFAULT_RELATION_WEIGHTS, "neutral" + weights = INTENT_RELATION_WEIGHTS.get(dominant, DEFAULT_RELATION_WEIGHTS) + return weights, dominant + + +def _normalize_relation_key(rel: Any) -> Any: + """Relation weights are keyed by RelationType. Normalize string attrs to enum.""" + if isinstance(rel, RelationType): + return rel + if isinstance(rel, str): + try: + return RelationType(rel) + except ValueError: + return rel + return rel + + +def _bfs_from_seeds( + graph: GraphEngine, + seed_scores: list[tuple[str, float]], + *, + depth: int, + rel_weights: dict[str, float], +) -> tuple[dict[str, float], list[tuple[str, str]]]: + """Confidence-weighted BFS. Returns (scores, edges_visited). + + Score policy: + seeds: normalized BM25 score (top seed = 1.0, others scaled) + neighbour at depth d via edge of weight w and confidence c: + score(neighbour) = max(prev, parent_score * w * CONF_FACTOR[c] * 1/(0.5*d + 1)) + + Why normalize seeds: if all 5 BM25 hits got flat 1.0, top-K shows them in + arbitrary order with identical scores and BFS-found neighbours never compete. + Scaling by ``score / max_seed_score`` preserves BM25's relative ranking and + lets a strongly-matching seed lift its 1-hop neighbours above weakly-matching + sibling seeds. + + Tools nodes are scored; CATEGORY/DOMAIN nodes are passthrough so we can + reach sibling tools on the next hop. + """ + if not seed_scores: + return {}, [] + + max_seed = max((s for _, s in seed_scores), default=1.0) or 1.0 + scores: dict[str, float] = {n: s / max_seed for n, s in seed_scores if graph.has_node(n)} + visited: set[str] = set(scores) + frontier: list[str] = list(scores) + edges_visited: list[tuple[str, str]] = [] + + for d in range(1, depth + 1): + decay = 1.0 / (0.5 * d + 1) + next_frontier: list[str] = [] + for node in frontier: + parent_score = scores.get(node, 0.0) + try: + edges = graph.get_edges_from(node, direction="both") + except (KeyError, ValueError): + continue + for src, tgt, attrs in edges: + neighbour = tgt if src == node else src + if neighbour in visited: + continue + neighbour_attrs = graph.get_node_attrs(neighbour) + neighbour_type = neighbour_attrs.get("node_type") + + rel_key = _normalize_relation_key(attrs.get("relation")) + rel_w = rel_weights.get(rel_key, 0.3) + conf = attrs.get("confidence") + conf_factor = CONF_FACTOR.get(conf, CONF_FACTOR[None]) + + if neighbour_type == NodeType.TOOL: + # Propagate parent's score so a high-BM25 seed lifts its + # neighbours more than a low-BM25 seed does. This is what + # makes the ranking actually informative — without + # parent_score multiplication every BFS-discovered tool + # would inherit the same fixed weight. + score = parent_score * rel_w * conf_factor * decay + scores[neighbour] = max(scores.get(neighbour, 0.0), score) + edges_visited.append((src, tgt)) + next_frontier.append(neighbour) + visited.add(neighbour) + elif neighbour_type in (NodeType.CATEGORY, NodeType.DOMAIN): + # Passthrough — visit but don't score; lets BFS reach + # sibling tools via CATEGORY hubs without inflating scores. + next_frontier.append(neighbour) + visited.add(neighbour) + frontier = next_frontier + if not frontier: + break + + return scores, edges_visited + + +# --------------------------------------------------------------------------- +# Subgraph rendering +# --------------------------------------------------------------------------- + + +def _node_line(name: str, tool: ToolSchema | None, attrs: dict) -> str: + """One NODE line for the subgraph text rendering.""" + md = (tool.metadata if tool else {}) or {} + method = str(md.get("method") or "").upper() + path = str(md.get("path") or "") + src_label = str(md.get("source_label") or "") + community = attrs.get("community") + parts = [name] + if method or path: + parts.append(f"[{method} {path}]".strip()) + if src_label: + parts.append(f"[source={src_label}]") + if community is not None: + parts.append(f"[community={community}]") + return "NODE " + " ".join(p for p in parts if p) + + +def _edge_line( + u: str, + v: str, + attrs: dict, +) -> str: + """One EDGE line. confidence in [], evidence in (...).""" + rel = attrs.get("relation") + rel_str = rel.value if hasattr(rel, "value") else str(rel) + conf = attrs.get("confidence", "") + conf_str = f" [{conf}]" if conf else "" + line = f"EDGE {u} --{rel_str}{conf_str}--> {v}" + evidence = attrs.get("evidence") + if evidence: + line += f" ({evidence})" + return line + + +def render_subgraph_text( + tg: ToolGraph, + nodes: set[str] | list[str], + edges: list[tuple[str, str]] | None = None, + *, + token_budget: int = _DEFAULT_BUDGET, + sort_by_score: dict[str, float] | None = None, +) -> str: + """Render the matched subgraph as ``NODE ...`` / ``EDGE ...`` lines. + + Approx 3 chars per token is the budget conversion. When the rendering + overflows the budget, the tail is cut and a ``... (truncated)`` line + is appended. + + sort_by_score: if provided, NODE lines are emitted in descending score + order so the LLM sees the most relevant tools first. + + edges: optional hint listing edges visited during BFS — purely for + ordering. Whether or not this is supplied, ALL graph edges between any + pair of chosen nodes are emitted so the LLM sees the full local + structure (matching graphify's behaviour). + """ + char_budget = token_budget * 3 + node_set: set[str] = set(nodes) + + # Order nodes: by retrieval score (desc) if known, else by name. + if sort_by_score: + node_order = sorted(node_set, key=lambda n: (-sort_by_score.get(n, 0.0), n)) + else: + node_order = sorted(node_set) + + lines: list[str] = [] + for n in node_order: + if not tg.graph.has_node(n): + continue + attrs = tg.graph.get_node_attrs(n) + tool = tg.tools.get(n) + lines.append(_node_line(n, tool, attrs)) + + # Walk all graph edges between chosen nodes (not just BFS visited ones) + # so the LLM gets the complete local structure. BFS-visited edges naturally + # come first when we sort, ensuring no surprise gaps. + seen_edges: set[tuple[str, str]] = set() + edge_lines: list[str] = [] + for u in node_order: + if not tg.graph.has_node(u): + continue + try: + outgoing = tg.graph.get_edges_from(u, direction="out") + except (KeyError, ValueError): + continue + for src, tgt, attrs in outgoing: + if tgt not in node_set: + continue + key = (src, tgt) + if key in seen_edges: + continue + seen_edges.add(key) + edge_lines.append(_edge_line(src, tgt, attrs)) + + lines.extend(edge_lines) + + output = "\n".join(lines) + if len(output) > char_budget: + # Cut at the last newline that fits, then append a marker. Keep the + # marker even if it pushes us slightly over the char budget — the + # token budget is a soft cap. + cut = output[:char_budget].rsplit("\n", 1)[0] + output = cut + f"\n... (truncated to ~{token_budget} token budget)" + return output + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def retrieve_graphify( + tg: ToolGraph, + query: str, + *, + top_k: int = _DEFAULT_TOP_K, + depth: int = _DEFAULT_DEPTH, + token_budget: int = _DEFAULT_BUDGET, + history: list[str] | None = None, +) -> dict[str, Any]: + """Retrieve tools for a natural-language query using graph traversal only. + + Parameters + ---------- + tg: + A graphify-style ``ToolGraph``. Edges should carry ``confidence`` + attrs (EXTRACTED/INFERRED/AMBIGUOUS); edges without one get the + neutral 0.5 multiplier. + query: + Natural-language search. + top_k: + Maximum tools in the result set (and the rendered subgraph). + depth: + BFS depth from seeds. 2 is graphify's default and works for most + workflow chains (createX -> getX -> doSomethingWithX). + token_budget: + Char-budget for the rendered text (~3 chars/token). + history: + Tool names already called in this session — they are demoted (×0.6) + to encourage progress through a workflow rather than re-suggesting. + + Returns + ------- + dict with keys: + - results: list of {name, score, tool: {...}} sorted desc. + - subgraph_text: the LLM-ready NODE/EDGE rendering. + - intent: {dominant: 'read'|'write'|'delete'|'neutral', read, write, delete} + - stats: {seeds: [...], visited_nodes: int, visited_edges: int} + + Note: prerequisite chain construction (e.g. listOrders → getOrder → cancelOrder) + is NOT this function's job — it lives in Stage 2 ``synthesize_plan`` which + consumes the graph this module produces. retrieve_graphify only finds the + primary candidates; chain assembly is downstream. + """ + if not query or not tg.tools: + return { + "results": [], + "subgraph_text": "", + "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0}, + "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0}, + } + + # 1) Seeds + seeds_with_scores = _select_seeds(tg, query, limit=5) + seed_names = [s for s, _ in seeds_with_scores] + + if not seed_names: + return { + "results": [], + "subgraph_text": "", + "intent": {"dominant": "neutral", "read": 0.0, "write": 0.0, "delete": 0.0}, + "stats": {"seeds": [], "visited_nodes": 0, "visited_edges": 0}, + } + + # 2) Intent → relation weight map + rel_weights, dominant = _intent_weights(query) + from graph_tool_call.retrieval.intent import classify_intent # noqa: I001 (re-import OK) + + intent_obj = classify_intent(query) + + # 3) BFS — pass full (name, score) pairs so seed scores reflect BM25 ranking + scores, edges_visited = _bfs_from_seeds( + tg.graph, + seeds_with_scores, + depth=depth, + rel_weights=rel_weights, + ) + + # 4) History demote + if history: + for h in history: + if h in scores: + scores[h] *= _HISTORY_DEMOTE + + # 5) Filter to TOOL nodes only and rank + tool_scores: dict[str, float] = {n: s for n, s in scores.items() if n in tg.tools} + ranked = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)[:top_k] + chosen_names: set[str] = {n for n, _ in ranked} + + # 6) Render + subgraph_text = render_subgraph_text( + tg, + chosen_names, + edges_visited, + token_budget=token_budget, + sort_by_score=tool_scores, + ) + + results = [ + { + "name": name, + "score": round(score, 4), + "tool": tg.tools[name].to_dict() if name in tg.tools else None, + } + for name, score in ranked + ] + + return { + "results": results, + "subgraph_text": subgraph_text, + "intent": { + "dominant": dominant, + "read": round(intent_obj.read_intent, 3), + "write": round(intent_obj.write_intent, 3), + "delete": round(intent_obj.delete_intent, 3), + }, + "stats": { + "seeds": seed_names, + "visited_nodes": len(scores), + "visited_edges": len(edges_visited), + }, + } diff --git a/graph_tool_call/ingest/io_contract.py b/graph_tool_call/ingest/io_contract.py new file mode 100644 index 0000000..90bf308 --- /dev/null +++ b/graph_tool_call/ingest/io_contract.py @@ -0,0 +1,349 @@ +"""Field-level IO contract extraction from OpenAPI / Swagger schemas. + +Used by L0 Knowledge Base — **Pass 1, deterministic**. Walks request and +response schemas and emits leaf field descriptors with JsonPath. The output +feeds: + + - Tool Graph: produces × consumes field-name match → ``produces_for`` edge + - Pass 2 enrichment: provides field list to LLM for ``semantic_tag`` assign + - Stage 3 Runner: bindings reference these json_paths + +This module assumes the input schema is **already $ref-resolved** (caller +runs ``_resolve_refs`` from ``graph_tool_call.ingest.openapi``). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class FieldLeaf: + """A leaf field extracted from a JSON Schema. + + ``json_path`` is the dotted JSONPath from the schema root, with ``[*]`` + used as the array wildcard (for produces). For consumes, callers usually + flatten to ``field_name`` since binding keys by name not path. + """ + + json_path: str + field_name: str + field_type: str + required: bool = False + description: str = "" + enum: list[Any] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Schema walker +# --------------------------------------------------------------------------- + + +_DEFAULT_MAX_DEPTH = 8 + + +def extract_leaves( + schema: Any, + *, + base_path: str = "$", + parent_required: bool = False, + max_depth: int = _DEFAULT_MAX_DEPTH, + _depth: int = 0, +) -> list[FieldLeaf]: + """Recursively walk a JSON Schema, emitting leaf field info. + + Parameters + ---------- + schema: + JSON Schema dict (already $ref-resolved). + base_path: + Starting JSONPath for this subtree (e.g. ``$``, ``$.body``). + parent_required: + Whether the containing field is required by its parent. Propagated to + leaves so the caller can filter ``required-only`` consumes. + max_depth: + Hard recursion limit. Cyclic schemas or pathological nesting stop here. + + Returns + ------- + list[FieldLeaf] + One entry per primitive (or array-of-primitive) leaf reachable. + """ + if not isinstance(schema, dict) or _depth > max_depth: + return [] + + schema = _resolve_combinators(schema) + + schema_type = _normalize_type(schema.get("type")) + + # Object: walk properties + if schema_type == "object" or "properties" in schema: + return _walk_object(schema, base_path, max_depth, _depth) + + # Array: walk items with [*] suffix + if schema_type == "array": + items = schema.get("items") or {} + return extract_leaves( + items, + base_path=f"{base_path}[*]", + parent_required=parent_required, + max_depth=max_depth, + _depth=_depth + 1, + ) + + # Primitive: emit a single leaf using the trailing path segment as name + field_name = _last_path_segment(base_path) + if not field_name: + # At root with no parent name — nothing useful to emit + return [] + return [ + FieldLeaf( + json_path=base_path, + field_name=field_name, + field_type=schema_type or "string", + required=parent_required, + description=str(schema.get("description") or "")[:200], + enum=list(schema.get("enum") or []), + ) + ] + + +def _walk_object( + schema: dict[str, Any], + base_path: str, + max_depth: int, + depth: int, +) -> list[FieldLeaf]: + leaves: list[FieldLeaf] = [] + properties = schema.get("properties") or {} + if not isinstance(properties, dict): + return leaves + required_set = set(schema.get("required") or []) + + for prop_name, prop_schema in properties.items(): + child_path = f"{base_path}.{prop_name}" + is_required = prop_name in required_set + child_leaves = extract_leaves( + prop_schema, + base_path=child_path, + parent_required=is_required, + max_depth=max_depth, + _depth=depth + 1, + ) + if child_leaves: + leaves.extend(child_leaves) + else: + # Object/array with no resolvable children — keep as a generic leaf + # so downstream knows the field exists (e.g. opaque additionalProps). + leaves.append( + FieldLeaf( + json_path=child_path, + field_name=prop_name, + field_type=_schema_type(prop_schema) or "object", + required=is_required, + description=( + str(prop_schema.get("description") or "")[:200] + if isinstance(prop_schema, dict) + else "" + ), + ) + ) + return leaves + + +def _resolve_combinators(schema: dict[str, Any]) -> dict[str, Any]: + """Flatten ``allOf`` / pick first ``oneOf`` / ``anyOf``. + + v1 strategy: best-effort. Doesn't handle JSON Schema combinator semantics + fully — sufficient to surface field shapes for our planning use. + """ + if "allOf" in schema and isinstance(schema["allOf"], list): + merged_props: dict[str, Any] = dict(schema.get("properties") or {}) + merged_required: list[str] = list(schema.get("required") or []) + for sub in schema["allOf"]: + if not isinstance(sub, dict): + continue + merged_props.update(sub.get("properties") or {}) + for r in sub.get("required") or []: + if r not in merged_required: + merged_required.append(r) + out = dict(schema) + out["type"] = "object" + out["properties"] = merged_props + out["required"] = merged_required + return out + + for key in ("oneOf", "anyOf"): + candidates = schema.get(key) + if isinstance(candidates, list) and candidates: + first = next((c for c in candidates if isinstance(c, dict)), None) + if first is not None: + # Merge the candidate as a base, parent fields override + base = dict(first) + base.update({k: v for k, v in schema.items() if k != key}) + return base + return schema + + +def _normalize_type(t: Any) -> str: + """JSON Schema 'type' can be str or list. Pick first non-null.""" + if isinstance(t, list): + return next((x for x in t if x and x != "null"), "") + return t or "" + + +def _schema_type(schema: Any) -> str: + if not isinstance(schema, dict): + return "" + return _normalize_type(schema.get("type")) + + +def _last_path_segment(path: str) -> str: + """Extract trailing field name from a JsonPath like ``$.body.goods[*].goodsNo``.""" + if not path or path == "$": + return "" + last = path.rsplit(".", 1)[-1] + if last.endswith("[*]"): + last = last[:-3] + return last + + +# --------------------------------------------------------------------------- +# Operation-level extraction (combines body + parameters) +# --------------------------------------------------------------------------- + + +def extract_produces_for_operation( + operation: dict[str, Any], + *, + is_swagger2: bool = False, +) -> list[FieldLeaf]: + """Walk operation's success response schema → leaf produces with JsonPath.""" + response_schema = _pick_response_schema(operation, is_swagger2=is_swagger2) + if not response_schema: + return [] + return extract_leaves(response_schema, base_path="$") + + +def extract_consumes_for_operation( + operation: dict[str, Any], + path_item: dict[str, Any] | None = None, + *, + is_swagger2: bool = False, + required_only: bool = True, +) -> list[FieldLeaf]: + """Combine query/path/header parameters and request body into a flat + consume list. + + Body fields are flattened to field-name level (the LLM-visible name) — + binding keys by name in Stage 2/3, not by nested path. The original + nested structure for HTTP injection is handled separately via the + existing ``leaf_path_map`` mechanism on the tool row. + """ + leaves: list[FieldLeaf] = [] + seen_names: set[str] = set() + + # query / path / header parameters + all_params = (operation.get("parameters") or []) + ((path_item or {}).get("parameters") or []) + for p in all_params: + if not isinstance(p, dict) or "name" not in p: + continue + loc = p.get("in") + if loc not in ("query", "path", "header"): + continue + is_required = bool(p.get("required", loc == "path")) + if required_only and not is_required: + continue + if is_swagger2: + ftype = p.get("type") or "string" + # Swagger 2.0 — enum lives directly on the parameter object. + enum_vals = p.get("enum") or [] + else: + param_schema = p.get("schema") or {} + ftype = _schema_type(param_schema) or "string" + # OpenAPI 3.x — enum lives under ``schema``. + enum_vals = param_schema.get("enum") or [] if isinstance(param_schema, dict) else [] + if p["name"] in seen_names: + continue + seen_names.add(p["name"]) + leaves.append( + FieldLeaf( + json_path=p["name"], # flat for consumes + field_name=p["name"], + field_type=ftype, + required=is_required, + description=str(p.get("description") or "")[:200], + enum=list(enum_vals), + ) + ) + + # request body (flattened) + body_schema = _pick_request_body_schema(operation, is_swagger2=is_swagger2) + if body_schema: + for leaf in extract_leaves(body_schema, base_path="$"): + if required_only and not leaf.required: + continue + if leaf.field_name in seen_names: + continue + seen_names.add(leaf.field_name) + leaves.append( + FieldLeaf( + json_path=leaf.field_name, # flat for consumes + field_name=leaf.field_name, + field_type=leaf.field_type, + required=leaf.required, + description=leaf.description, + enum=leaf.enum, + ) + ) + + return leaves + + +def _pick_response_schema( + operation: dict[str, Any], + *, + is_swagger2: bool = False, +) -> dict[str, Any] | None: + responses = operation.get("responses") or {} + for code in ("200", "201", "default"): + resp = responses.get(code) + if not isinstance(resp, dict): + continue + # Swagger 2.0 + if "schema" in resp: + return resp["schema"] + # OpenAPI 3.x + content = resp.get("content") or {} + if "application/json" in content: + return content["application/json"].get("schema") + return None + + +def _pick_request_body_schema( + operation: dict[str, Any], + *, + is_swagger2: bool = False, +) -> dict[str, Any] | None: + if is_swagger2: + for p in operation.get("parameters") or []: + if isinstance(p, dict) and p.get("in") == "body": + return p.get("schema") + return None + body = operation.get("requestBody") or {} + content = body.get("content") or {} + if "application/json" in content: + return content["application/json"].get("schema") + if content: + first = next(iter(content.values())) + return first.get("schema") if isinstance(first, dict) else None + return None + + +__all__ = [ + "FieldLeaf", + "extract_leaves", + "extract_produces_for_operation", + "extract_consumes_for_operation", +] diff --git a/graph_tool_call/ingest/openapi.py b/graph_tool_call/ingest/openapi.py index 90399dd..8f53173 100644 --- a/graph_tool_call/ingest/openapi.py +++ b/graph_tool_call/ingest/openapi.py @@ -134,6 +134,41 @@ def _schema_type(schema: dict[str, Any]) -> str: return _TYPE_MAP.get(schema.get("type", "string"), "string") +def _pick_content_schema(content: dict[str, Any]) -> dict[str, Any]: + """Pick a usable schema from an OpenAPI ``content`` object. + + OpenAPI 3.x lets a request body / response declare schemas under any + media-type key. The preferred order is: + + 1. ``application/json`` — most common + 2. ``application/*+json`` (e.g. hal+json) — JSON variants + 3. ``*/*`` — Spring/SpringDoc default when + the operation doesn't pin a + specific content type + 4. first available media-type — last resort + + Returning the schema dict (possibly empty). The earlier code only + looked at ``application/json`` and silently dropped everything else, + which produced empty ``response_schema`` for every Spring endpoint + that uses the default ``*/*`` (real-world failure: x2bee Order API, + where this caused PathSynthesizer to find zero producers). + """ + if not isinstance(content, dict) or not content: + return {} + if "application/json" in content: + return (content["application/json"] or {}).get("schema") or {} + for ct, val in content.items(): + if isinstance(ct, str) and ct.endswith("+json"): + return (val or {}).get("schema") or {} + if "*/*" in content: + return (content["*/*"] or {}).get("schema") or {} + # Last resort: the first content type with a schema. + for val in content.values(): + if isinstance(val, dict) and val.get("schema"): + return val["schema"] + return {} + + # --------------------------------------------------------------------------- # Operation -> ToolSchema # --------------------------------------------------------------------------- @@ -167,6 +202,11 @@ def _extract_params_swagger2( ) else: is_required = p.get("required", False) + # OpenAPI 3.x / Swagger 2.0: path 파라미터는 본질적으로 required. + # 많은 spec이 명시 안 해도 URL placeholder라 호출 시 반드시 값이 있어야 함. + # synthesizer가 required 안 보고 빈 entity로 plan 생성 → HTTP 호출 실패 케이스 차단. + if location == "path": + is_required = True if required_only and not is_required: continue params.append( @@ -181,48 +221,190 @@ def _extract_params_swagger2( return params +def _summarize_object_schema(schema: dict[str, Any], *, max_depth: int = 2) -> str: + """Object/array schema의 nested properties를 사람/LLM이 읽기 좋게 요약. + + parameter type이 'object'/'array'인데 안의 필드명이 ToolParameter에 안 드러나면 + LLM이 필드명을 추측하게 된다. 이 함수는 properties + required + description을 + description 텍스트로 합쳐서 LLM 컨텍스트에 함께 노출되도록 한다. + """ + if not isinstance(schema, dict): + return "" + + def _walk(s: dict[str, Any], depth: int, indent: int) -> list[str]: + if depth > max_depth or not isinstance(s, dict): + return [] + out: list[str] = [] + prefix = " " * indent + + # Unwrap array → items + if s.get("type") == "array": + items = s.get("items") or {} + out.append(f"{prefix}[array of:]") + out.extend(_walk(items, depth + 1, indent + 1)) + return out + + props = s.get("properties") or {} + if not props: + return out + required = set(s.get("required") or []) + for name, prop in props.items(): + if not isinstance(prop, dict): + continue + ptype = _schema_type(prop) + req = "*" if name in required else "" + desc = (prop.get("description") or "").strip() + example = prop.get("example") + line = f"{prefix}- {name}{req} ({ptype})" + if desc: + line += f": {desc}" + if example is not None and not desc: + line += f" e.g. {example}" + out.append(line) + # Nested object/array 1단계 더 펼치기 + if depth < max_depth: + if ptype == "object": + out.extend(_walk(prop, depth + 1, indent + 1)) + elif ptype == "array": + items = prop.get("items") or {} + if items.get("properties") or items.get("type") in ("object", "array"): + out.extend(_walk(items, depth + 1, indent + 1)) + return out + + lines = _walk(schema, 0, 0) + return "\n".join(lines) + + def _extract_params_openapi3( operation: dict[str, Any], resolved_spec: dict[str, Any], *, required_only: bool = False, ) -> list[ToolParameter]: - """Extract parameters from an OpenAPI 3.x operation.""" + """Extract parameters from an OpenAPI 3.x operation. + + Spring/SpringDoc gotcha: when a controller takes a `@ModelAttribute` + DTO via query string, the spec sometimes lists BOTH the wrapper + object AND its inner fields as separate query parameters + (``regularOrderDetailRequest`` ``in=query`` ``type=object`` AND + ``rglrDeliNo`` ``in=query`` ``type=string``). Treating the wrapper + as a real input field poisons downstream producer matching: nothing + in the API ever returns a value named after the wrapper class, so + PathSynthesizer raises ``UnsatisfiableField`` on a phantom field. + + Strategy: drop wrapper parameters when their inner properties are + already exposed as siblings; otherwise expand the wrapper into its + leaf properties so callers see the real input names. + """ params: list[ToolParameter] = [] + raw_parameters = list(operation.get("parameters", [])) + # Pre-collect names from non-object parameters — used to detect when + # a wrapper's inner property is already exposed alongside it. + sibling_names: set[str] = { + str(p.get("name") or "") + for p in raw_parameters + if isinstance(p, dict) and _schema_type(p.get("schema", {}) or {}) not in ("object",) + } + # Path / query / header / cookie parameters - for p in operation.get("parameters", []): + for p in raw_parameters: if "name" not in p: continue # skip malformed parameters (missing required 'name' field) schema = p.get("schema", {}) is_required = p.get("required", False) + # OpenAPI 3.x: path 파라미터는 본질적으로 required (URL placeholder 채우려면 필수). + # 많은 spec이 명시 안 해도 강제로 required 처리해야 synthesizer가 빈 entity를 + # UnsatisfiableFieldError로 raise → question.required popup으로 사용자에게 묻는다. + if p.get("in") == "path": + is_required = True + ptype = _schema_type(schema) + + # Wrapper-object/array query parameter handling. + # type=object → wrapper itself (Spring @ModelAttribute style). + # type=array of objects → wrapper used to send a list of structured + # records (less common but seen in some Spring specs); we expand the + # element schema's properties. Primitive arrays (array of integers / + # strings) are real list inputs and are NOT expanded here — those + # belong to the caller as a single multi-value field. + if ptype in ("object", "array") and p.get("in") == "query": + wrapper_props: dict[str, Any] = {} + wrapper_required: set[str] = set() + if ptype == "object": + wrapper_props = (schema.get("properties") or {}) if isinstance(schema, dict) else {} + wrapper_required = set(schema.get("required") or []) + else: # array + items = (schema.get("items") or {}) if isinstance(schema, dict) else {} + if isinstance(items, dict) and items.get("type") == "object": + wrapper_props = items.get("properties") or {} + wrapper_required = set(items.get("required") or []) + # else: primitive-element array — don't expand, treat as real input + if wrapper_props: + # If every inner property is already a sibling parameter, + # drop the wrapper entirely (deduplication). + if all(prop in sibling_names for prop in wrapper_props): + continue + # Otherwise expand the wrapper into individual leaves so + # producer matching has real field names to chase. + for prop_name, prop_schema in wrapper_props.items(): + if prop_name in sibling_names: + continue # don't double-list ones already exposed + inner_required = prop_name in wrapper_required + if required_only and not inner_required: + continue + inner_type = _schema_type(prop_schema or {}) + inner_desc = (prop_schema or {}).get("description", "") or "" + params.append( + ToolParameter( + name=prop_name, + type=inner_type, + description=inner_desc, + required=inner_required, + enum=(prop_schema or {}).get("enum"), + ) + ) + continue # wrapper itself is not added + if required_only and not is_required: continue + desc = p.get("description", "") or "" + # object/array 타입이면 nested fields를 description에 펼쳐서 + # LLM이 정확한 필드명(예: searchWord)을 알 수 있게 한다. + if ptype in ("object", "array"): + nested = _summarize_object_schema(schema) + if nested: + desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}" params.append( ToolParameter( name=p["name"], - type=_schema_type(schema), - description=p.get("description", ""), + type=ptype, + description=desc, required=is_required, enum=schema.get("enum"), ) ) - # requestBody + # requestBody — pick the most specific schema across declared media types + # (Spring/SpringDoc commonly emits */* — see _pick_content_schema notes). request_body = operation.get("requestBody", {}) content = request_body.get("content", {}) - json_content = content.get("application/json", {}) - body_schema = json_content.get("schema", {}) + body_schema = _pick_content_schema(content) body_required = set(body_schema.get("required", [])) for prop_name, prop_schema in body_schema.get("properties", {}).items(): is_required = prop_name in body_required if required_only and not is_required: continue + desc = prop_schema.get("description") or "" + # nested object/array는 한 단계 더 펼치기 + if _schema_type(prop_schema) in ("object", "array"): + nested = _summarize_object_schema(prop_schema) + if nested: + desc = (desc + "\nFields:\n" + nested).strip() if desc else f"Fields:\n{nested}" params.append( ToolParameter( name=prop_name, type=_schema_type(prop_schema), - description=prop_schema.get("description", ""), + description=desc, required=is_required, ) ) @@ -304,6 +486,34 @@ def _enrich_description(description: str, method: str, path: str) -> str: return description +def _resolve_server_url( + operation: dict[str, Any], + path_item: dict[str, Any] | None, + spec: dict[str, Any], + *, + is_swagger2: bool = False, +) -> str | None: + """OpenAPI 우선순위: operation.servers > path.servers > spec.servers. + + Swagger 2.0은 ``host`` + ``basePath`` + ``schemes`` 조합으로 base_url 구성. + """ + if is_swagger2: + host = spec.get("host") + if not host: + return None + scheme = (spec.get("schemes") or ["https"])[0] + base_path = spec.get("basePath") or "" + return f"{scheme}://{host}{base_path}".rstrip("/") + + for source in (operation, path_item or {}, spec): + servers = source.get("servers") if isinstance(source, dict) else None + if servers and isinstance(servers, list) and servers: + url = (servers[0] or {}).get("url") + if url: + return str(url).rstrip("/") + return None + + def _operation_to_tool( operation_id: str, operation: dict[str, Any], @@ -313,6 +523,7 @@ def _operation_to_tool( *, is_swagger2: bool = False, required_only: bool = False, + path_item: dict[str, Any] | None = None, ) -> ToolSchema: """Convert a single OpenAPI operation into a ToolSchema.""" description = operation.get("summary") or operation.get("description", "") @@ -333,21 +544,24 @@ def _operation_to_tool( else: parameters = _extract_params_openapi3(operation, resolved_spec, required_only=required_only) - # Build response schema metadata + # Build response schema metadata. Walk responses in success-code order + # and use _pick_content_schema so we don't drop schemas declared under + # */*, application/*+json, or other non-JSON media types. responses = operation.get("responses", {}) response_schema: dict[str, Any] = {} for code in ("200", "201", "default"): - if code in responses: - resp = responses[code] - # Swagger 2.0 - if "schema" in resp: - response_schema = resp["schema"] - break - # OpenAPI 3.x - resp_content = resp.get("content", {}) - if "application/json" in resp_content: - response_schema = resp_content["application/json"].get("schema", {}) - break + if code not in responses: + continue + resp = responses[code] or {} + # Swagger 2.0 puts the schema directly on the response object. + if "schema" in resp and isinstance(resp.get("schema"), dict): + response_schema = resp["schema"] + break + # OpenAPI 3.x: inspect the content map. + picked = _pick_content_schema(resp.get("content") or {}) + if picked: + response_schema = picked + break metadata: dict[str, Any] = { "source": "openapi", @@ -357,6 +571,13 @@ def _operation_to_tool( if response_schema: metadata["response_schema"] = response_schema + # spec/path/operation 단위의 servers field → tool 자체 base_url 부여. + # 한 컬렉션에 다른 host를 가진 source들이 섞여 있을 때 executor가 tool마다 + # 알맞은 base_url로 호출할 수 있게 한다. + server_url = _resolve_server_url(operation, path_item, resolved_spec, is_swagger2=is_swagger2) + if server_url: + metadata["base_url"] = server_url + return ToolSchema( name=operation_id, description=description, @@ -459,6 +680,7 @@ def ingest_openapi( resolved_raw, is_swagger2=is_swagger2, required_only=required_only, + path_item=path_item, ) tools.append(tool) diff --git a/graph_tool_call/langchain/gateway.py b/graph_tool_call/langchain/gateway.py index cfde75e..a570589 100644 --- a/graph_tool_call/langchain/gateway.py +++ b/graph_tool_call/langchain/gateway.py @@ -66,6 +66,89 @@ def _extract_parameters_info(tool: Any) -> list[dict[str, Any]] | None: return None +def _summarize_response_schema(schema: dict[str, Any]) -> str | None: + """Produce a one-line summary of an OpenAPI response schema for the LLM. + + Lists top-level field names + types so the model can plan parameter + extraction for the next call. + """ + if not isinstance(schema, dict): + return None + + # Unwrap arrays + container = schema + is_array = False + if container.get("type") == "array" and isinstance(container.get("items"), dict): + container = container["items"] + is_array = True + + props = container.get("properties") + if not isinstance(props, dict) or not props: + # Fall back to a bare type description + t = container.get("type") + return f"array of {t}" if is_array and t else t + + fields = [] + for name, info in list(props.items())[:12]: + if not isinstance(info, dict): + fields.append(name) + continue + t = info.get("type") or info.get("$ref", "object").rsplit("/", 1)[-1] + fields.append(f"{name}:{t}") + summary = "{" + ", ".join(fields) + "}" + return f"array of {summary}" if is_array else summary + + +def _enrich_from_graph(name: str, graph: Any | None) -> dict[str, Any]: + """Pull source_label, method/path, response summary, and outgoing edges + from the underlying ToolGraph for *name*. Returns an empty dict if the + graph or tool is not available — callers should treat all keys as optional. + """ + if graph is None: + return {} + + enrichment: dict[str, Any] = {} + + tool_schema = None + try: + tool_schema = graph.tools.get(name) + except Exception: + return enrichment + + if tool_schema is not None and getattr(tool_schema, "metadata", None): + meta = tool_schema.metadata + if meta.get("source_label"): + enrichment["source"] = meta["source_label"] + if meta.get("method") and meta.get("path"): + enrichment["http"] = f"{meta['method'].upper()} {meta['path']}" + rs = meta.get("response_schema") + if isinstance(rs, dict): + summary = _summarize_response_schema(rs) + if summary: + enrichment["returns"] = summary + + # Outgoing edges → chain hints + try: + engine = graph.graph + edges = engine.get_edges_from(name, direction="out") + chains: list[str] = [] + for _src, target, attrs in edges: + relation = attrs.get("relation") + relation_name = relation.value if hasattr(relation, "value") else str(relation) + # Skip purely structural BELONGS_TO edges + if relation_name in ("belongs_to", "BELONGS_TO"): + continue + chains.append(f"{relation_name}→{target}") + if len(chains) >= 5: + break + if chains: + enrichment["next_candidates"] = chains + except Exception: + pass + + return enrichment + + def create_gateway_tools( tools: list[Any], *, @@ -111,12 +194,15 @@ def create_gateway_tools( total = len(tool_map) call_history: list[str] = [] + underlying_graph = getattr(toolkit, "graph", None) + @langchain_tool def search_tools(query: str, top_k: int | None = None) -> str: """Search available tools by natural language query. Use this FIRST to find which tools are available for the task. - Returns tool names, descriptions, and required parameters. + Returns tool names, descriptions, parameters, response shape, and + ``next_candidates`` (related tools you may want to call afterwards). Args: query: Natural language search query (e.g. "cancel order", "send email") @@ -135,11 +221,12 @@ def search_tools(query: str, top_k: int | None = None) -> str: desc = t.get("description", "") entry: dict[str, Any] = { "name": name, - "description": desc[:200], + "description": desc[:300], } params = _extract_parameters_info(t) if params: entry["parameters"] = params + entry.update(_enrich_from_graph(name, underlying_graph)) matched.append(entry) output = { @@ -148,8 +235,10 @@ def search_tools(query: str, top_k: int | None = None) -> str: "total_tools": total, "tools": matched, "hint": ( - "Use call_tool to execute a tool. " - "Pass tool_name and arguments as a dict matching the parameters above." + "Use call_tool to execute a tool. Pass tool_name and arguments " + "as a dict matching the parameters above. The 'returns' field " + "shows the response shape — extract values from there to build " + "arguments for the next call (see 'next_candidates')." ), } diff --git a/graph_tool_call/net.py b/graph_tool_call/net.py index dfe1c35..466ae30 100644 --- a/graph_tool_call/net.py +++ b/graph_tool_call/net.py @@ -44,8 +44,23 @@ def redirect_request( return super().redirect_request(req, fp, code, msg, headers, newurl) -def _open_url(request: urllib.request.Request | str, *, timeout: int, max_redirects: int) -> Any: - opener = urllib.request.build_opener(_LimitedRedirectHandler(max_redirects)) +def _open_url( + request: urllib.request.Request | str, + *, + timeout: int, + max_redirects: int, + verify_ssl: bool = True, +) -> Any: + """urllib opener — verify_ssl=False 시 self-signed/사내 CA 인증서 허용.""" + handlers: list[Any] = [_LimitedRedirectHandler(max_redirects)] + if not verify_ssl: + import ssl + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + handlers.append(urllib.request.HTTPSHandler(context=ctx)) + opener = urllib.request.build_opener(*handlers) return opener.open(request, timeout=timeout) @@ -128,13 +143,27 @@ def fetch_url_text( allowed_content_types: tuple[str, ...] = _DEFAULT_ALLOWED_CONTENT_TYPES, allow_private_hosts: bool = False, max_redirects: int = _DEFAULT_MAX_REDIRECTS, + verify_ssl: bool | None = None, ) -> str: - """Fetch UTF-8 text from a remote URL with basic SSRF protections.""" + """Fetch UTF-8 text from a remote URL with basic SSRF protections. + + ``verify_ssl`` — None 이면 ``allow_private_hosts`` 값에 따라 자동 결정 + (사내망 hosts 는 self-signed CA 가 일반적이므로 verify off 가 기본). + """ validate_remote_url(url, allow_private_hosts=allow_private_hosts) + if verify_ssl is None: + # allow_private_hosts=True 사용자는 보통 사내망 hitting. 사내 CA 포용. + verify_ssl = not allow_private_hosts + req = urllib.request.Request(url, headers=headers or {}) try: - with _open_url(req, timeout=timeout, max_redirects=max_redirects) as resp: + with _open_url( + req, + timeout=timeout, + max_redirects=max_redirects, + verify_ssl=verify_ssl, + ) as resp: final_url = url if hasattr(resp, "geturl"): candidate = resp.geturl() diff --git a/graph_tool_call/ontology/builder.py b/graph_tool_call/ontology/builder.py index f6fb1a7..517d730 100644 --- a/graph_tool_call/ontology/builder.py +++ b/graph_tool_call/ontology/builder.py @@ -5,7 +5,7 @@ from graph_tool_call.core.dict_graph import DictGraph from graph_tool_call.core.protocol import GraphEngine from graph_tool_call.core.tool import ToolSchema -from graph_tool_call.ontology.schema import NodeType, RelationType +from graph_tool_call.ontology.schema import Confidence, NodeType, RelationType class OntologyBuilder: @@ -64,11 +64,36 @@ def add_relation( target: str, relation: str | RelationType, weight: float = 1.0, + *, + confidence: str | Confidence | None = None, + conf_score: float | None = None, + layer: int | None = None, + evidence: str | None = None, ) -> None: - """Add a directed relation between two nodes.""" + """Add a directed relation between two nodes. + + Optional graphify-style attrs (all default None — existing callers + unaffected): + + confidence: Confidence label (EXTRACTED / INFERRED / AMBIGUOUS). + conf_score: Raw 0.0–1.0 score from the upstream detector. + layer: 1=structural (path/CRUD/$ref), 2=heuristic (name/RPC). + evidence: Human-readable reason; capped at 200 chars to avoid bloat. + """ if isinstance(relation, str): relation = RelationType(relation) - self._graph.add_edge(source, target, relation=relation, weight=weight) + if isinstance(confidence, Confidence): + confidence = confidence.value + attrs: dict = {"relation": relation, "weight": weight} + if confidence is not None: + attrs["confidence"] = confidence + if conf_score is not None: + attrs["conf_score"] = float(conf_score) + if layer is not None: + attrs["layer"] = int(layer) + if evidence: + attrs["evidence"] = evidence[:200] + self._graph.add_edge(source, target, **attrs) # --- queries --- diff --git a/graph_tool_call/ontology/llm_provider.py b/graph_tool_call/ontology/llm_provider.py index 7897554..8d19923 100644 --- a/graph_tool_call/ontology/llm_provider.py +++ b/graph_tool_call/ontology/llm_provider.py @@ -5,7 +5,7 @@ import json import urllib.request from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any from graph_tool_call.ontology.schema import RelationType @@ -13,11 +13,20 @@ @dataclass class ToolSummary: - """Lightweight tool representation for LLM prompts.""" + """Lightweight tool representation for LLM prompts. + + The optional fields (``method``, ``path``, ``response_fields``) extend the + summary for semantic enrichment (``enrich_tool_semantics``). They are + ignored by methods that don't need them, preserving backward compat. + """ name: str description: str parameters: list[str] # just parameter names + # Extended context for semantic enrichment (optional) + method: str = "" + path: str = "" + response_fields: list[str] = field(default_factory=list) @dataclass @@ -31,6 +40,76 @@ class InferredRelation: reason: str +@dataclass +class FieldSemantic: + """A field annotated with its semantic identifier. + + Used on both produces (what a tool outputs) and consumes (what it + requires). ``json_path`` is set on produces; ``field`` is set on consumes. + + ``kind`` (consumes only) distinguishes two roles: + - ``"data"`` — true data dependency (e.g. a business identifier + needed to address the operation). PathSynthesizer + will chain to a producer for this field. + - ``"context"`` — ambient config (locale, site, pagination). Must be + supplied as an entity or collection default; the + synthesizer will NOT build a prerequisite chain + just to fetch it. + + The default ``"data"`` matches pre-kind behavior (safe for tools whose + enrichment predates this schema change). + """ + + semantic: str + json_path: str = "" + field: str = "" + kind: str = "data" + + +@dataclass +class PairHint: + """A tool that pairs with the current tool in a workflow. + + ``source`` distinguishes ownership so re-running auto enrichment doesn't + overwrite operator curation: + - ``"auto"`` — produced by Pass 2a (per-tool batch) or Pass 2b + (cross-batch). Replaced on every Pass 2b re-run. + - ``"manual"`` — added by an operator through the UI. Never overwritten + by automatic enrichment. + + Default ``"manual"`` is intentional: legacy data without a ``source`` + field gets the safer label, so a Pass 2b re-run does not silently delete + pre-existing entries that may have been hand-curated. + """ + + tool: str + reason: str = "" + source: str = "manual" + + +@dataclass +class ToolEnrichment: + """Per-tool semantic annotation produced by ``enrich_tool_semantics``. + + This is the Pass 2 output of the Plan-and-Execute L0 knowledge base. + Used downstream by: + - Stage 1 target selection (``when_to_use`` in catalog) + - Stage 2 path synthesis (``produces_semantics`` / ``consumes_semantics`` + replace hardcoded synonym tables) + - Graph edges (``pairs_well_with`` becomes semantic edges) + """ + + # canonical_action: search | read | create | update | delete | action + canonical_action: str + primary_resource: str # e.g. "product" + one_line_summary: str + when_to_use: str + when_not_to_use: str = "" + produces_semantics: list[FieldSemantic] = field(default_factory=list) + consumes_semantics: list[FieldSemantic] = field(default_factory=list) + pairs_well_with: list[PairHint] = field(default_factory=list) + + # --------------------------------------------------------------------------- # Prompt templates # --------------------------------------------------------------------------- @@ -124,6 +203,103 @@ class InferredRelation: [{{"source":"toolA","target":"toolB","relation":"PRECEDES","confidence":0.9,"reason":"..."}}]""" +_ENRICH_SEMANTICS_PROMPT = """\ +You are annotating API tools for a plan-and-execute planning system. +Produce structured metadata that downstream components use to (1) pick the +right tool for a user's goal, (2) synthesize execution plans, and (3) wire +one tool's output to another tool's input. +{reference_block}{vocab_block} +TOOLS TO ANNOTATE (this batch): +{batch_detailed} + +For each tool in the batch, output a JSON object with these fields: + - canonical_action: one of "search" | "read" | "create" | "update" | "delete" | "action" + - primary_resource: one lowercase noun (e.g. "product", "order", "user", "shop", "category") + - one_line_summary: short natural-language summary (<=60 chars) + - when_to_use: 1-2 sentences describing the trigger condition + - when_not_to_use: optional 1 sentence (can be empty) — alternative tool cases + - produces_semantics: array of {{"semantic": "canonical_id", "json_path": "$.body..."}} + * Include only MEANINGFUL fields (IDs, names, key metrics). + * Skip pagination, headers, status codes. + * Use CONSISTENT semantic ids across tools. If two tools both return a + product identifier (one calls it "goodsNo", another "productId"), + use the same semantic like "product_id". + - consumes_semantics: array of {{"semantic": "canonical_id", + "field": "paramName", + "kind": "data" | "context"}} + * REQUIRED inputs only. Skip optional filters, pagination. + * Same semantic id conventions as produces. + * kind="data" — business-data dependency: an identifier or value that + addresses a specific record (e.g. product_id, order_id, user_id, + search_keyword). A prior step in a plan normally produces it. + * kind="context" — ambient/environmental config shared across the + workflow (locale, site_no, tenant, pagination cursors, flag switches). + The user or the caller supplies it as a default — it is NOT produced + by a prior step. Use this for anything a plain UI user would set + once per session, not per request. + - pairs_well_with: array of {{"tool": "tool_name_from_available_list", + "reason": "brief reason"}} + * 2-4 tools that typically precede or follow this tool. + * Names MUST match the available list exactly. Do not invent. + +OUTPUT FORMAT (strict): +{{ + "tool_name_1": {{...fields...}}, + "tool_name_2": {{...fields...}} +}} + +STRICT RULES: + - You MUST produce one entry for EVERY tool in the batch. + - Do NOT skip tools with unclear descriptions — make your best guess. + - Keep fields concise (short sentences) so all tools fit in the output. + - Return JSON only. No markdown fences, no prose, no comments.""" + + +# Pass 2b — cross-batch workflow pairing. +# +# Per-tool enrichment (Pass 2a) only sees one batch at a time, so it cannot +# spot pairs whose other half lives in a different batch. This prompt shows +# the entire collection's 1-line summaries so the LLM can suggest workflow +# successors that span resources. +# +# The output is batched (subset of tools per call) to stay within the +# response token budget — input stays full, output stays small. +_PAIRS_PROMPT = """\ +You are reviewing an API tool collection to suggest workflow pairs. + +For EACH tool in the OUTPUT BATCH, suggest 2-4 OTHER tools from the FULL +TOOL LIST that are commonly invoked just before or just after this tool in +a real-world workflow. Pairs SHOULD cross resource boundaries when there is +a natural business sequence (e.g. product detail → add to cart → checkout). + +Pair quality matters more than quantity — only suggest tools you are +confident about. If a tool has no good pair candidates, return an empty +array for it. + +FULL TOOL LIST (all available tools — pick pairs only from this list): +{full_list} + +OUTPUT BATCH (suggest pairs ONLY for these tools): +{batch_list} + +OUTPUT FORMAT (strict JSON): +{{ + "tool_name_1": [ + {{"tool": "other_tool_name", "reason": "short reason"}}, + ... + ], + "tool_name_2": [...], + ... +}} + +STRICT RULES: + - You MUST include one entry for EVERY tool in the OUTPUT BATCH (use + empty array if no good pairs). + - Pair tool names MUST exactly match a name in the FULL TOOL LIST. + - Do NOT pair a tool with itself. + - Return JSON only. No markdown fences, no prose, no comments.""" + + def _format_tools_list(tools: list[ToolSummary]) -> str: lines = [] for i, t in enumerate(tools, 1): @@ -132,6 +308,107 @@ def _format_tools_list(tools: list[ToolSummary]) -> str: return "\n".join(lines) +def _format_tools_brief(tools: list[ToolSummary]) -> str: + """Compact name list for the ``pairs_well_with`` reference. + + Name-only (no descriptions) to keep prompt small — descriptions would + bloat the prompt by N× since every batch prompt contains this list. + Tool names like ``seltSearchProduct`` already encode intent. + """ + return "\n".join(f"- {t.name}" for t in tools) + + +def _format_tools_for_pairs(tools: list[ToolSummary]) -> str: + """Compact ``name: 1-line summary`` block for Pass 2b prompts. + + Uses ``description`` (mapped from ai_metadata.one_line_summary by the + caller for tools that have been Pass 2a annotated) so the LLM can pair + based on workflow meaning, not just tool names. + """ + lines = [] + for t in tools: + summary = (t.description or "").strip().replace("\n", " ") + if len(summary) > 100: + summary = summary[:97] + "..." + lines.append(f"- {t.name}: {summary}" if summary else f"- {t.name}") + return "\n".join(lines) + + +def _format_tools_for_enrichment(tools: list[ToolSummary]) -> str: + """Detailed per-tool block for enrichment prompt input.""" + blocks = [] + for t in tools: + parts = [f"== {t.name} =="] + if t.method and t.path: + parts.append(f"HTTP: {t.method.upper()} {t.path}") + if t.description: + desc = t.description.strip()[:400] + parts.append(f"Description: {desc}") + if t.parameters: + params = ", ".join(t.parameters[:25]) + parts.append(f"Request fields: {params}") + if t.response_fields: + resp = ", ".join(t.response_fields[:25]) + parts.append(f"Response fields: {resp}") + blocks.append("\n".join(parts)) + return "\n\n".join(blocks) + + +def _parse_enrichment(data: Any) -> ToolEnrichment | None: + """Build a ToolEnrichment from LLM JSON output. Tolerant of missing keys.""" + if not isinstance(data, dict): + return None + try: + produces = [ + FieldSemantic( + semantic=str(p.get("semantic", "")).strip(), + json_path=str(p.get("json_path", "")).strip(), + ) + for p in (data.get("produces_semantics") or []) + if isinstance(p, dict) and str(p.get("semantic", "")).strip() + ] + consumes = [] + for c in data.get("consumes_semantics") or []: + if not (isinstance(c, dict) and str(c.get("semantic", "")).strip()): + continue + raw_kind = str(c.get("kind", "data")).strip().lower() + kind = raw_kind if raw_kind in ("data", "context") else "data" + consumes.append( + FieldSemantic( + semantic=str(c.get("semantic", "")).strip(), + field=str(c.get("field", "")).strip(), + kind=kind, + ) + ) + # Pairs from per-tool enrichment are batch-scoped (LLM only sees the + # current batch), so quality is lower than cross-batch Pass 2b. + # Marked source="auto" so a Pass 2b run can replace them while + # preserving operator-curated source="manual" entries. + pairs = [ + PairHint( + tool=str(p.get("tool", "")).strip(), + reason=str(p.get("reason", "")).strip(), + source="auto", + ) + for p in (data.get("pairs_well_with") or []) + if isinstance(p, dict) and str(p.get("tool", "")).strip() + ] + action = str(data.get("canonical_action", "")).strip().lower() + resource = str(data.get("primary_resource", "")).strip().lower() + return ToolEnrichment( + canonical_action=action, + primary_resource=resource, + one_line_summary=str(data.get("one_line_summary", "")).strip(), + when_to_use=str(data.get("when_to_use", "")).strip(), + when_not_to_use=str(data.get("when_not_to_use", "")).strip(), + produces_semantics=produces, + consumes_semantics=consumes, + pairs_well_with=pairs, + ) + except (KeyError, TypeError, ValueError, AttributeError): + return None + + def _parse_relation_type(s: str) -> RelationType | None: mapping = { "REQUIRES": RelationType.REQUIRES, @@ -424,6 +701,157 @@ def generate_example_queries( return all_queries + def enrich_pairs( + self, + tools: list[ToolSummary], + batch_size: int = 30, + ) -> dict[str, list[PairHint]]: + """Pass 2b — cross-batch workflow pair suggestion. + + Unlike Pass 2a (``enrich_tool_semantics``) which sees only the + current batch, this pass shows the LLM the full collection's 1-line + summaries so it can suggest pairs that cross resource boundaries + (e.g. ``getProductDetail → addToCart`` even when the two tools live + in different swagger sources). + + Output is batched only on the OUTPUT axis: input list stays full + for every call, output covers ``batch_size`` tools per call. This + keeps the prompt short and avoids the 8k-token output limit + truncating long pair lists. + + Tools should arrive with ``description`` set to ai_metadata + ``one_line_summary`` when available (Pass 2a output) so pairing can + rely on workflow meaning, not just tool names. + + Returns: {tool_name: [PairHint(source="auto"), ...]} + """ + results: dict[str, list[PairHint]] = {} + if not tools: + return results + + full_list = _format_tools_for_pairs(tools) + + for i in range(0, len(tools), batch_size): + batch = tools[i : i + batch_size] + batch_list = _format_tools_for_pairs(batch) + prompt = _PAIRS_PROMPT.format(full_list=full_list, batch_list=batch_list) + response = self.generate(prompt) + + try: + parsed = _extract_json(response) + if not isinstance(parsed, dict): + continue + for name, raw_pairs in parsed.items(): + if not isinstance(raw_pairs, list): + continue + pair_list: list[PairHint] = [] + for p in raw_pairs: + if not isinstance(p, dict): + continue + target = str(p.get("tool", "")).strip() + if not target or target == name: + continue + pair_list.append( + PairHint( + tool=target, + reason=str(p.get("reason", "")).strip(), + source="auto", + ) + ) + results[str(name)] = pair_list + except (json.JSONDecodeError, KeyError, TypeError): + continue + + return results + + def enrich_tool_semantics( + self, + tools: list[ToolSummary], + batch_size: int = 10, + *, + reference_tools: list[ToolSummary] | None = None, + existing_vocab: list[str] | None = None, + valid_tool_names: set[str] | None = None, + ) -> dict[str, ToolEnrichment]: + """Per-tool semantic annotation for Plan-and-Execute architecture. + + ``tools`` = the batch(es) to produce detailed enrichment for. + + ``reference_tools`` (optional, default ``None``) — when supplied, + rendered as a brief tool list in the prompt so the LLM can pick + ``pairs_well_with`` from valid names. **Streaming callers should + usually pass ``None``** — Pass 2b handles pairs in a separate + cross-batch call, and skipping the reference block saves ~50% + prompt tokens. The pair list emitted in this pass is post-validated + against ``valid_tool_names`` instead. + + ``existing_vocab`` (optional) — accumulated semantic ids decided in + previous batches of the same enrichment run. The LLM is asked to + reuse these labels when applicable, which keeps cross-batch vocab + consistent (avoids ``product_id`` vs ``productId`` divergence). + Streaming callers should pass the unique semantics seen so far. + + ``valid_tool_names`` (optional) — full set of tool names in the + collection. When supplied, ``pairs_well_with`` entries pointing to + tools outside this set are dropped silently (LLM hallucination + guard). When ``reference_tools`` is None the LLM only knows the + names in the current batch; without this guard it would invent + names for cross-batch pairs. + """ + results: dict[str, ToolEnrichment] = {} + if not tools: + return results + + ref_block = "" + if reference_tools: + ref_block = ( + "\nAVAILABLE TOOLS IN THE COLLECTION (names + 1-line " + "descriptions, for pairs_well_with reference):\n" + + _format_tools_brief(reference_tools) + + "\n" + ) + + vocab_block = "" + if existing_vocab: + vocab_block = ( + "\nEXISTING SEMANTIC VOCABULARY (reuse these canonical ids " + "when the field has the same meaning — keeps cross-batch " + "labels consistent):\n" + + "\n".join(f"- {s}" for s in sorted(set(existing_vocab))) + + "\n" + ) + + for i in range(0, len(tools), batch_size): + batch = tools[i : i + batch_size] + prompt = _ENRICH_SEMANTICS_PROMPT.format( + reference_block=ref_block, + vocab_block=vocab_block, + batch_detailed=_format_tools_for_enrichment(batch), + ) + response = self.generate(prompt) + + try: + parsed = _extract_json(response) + if not isinstance(parsed, dict): + continue + for name, data in parsed.items(): + enrichment = _parse_enrichment(data) + if enrichment is None or not enrichment.canonical_action: + continue + # Hallucination guard for pairs_well_with — drop entries + # whose target name is not in the catalog. + if valid_tool_names is not None: + enrichment.pairs_well_with = [ + p + for p in enrichment.pairs_well_with + if p.tool in valid_tool_names and p.tool != str(name) + ] + results[str(name)] = enrichment + except (json.JSONDecodeError, KeyError, TypeError): + continue + + return results + # --------------------------------------------------------------------------- # Ollama Provider @@ -476,18 +904,25 @@ def __init__( model: str = "gpt-4o-mini", base_url: str = "https://api.openai.com/v1", api_key: str = "", + max_tokens: int = 8192, + timeout: int = 300, ) -> None: self.model = model self.base_url = base_url.rstrip("/") self.api_key = api_key + self.max_tokens = max_tokens + self.timeout = timeout def generate(self, prompt: str) -> str: url = f"{self.base_url}/chat/completions" + # max_tokens 를 명시 지정하지 않으면 provider 기본값 (일부 모델은 4096) + # 으로 잘려서 batch enrichment JSON 이 중간에 truncate → 일부 tool 누락. payload = json.dumps( { "model": self.model, "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, + "max_tokens": self.max_tokens, } ).encode() @@ -496,7 +931,7 @@ def generate(self, prompt: str) -> str: headers["Authorization"] = f"Bearer {self.api_key}" req = urllib.request.Request(url, data=payload, headers=headers, method="POST") - with urllib.request.urlopen(req, timeout=120) as resp: # noqa: S310 + with urllib.request.urlopen(req, timeout=self.timeout) as resp: # noqa: S310 result = json.loads(resp.read().decode()) choices = result.get("choices", []) if choices: diff --git a/graph_tool_call/ontology/schema.py b/graph_tool_call/ontology/schema.py index 04086fb..2a67290 100644 --- a/graph_tool_call/ontology/schema.py +++ b/graph_tool_call/ontology/schema.py @@ -24,6 +24,26 @@ class NodeType(str, Enum): DOMAIN = "domain" +class Confidence(str, Enum): + """Edge confidence label, graphify-style. + + Every edge in a graphify-style ToolGraph carries one of three labels so + downstream consumers (LLM agents, retrieval scoring, UI) can distinguish + deterministic facts from heuristic guesses. + + EXTRACTED — derived deterministically from the spec (path hierarchy, + shared $ref, CRUD pattern). conf_score >= 0.85 AND layer == 1. + INFERRED — heuristic match (name-based, RPC pattern, cross-resource). + conf_score >= 0.85 but not strictly structural. + AMBIGUOUS — low-confidence heuristic (0.70 <= conf_score < 0.85). + Surface in UI for review; retrieval applies a score penalty. + """ + + EXTRACTED = "EXTRACTED" + INFERRED = "INFERRED" + AMBIGUOUS = "AMBIGUOUS" + + # Weights for relation types during retrieval scoring DEFAULT_RELATION_WEIGHTS: dict[str, float] = { RelationType.SIMILAR_TO: 0.8, diff --git a/graph_tool_call/plan/__init__.py b/graph_tool_call/plan/__init__.py new file mode 100644 index 0000000..dbab1f3 --- /dev/null +++ b/graph_tool_call/plan/__init__.py @@ -0,0 +1,95 @@ +"""Plan-and-Execute primitives: schemas, binding resolver, runner. + +The ``plan`` package is deliberately transport-agnostic. It knows nothing +about HTTP, authentication, or xgen internals — it only defines how a +Plan looks, how string bindings are resolved against step outputs, and how +to drive execution via an injected callable. + +Typical use (from an integration layer like xgen-workflow): + + from graph_tool_call.plan import Plan, PlanStep, PlanRunner + + plan = Plan(id="...", goal="...", steps=[PlanStep(...), ...]) + + def call_tool(tool_name, args): + return my_http_executor.execute(tool_name, args) + + runner = PlanRunner(call_tool) + for event in runner.run(plan): + # event: StepStarted | StepCompleted | StepFailed | PlanCompleted + ... +""" + +from graph_tool_call.plan.binding import ( + BindingError, + resolve_bindings, +) +from graph_tool_call.plan.intent import ( + IntentParseError, + ParsedIntent, + ToolCatalogEntry, + parse_intent, +) +from graph_tool_call.plan.response import ( + synthesize_failure_response, + synthesize_success_response, +) +from graph_tool_call.plan.runner import ( + PlanAborted, + PlanCompleted, + PlanEvent, + PlanRunner, + PlanStarted, + StepCompleted, + StepFailed, + StepStarted, +) +from graph_tool_call.plan.schema import ( + ExecutionTrace, + Plan, + PlanStep, + StepTrace, +) +from graph_tool_call.plan.synthesizer import ( + CyclicDependencyError, + DynamicOptionRequired, + MaxDepthExceededError, + PathSynthesizer, + PlanSynthesisError, + UnsatisfiableFieldError, +) + +__all__ = [ + # schema + "Plan", + "PlanStep", + "ExecutionTrace", + "StepTrace", + # binding + "BindingError", + "resolve_bindings", + # runner + events + "PlanRunner", + "PlanEvent", + "PlanStarted", + "StepStarted", + "StepCompleted", + "StepFailed", + "PlanCompleted", + "PlanAborted", + # synthesizer + "PathSynthesizer", + "PlanSynthesisError", + "UnsatisfiableFieldError", + "CyclicDependencyError", + "MaxDepthExceededError", + "DynamicOptionRequired", + # intent + "ToolCatalogEntry", + "ParsedIntent", + "IntentParseError", + "parse_intent", + # response + "synthesize_success_response", + "synthesize_failure_response", +] diff --git a/graph_tool_call/plan/binding.py b/graph_tool_call/plan/binding.py new file mode 100644 index 0000000..2ae6a50 --- /dev/null +++ b/graph_tool_call/plan/binding.py @@ -0,0 +1,161 @@ +"""Binding resolver for Plan args. + +Substitutes ``${source.dotted.path}`` placeholders in step arguments with +actual values drawn from the runtime context. The context is a dict mapping +source names (``"s1"``, ``"s2"``, ``"input"``, ...) to arbitrary JSON-like +objects. + +v1 path syntax (kept deliberately small): + + - dotted keys : ``s1.body.goods`` → ``ctx["s1"]["body"]["goods"]`` + - array index : ``s1.body.goods[0].goodsNo`` + - whole-source : ``s1`` → entire result dict of step s1 + - input alias : ``input.keyword`` — caller injects a special + ``"input"`` entry at runtime for user-provided + entities extracted by Stage 1. + +Explicitly NOT supported in v1: + + - wildcard ``[*]`` (fan-out) — see §11.1 of the design doc + - filter expressions (JSONPath ``[?(...)]``) + - functions / casts (``int(...)``, ``default(...)``) + +Behavior rules: + + 1. If a string argument is **entirely** one binding (``"${s1.id}"``) the + resolved value keeps its native type (int, dict, list, ...). This is + important so integer IDs aren't accidentally stringified. + 2. If a string contains bindings mixed with literal text + (``"prefix-${s1.id}"``) each binding is ``str()``-cast during + interpolation. The result is always a string. + 3. Unresolved bindings raise ``BindingError`` — callers should treat + this as a plan validation failure, not a tool execution error. + 4. ``dict`` and ``list`` values are walked recursively. +""" + +from __future__ import annotations + +import re +from typing import Any + + +class BindingError(ValueError): + """Raised when a ``${...}`` expression cannot be resolved.""" + + +# Matches one ``${...}`` placeholder. Accepts empty body so ``${}`` triggers +# a clear BindingError downstream instead of passing through as a literal. +# ``{`` and ``}`` inside a binding are not supported in v1. +_BINDING_RE = re.compile(r"\$\{([^${}]*)\}") + + +def resolve_bindings(value: Any, context: dict[str, Any]) -> Any: + """Recursively resolve bindings in *value* against *context*. + + Dict/list values are walked; strings are interpolated. Non-string + scalars pass through unchanged. + """ + if isinstance(value, dict): + return {k: resolve_bindings(v, context) for k, v in value.items()} + if isinstance(value, list): + return [resolve_bindings(v, context) for v in value] + if isinstance(value, str): + return _resolve_string(value, context) + return value + + +def _resolve_string(s: str, context: dict[str, Any]) -> Any: + """Resolve a string value. + + If the string is exactly one binding (``${path}``), returns the native + value. Otherwise substitutes each match with its stringified form. + """ + # Whole-string binding → native type + m = _BINDING_RE.fullmatch(s.strip()) + if m: + return _lookup(m.group(1).strip(), context) + + # Mixed / multi-binding → string interpolation + def _sub(match: re.Match[str]) -> str: + val = _lookup(match.group(1).strip(), context) + return "" if val is None else str(val) + + return _BINDING_RE.sub(_sub, s) + + +def _lookup(expr: str, context: dict[str, Any]) -> Any: + """Walk a dotted path with optional ``[N]`` indices against *context*.""" + tokens = _tokenize(expr) + if not tokens: + raise BindingError(f"empty binding expression: {expr!r}") + + head = tokens[0] + if head not in context: + raise BindingError( + f"unknown source {head!r} in binding ${{...}}: context has {sorted(context)!r}" + ) + node: Any = context[head] + + for tok in tokens[1:]: + if tok.startswith("[") and tok.endswith("]"): + # array index — allow negative too + try: + idx = int(tok[1:-1]) + except ValueError as exc: + raise BindingError(f"non-numeric array index {tok!r} in binding {expr!r}") from exc + if not isinstance(node, (list, tuple)): + raise BindingError( + f"indexing {tok} on non-list type {type(node).__name__} (expr={expr!r})" + ) + try: + node = node[idx] + except IndexError as exc: + raise BindingError(f"index {idx} out of range in binding {expr!r}") from exc + else: + if not isinstance(node, dict): + raise BindingError( + f"cannot descend into .{tok} on non-dict type {type(node).__name__} " + f"(expr={expr!r})" + ) + if tok not in node: + raise BindingError( + f"key {tok!r} not found in binding {expr!r} " + f"(available: {sorted(node)[:8]!r}...)" + ) + node = node[tok] + + return node + + +def _tokenize(expr: str) -> list[str]: + """Tokenize a dotted path with ``[N]`` indices. + + ``s1.body.goods[0].goodsNo`` → ``["s1", "body", "goods", "[0]", "goodsNo"]`` + """ + tokens: list[str] = [] + buf = [] + i = 0 + while i < len(expr): + ch = expr[i] + if ch == ".": + if buf: + tokens.append("".join(buf)) + buf = [] + elif ch == "[": + if buf: + tokens.append("".join(buf)) + buf = [] + end = expr.find("]", i) + if end == -1: + raise BindingError(f"unclosed '[' in binding {expr!r}") + tokens.append(expr[i : end + 1]) + i = end + else: + buf.append(ch) + i += 1 + if buf: + tokens.append("".join(buf)) + return tokens + + +__all__ = ["BindingError", "resolve_bindings"] diff --git a/graph_tool_call/plan/intent.py b/graph_tool_call/plan/intent.py new file mode 100644 index 0000000..c62d396 --- /dev/null +++ b/graph_tool_call/plan/intent.py @@ -0,0 +1,361 @@ +"""Stage 1 — Intent Parser. + +자연어 요구사항을 Stage 2 (PathSynthesizer) 가 소비할 수 있는 구조화 +``{target, entities}`` 로 변환한다. LLM 1회 호출, 작은 context. + +Catalog 구성 원칙 (설계 §4): + - 사전에 retrieval 로 상위 K개 도구만 넘김 (전체 카탈로그 X) + - 각 도구는 name + one_line_summary + when_to_use + 핵심 semantic tags + - Pass 2 enrichment 가 채운 ai_metadata 가 있으면 그 정보를 우선 사용; + 없으면 description 축약본으로 fallback + +LLM 은 structured JSON 만 반환 — 파싱 실패 시 BindingError 같은 방식으로 +호출자에게 명확히 전달. +""" + +from __future__ import annotations + +import difflib +import json +from dataclasses import dataclass, field +from typing import Any + +from graph_tool_call.ontology.llm_provider import OntologyLLM, _extract_json + +# Minimum SequenceMatcher ratio for treating an LLM-emitted entity key as +# a typo/expansion of a real vocab entry. 0.8 catches "search_keyword_name" +# vs "search_keyword" (~0.85) while rejecting unrelated pairs like +# "search_keyword" vs "search_query" (~0.54). +_VOCAB_FUZZY_CUTOFF = 0.8 + + +# --------------------------------------------------------------------------- +# data shape +# --------------------------------------------------------------------------- + + +@dataclass +class ToolCatalogEntry: + """Condensed tool view for intent-parsing prompt — under ~150 chars each.""" + + name: str + summary: str = "" # one_line_summary from ai_metadata + when_to_use: str = "" # ai_metadata.when_to_use + consumes_tags: list[str] = field(default_factory=list) # required semantic ids + canonical_action: str = "" # "read" | "search" | "create" | ... + primary_resource: str = "" # "product" | ... + + +@dataclass +class ParsedIntent: + """Stage 1 output — consumed by Stage 2 PathSynthesizer.""" + + target: str # tool name picked by LLM + entities: dict[str, Any] = field(default_factory=dict) + confidence: float = 0.0 # 0.0 ~ 1.0 + output_shape: str = "single" # "single" | "list" | "count" + reasoning: str = "" + + +class IntentParseError(Exception): + """Raised when the LLM output can't be mapped to a valid ParsedIntent.""" + + +# --------------------------------------------------------------------------- +# prompt +# --------------------------------------------------------------------------- + + +_INTENT_PROMPT = """\ +You pick the right API tool and extract entity values for a planning system. + +User requirement: +{requirement} + +Candidate tools (shortlisted by retrieval — includes the target's +prerequisite producers so every key you need should appear in some +tool's "needs:" line below): +{catalog} +{vocabulary_block}{enum_block}{seed_block} +HARD CONSTRAINTS — violating any of these is a planning error, not a +stylistic choice. Re-check the constraints before you emit JSON. + + HC1. DO NOT put a value into an identifier-style field (a field name + ending in "No" / "Id" / "Idx" / "Code" / "id") if the value + contains spaces, Korean/Chinese/Japanese letters, or category + words ("티셔츠", "신발", "shoes", a brand or model name). + Identifier fields accept short alphanumeric record locators + only ("G12345", "10293"). A descriptive phrase placed in such + a field is always wrong. + HC2. DO NOT invent field names. Every entity key MUST appear in one + of the candidate tools' "needs:" lines. If no listed field can + carry the user's value without violating HC1, omit the entity — + empty entities are fine; the downstream synthesizer chains + through a producer. + HC3. DO NOT put the same value into more than one field. Each value + goes into zero or exactly one field. + HC4. DO NOT translate, normalize, paraphrase, or expand the user's + value. Copy it byte-for-byte as written in the requirement. + HC5. For fields that have an enum mapping below, the entity value + MUST be one of the listed CODES (left side), never the label + (right side) and never the user's original phrase. Pick the + code whose label best matches the user's intent. If nothing + matches clearly, omit that entity. + +Selection guidance (apply only after the constraints hold): + - Pick exactly ONE tool — the final-goal tool. Do not plan the chain; + the downstream system builds prerequisite steps automatically. + - Free-text values (descriptive phrases like "quarzen 티셔츠", + "black hoodie") match fields named "searchWord", "query", + "keyword", or names ending in "Nm" / "Name". + - When several fields could carry the value without violating HC1, + prefer one a candidate's "needs:" line lists — that is a field a + tool you already considered actually accepts. + - output_shape: "single" / "list" / "count". + - confidence: 0.0~1.0 — your certainty in the tool pick. + - reasoning: one short sentence for audit logs. + +Output JSON only — no markdown, no prose. Schema: +{{ + "target": "", + "entities": {{...}}, + "confidence": 0.0, + "output_shape": "single" | "list" | "count", + "reasoning": "..." +}} +""" + + +def _coerce_entity_keys( + entities: dict[str, Any], + vocab: list[str], +) -> dict[str, Any]: + """Map LLM-emitted entity keys onto the vocabulary. + + Exact match → kept. Close match above ``_VOCAB_FUZZY_CUTOFF`` → coerced + to the canonical vocab entry. Otherwise the entry is dropped — silently + passing an invented key downstream causes producer-chain failures or + cycle detection (the vocab miss is the failure, not the symptom). + """ + vocab_set = set(vocab) + out: dict[str, Any] = {} + for key, value in entities.items(): + key_str = str(key) + if key_str in vocab_set: + out[key_str] = value + continue + match = difflib.get_close_matches( + key_str, + vocab, + n=1, + cutoff=_VOCAB_FUZZY_CUTOFF, + ) + if match: + # If multiple LLM keys collapse onto the same vocab entry, the + # later one wins. Acceptable: same canonical key with two + # values is already a degenerate LLM output. + out[match[0]] = value + return out + + +def _format_seed_block(seed_entities: dict[str, Any] | None) -> str: + """Render a 'carry forward' section for entities the caller already + decided in a previous turn. + + Multi-turn flow: when a previous synthesize attempt asked the user to + pick a value (e.g. via a popup of enum options), the chosen pairs are + fed back as ``seed_entities``. The LLM should keep them as-is unless + the new requirement explicitly contradicts a value, and only EXTRACT + NEW entities to add. Empty / None ⇒ section omitted. + """ + if not seed_entities: + return "" + lines = "\n".join( + f" - {k}: {json.dumps(v, ensure_ascii=False)}" for k, v in seed_entities.items() + ) + return ( + "\n\nExisting entities (carried over from prior turns — keep these " + "values exactly unless the user's new requirement explicitly " + "overrides one. You only need to extract additional entities that " + "the new requirement introduces):\n" + f"{lines}" + ) + + +def _format_enum_block(enum_mappings: dict[str, dict[str, str]] | None) -> str: + """Render the optional enum-mapping section of the prompt. + + ``enum_mappings`` shape: ``{field_name: {code: label}}`` — operator- + registered code lookups for backend enum fields whose values aren't + in the swagger schema (e.g. "10" -> "비회원" for a basket type code). + The LLM picks the code whose label matches the user's natural-language + intent. Empty / None ⇒ section omitted entirely. + """ + if not enum_mappings: + return "" + lines: list[str] = [] + for field_name, codes in enum_mappings.items(): + if not isinstance(codes, dict) or not codes: + continue + lines.append(f" - {field_name}:") + for code, label in codes.items(): + lines.append(f' "{code}" → {label}') + if not lines: + return "" + body = "\n".join(lines) + return ( + "\n\nEnum code mappings (operator-registered — when one of these " + "fields needs a value, pick the CODE whose label matches the " + "user's intent):\n" + f"{body}" + ) + + +def _format_vocabulary_block(tags: list[str]) -> str: + """Render the optional vocabulary section of the prompt. + + Returns an empty string when no vocab is provided so the prompt + stays focused on ``catalog``. Callers that want LLM access to + field names beyond the catalog (e.g. when retrieval failed to pull + in producers) can pass a non-empty list. + """ + if not tags: + return "" + lines = "\n".join(f" - {t}" for t in tags) + return ( + "\n\nAvailable entity field names — backup vocabulary used only when " + "no candidate tool's \"needs:\" line carries the user's value:\n" + f"{lines}" + ) + + +def _format_catalog(entries: list[ToolCatalogEntry]) -> str: + lines: list[str] = [] + for i, e in enumerate(entries, start=1): + parts = [f"{i}. {e.name}"] + if e.canonical_action or e.primary_resource: + parts.append(f"[{e.canonical_action}/{e.primary_resource}]".strip("[/]")) + if e.summary: + parts.append(f"— {e.summary}") + lines.append(" ".join(p for p in parts if p)) + if e.when_to_use: + lines.append(f" when: {e.when_to_use[:140]}") + if e.consumes_tags: + lines.append(f" needs: {', '.join(e.consumes_tags[:6])}") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# public API +# --------------------------------------------------------------------------- + + +def parse_intent( + requirement: str, + catalog: list[ToolCatalogEntry], + llm: OntologyLLM, + *, + vocabulary: list[str] | None = None, + enum_mappings: dict[str, dict[str, str]] | None = None, + seed_entities: dict[str, Any] | None = None, +) -> ParsedIntent: + """Call the LLM once to produce a ParsedIntent. + + ``catalog`` should be the retrieval-shortlisted candidate tools (keep + small — ~10 entries — to control prompt size). ``vocabulary`` is the + full set of ``kind=data`` semantic ids in the graph (so the LLM can + map free-text inputs to a search-style key even when the matching + producer wasn't retrieved). ``enum_mappings`` is operator-registered + ``{field_name: {code: label}}`` lookups for backend enum fields whose + values aren't in the swagger schema — exposed only when relevant + (caller should pre-filter to the catalog's consumes fields). + ``seed_entities`` carries entities decided in earlier turns of a + multi-turn flow (e.g. user clicked an option in a popup); the LLM + keeps them and only extracts additional ones from the new + ``requirement``. ``llm`` is any OntologyLLM-compatible provider. + """ + if not catalog: + raise IntentParseError("empty catalog — cannot pick a target") + + vocab = vocabulary or [] + if not vocab: + # Fallback: derive from catalog. Same-domain narrowing only — + # callers that supply the full graph vocab get better accuracy. + seen: set[str] = set() + for e in catalog: + for tag in e.consumes_tags: + if tag and tag not in seen: + seen.add(tag) + vocab.append(tag) + + prompt = _INTENT_PROMPT.format( + requirement=requirement.strip(), + catalog=_format_catalog(catalog), + vocabulary_block=_format_vocabulary_block(vocab), + enum_block=_format_enum_block(enum_mappings), + seed_block=_format_seed_block(seed_entities), + ) + raw = llm.generate(prompt) + + try: + parsed = _extract_json(raw) + except json.JSONDecodeError as exc: + raise IntentParseError(f"LLM output not parseable JSON: {exc}") from exc + + if not isinstance(parsed, dict): + raise IntentParseError(f"expected JSON object, got {type(parsed).__name__}") + + target = str(parsed.get("target") or "").strip() + if not target: + raise IntentParseError("target missing from LLM output") + + # Validate target is in the catalog — guard against hallucinated names + allowed = {e.name for e in catalog} + if target not in allowed: + raise IntentParseError( + f"target {target!r} not in catalog (candidates: {sorted(allowed)[:5]!r}...)" + ) + + entities_raw = parsed.get("entities") + entities = entities_raw if isinstance(entities_raw, dict) else {} + + # Validate entity keys against the vocabulary. The LLM regularly emits + # a slightly-elaborated key ("search_keyword_name" instead of + # "search_keyword") that nothing downstream can match — coerce the + # close ones, drop the rest. A wrong key triggers worse downstream + # behavior than no key. + if vocab and entities: + entities = _coerce_entity_keys(entities, vocab) + + # Multi-turn safety net: even if the LLM ignored the carry-forward + # instructions, prior-turn entities must persist. New entities from + # this turn override on conflict (later turn wins for explicit + # contradictions in the requirement). + if seed_entities: + entities = {**seed_entities, **entities} + + try: + confidence = float(parsed.get("confidence") or 0.0) + except (TypeError, ValueError): + confidence = 0.0 + confidence = max(0.0, min(1.0, confidence)) + + shape = str(parsed.get("output_shape") or "single").strip().lower() + if shape not in ("single", "list", "count"): + shape = "single" + + return ParsedIntent( + target=target, + entities=entities, + confidence=confidence, + output_shape=shape, + reasoning=str(parsed.get("reasoning") or "").strip(), + ) + + +__all__ = [ + "ToolCatalogEntry", + "ParsedIntent", + "IntentParseError", + "parse_intent", +] diff --git a/graph_tool_call/plan/response.py b/graph_tool_call/plan/response.py new file mode 100644 index 0000000..4eefdfc --- /dev/null +++ b/graph_tool_call/plan/response.py @@ -0,0 +1,136 @@ +"""Stage 4 — Response Synthesizer. + +ExecutionTrace 를 사용자 친화적 자연어 응답으로 변환한다. LLM 1회 호출, +context 는 execution 결과 요약 + 원본 요구사항. + +성공 / 실패 두 경우 모두 다룸: + - 성공: plan.output (final step body) + 요구사항 → 답변 + - 실패: failed_step + error + 부분 결과 → 무엇이 됐고 무엇이 안 됐는지 + +실행 결과가 대형 JSON 일 수 있으므로 호출자가 미리 projection / 압축한 후 +넘기는 것을 권장 (본 모듈은 단순히 ``str(output)`` 사용). +""" + +from __future__ import annotations + +import json +from typing import Any + +from graph_tool_call.ontology.llm_provider import OntologyLLM + +# --------------------------------------------------------------------------- +# prompts +# --------------------------------------------------------------------------- + + +_SUCCESS_PROMPT = """\ +You turn API execution results into a natural answer for the user. + +User asked: +{requirement} + +Execution result (from the last step): +{result} + +Respond in Korean unless the user's question is clearly in another language. +Keep it concise — 1~3 sentences for simple answers, short bullet list for +multi-item results. Do not invent data not present in the result. + +CRITICAL — count/total claims: +- The result above may be **truncated** for length. The list you see is NOT + necessarily the complete list. +- If the result contains an explicit total field (e.g. ``totalCount``, + ``totalElements``, ``total``, ``count``, ``size`` at top-level or inside + ``payload`` / ``data``), USE THAT NUMBER as the actual count and say + "총 N개 중 일부" or similar. +- If no total field exists, do NOT claim a specific count. Avoid phrases like + "현재 1개 등록되어 있습니다" — instead say "조회된 리뷰" or + "응답에 포함된 항목". Counting visible list items as the absolute total + is forbidden. +""" + + +_FAILURE_PROMPT = """\ +You explain an API execution failure to the user. + +User asked: +{requirement} + +Plan aborted at step {failed_step!r}. +Error: {error} + +Partial results collected before the failure: +{partial} + +Tell the user clearly in Korean (unless the question is another language): + - what they asked for + - what was attempted + - where and why it failed (in plain language — do not dump stack traces) + - what they can try next, if obvious +Keep it short and helpful — 2~4 sentences. +""" + + +# --------------------------------------------------------------------------- +# public API +# --------------------------------------------------------------------------- + + +def synthesize_success_response( + *, + requirement: str, + result: Any, + llm: OntologyLLM, + result_char_limit: int = 4000, +) -> str: + """Success case — plan completed, convert output to NL answer.""" + prompt = _SUCCESS_PROMPT.format( + requirement=requirement.strip(), + result=_render(result, result_char_limit), + ) + return llm.generate(prompt).strip() + + +def synthesize_failure_response( + *, + requirement: str, + failed_step: str, + error: Any, + partial_results: Any = None, + llm: OntologyLLM, + partial_char_limit: int = 1000, +) -> str: + """Failure case — plan aborted, explain to user.""" + prompt = _FAILURE_PROMPT.format( + requirement=requirement.strip(), + failed_step=failed_step, + error=_render(error, 300), + partial=_render(partial_results, partial_char_limit) if partial_results else "(none)", + ) + return llm.generate(prompt).strip() + + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + + +def _render(value: Any, char_limit: int) -> str: + """Serialize *value* to a short string for prompt use.""" + if value is None: + return "(none)" + if isinstance(value, str): + return value[:char_limit] + ("…" if len(value) > char_limit else "") + try: + text = json.dumps(value, ensure_ascii=False, indent=2) + except (TypeError, ValueError): + text = str(value) + if len(text) <= char_limit: + return text + return text[:char_limit] + "…" + + +__all__ = [ + "synthesize_success_response", + "synthesize_failure_response", +] diff --git a/graph_tool_call/plan/runner.py b/graph_tool_call/plan/runner.py new file mode 100644 index 0000000..73038de --- /dev/null +++ b/graph_tool_call/plan/runner.py @@ -0,0 +1,414 @@ +"""PlanRunner — deterministic executor for Plan artifacts. + +The runner is transport-agnostic: it takes a ``call_tool`` callable that +actually performs each step. This decouples ``graph_tool_call`` (pure +plan/graph logic) from integration concerns (HTTP, auth, retries — +handled by the caller's adapter). + +The runner emits structured events as it progresses — callers can relay +these over SSE, logs, or progress UIs. + +v1 scope reminder: **linear execution, no fan-out, no conditionals, no +automatic re-planning**. Failures abort the run and return a trace. +""" + +from __future__ import annotations + +import time +from collections.abc import Callable, Iterator +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +from graph_tool_call.plan.binding import BindingError, resolve_bindings +from graph_tool_call.plan.schema import ( + ExecutionTrace, + Plan, + StepTrace, +) + +# --------------------------------------------------------------------------- +# Event types — structured so callers can pattern-match by ``type`` field +# --------------------------------------------------------------------------- + + +@dataclass +class PlanStarted: + type: str = "plan.started" + plan_id: str = "" + goal: str = "" + step_count: int = 0 + + +@dataclass +class StepStarted: + type: str = "step.started" + step_id: str = "" + tool: str = "" + args_resolved: dict[str, Any] = field(default_factory=dict) + index: int = 0 + total: int = 0 + + +@dataclass +class StepCompleted: + type: str = "step.completed" + step_id: str = "" + tool: str = "" + duration_ms: int = 0 + output_preview: Any = None # truncated output for UI + output_size: int = 0 + + +@dataclass +class StepFailed: + type: str = "step.failed" + step_id: str = "" + tool: str = "" + error: dict[str, Any] = field(default_factory=dict) + duration_ms: int = 0 + + +@dataclass +class PlanCompleted: + type: str = "plan.completed" + plan_id: str = "" + output: Any = None + total_duration_ms: int = 0 + # 누적 step traces — 비-스트리밍 ``run()`` 이 ExecutionTrace.steps 채울 때 사용. + trace_steps: list[StepTrace] = field(default_factory=list) + + +@dataclass +class PlanAborted: + type: str = "plan.aborted" + plan_id: str = "" + failed_step: str = "" + error: dict[str, Any] = field(default_factory=dict) + total_duration_ms: int = 0 + trace_steps: list[StepTrace] = field(default_factory=list) + + +PlanEvent = PlanStarted | StepStarted | StepCompleted | StepFailed | PlanCompleted | PlanAborted + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + + +# ToolCaller signature: (tool_name, resolved_args) -> output_dict +ToolCaller = Callable[[str, dict[str, Any]], Any] + + +class PlanRunner: + """Execute a Plan step-by-step using a caller-provided tool invoker. + + Usage:: + + def call_tool(name: str, args: dict) -> dict: + return my_http_executor.execute(name, args) + + runner = PlanRunner(call_tool) + trace = runner.run(plan) # run to completion, return trace + # or — streaming: + for event in runner.run_stream(plan): + send_over_sse(event) + """ + + def __init__( + self, + call_tool: ToolCaller, + *, + output_preview_limit: int = 512, + on_error: str = "abort", # 'abort' only in v1 + ) -> None: + self._call_tool = call_tool + self._preview_limit = output_preview_limit + if on_error != "abort": + raise ValueError("v1 PlanRunner only supports on_error='abort'") + + # ---------------------------------------------------------------------- + # Streaming interface — yields PlanEvent instances + # ---------------------------------------------------------------------- + + def run_stream( + self, + plan: Plan, + *, + input_context: dict[str, Any] | None = None, + ) -> Iterator[PlanEvent]: + """Execute *plan* and yield events as each step progresses. + + ``input_context`` supplies values for ``${input.xxx}`` and + ``${user_input.xxx}`` bindings (both keys resolve to the same dict, + kept as aliases because the synthesizer emits ``user_input`` for + F2/Cycle-policy fallbacks and historical entity-injection paths use + ``input``). Typically the entities extracted by Stage 1 (intent + parser) plus any operator-supplied seed values. + """ + plan_start = time.monotonic() + + yield PlanStarted( + plan_id=plan.id, + goal=plan.goal, + step_count=len(plan.steps), + ) + + # step_id -> output (runtime context for binding resolution). + # ``input`` and ``user_input`` are aliases — same dict, both names — + # so binding ``${input.x}`` and ``${user_input.x}`` both resolve. + context: dict[str, Any] = {} + if input_context: + input_dict = dict(input_context) + context["input"] = input_dict + context["user_input"] = input_dict + + trace_steps: list[StepTrace] = [] + + for idx, step in enumerate(plan.steps, start=1): + step_trace = StepTrace(id=step.id, tool=step.tool) + step_start = time.monotonic() + + # 1. Resolve bindings + try: + resolved = resolve_bindings(step.args, context) + except BindingError as exc: + err = { + "kind": "binding", + "message": str(exc), + } + step_trace.error = err + step_trace.duration_ms = _ms_since(step_start) + trace_steps.append(step_trace) + yield StepFailed( + step_id=step.id, + tool=step.tool, + error=err, + duration_ms=step_trace.duration_ms, + ) + yield PlanAborted( + plan_id=plan.id, + failed_step=step.id, + error=err, + total_duration_ms=_ms_since(plan_start), + trace_steps=list(trace_steps), + ) + return + + step_trace.args_resolved = resolved + yield StepStarted( + step_id=step.id, + tool=step.tool, + args_resolved=resolved, + index=idx, + total=len(plan.steps), + ) + + # 2. Execute via caller's tool invoker + try: + output = self._call_tool(step.tool, resolved) + except Exception as exc: # noqa: BLE001 — caller-defined + err = { + "kind": "tool", + "message": str(exc), + "exception_type": type(exc).__name__, + } + step_trace.error = err + step_trace.duration_ms = _ms_since(step_start) + trace_steps.append(step_trace) + yield StepFailed( + step_id=step.id, + tool=step.tool, + error=err, + duration_ms=step_trace.duration_ms, + ) + yield PlanAborted( + plan_id=plan.id, + failed_step=step.id, + error=err, + total_duration_ms=_ms_since(plan_start), + trace_steps=list(trace_steps), + ) + return + + # 2a. Unwrap a single-level envelope when the response shape + # diverges from the schema in the canonical "{code, message, + # : {...}, timestamp}" pattern. One detect per step, + # not per binding — every binding for this step then resolves + # against the unwrapped dict naturally. + output = _maybe_unwrap_envelope(output, step.response_root_keys) + + step_trace.output = output + step_trace.duration_ms = _ms_since(step_start) + trace_steps.append(step_trace) + + # 3. Store output in context for later bindings + context[step.id] = output + + yield StepCompleted( + step_id=step.id, + tool=step.tool, + duration_ms=step_trace.duration_ms, + output_preview=_preview(output, self._preview_limit), + output_size=_output_size(output), + ) + + # 4. Resolve output_binding for final answer + try: + final = ( + resolve_bindings(plan.output_binding, context) + if plan.output_binding + else (context[plan.steps[-1].id] if plan.steps else None) + ) + except BindingError as exc: + err = {"kind": "output_binding", "message": str(exc)} + yield PlanAborted( + plan_id=plan.id, + failed_step="", + error=err, + total_duration_ms=_ms_since(plan_start), + trace_steps=list(trace_steps), + ) + return + + yield PlanCompleted( + plan_id=plan.id, + output=final, + total_duration_ms=_ms_since(plan_start), + trace_steps=list(trace_steps), + ) + + # ---------------------------------------------------------------------- + # Non-streaming interface — returns final ExecutionTrace + # ---------------------------------------------------------------------- + + def run( + self, + plan: Plan, + *, + input_context: dict[str, Any] | None = None, + ) -> ExecutionTrace: + """Execute *plan* and return an ExecutionTrace aggregating events. + + ``trace_steps`` 는 종결 이벤트 (``PlanCompleted`` / ``PlanAborted``) 가 + 실어 보내는 것을 그대로 사용 — run_stream 안에서 step 단위로 누적된 + StepTrace 가 그대로 ExecutionTrace.steps 에 들어간다. + """ + started_at = _now_iso() + started = time.monotonic() + trace_steps: list[StepTrace] = [] + success = False + failed_step: str | None = None + output: Any = None + + for event in self.run_stream(plan, input_context=input_context): + etype = event.type + if etype == "plan.completed": + success = True + output = event.output # type: ignore[union-attr] + trace_steps = list(event.trace_steps) # type: ignore[union-attr] + elif etype == "plan.aborted": + failed_step = event.failed_step # type: ignore[union-attr] + trace_steps = list(event.trace_steps) # type: ignore[union-attr] + + return ExecutionTrace( + plan_id=plan.id, + success=success, + steps=trace_steps, + output=output, + failed_step=failed_step, + total_duration_ms=_ms_since(started), + started_at=started_at, + ended_at=_now_iso(), + ) + + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + + +def _ms_since(start_monotonic: float) -> int: + return int((time.monotonic() - start_monotonic) * 1000) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _preview(value: Any, limit: int) -> Any: + """Trim large outputs for UI previews. Keep small values intact.""" + if isinstance(value, (dict, list)): + import json as _json + + try: + rendered = _json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return {"_preview": f""} + if len(rendered) <= limit: + return value + return {"_preview": rendered[:limit] + "…", "_truncated": True} + if isinstance(value, str) and len(value) > limit: + return value[:limit] + "…" + return value + + +def _maybe_unwrap_envelope( + output: Any, + expected_root_keys: list[str], +) -> Any: + """Peel one envelope layer when the response shape diverges from schema. + + Conservative — unwraps only when ALL of these hold: + + 1. ``output`` is a dict with two or more root keys + (a bare ``{"payload": ...}`` is more likely real data than envelope). + 2. Exactly one root value is itself a dict — the wrapper candidate. + 3. Every other root value is scalar / null + (envelope siblings are status/code/message/timestamp — not + business collections). + 4. None of ``expected_root_keys`` appears at the response root + (otherwise the response is already in schema-shape). + 5. At least one ``expected_root_keys`` entry appears inside the + wrapper candidate (otherwise the dict-typed sibling is unrelated + business data — unwrapping would lose information). + + The wrapper *key name* is never inspected, so this works for + ``payload`` / ``data`` / ``result`` / any other convention. Without + ``expected_root_keys`` there's no schema signal to validate against, + so the output passes through unchanged. + """ + if not expected_root_keys or not isinstance(output, dict) or len(output) < 2: + return output + + dict_keys = [k for k, v in output.items() if isinstance(v, dict)] + if len(dict_keys) != 1: + return output + + wrapper_key = dict_keys[0] + for k, v in output.items(): + if k == wrapper_key: + continue + if isinstance(v, (dict, list)): + return output + + expected = set(expected_root_keys) + if expected & set(output.keys()): + return output + + wrapper = output[wrapper_key] + if not (expected & set(wrapper.keys())): + return output + + return wrapper + + +def _output_size(value: Any) -> int: + """Approximate serialized byte size (for observability).""" + import json as _json + + try: + return len(_json.dumps(value, ensure_ascii=False)) + except (TypeError, ValueError): + return 0 diff --git a/graph_tool_call/plan/schema.py b/graph_tool_call/plan/schema.py new file mode 100644 index 0000000..9fff497 --- /dev/null +++ b/graph_tool_call/plan/schema.py @@ -0,0 +1,86 @@ +"""Plan and ExecutionTrace dataclasses. + +``Plan`` is the artifact produced by Stage 2 (Path Synthesizer) of the +Plan-and-Execute architecture. It's consumed by ``PlanRunner`` (Stage 3). +Both are intentionally plain dataclasses — serializable, introspectable, +easy to hand-craft for testing. + +The schema explicitly does NOT include fan-out / conditional branching in +v1 (per design doc §16 decision 6). Future versions can add optional +fields (``foreach``, ``condition``) on ``PlanStep``. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class PlanStep: + """A single step in a Plan. + + ``args`` may contain binding placeholders of the form + ``${step_id.json.path}`` or ``${input.keyword}``. These are resolved + at runtime by ``resolve_bindings`` using the accumulated step context. + """ + + id: str # "s1", "s2", ... + tool: str # function_name (graph node name) + args: dict[str, Any] = field(default_factory=dict) + rationale: str = "" # why this step exists (for audit) + timeout_ms: int | None = None + retryable: bool = False # reserved for v1.1 retry policy + # Top-level keys the synthesizer expects in this tool's response, + # derived from ``produces[].json_path``. Used by PlanRunner to detect + # envelope wrappers (e.g. ``{code, message, payload: {...}}``) when the + # ingest captured the wrapped fields without the wrapper itself. Empty + # list means "no hint" — the runner then leaves the response untouched. + response_root_keys: list[str] = field(default_factory=list) + + +@dataclass +class Plan: + """Executable plan — ordered steps with binding references. + + v1 scope: **linear execution only**. Steps run in listed order. No + fan-out, no conditional branching, no parallelism. Each step may + reference earlier step outputs via ``${sN.path}`` bindings. + + ``output_binding`` designates which step's (or sub-path's) result is + the final answer. If unset, runner returns the last step's result. + """ + + id: str # uuid + goal: str # user requirement summary + steps: list[PlanStep] = field(default_factory=list) + output_binding: str | None = None # e.g. "${s2.body}" + created_at: str = "" # ISO8601 + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class StepTrace: + """Record of a single step execution.""" + + id: str + tool: str + args_resolved: dict[str, Any] = field(default_factory=dict) + output: Any = None # set on success + error: dict[str, Any] | None = None # set on failure + duration_ms: int = 0 + retries: int = 0 + + +@dataclass +class ExecutionTrace: + """Result of a full Plan execution.""" + + plan_id: str + success: bool + steps: list[StepTrace] = field(default_factory=list) + output: Any = None # plan.output_binding resolved + failed_step: str | None = None + total_duration_ms: int = 0 + started_at: str = "" + ended_at: str = "" diff --git a/graph_tool_call/plan/synthesizer.py b/graph_tool_call/plan/synthesizer.py new file mode 100644 index 0000000..ac8f2de --- /dev/null +++ b/graph_tool_call/plan/synthesizer.py @@ -0,0 +1,1045 @@ +"""PathSynthesizer — Stage 2 of Plan-and-Execute. + +Given a target tool and user-provided entities, walk the ToolGraph's +produces/consumes metadata backwards to construct a Plan (ordered steps + +bindings) that, when executed by PlanRunner, satisfies the target. + +This module is transport-agnostic. It consumes a plain ``graph`` dict (the +shape persisted as ``api_tool_collections.graph.graph``) — no DB, no HTTP. + +v1 scope (per design §16.6): + - Linear chain only — no fan-out, no parallel, no branching. + - Max recursion depth = 5 (guard against cyclic or pathological graphs). + +Matching order for each required consume field: + 1. User ``entities`` (Stage 1 output) — preferred, no extra step. + 2. Another tool's ``produces`` with the same ``semantic_tag`` + (Pass 2 LLM enrichment quality). + 3. Another tool's ``produces`` with the same ``field_name`` + (Pass 1 deterministic extraction, fallback). + +Producer selection is ranked by Pass 2 metadata signals — no hardcoded +domain or field rules: + - Entity affinity: producer consumes an entity the user supplied, + so chaining through it actually uses that entity. + - Pair hint: target's ``pairs_well_with`` includes this producer. + - Action preference: ``canonical_action`` = search/read fits a + prerequisite role better than create/update/delete. + +``consumes[].kind`` ("data" | "context", set by Pass 2): + - "data" — chain to a producer if entity doesn't match. + - "context" — ambient config (locale, site, tenant). Never chained; + must come from entity or skipped (runtime uses API default). +""" + +from __future__ import annotations + +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +from graph_tool_call.plan.schema import Plan, PlanStep + + +class PlanSynthesisError(Exception): + """Base class for synthesis failures.""" + + +class UnsatisfiableFieldError(PlanSynthesisError): + """A required field cannot be supplied by entities or any producer.""" + + +class CyclicDependencyError(PlanSynthesisError): + """The synthesis trace revisits a tool already in progress.""" + + +class MaxDepthExceededError(PlanSynthesisError): + """Recursion depth exceeded — likely a misshapen graph.""" + + +class DynamicOptionRequired(UnsatisfiableFieldError): # noqa: N818 + """A required data field has a single-hop producer that can be called + immediately with the user's entities + context_defaults. Surface this + so the caller can fetch the option list (instead of weaving a chain) + and ask the user to pick — the popup-driven UX for fields like + ``itmNo`` (single-품목 option) where the choices are dynamic per + request. + + The exception carries enough metadata for the caller to: + * know which producer to call (``producer_name``) + * find the option array in the producer's response (``options_path``) + * pick a sensible label field next to each code (``label_field_hints``) + """ + + def __init__( + self, + message: str, + *, + field_name: str, + semantic_tag: str, + producer_name: str, + options_path: str, + label_field_hints: list[str], + ) -> None: + super().__init__(message) + self.field_name = field_name + self.semantic_tag = semantic_tag + self.producer_name = producer_name + self.options_path = options_path + self.label_field_hints = list(label_field_hints) + + +def _normalize_field_name(name: str) -> str: + """Lowercase + strip non-alphanumerics for loose field-name matching. + + Conservative on purpose: + ``ordNo`` → ``ordno`` ``ord_no`` → ``ordno`` ``ORD-NO`` → ``ordno`` + + Token roots stay distinct: + ``ordNo`` ≠ ``orderNo`` (``ordno`` ≠ ``orderno``) + + Token-level synonym mapping (``ord`` ↔ ``order``) is domain-specific + and intentionally NOT done here — that's the job of the graph-edge + fallback in ``_find_producer``, which uses path/$ref/CRUD signals + instead of name guessing. + """ + if not name: + return "" + return "".join(ch.lower() for ch in name if ch.isalnum()) + + +@dataclass +class _PartialStep: + """In-progress step being built during bottom-up synthesis.""" + + tool: str + args: dict[str, Any] = field(default_factory=dict) + rationale: str = "" + step_id: str = "" # assigned at topological sort + + +class PathSynthesizer: + """Deterministic plan builder driven by graph ``produces``/``consumes``. + + Usage:: + + syn = PathSynthesizer(graph_dict) + plan = syn.synthesize( + target="seltProductDetailInfo", + entities={"search_keyword": "quarzen 티셔츠"}, + ) + """ + + def __init__( + self, + graph: dict[str, Any], + *, + max_depth: int = 5, + context_defaults: dict[str, Any] | None = None, + enum_field_names: set[str] | None = None, + ) -> None: + self._tools: dict[str, dict[str, Any]] = dict(graph.get("tools") or {}) + self._max_depth = max_depth + # Collection-level ambient values (locale, tenant id, site id, ...) the + # operator registers once per collection. Filled into ``kind=context`` + # consume fields when the user's entities don't supply them — avoids + # repeating env-style args in every requirement and avoids leaking + # backend-specific defaults into library code. Lookup precedence: + # entities > context_defaults > skip. + self._context_defaults: dict[str, Any] = dict(context_defaults or {}) + # Field names the operator registered an enum mapping for. When a + # required-data field of this kind can't be filled by an entity, + # the synthesizer raises UnsatisfiableFieldError instead of + # producer-chaining — the caller (service layer) is expected to + # surface a popup to the user rather than weaving an awkward + # producer chain that pulls in unrelated tools just to source a + # code value. User intent (popup choice) wins over chain depth. + self._enum_field_names: set[str] = set(enum_field_names or ()) + # semantic_tag -> [tool_name], insertion order preserved + self._producers_by_semantic: dict[str, list[str]] = {} + self._producers_by_field: dict[str, list[str]] = {} + # Loose-field index: normalised field name → [tool_name]. + # Lets ``ordNo`` match producers of ``ordno`` / ``ord_no`` / ``ORDNO``. + # Conservative — only normalises case + separators, never strips + # tokens (so ``ordNo`` ≠ ``orderNo`` — those need the graph fallback). + self._producers_by_loose_field: dict[str, list[str]] = {} + # graphify-mode adjacency: ``tool_name -> [edge_dict]`` for outgoing + # workflow edges (REQUIRES / PRECEDES / COMPLEMENTARY). Used as a + # fallback in ``_find_producer`` when neither semantic_tag nor + # field_name match — we walk the graph the user/extractor built + # rather than failing on field-name divergence. + self._workflow_edges_out: dict[str, list[dict[str, Any]]] = {} + self._index_workflow_edges(graph) + self._build_producer_indexes() + + # ------------------------------------------------------------------ + # public API + # ------------------------------------------------------------------ + + def synthesize( + self, + *, + target: str, + entities: dict[str, Any] | None = None, + goal: str = "", + ) -> Plan: + """Build a Plan whose final step is ``target`` with required args + filled by entities + prerequisite steps. + + Raises ``UnsatisfiableFieldError`` if a required field has no + producer or entity mapping. + """ + if target not in self._tools: + raise PlanSynthesisError(f"target tool not in graph: {target!r}") + + entities = entities or {} + steps_by_tool: dict[str, _PartialStep] = {} + visiting: set[str] = set() + + # Resolve recursively; populates steps_by_tool with target at the end + self._resolve( + tool_name=target, + entities=entities, + steps_by_tool=steps_by_tool, + visiting=visiting, + depth=0, + ) + + # Assign topological ids s1..sN by insertion order + ordered_tools = list(steps_by_tool.keys()) + for idx, tool_name in enumerate(ordered_tools, start=1): + steps_by_tool[tool_name].step_id = f"s{idx}" + + # Replace tool-name bindings with step-id bindings + final_steps: list[PlanStep] = [] + for tool_name in ordered_tools: + partial = steps_by_tool[tool_name] + args = {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in partial.args.items()} + final_steps.append( + PlanStep( + id=partial.step_id, + tool=partial.tool, + args=args, + rationale=partial.rationale, + response_root_keys=self._response_root_keys(tool_name), + ) + ) + + target_step_id = steps_by_tool[target].step_id + + # Collect user_input slots so the runner can prompt the caller in + # advance and the UI can render a single popup with all missing + # fields, instead of one popup per step. Each entry: which step + # needs which field, and (when known) the original semantic_tag + # so frontend can show the same enum/popup the operator + # registered for that field. + user_input_slots: list[dict[str, Any]] = [] + for step in final_steps: + for arg_name, arg_val in (step.args or {}).items(): + if isinstance(arg_val, str) and arg_val.startswith("${user_input."): + user_input_slots.append( + { + "step_id": step.id, + "tool": step.tool, + "field_name": arg_name, + } + ) + + return Plan( + id=str(uuid.uuid4()), + goal=goal or f"Execute {target}", + steps=final_steps, + # PlanRunner adapter 는 step ctx 에 응답 body 를 root 로 노출 → + # ``${sN}`` 만으로 전체 응답 dict 가 잡힌다 (과거 ``${sN.body}`` 는 + # adapter 가 ``{status, body}`` 을 그대로 흘릴 때의 흔적). + output_binding=f"${{{target_step_id}}}", + created_at=datetime.now(timezone.utc).isoformat(), + metadata={ + "target": target, + "entities": dict(entities), + "synthesized_by": "PathSynthesizer/v1", + "user_input_slots": user_input_slots, + }, + ) + + # ------------------------------------------------------------------ + # core recursion + # ------------------------------------------------------------------ + + def _resolve( + self, + *, + tool_name: str, + entities: dict[str, Any], + steps_by_tool: dict[str, _PartialStep], + visiting: set[str], + depth: int, + ) -> str: + """Ensure ``tool_name`` has a PartialStep with resolved args. + + Returns the tool name itself (used as a placeholder in args until + step_ids are assigned by the caller). + """ + if depth > self._max_depth: + raise MaxDepthExceededError( + f"synthesis exceeded max_depth={self._max_depth} at {tool_name!r}" + ) + if tool_name in steps_by_tool: + return tool_name + if tool_name in visiting: + raise CyclicDependencyError( + f"cycle detected at {tool_name!r} (chain: {sorted(visiting)!r})" + ) + visiting.add(tool_name) + + tool = self._tools.get(tool_name) or {} + metadata = tool.get("metadata") or {} + consumes = metadata.get("consumes") or [] + + args: dict[str, Any] = {} + rationales: list[str] = [] + + for consume in consumes: + field_name = consume.get("field_name") or "" + semantic = consume.get("semantic_tag") or "" + kind = str(consume.get("kind") or "data").strip().lower() + is_required = bool(consume.get("required")) + + # 1. Entity match (user-supplied) — applies to both data and + # context, both required and optional. The user's input + # always wins. + entity_val = self._match_entity(entities, semantic, field_name) + if entity_val is not None: + args[field_name] = entity_val + continue + + # 2. Context-kind: try collection-level defaults regardless of + # required flag. Context is never chained — ambient config + # must come from entity or operator-registered default + # (chaining through e.g. getSiteInfo would inflate the plan + # with steps that don't produce business value). + if kind == "context": + default = self._lookup_context_default(semantic, field_name) + if default is not None: + args[field_name] = default + continue + + # 3. Optional data field: leave out. The caller's backend will + # apply its own defaults — synthesizer has no business + # inventing values for optional business inputs. + if not is_required: + continue + + # 4. Enum-field popup priority. If the operator registered an + # enum mapping for this field, it's the kind of value the + # user should pick from a popup — NOT something to chain + # through a producer (which often drags in semantically + # unrelated tools just because their response happens to + # contain a code by the same name). Surface + # UnsatisfiableFieldError so the caller can yield a + # question.required event instead. + if field_name in self._enum_field_names: + raise UnsatisfiableFieldError( + f"tool {tool_name!r} requires {field_name!r} " + f"(semantic={semantic!r}) — enum field, expects user " + f"selection (no producer chain attempted)" + ) + + # 5. Required data field → rank candidate producers and pick the best. + # Pass ``visiting`` as ``excluded`` so cycle-prone candidates are + # skipped here (Cycle policy A). The chain reroutes around the + # cycle when an alternative producer exists; only when none + # remains does the caller fall through to user-input slot (F2). + producer = self._find_producer( + semantic=semantic, + field_name=field_name, + target_tool=tool_name, + entities=entities, + excluded=visiting, + ) + if producer is None: + # F2 + Cycle policy B: gracefully surface the field as a + # ``${user_input.}`` placeholder rather than aborting + # the entire plan. The runner detects the placeholder at + # step-start and asks the user (or its surrounding agent) + # to supply the value. The plan's metadata records every + # such slot so the caller can pre-collect inputs. + placeholder = f"${{user_input.{field_name}}}" + args[field_name] = placeholder + rationales.append(f"{field_name} ← user_input") + continue + + # 5a. Dynamic-option popup priority. Detect "read-detail then + # pick one" patterns where the producer is a single-hop + # read of a product/record whose response carries a + # list of options the user must choose from (e.g. + # ``getProductInfo`` exposes ``$.itmInfo[*].itmNo`` — + # the available SKUs). In that case, defer to the caller + # to fetch options and pop up a question, instead of + # chaining the producer in and binding ``[0]`` blindly. + # + # Constrained to ``canonical_action='read'`` because + # ``search`` producers (e.g. seltSearchProduct → goodsNo) + # are exactly the chain idiom we DO want — pick the first + # hit and continue. Without this constraint legitimate + # search→detail chains turn into popups. + producer_action = self._producer_action(producer) + if producer_action == "read" and self._is_producer_simple_callable(producer, entities): + opt_path = self._produces_path_for( + producer, + semantic=semantic, + field_name=field_name, + ) + if opt_path and "[*]" in opt_path: + raise DynamicOptionRequired( + f"tool {tool_name!r} requires {field_name!r} " + f"(semantic={semantic!r}) — dynamic option from " + f"{producer!r}; caller should fetch options and " + f"prompt the user", + field_name=field_name, + semantic_tag=semantic, + producer_name=producer, + options_path=opt_path, + label_field_hints=self._label_hints_for(producer, opt_path), + ) + + # Recurse into the producer first so step_id ordering is correct. + # Cycle policy B + F2: if the producer's own chain is too deep + # or cycles back, we don't abort the whole plan — we drop this + # producer and fall back to a user_input slot for the field. + # This keeps the surface tool callable when the prerequisite + # chain extends beyond what the synthesiser can flatten. + try: + self._resolve( + tool_name=producer, + entities=entities, + steps_by_tool=steps_by_tool, + visiting=visiting, + depth=depth + 1, + ) + except (MaxDepthExceededError, CyclicDependencyError) as exc: + placeholder = f"${{user_input.{field_name}}}" + args[field_name] = placeholder + rationales.append( + f"{field_name} ← user_input (chain unflattenable: {exc.__class__.__name__})" + ) + continue + + # Build a placeholder binding — will be rewritten after step_ids + # are assigned. Format: ${.} + prod_path = self._producer_jsonpath(producer, semantic, field_name) + args[field_name] = f"${{{producer}.{prod_path}}}" + rationales.append(f"{field_name} ← {producer} ({prod_path})") + + steps_by_tool[tool_name] = _PartialStep( + tool=tool_name, + args=args, + rationale="; ".join(rationales) if rationales else "", + ) + visiting.discard(tool_name) + return tool_name + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + + def _build_producer_indexes(self) -> None: + """Index which tools produce which semantic / field across the graph. + + Echo-back filter: a tool that takes ``ordNo`` as input and echoes it + back in its response is NOT a producer of ``ordNo`` in any useful + sense — it's just relaying the value the caller already supplied. We + skip those entries so the index reflects tools that actually CREATE + or DISCOVER the value (``listOrders``, ``createOrder``, + ``searchOrders`` etc.) rather than every endpoint that happens to + round-trip the field. + + Same rule applied to ``semantic_tag`` for parity with the LLM Pass 2 + enrichment path. Empty consumes (no input fields) → never echo, so + all produces are real producers. + """ + for name, tool in self._tools.items(): + meta = tool.get("metadata") or {} + consumed_fields: set[str] = set() + consumed_semantics: set[str] = set() + for c in meta.get("consumes") or []: + if not isinstance(c, dict): + continue + cf = c.get("field_name") or "" + cs = c.get("semantic_tag") or "" + if cf: + consumed_fields.add(cf) + if cs: + consumed_semantics.add(cs) + + for produce in meta.get("produces") or []: + sem = produce.get("semantic_tag") or "" + fname = produce.get("field_name") or "" + # Skip pure echo-back: the field came in, gets relayed out. + if fname and fname in consumed_fields: + continue + if sem and sem in consumed_semantics: + continue + if sem: + self._producers_by_semantic.setdefault(sem, []).append(name) + if fname: + self._producers_by_field.setdefault(fname, []).append(name) + loose = _normalize_field_name(fname) + if loose and loose != fname: + self._producers_by_loose_field.setdefault(loose, []).append(name) + + # ---- graphify edge indexing & traversal --------------------------------- + + _WORKFLOW_RELATIONS: frozenset[str] = frozenset({"requires", "precedes", "complementary"}) + _CONFIDENCE_RANK: dict[str, int] = { + "EXTRACTED": 0, + "INFERRED": 1, + "AMBIGUOUS": 2, + } + + def _index_workflow_edges(self, graph: dict[str, Any]) -> None: + """Bucket the graphify graph's outgoing workflow edges by source tool. + + Accepts the same graph dict the rest of the class consumes — looks + for ``graph.graph.edges`` (DictGraph.to_dict() output) or the + legacy NetworkX-style ``graph.graph.links`` if present. Edges + without a confidence label are kept (treated as fallback) so this + also works on graphs built before the graphify ingest landed. + """ + graph_inner = graph.get("graph") or {} + edges = graph_inner.get("edges") or graph_inner.get("links") or [] + for e in edges: + if not isinstance(e, dict): + continue + src = e.get("source") or e.get("from") + tgt = e.get("target") or e.get("to") + rel = e.get("relation") + rel_str = ( + rel.value if hasattr(rel, "value") else str(rel) if rel is not None else "" + ).lower() + if not src or not tgt or rel_str not in self._WORKFLOW_RELATIONS: + continue + self._workflow_edges_out.setdefault(src, []).append( + { + "target": tgt, + "relation": rel_str, + "confidence": e.get("confidence"), + "conf_score": float(e.get("conf_score") or 0.0), + "evidence": e.get("evidence") or "", + } + ) + + # Producer-signal score weights. Higher = stronger signal that this + # candidate genuinely produces the value the target needs. Weights chosen + # so combined signals (e.g. graph EXTRACTED + field exact = 90) beat any + # single signal, and graph EXTRACTED alone (50) beats field exact alone + # (40) — Path/$ref/CRUD-derived edges are more reliable than coincidental + # field-name overlap. ``semantic_exact`` requires LLM Pass 2 enrichment; + # when present it's the strongest signal we have. + _SIGNAL_WEIGHTS: dict[str, int] = { + "semantic_exact": 100, + "graph_EXTRACTED": 50, + "field_exact": 40, + "graph_INFERRED": 20, + "field_loose": 10, + "graph_AMBIGUOUS": 5, + } + + def _find_producer( + self, + *, + semantic: str, + field_name: str, + target_tool: str, + entities: dict[str, Any], + excluded: set[str] | None = None, + ) -> str | None: + """Pick the best producer using combined graph + schema signals. + + Producer matching is treated as the intersection of two first-class + signals (NOT a fallback chain): + (a) Schema match — semantic_tag / field_name on ``produces``. + (b) Graph traversal — outgoing REQUIRES / PRECEDES / COMPLEMENTARY + edges from ``target_tool``, ranked by ``confidence``. + + A candidate accumulates one entry per matching signal. The signal + weights live in ``_SIGNAL_WEIGHTS`` and combine additively, so a + candidate matched by both graph EXTRACTED and field_exact (90) wins + over one matched only by field_exact (40). Tie-break uses the + existing Pass-2 ``_rank_producers`` (entity affinity, pair hint, + canonical action), and ``_is_chain_eligible`` still gates the final + pick — sparse Pass-2 metadata pass-throughs apply unchanged. + + ``excluded`` is the set of tools currently being resolved (the + caller's ``visiting`` set). Producer candidates in this set would + re-enter recursion and trigger ``CyclicDependencyError`` — we skip + them here so the second-best candidate gets a chance instead. This + is the "skip-this-branch" cycle policy: the chain reroutes around + the cycle when alternative producers exist; only when all candidates + cycle does the caller fall back to user-input slot handling. + + Returns the highest-scoring eligible candidate, or None if no + candidate has any signal (or all signals point to ``excluded`` tools). + """ + excluded = excluded or set() + candidate_signals: dict[str, set[str]] = {} + + def _record(name: str, signal: str) -> None: + if name and name != target_tool: + candidate_signals.setdefault(name, set()).add(signal) + + # (a) schema-side: exact semantic / field_name (echo-back already + # filtered when the index was built). + if semantic: + for n in self._producers_by_semantic.get(semantic, []): + _record(n, "semantic_exact") + if field_name: + for n in self._producers_by_field.get(field_name, []): + _record(n, "field_exact") + + # (a') schema-side: loose field match — separator/case folded. + # ``ordNo`` won't match ``orderNo`` (different roots) but will match + # ``ord_no`` / ``ORDNO``. Cross-naming-convention safety net. + if field_name: + loose = _normalize_field_name(field_name) + if loose: + for n in self._producers_by_loose_field.get(loose, []): + if n in candidate_signals: + continue # already had a stronger signal + _record(n, "field_loose") + + # (b) graph-side: walk outgoing workflow edges, verify each + # candidate actually has a matching produces entry. + edges = self._workflow_edges_out.get(target_tool) or [] + loose_target = _normalize_field_name(field_name) if field_name else "" + for e in edges: + cand = e.get("target") + if not cand or cand == target_tool: + continue + tool = self._tools.get(cand) + if not tool: + continue + cand_consumes_fields = { + (c or {}).get("field_name", "") + for c in (tool.get("metadata") or {}).get("consumes") or [] + if isinstance(c, dict) + } + cand_consumes_semantics = { + (c or {}).get("semantic_tag", "") + for c in (tool.get("metadata") or {}).get("consumes") or [] + if isinstance(c, dict) + } + for p in (tool.get("metadata") or {}).get("produces") or []: + if not isinstance(p, dict): + continue + p_sem = p.get("semantic_tag") or "" + p_fname = p.get("field_name") or "" + # Echo-back guard for the candidate itself — same rule as + # _build_producer_indexes, applied here so graph-edge + # discoveries don't sneak in a relayed value. + if p_fname and p_fname in cand_consumes_fields: + continue + if p_sem and p_sem in cand_consumes_semantics: + continue + + matched = False + if semantic and p_sem == semantic: + matched = True + elif field_name and p_fname == field_name: + matched = True + elif loose_target and _normalize_field_name(p_fname) == loose_target: + matched = True + if not matched: + continue + + conf = e.get("confidence") or "AMBIGUOUS" + _record(cand, f"graph_{conf}") + break # one signal per candidate per edge target is enough + + if not candidate_signals: + return None + + # Score and pre-rank by signal strength (stable for equal scores). + def _score(signals: set[str]) -> int: + return sum(self._SIGNAL_WEIGHTS.get(s, 0) for s in signals) + + scored = sorted( + candidate_signals.items(), + key=lambda item: (-_score(item[1]), item[0]), + ) + sorted_names = [n for n, _ in scored] + + # Pass 2 / chain-eligibility gate — pass-through when ai_metadata + # is sparse, identical behaviour to the previous implementation. + # Cycle filter: skip candidates currently in the resolution stack so + # the synthesiser reroutes around the cycle instead of raising. + ranked = self._rank_producers( + sorted_names, + target_tool=target_tool, + entities=entities, + ) + for cand in ranked: + if cand in excluded: + continue + if self._is_chain_eligible(cand, target_tool=target_tool): + return cand + return None + + def _producer_action(self, producer_name: str) -> str: + """Return the producer's ``ai_metadata.canonical_action`` (lowercased, + empty string if missing). Used to gate dynamic-option popups to + ``read`` producers — search producers are the chain idiom (pick + first hit), not popup candidates. + """ + tool = self._tools.get(producer_name) or {} + ai = (tool.get("metadata") or {}).get("ai_metadata") or {} + return str(ai.get("canonical_action") or "").strip().lower() + + def _is_producer_simple_callable( + self, + producer_name: str, + entities: dict[str, Any], + ) -> bool: + """True iff the producer can be called with only the user's entities + and the collection's context_defaults — i.e. no further producer + chain needed to source its inputs. + + Used to detect "single-hop dynamic option" cases: instead of + chaining the producer into the plan, the caller fetches it once + and pops up the resulting list to the user (e.g. itmNo from + getProductInfo when the user already supplied goodsNo). + """ + producer = self._tools.get(producer_name) or {} + for c in (producer.get("metadata") or {}).get("consumes") or []: + if not isinstance(c, dict) or not c.get("required"): + continue + field = c.get("field_name") or "" + sem = c.get("semantic_tag") or "" + kind = str(c.get("kind") or "data").strip().lower() + if self._match_entity(entities, sem, field) is not None: + continue + if kind == "context" and self._lookup_context_default(sem, field) is not None: + continue + return False + return True + + def _produces_path_for( + self, + producer_name: str, + *, + semantic: str, + field_name: str, + ) -> str: + """Find the producer's json_path that emits the given field — the + location of the option array in the response (e.g. + ``$.itmInfo[*].itmNo``). Empty string if no match. + """ + producer = self._tools.get(producer_name) or {} + for p in (producer.get("metadata") or {}).get("produces") or []: + if not isinstance(p, dict): + continue + if semantic and p.get("semantic_tag") == semantic: + return str(p.get("json_path") or "") + # Fallback: match by field_name when semantic missing/mismatched + for p in (producer.get("metadata") or {}).get("produces") or []: + if not isinstance(p, dict): + continue + if field_name and p.get("field_name") == field_name: + return str(p.get("json_path") or "") + return "" + + def _label_hints_for( + self, + producer_name: str, + options_path: str, + ) -> list[str]: + """Return field names that look like human labels living next to + the option-code field in the producer's response. Heuristic: same + array prefix, name ending in ``Nm`` / ``Name`` / ``Label``. + + ``options_path`` looks like ``$.itmInfo[*].itmNo``; we walk the + producer's other produces entries that share the prefix + ``$.itmInfo[*].`` and pick the ones whose field_name suggests a + label. + """ + producer = self._tools.get(producer_name) or {} + # Compute the array prefix: everything up to the last "." + if "." not in options_path: + return [] + prefix = options_path.rsplit(".", 1)[0] + "." + hints: list[str] = [] + seen: set[str] = set() + for p in (producer.get("metadata") or {}).get("produces") or []: + if not isinstance(p, dict): + continue + jp = str(p.get("json_path") or "") + if not jp.startswith(prefix): + continue + field = str(p.get("field_name") or "") + if not field or field in seen: + continue + lower = field.lower() + if lower.endswith("nm") or lower.endswith("name") or lower.endswith("label"): + hints.append(field) + seen.add(field) + return hints + + def _is_chain_eligible(self, producer_name: str, *, target_tool: str) -> bool: + """Return True if ``producer_name`` may be added to the prerequisite + chain for ``target_tool``. + + Two signals from Pass 2 ``ai_metadata`` decide: + + 1. ``canonical_action`` ∈ {search, read} + create/update/delete/action are not prerequisite material — + they perform side effects, never just data lookup. + 2. ``primary_resource`` is in the target's domain set + (target's own resource + the prefix of every consume's + semantic_tag, e.g. ``product_id`` ⇒ ``product``). + + Either signal absent (sparse ``ai_metadata``) ⇒ pass through. + Operators that haven't enriched the graph yet keep the previous + behaviour; once enriched, the policy starts filtering. Also + reverts to pass-through if the target itself has no ``ai_metadata``, + because the "domain set" can't be computed. + """ + producer = self._tools.get(producer_name) or {} + p_meta = (producer.get("metadata") or {}).get("ai_metadata") or {} + p_action = str(p_meta.get("canonical_action") or "").strip().lower() + if not p_action: + return True + if p_action not in ("search", "read"): + return False + + p_resource = str(p_meta.get("primary_resource") or "").strip().lower() + if not p_resource: + return True + + target = self._tools.get(target_tool) or {} + t_meta_full = target.get("metadata") or {} + t_meta = t_meta_full.get("ai_metadata") or {} + t_resource = str(t_meta.get("primary_resource") or "").strip().lower() + + related: set[str] = set() + if t_resource: + related.add(t_resource) + if "_" in t_resource: + related.add(t_resource.split("_", 1)[0]) + + for c in t_meta_full.get("consumes") or []: + if not isinstance(c, dict): + continue + sem = str(c.get("semantic_tag") or "").strip().lower() + if not sem: + continue + related.add(sem.split("_", 1)[0] if "_" in sem else sem) + + if not related: + return True + + p_prefix = p_resource.split("_", 1)[0] if "_" in p_resource else p_resource + return p_resource in related or p_prefix in related + + def _rank_producers( + self, + candidates: list[str], + *, + target_tool: str, + entities: dict[str, Any], + ) -> list[str]: + """Rank candidates by Pass 2 metadata signals. + + Order: + 1. Entity affinity — producer consumes a field the user already + supplied (so the chain actually uses user input). + 2. Pair hint — target's ``pairs_well_with`` names this producer. + 3. Action preference — ``search`` > ``read`` > others as a + prerequisite role. + Ties fall back to insertion order (stable sort). + + No hardcoded names / regexes. Every signal is a per-tool Pass 2 + field the LLM filled at ingest time. + """ + target_meta = (self._tools.get(target_tool) or {}).get("metadata") or {} + target_ai = target_meta.get("ai_metadata") or {} + pair_names = { + str(p.get("tool") or "").strip() + for p in (target_ai.get("pairs_well_with") or []) + if isinstance(p, dict) + } + pair_names.discard("") + entity_keys = {str(k) for k in (entities or {}).keys()} + + action_score = {"search": 3, "read": 2, "action": 1} + + def _score(name: str) -> tuple[int, int, int]: + tool = self._tools.get(name) or {} + meta = tool.get("metadata") or {} + ai = meta.get("ai_metadata") or {} + + affinity = 0 + for c in meta.get("consumes") or []: + tag = c.get("semantic_tag") or "" + fname = c.get("field_name") or "" + if (tag and tag in entity_keys) or (fname and fname in entity_keys): + affinity += 1 + + pair_bonus = 1 if name in pair_names else 0 + action = str(ai.get("canonical_action") or "").strip().lower() + return (affinity, pair_bonus, action_score.get(action, 0)) + + # Python's sort is stable; higher score wins, ties keep insertion order. + return sorted(candidates, key=_score, reverse=True) + + def _response_root_keys(self, tool_name: str) -> list[str]: + """Top-level keys of the tool's response, taken from ``produces``. + + Each ``produces[].json_path`` (e.g. ``$.searchDataList[*].goodsNo``) + contributes its first dotted segment (``searchDataList``). Used by + PlanRunner as a schema hint for envelope detection — when the + actual response is missing every hint at root but a single nested + dict contains them, the wrapper is peeled away. + """ + tool = self._tools.get(tool_name) or {} + produces = (tool.get("metadata") or {}).get("produces") or [] + out: list[str] = [] + seen: set[str] = set() + for p in produces: + raw = p.get("json_path") or "" + head = _jsonpath_head(raw) + if head and head not in seen: + out.append(head) + seen.add(head) + return out + + def _producer_jsonpath( + self, + producer: str, + semantic: str, + field_name: str, + ) -> str: + """Return a dotted path under the producer's response that yields + the desired field. Converts ``$.a.b[*].c`` → ``a.b[0].c`` (v1 picks + the first array element when a wildcard is present). + + Falls back to ``body`` + field_name if we can't locate the produces. + """ + tool = self._tools.get(producer) or {} + produces = (tool.get("metadata") or {}).get("produces") or [] + match = None + if semantic: + match = next( + (p for p in produces if p.get("semantic_tag") == semantic), + None, + ) + if match is None and field_name: + match = next( + (p for p in produces if p.get("field_name") == field_name), + None, + ) + if match is None: + return f"body.{field_name}" if field_name else "body" + + raw = match.get("json_path") or "" + return _normalize_jsonpath_for_binding(raw) + + def _lookup_context_default( + self, + semantic: str, + field_name: str, + ) -> Any | None: + """Pick a registered context default for a consume field. + + Mirrors ``_match_entity`` lookup order — semantic tag first (Pass 2 + canonical id), field name second (Pass 1 raw). Returns ``None`` if + the operator hasn't registered a value for either key. + """ + if not self._context_defaults: + return None + if semantic and semantic in self._context_defaults: + return self._context_defaults[semantic] + if field_name and field_name in self._context_defaults: + return self._context_defaults[field_name] + return None + + def _match_entity( + self, + entities: dict[str, Any], + semantic: str, + field_name: str, + ) -> Any | None: + """Look up user-supplied entity by semantic tag or field name.""" + if semantic and semantic in entities: + return entities[semantic] + if field_name and field_name in entities: + return entities[field_name] + return None + + def _rewrite_tool_refs( + self, + value: Any, + steps_by_tool: dict[str, _PartialStep], + ) -> Any: + """Recursively rewrite ``${.}`` → ``${sN.}``.""" + if isinstance(value, dict): + return {k: self._rewrite_tool_refs(v, steps_by_tool) for k, v in value.items()} + if isinstance(value, list): + return [self._rewrite_tool_refs(v, steps_by_tool) for v in value] + if not isinstance(value, str): + return value + # Only rewrite full-string bindings that we inserted. Entities + # supplied by the caller are left alone (no ${...} wrapping). + if not (value.startswith("${") and value.endswith("}")): + return value + inner = value[2:-1] + head, _, tail = inner.partition(".") + if head in steps_by_tool: + step_id = steps_by_tool[head].step_id + rest = f".{tail}" if tail else "" + return f"${{{step_id}{rest}}}" + return value + + +def _jsonpath_head(raw: str) -> str: + """First dotted segment of a JSONPath, stripping ``$``, ``.`` and ``[…]``. + + ``$.payload.searchDataList[*].goodsNo`` → ``"payload"``. + ``$.totalCount`` → ``"totalCount"``. + Returns ``""`` for empty / unparseable input. + """ + if not raw: + return "" + path = raw[1:] if raw.startswith("$") else raw + if path.startswith("."): + path = path[1:] + # Cut at the first separator (``.`` or ``[``). + for i, ch in enumerate(path): + if ch in ".[": + return path[:i] + return path + + +def _normalize_jsonpath_for_binding(raw: str) -> str: + """``$.body.goods[*].goodsNo`` → ``body.goods[0].goodsNo``. + + v1 always picks index 0 for arrays. Fan-out is v2 (design §11.1). + """ + if not raw: + return "" + path = raw + if path.startswith("$"): + path = path[1:] + if path.startswith("."): + path = path[1:] + return path.replace("[*]", "[0]") + + +__all__ = [ + "PathSynthesizer", + "PlanSynthesisError", + "UnsatisfiableFieldError", + "CyclicDependencyError", + "MaxDepthExceededError", + "DynamicOptionRequired", +] diff --git a/graph_tool_call/serialization.py b/graph_tool_call/serialization.py index cfa56ea..81e56b6 100644 --- a/graph_tool_call/serialization.py +++ b/graph_tool_call/serialization.py @@ -52,7 +52,10 @@ def save_graph( path = Path(path) try: path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(data, indent=2, ensure_ascii=False, default=str)) + path.write_text( + json.dumps(data, indent=2, ensure_ascii=False, default=str), + encoding="utf-8", + ) except PermissionError: msg = f"Permission denied: {path}. Check directory permissions." raise PermissionError(msg) from None diff --git a/graph_tool_call/tool_graph.py b/graph_tool_call/tool_graph.py index e784ded..e415368 100644 --- a/graph_tool_call/tool_graph.py +++ b/graph_tool_call/tool_graph.py @@ -16,7 +16,7 @@ from graph_tool_call.core.protocol import GraphEngine from graph_tool_call.core.tool import ToolSchema, normalize_tool, parse_tool from graph_tool_call.ontology.builder import OntologyBuilder -from graph_tool_call.ontology.schema import RelationType +from graph_tool_call.ontology.schema import Confidence, RelationType def _encode_spec_url(base: str, raw_url: str) -> str: @@ -289,6 +289,9 @@ def ingest_openapi( min_confidence: float = 0.7, allow_private_hosts: bool = False, max_response_bytes: int = 5_000_000, + source_label: str | None = None, + on_conflict: str = "overwrite", + relink_existing: bool = True, ) -> list[ToolSchema]: """Ingest an OpenAPI/Swagger spec, register tools, and auto-detect relations. @@ -304,11 +307,29 @@ def ingest_openapi( If True (default), run automatic dependency detection. min_confidence: Minimum confidence threshold for detected relations. + source_label: + Optional origin tag stored on each tool's ``metadata["source_label"]``. + Enables :meth:`list_sources` / :meth:`remove_source` and is used + to derive the namespace prefix when ``on_conflict="prefix"``. + on_conflict: + How to handle a name collision with an already-registered tool. + + - ``"overwrite"`` (default): replace the existing tool. + - ``"prefix"``: rename incoming as ``{source_label}.{name}`` (or + ``incoming.{name}`` if no label provided). Subsequent collisions + after prefixing fall back to ``overwrite``. + - ``"skip"``: keep the existing tool, drop the incoming one. + - ``"error"``: raise ``ValueError`` on the first collision. + relink_existing: + When True (default), after adding the new batch, dependency + detection is re-run across **new ↔ existing** tools so that + cross-source edges are discovered. Has no effect when this is + the first ingest or ``detect_dependencies=False``. Returns ------- list[ToolSchema] - The ingested tool schemas. + The ingested tool schemas (with any prefix-rename applied). """ from graph_tool_call.ingest.openapi import ingest_openapi @@ -319,13 +340,16 @@ def ingest_openapi( allow_private_hosts=allow_private_hosts, max_response_bytes=max_response_bytes, ) - self._register_tools_batch( + registered = self._register_tools_batch( tools, detect_dependencies=detect_dependencies, min_confidence=min_confidence, spec=spec.raw, + source_label=source_label, + on_conflict=on_conflict, + relink_existing=relink_existing, ) - return tools + return registered def ingest_mcp_tools( self, @@ -464,9 +488,27 @@ def add_relation( target: str, relation: str | RelationType, weight: float = 1.0, + *, + confidence: str | Confidence | None = None, + conf_score: float | None = None, + layer: int | None = None, + evidence: str | None = None, ) -> None: - """Add a relation between two tools.""" - self._builder.add_relation(source, target, relation, weight) + """Add a relation between two tools. + + Optional graphify-style attrs are forwarded to ``OntologyBuilder``; + see ``OntologyBuilder.add_relation`` for semantics. + """ + self._builder.add_relation( + source, + target, + relation, + weight, + confidence=confidence, + conf_score=conf_score, + layer=layer, + evidence=evidence, + ) self._invalidate_retrieval() def add_domain(self, domain: str, description: str = "") -> None: @@ -923,33 +965,92 @@ def _register_tools_batch( detect_dependencies: bool = True, min_confidence: float = 0.7, spec: dict | None = None, - ) -> None: + source_label: str | None = None, + on_conflict: str = "overwrite", + relink_existing: bool = True, + ) -> list[ToolSchema]: """Register tools, assign categories, and detect dependencies. Shared logic for ingest_openapi, ingest_mcp_tools, and ingest_functions. + Returns the list of tools that were actually registered (after any + conflict-driven rename or skip). """ + had_existing = bool(self._tools) + registered: list[ToolSchema] = [] categories_seen: set[str] = set() + for tool in tools: - self._tools[tool.name] = tool - self._builder.add_tool(tool) - if tool.domain: - if tool.domain not in categories_seen: - if not self._graph.has_node(tool.domain): - self._builder.add_category(tool.domain) - categories_seen.add(tool.domain) - self._builder.assign_category(tool.name, tool.domain) - - if detect_dependencies and len(tools) >= 2: + resolved = self._resolve_conflict(tool, on_conflict, source_label) + if resolved is None: + continue + if source_label: + resolved.metadata["source_label"] = source_label + self._tools[resolved.name] = resolved + self._builder.add_tool(resolved) + if resolved.domain: + if resolved.domain not in categories_seen: + if not self._graph.has_node(resolved.domain): + self._builder.add_category(resolved.domain) + categories_seen.add(resolved.domain) + self._builder.assign_category(resolved.name, resolved.domain) + registered.append(resolved) + + if detect_dependencies and registered: from graph_tool_call.analyze.dependency import detect_dependencies as _detect - kwargs: dict = {"min_confidence": min_confidence} - if spec: - kwargs["spec"] = spec - relations = _detect(tools, **kwargs) - for rel in relations: - self._builder.add_relation(rel.source, rel.target, rel.relation_type) + # Scope of detection: + # - First ingest, or relink disabled → only the new batch. + # - Incremental + relink_existing → union of new + all existing, + # so cross-source edges (e.g. order.* ↔ claim.*) are discovered. + if had_existing and relink_existing and len(self._tools) >= 2: + scope = list(self._tools.values()) + else: + scope = registered + + if len(scope) >= 2: + kwargs: dict = {"min_confidence": min_confidence} + if spec: + kwargs["spec"] = spec + relations = _detect(scope, **kwargs) + for rel in relations: + self._builder.add_relation(rel.source, rel.target, rel.relation_type) self._invalidate_retrieval() + return registered + + def _resolve_conflict( + self, + tool: ToolSchema, + on_conflict: str, + source_label: str | None, + ) -> ToolSchema | None: + """Apply the *on_conflict* policy. Returns the tool to register, or None to skip. + + Mutates ``tool.name`` when prefix-renaming. + """ + if tool.name not in self._tools: + return tool + + if on_conflict == "overwrite": + return tool + if on_conflict == "skip": + return None + if on_conflict == "error": + raise ValueError( + f"Tool '{tool.name}' already exists " + f"(on_conflict='error', incoming source_label={source_label!r})" + ) + if on_conflict == "prefix": + prefix = source_label or "incoming" + new_name = f"{prefix}.{tool.name}" + # If the prefixed name also collides, fall through to overwrite — + # the caller has already chosen prefix as the deconfliction strategy. + tool.name = new_name + return tool + raise ValueError( + f"Unknown on_conflict policy: {on_conflict!r} " + "(expected 'overwrite' | 'prefix' | 'skip' | 'error')" + ) # --- from_url --- @@ -1167,6 +1268,60 @@ def apply_conflicts(self, conflicts: list | None = None, *, min_confidence: floa self._invalidate_retrieval() return added + # --- source management (incremental ingest) --- + + def list_sources(self) -> list[str]: + """Return distinct ``source_label`` values across all registered tools.""" + seen: dict[str, None] = {} + for tool in self._tools.values(): + label = tool.metadata.get("source_label") if tool.metadata else None + if label and label not in seen: + seen[label] = None + return list(seen.keys()) + + def tools_by_source(self, source_label: str) -> list[ToolSchema]: + """Return all tools tagged with the given ``source_label``.""" + return [ + t + for t in self._tools.values() + if t.metadata and t.metadata.get("source_label") == source_label + ] + + def remove_source(self, source_label: str) -> int: + """Remove every tool tagged with *source_label* and its incident edges. + + Returns the number of tools removed. + """ + victims = [t.name for t in self.tools_by_source(source_label)] + for name in victims: + self._tools.pop(name, None) + if self._graph.has_node(name): + self._graph.remove_node(name) + if victims: + self._invalidate_retrieval() + return len(victims) + + def relink(self, *, min_confidence: float = 0.7) -> int: + """Re-run dependency detection across all currently registered tools. + + New relations are added to the existing graph. Existing edges are + preserved (the underlying graph engine deduplicates edges by + ``(source, target, relation)``). + + Returns the number of detected relations applied (including + previously known ones — use this as an upper bound, not a delta). + """ + if len(self._tools) < 2: + return 0 + from graph_tool_call.analyze.dependency import detect_dependencies as _detect + + relations = _detect(list(self._tools.values()), min_confidence=min_confidence) + for rel in relations: + self._builder.add_relation(rel.source, rel.target, rel.relation_type) + if relations: + self._invalidate_retrieval() + return len(relations) + def analyze( self, *, @@ -1397,17 +1552,28 @@ def search_tools(query: str, top_k: int | None = None) -> str: """Search available tools by natural language query. Use this FIRST to find which tools are available for the task. - Returns tool names, descriptions, and required parameters. + Returns tool names, descriptions, required parameters, and + **dependency hints** (``prerequisites`` for tools that must be + called first, ``relations`` for tools used together or in order). + + Planning rule: + - Pick the single tool that best matches the user's goal. + - If its ``prerequisites`` are non-empty, call those first and + feed their results into the target tool's arguments. + - ``relations`` with type=precedes/requires imply call order. Args: query: Natural language search query (e.g. "add numbers", "get weather") top_k: Max number of results (optional) """ k = top_k if top_k is not None else default_top_k - results = graph_ref.retrieve(query, top_k=k) + # retrieve_with_scores 를 써야 _enrich_relations 가 채운 relations/prerequisites + # 가 살아남는다. retrieve() 는 ToolSchema 만 반환해 이 정보가 버려짐. + results = graph_ref.retrieve_with_scores(query, top_k=k) matched = [] - for schema in results: + for result in results: + schema = result.tool entry: dict[str, Any] = { "name": schema.name, "description": (schema.description or "")[:200], @@ -1422,6 +1588,22 @@ def search_tools(query: str, top_k: int | None = None) -> str: } for p in schema.parameters ] + # Dependency / ordering hints from graph edges. + # prerequisites: REQUIRES targets not in the result set — LLM + # should call these first. relations: edges among result set + # members, carrying human-readable hint strings. + if result.prerequisites: + entry["prerequisites"] = list(result.prerequisites) + if result.relations: + entry["relations"] = [ + { + "target": rel.target, + "type": rel.type, + "direction": rel.direction, + "hint": rel.hint, + } + for rel in result.relations + ] matched.append(entry) output = { @@ -1430,8 +1612,10 @@ def search_tools(query: str, top_k: int | None = None) -> str: "total_tools": len(graph_ref._tools), "tools": matched, "hint": ( - "Use call_tool to execute a tool. " - "Pass tool_name and arguments as a dict matching the parameters above." + "Pick ONE tool matching the user's goal. If its " + "'prerequisites' list is non-empty, call those tools " + "first and use their results to fill the target tool's " + "arguments. Then call_tool the target." ), } return json.dumps(output, ensure_ascii=False, indent=2) diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..ff68b3a --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1,5 @@ +"""Internal scripts package — referenced by tests/test_release_script.py. + +Empty marker so Python treats ``scripts/`` as an importable package. +Not included in the published wheel (see ``pyproject.toml`` ``packages``). +""" diff --git a/tests/test_dependency_verbs.py b/tests/test_dependency_verbs.py new file mode 100644 index 0000000..756e8a4 --- /dev/null +++ b/tests/test_dependency_verbs.py @@ -0,0 +1,45 @@ +"""Unit tests for ``graph_tool_call.analyze.dependency`` verb mapping. + +특히 'reg' 약어가 'write' intent 로 분류되는지 확인 (리뷰 🟢 항목). +""" + +from __future__ import annotations + +from graph_tool_call.analyze.dependency import _VERB_TO_INTENT + + +def test_reg_abbrev_maps_to_write(): + """``regGoodsApprove`` 같은 camelCase 약어를 위해 'reg' 도 write 로 잡아야.""" + assert _VERB_TO_INTENT.get("reg") == "write" + + +def test_register_full_form_still_maps_to_write(): + assert _VERB_TO_INTENT.get("register") == "write" + assert _VERB_TO_INTENT.get("regist") == "write" + + +def test_basic_verbs_unchanged(): + """기존 verb mapping 회귀 방지.""" + assert _VERB_TO_INTENT.get("get") == "read" + assert _VERB_TO_INTENT.get("create") == "write" + assert _VERB_TO_INTENT.get("update") == "update" + assert _VERB_TO_INTENT.get("delete") == "delete" + + +# ─── _ANNOTATION_BY_VERB sibling 일관성 (잠복 결함) ── + + +def test_annotation_by_verb_covers_register_family(): + """``_ANNOTATION_BY_VERB`` 도 register 계열 커버해야 — _VERB_TO_INTENT 와 sibling. + + ``registerUser`` / ``insertOrder`` / ``regGoodsApprove`` 같은 도구가 MCP + annotation 을 받을 수 있어야 한다 (read_only_hint=False, ...). + """ + from graph_tool_call.core.tool import _ANNOTATION_BY_VERB + + for verb in ("register", "regist", "reg", "insert"): + assert verb in _ANNOTATION_BY_VERB, ( + f"verb {verb!r} 누락 — _VERB_TO_INTENT 와 sibling vocabulary 불일치" + ) + assert _ANNOTATION_BY_VERB[verb].read_only_hint is False + assert _ANNOTATION_BY_VERB[verb].destructive_hint is False diff --git a/tests/test_gateway_xgen_workflow.py b/tests/test_gateway_xgen_workflow.py index d9a0f91..8517b0f 100644 --- a/tests/test_gateway_xgen_workflow.py +++ b/tests/test_gateway_xgen_workflow.py @@ -473,7 +473,10 @@ def api_get_customer_info(customer_id: str) -> str: @tool def api_submit_approval(document_id: str, action: str) -> str: - """결재: 문서 결재를 승인 또는 반려합니다. Approve or reject a document in the approval workflow.""" + """결재: 문서 결재를 승인 또는 반려합니다. + + Approve or reject a document in the approval workflow. + """ return json.dumps({"document_id": document_id, "action": action, "result": "processed"}) diff --git a/tests/test_io_contract.py b/tests/test_io_contract.py new file mode 100644 index 0000000..865b646 --- /dev/null +++ b/tests/test_io_contract.py @@ -0,0 +1,171 @@ +"""Unit tests for ``graph_tool_call.ingest.io_contract``. + +특히 query/path parameter 의 enum 추출 (리뷰에서 빠뜨려진 부분) 확인. +""" + +from __future__ import annotations + +from graph_tool_call.ingest.io_contract import ( + extract_consumes_for_operation, + extract_leaves, + extract_produces_for_operation, +) + +# ─── extract_leaves ── + + +def test_extract_leaves_object_with_primitives(): + schema = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name"], + } + leaves = extract_leaves(schema, base_path="$") + by_name = {leaf.field_name: leaf for leaf in leaves} + assert by_name["name"].required is True + assert by_name["name"].field_type == "string" + assert by_name["age"].required is False + + +def test_extract_leaves_array_of_objects(): + schema = { + "type": "array", + "items": { + "type": "object", + "properties": {"id": {"type": "string"}}, + }, + } + leaves = extract_leaves(schema, base_path="$.body") + paths = {leaf.json_path for leaf in leaves} + assert any("[*]" in p for p in paths), "array → [*] wildcard 경로" + + +def test_extract_leaves_captures_enum(): + schema = { + "type": "object", + "properties": { + "status": {"type": "string", "enum": ["pending", "shipped"]}, + }, + } + leaves = extract_leaves(schema, base_path="$") + status = next(leaf for leaf in leaves if leaf.field_name == "status") + assert status.enum == ["pending", "shipped"] + + +# ─── consumes — enum 추출 회귀 (리뷰 🟢 항목) ── + + +def test_query_param_enum_extracted_openapi3(): + """OpenAPI 3.x query param 의 schema.enum 이 FieldLeaf.enum 에 들어가야.""" + operation = { + "parameters": [ + { + "name": "sort", + "in": "query", + "required": True, + "schema": {"type": "string", "enum": ["asc", "desc"]}, + }, + ], + "responses": {"200": {"description": "OK"}}, + } + leaves = extract_consumes_for_operation(operation) + by_name = {leaf.field_name: leaf for leaf in leaves} + assert "sort" in by_name + assert by_name["sort"].enum == ["asc", "desc"] + + +def test_query_param_enum_extracted_swagger2(): + """Swagger 2.0 query param 의 enum (parameter level) 도 잡아야.""" + operation = { + "parameters": [ + { + "name": "type", + "in": "query", + "required": True, + "type": "string", + "enum": ["A", "B", "C"], + }, + ], + "responses": {"200": {"description": "OK"}}, + } + leaves = extract_consumes_for_operation(operation, is_swagger2=True) + type_leaf = next(leaf for leaf in leaves if leaf.field_name == "type") + assert type_leaf.enum == ["A", "B", "C"] + + +def test_path_param_enum_extracted(): + """Path param 의 enum 도 동일.""" + operation = { + "parameters": [ + { + "name": "kind", + "in": "path", + "required": True, + "schema": {"type": "string", "enum": ["x", "y"]}, + }, + ], + "responses": {"200": {"description": "OK"}}, + } + leaves = extract_consumes_for_operation(operation) + kind = next(leaf for leaf in leaves if leaf.field_name == "kind") + assert kind.enum == ["x", "y"] + + +def test_param_without_enum_has_empty_list(): + """enum 없는 일반 param 은 enum=[] 으로 들어가야 (None 아님).""" + operation = { + "parameters": [ + {"name": "page", "in": "query", "schema": {"type": "integer"}}, + ], + "responses": {"200": {"description": "OK"}}, + } + leaves = extract_consumes_for_operation(operation, required_only=False) + page = next(leaf for leaf in leaves if leaf.field_name == "page") + assert page.enum == [] + + +# ─── produces ── + + +def test_extract_produces_walks_response_body(): + operation = { + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "data": { + "type": "object", + "properties": { + "id": {"type": "string"}, + }, + }, + }, + }, + }, + }, + }, + }, + } + leaves = extract_produces_for_operation(operation) + paths = {leaf.json_path for leaf in leaves} + assert "$.data.id" in paths + + +def test_consumes_skips_optional_when_required_only(): + operation = { + "parameters": [ + {"name": "must", "in": "query", "required": True, "schema": {"type": "string"}}, + {"name": "maybe", "in": "query", "required": False, "schema": {"type": "string"}}, + ], + "responses": {"200": {"description": "OK"}}, + } + leaves = extract_consumes_for_operation(operation) + names = {leaf.field_name for leaf in leaves} + assert "must" in names + assert "maybe" not in names diff --git a/tests/test_plan_binding.py b/tests/test_plan_binding.py new file mode 100644 index 0000000..eee0ae9 --- /dev/null +++ b/tests/test_plan_binding.py @@ -0,0 +1,71 @@ +"""Unit tests for ``graph_tool_call.plan.binding``. + +binding placeholder resolution + error 동작. +""" + +from __future__ import annotations + +import pytest + +from graph_tool_call.plan.binding import BindingError, resolve_bindings + + +def test_literal_passes_through(): + assert resolve_bindings("hello", {}) == "hello" + assert resolve_bindings(42, {}) == 42 + assert resolve_bindings(None, {}) is None + + +def test_simple_lookup(): + ctx = {"s1": {"foo": "BAR"}} + assert resolve_bindings("${s1.foo}", ctx) == "BAR" + + +def test_full_step_object(): + ctx = {"s1": {"a": 1, "b": 2}} + assert resolve_bindings("${s1}", ctx) == {"a": 1, "b": 2} + + +def test_array_index(): + ctx = {"s1": {"items": [{"id": "A"}, {"id": "B"}]}} + assert resolve_bindings("${s1.items[0].id}", ctx) == "A" + assert resolve_bindings("${s1.items[1].id}", ctx) == "B" + + +def test_array_negative_index(): + ctx = {"s1": [10, 20, 30]} + assert resolve_bindings("${s1[-1]}", ctx) == 30 + + +def test_unknown_source_raises(): + with pytest.raises(BindingError, match="unknown source"): + resolve_bindings("${ghost.x}", {"s1": {}}) + + +def test_dict_walks_recursively(): + ctx = {"s1": {"v": 9}} + out = resolve_bindings( + {"a": "${s1.v}", "b": "literal", "nested": {"c": "${s1.v}"}}, + ctx, + ) + assert out == {"a": 9, "b": "literal", "nested": {"c": 9}} + + +def test_list_walks_recursively(): + ctx = {"s1": {"v": "X"}} + out = resolve_bindings(["${s1.v}", "lit", {"k": "${s1.v}"}], ctx) + assert out == ["X", "lit", {"k": "X"}] + + +def test_oob_index_raises(): + ctx = {"s1": [1, 2]} + with pytest.raises(BindingError, match="out of range"): + resolve_bindings("${s1[5]}", ctx) + + +def test_input_alias_lookup(): + """input / user_input 둘 다 같은 값 가리키도록 caller 가 등록한 케이스.""" + shared = {"keyword": "shoes"} + ctx = {"input": shared, "user_input": shared} + assert resolve_bindings("${input.keyword}", ctx) == "shoes" + assert resolve_bindings("${user_input.keyword}", ctx) == "shoes" diff --git a/tests/test_plan_runner.py b/tests/test_plan_runner.py new file mode 100644 index 0000000..a4cf216 --- /dev/null +++ b/tests/test_plan_runner.py @@ -0,0 +1,217 @@ +"""Unit tests for ``graph_tool_call.plan.runner``. + +리뷰 CRITICAL #1, #2 회귀 방지 + 핵심 동작 cover. +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from graph_tool_call.plan import ( + Plan, + PlanRunner, + PlanStep, +) +from graph_tool_call.plan.runner import ( + PlanAborted, + PlanCompleted, +) + + +def _echo(name: str, args: dict[str, Any]) -> dict[str, Any]: + return {"echoed": args, "tool": name} + + +# ─── CRITICAL #1: input_context 가 ${user_input.x} / ${input.x} 둘 다 resolve ── + + +def test_user_input_alias_resolves(): + """``${user_input.foo}`` 가 input_context["foo"] 로 resolve 되어야 한다. + + 이전엔 synthesizer 가 ${user_input.x} 만들고 runner 가 context["input"] 에만 + 심어서 첫 step 부터 BindingError 로 abort 됐던 케이스. + """ + plan = Plan( + id="t", + goal="g", + steps=[ + PlanStep(id="s1", tool="echo", args={"foo": "${user_input.foo}"}), + ], + output_binding="${s1}", + ) + trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"}) + assert trace.success, f"plan should succeed, got: {trace.failed_step}" + assert trace.steps[0].args_resolved == {"foo": "BAR"} + + +def test_input_alias_resolves_too(): + """``${input.foo}`` 도 동일 dict 가리켜야 한다 (backward compat).""" + plan = Plan( + id="t", + goal="g", + steps=[ + PlanStep(id="s1", tool="echo", args={"foo": "${input.foo}"}), + ], + output_binding="${s1}", + ) + trace = PlanRunner(_echo).run(plan, input_context={"foo": "BAR"}) + assert trace.success + assert trace.steps[0].args_resolved == {"foo": "BAR"} + + +def test_mixed_input_user_input_in_same_step(): + """한 step 에 ${input.x} 와 ${user_input.y} 가 섞여 있어도 둘 다 resolve.""" + plan = Plan( + id="t", + goal="g", + steps=[ + PlanStep( + id="s1", + tool="echo", + args={"a": "${input.x}", "b": "${user_input.y}"}, + ), + ], + ) + trace = PlanRunner(_echo).run(plan, input_context={"x": "X", "y": "Y"}) + assert trace.success + assert trace.steps[0].args_resolved == {"a": "X", "b": "Y"} + + +# ─── CRITICAL #2: ExecutionTrace.steps 가 누적 ── + + +def test_execution_trace_accumulates_steps(): + """run() 의 ExecutionTrace.steps 가 빈 리스트가 아니어야 한다. + + 이전엔 runner.py:289 의 pass 때문에 항상 [] 였던 케이스. + """ + plan = Plan( + id="t", + goal="g", + steps=[ + PlanStep(id="s1", tool="echo", args={"x": "hello"}), + PlanStep(id="s2", tool="echo", args={"y": "${s1.echoed.x}"}), + ], + output_binding="${s2}", + ) + trace = PlanRunner(_echo).run(plan) + assert trace.success + assert len(trace.steps) == 2, "두 step 모두 trace 에 누적돼야 함" + assert trace.steps[0].id == "s1" + assert trace.steps[1].id == "s2" + assert trace.steps[0].output == {"echoed": {"x": "hello"}, "tool": "echo"} + assert trace.steps[1].args_resolved == {"y": "hello"}, "이전 step 출력 binding" + + +def test_execution_trace_includes_failed_step(): + """실패해도 실패한 step + 그 이전 step 이 trace 에 포함.""" + + def flaky(name: str, args: dict[str, Any]) -> dict[str, Any]: + if name == "boom": + raise RuntimeError("simulated") + return {"ok": True} + + plan = Plan( + id="t", + goal="g", + steps=[ + PlanStep(id="s1", tool="ok"), + PlanStep(id="s2", tool="boom"), + PlanStep(id="s3", tool="never_called"), + ], + ) + trace = PlanRunner(flaky).run(plan) + assert trace.success is False + assert trace.failed_step == "s2" + assert len(trace.steps) == 2, "실패까지의 step 만 누적 (s3 는 도달 안 함)" + assert trace.steps[0].id == "s1" + assert trace.steps[0].error is None + assert trace.steps[1].id == "s2" + assert trace.steps[1].error is not None + assert "simulated" in trace.steps[1].error["message"] + + +# ─── 일반 동작 ── + + +def test_run_stream_yields_expected_events_in_order(): + plan = Plan( + id="t", + goal="g", + steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})], + ) + events = list(PlanRunner(_echo).run_stream(plan)) + types = [e.type for e in events] + assert types[0] == "plan.started" + assert types[-1] == "plan.completed" + assert "step.started" in types + assert "step.completed" in types + + +def test_plan_completed_carries_trace_steps(): + """run_stream 의 PlanCompleted 가 trace_steps 를 실어 보내야 run() 이 읽을 수 있음.""" + plan = Plan( + id="t", + goal="g", + steps=[PlanStep(id="s1", tool="echo", args={"x": "hi"})], + ) + completed = next(e for e in PlanRunner(_echo).run_stream(plan) if isinstance(e, PlanCompleted)) + assert len(completed.trace_steps) == 1 + assert completed.trace_steps[0].id == "s1" + + +def test_plan_aborted_carries_trace_steps(): + """abort 시에도 PlanAborted 가 그때까지의 trace_steps 를 실어 보내야 함.""" + + def fail(name: str, args: dict[str, Any]) -> dict[str, Any]: + raise RuntimeError("boom") + + plan = Plan(id="t", goal="g", steps=[PlanStep(id="s1", tool="x")]) + aborted = next(e for e in PlanRunner(fail).run_stream(plan) if isinstance(e, PlanAborted)) + assert len(aborted.trace_steps) == 1 + assert aborted.trace_steps[0].error is not None + + +def test_binding_to_unknown_source_aborts(): + """존재하지 않는 step id 참조 → BindingError → abort.""" + plan = Plan( + id="t", + goal="g", + steps=[PlanStep(id="s1", tool="echo", args={"x": "${ghost.foo}"})], + ) + trace = PlanRunner(_echo).run(plan) + assert trace.success is False + assert trace.failed_step == "s1" + assert trace.steps[0].error["kind"] == "binding" + + +def test_output_binding_resolves_nested_path(): + """output_binding 이 step 응답 안의 nested path 를 가리킬 수 있어야.""" + plan = Plan( + id="t", + goal="g", + steps=[PlanStep(id="s1", tool="echo", args={"v": 42})], + output_binding="${s1.echoed.v}", + ) + trace = PlanRunner(_echo).run(plan) + assert trace.success + assert trace.output == 42 + + +def test_no_input_context_works_when_plan_has_no_input_binding(): + """input_context 안 줘도 ${input.x} 안 쓰면 동작.""" + plan = Plan( + id="t", + goal="g", + steps=[PlanStep(id="s1", tool="echo", args={"x": "literal"})], + ) + trace = PlanRunner(_echo).run(plan) + assert trace.success + + +def test_v1_only_supports_abort_on_error(): + """v1 PlanRunner 는 on_error='abort' 만 허용 — 다른 값은 ValueError.""" + with pytest.raises(ValueError): + PlanRunner(_echo, on_error="continue") diff --git a/tests/test_plan_synthesizer.py b/tests/test_plan_synthesizer.py new file mode 100644 index 0000000..d1793b9 --- /dev/null +++ b/tests/test_plan_synthesizer.py @@ -0,0 +1,168 @@ +"""Unit tests for ``graph_tool_call.plan.synthesizer``. + +핵심 합성 시나리오 + Cycle/F2 fallback 의 user_input placeholder 출력. +""" + +from __future__ import annotations + +import pytest + +from graph_tool_call.plan.synthesizer import ( + PathSynthesizer, + PlanSynthesisError, + _normalize_field_name, +) + + +def _basic_graph() -> dict: + """포함: + - 'searchProduct': 입력=keyword, 출력=goodsNo (semantic=goods.id) + - 'getProductDetail': 입력=goodsNo (semantic=goods.id) → 의존 + """ + return { + "tools": { + "searchProduct": { + "metadata": { + "method": "GET", + "path": "/api/v1/products", + "consumes": [{"field_name": "keyword", "kind": "data", "required": True}], + "produces": [ + { + "field_name": "goodsNo", + "json_path": "$.body.items[*].goodsNo", + "semantic_tag": "goods.id", + } + ], + "ai_metadata": { + "canonical_action": "search", + "primary_resource": "product", + }, + }, + }, + "getProductDetail": { + "metadata": { + "method": "GET", + "path": "/api/v1/products/{goodsNo}", + "consumes": [ + { + "field_name": "goodsNo", + "semantic_tag": "goods.id", + "kind": "data", + "required": True, + } + ], + "produces": [{"field_name": "name", "json_path": "$.body.name"}], + "ai_metadata": { + "canonical_action": "read", + "primary_resource": "product", + }, + }, + }, + }, + } + + +# ─── normalize_field_name ── + + +def test_normalize_field_name_collapses_separators(): + assert _normalize_field_name("ord_no") == "ordno" + assert _normalize_field_name("ORD-NO") == "ordno" + assert _normalize_field_name("ordNo") == "ordno" + + +def test_normalize_field_name_keeps_token_roots_distinct(): + """ord ≠ order — token-level synonym mapping은 안 함.""" + assert _normalize_field_name("ordNo") != _normalize_field_name("orderNo") + + +def test_normalize_field_name_empty(): + assert _normalize_field_name("") == "" + assert _normalize_field_name(None) == "" # type: ignore[arg-type] + + +# ─── synthesizer 핵심 동작 ── + + +def test_synthesize_uses_entity_when_available(): + """user 가 keyword 를 entity 로 줬으면 검색 step 1개로 끝나야.""" + syn = PathSynthesizer(_basic_graph()) + plan = syn.synthesize(target="searchProduct", entities={"keyword": "shoes"}) + assert len(plan.steps) == 1 + assert plan.steps[0].tool == "searchProduct" + assert plan.steps[0].args == {"keyword": "shoes"} + + +def test_synthesize_chains_producer_when_entity_missing(): + """getProductDetail 호출하려면 goodsNo 가 필요 — searchProduct 가 producer. + + keyword 만 entity 로 주면 chain: searchProduct → getProductDetail. + 합성 후 step 이름은 ``s1``/``s2`` 로 정렬되고, binding 도 그에 맞게 rewrite 됨. + """ + syn = PathSynthesizer(_basic_graph()) + plan = syn.synthesize( + target="getProductDetail", + entities={"keyword": "shoes"}, + ) + assert len(plan.steps) == 2, "검색 + 상세조회 2-step chain" + assert plan.steps[0].tool == "searchProduct" + assert plan.steps[1].tool == "getProductDetail" + binding = plan.steps[1].args.get("goodsNo", "") + # step_id 순서 정렬 후 binding 은 ${s1...} 로 rewrite — 첫 step 의 출력 가리킴 + assert binding.startswith("${"), "binding placeholder 형식이어야" + assert "s1" in binding, f"첫 step (s1) 출력 binding 이어야, got {binding}" + assert "goodsNo" in binding, "produces 필드 경로 포함" + + +def test_synthesize_falls_back_to_user_input_placeholder(): + """필수 field 인데 entity 도 없고 producer 도 없으면 ``${user_input.X}`` 로 fallback. + + F2 + Cycle policy B 의 핵심 동작 — abort 대신 caller 에게 슬롯을 surface. + runner 가 input_context 에 ``user_input`` 별칭으로 등록하므로 + plan 자체는 합성되고, 실행 시 caller 가 값을 공급하면 작동한다. + """ + g = { + "tools": { + "needsX": { + "metadata": { + "consumes": [{"field_name": "mysteryField", "kind": "data", "required": True}], + "produces": [], + "ai_metadata": {"canonical_action": "read"}, + }, + }, + }, + } + syn = PathSynthesizer(g) + plan = syn.synthesize(target="needsX", entities={}) + assert len(plan.steps) == 1 + assert plan.steps[0].args == {"mysteryField": "${user_input.mysteryField}"} + + +def test_synthesize_unknown_target_raises(): + syn = PathSynthesizer(_basic_graph()) + with pytest.raises(PlanSynthesisError): + syn.synthesize(target="ghostTool", entities={}) + + +def test_synthesize_context_field_uses_collection_default(): + """kind=context 인 필드는 entity 없으면 context_defaults 에서 채움.""" + g = { + "tools": { + "needsLocale": { + "metadata": { + "consumes": [ + { + "field_name": "locale", + "kind": "context", + "required": True, + } + ], + "produces": [], + "ai_metadata": {"canonical_action": "read"}, + }, + }, + }, + } + syn = PathSynthesizer(g, context_defaults={"locale": "ko_KR"}) + plan = syn.synthesize(target="needsLocale", entities={}) + assert plan.steps[0].args == {"locale": "ko_KR"}