synaptic-memory/eval/run_all.py at main · PlateerLab/synaptic-memory · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Unified QA benchmark runner — run after every development cycle.

Runs all evaluation datasets (custom + public) through the synaptic
pipeline and produces a regression-aware comparison table.

Usage::

    # Full run (all datasets)
    uv run python eval/run_all.py

    # Quick run (custom only, skip large public datasets)
    uv run python eval/run_all.py --quick

    # Compare against last baseline
    uv run python eval/run_all.py --compare eval/results/baseline.json

Output::

    ┌──────────────────┬────────┬───────┬───────┬───────┬──────────┐
    │ Dataset          │ Corpus │  MRR  │ P@10  │ R@10  │ Status   │
    ├──────────────────┼────────┼───────┼───────┼───────┼──────────┤
    │ KRRA Easy        │ 19,720 │ 0.967 │ 0.496 │ 0.914 │ ✅       │
    │ KRRA Hard        │ 19,720 │ 0.507 │ 0.157 │ 0.633 │ ✅       │
    │ assort Easy      │ 13,909 │ 0.880 │ 0.100 │ 0.933 │ ✅       │
    │ assort Hard      │ 13,909 │ 0.127 │ 0.047 │ 0.267 │ ✅       │
    │ HotPotQA-200     │  1,990 │ 0.742 │       │       │ NEW      │
    │ Ko-StrategyQA    │  9,251 │ 0.317 │       │       │ NEW      │
    │ ...              │        │       │       │       │          │
    └──────────────────┴────────┴───────┴───────┴───────┴──────────┘
"""

from __future__ import annotations

import argparse
import asyncio
import json
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT))

from datetime import UTC

from synaptic.agent_loop import project_tool_result
from synaptic.graph import SynapticGraph
from tests.benchmark.metrics import BenchmarkResult

# Diagnostic escape hatch — set SYN_EVAL_NO_PROJECT=1 to fall back to the
# old json.dumps(result)[:5000] truncation for α1-4 A/B measurement.
_PROJECT_ENABLED = os.environ.get("SYN_EVAL_NO_PROJECT") != "1"


def _project_or_legacy(result: dict) -> str:
    if _PROJECT_ENABLED:
        return project_tool_result(result)
    return json.dumps(result, ensure_ascii=False)[:5000]


# --- Dataset registry ---

BENCHMARK_DIR = REPO_ROOT / "tests" / "benchmark" / "data"
EVAL_DIR = REPO_ROOT / "eval"
RESULTS_DIR = EVAL_DIR / "results"


@dataclass
class DatasetConfig:
    name: str
    path: Path
    query_path: Path | None = None  # None = queries embedded in dataset
    corpus_key: str = "corpus"
    query_key: str = "queries"
    doc_id_key: str = "doc_id"
    text_key: str = "text"
    title_key: str = "title"
    k: int = 10
    is_custom: bool = False  # custom = KRRA/assort, not public
    quick: bool = True  # include in --quick mode


# Custom datasets (KRRA, assort)
CUSTOM_DATASETS = [
    DatasetConfig(
        name="KRRA Easy",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="KRRA Hard",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Easy",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Hard",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Easy",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Hard",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee_hard.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="KRRA Conv",
        path=EVAL_DIR / "data" / "krra_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "krra_conversational.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="assort Conv",
        path=EVAL_DIR / "data" / "assort_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "assort_conversational.json",
        is_custom=True,
        quick=True,
    ),
    DatasetConfig(
        name="X2BEE Conv",
        path=EVAL_DIR / "data" / "x2bee_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "x2bee_conversational.json",
        is_custom=True,
        quick=True,
    ),
    # Phase 1: forward-looking cross-domain federation eval. Runs the agent
    # against the combined MetaCorpus (krra + assort + x2bee) and scores by
    # per-domain coverage instead of doc-id matching. Requires
    # ``eval/data/metacorpus.sqlite`` to exist — build with
    # ``uv run python eval/build_metacorpus.py``. Not in --quick by default
    # (forces an explicit opt-in via --agent-dataset "Cross-Domain").
    DatasetConfig(
        name="Cross-Domain",
        path=EVAL_DIR / "data" / "metacorpus.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "cross_domain.json",
        is_custom=True,
        quick=False,
    ),
    # 금융 법령/규정 corpus — 4,417 조문 scraped from law.go.kr. Heavily
    # cross-referential; finreg multihop is the cross-reference reasoning
    # benchmark single-shot RAG structurally cannot solve. Built via
    # eval/datasets/{build,ingest,gen_finreg_queries}.py. Opt-in (not --quick).
    DatasetConfig(
        name="finreg",
        path=EVAL_DIR / "data" / "finreg_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "finreg.json",
        is_custom=True,
        quick=False,
    ),
    DatasetConfig(
        name="finreg multihop",
        path=EVAL_DIR / "data" / "finreg_graph.sqlite",
        query_path=EVAL_DIR / "data" / "queries" / "finreg_multihop.json",
        is_custom=True,
        quick=False,
    ),
]

# Public datasets (in-memory, from benchmark JSON)
PUBLIC_DATASETS = [
    DatasetConfig(name="HotPotQA-24", path=BENCHMARK_DIR / "hotpotqa_24.json", quick=True),
    DatasetConfig(name="HotPotQA-200", path=BENCHMARK_DIR / "hotpotqa.json", quick=False),
    DatasetConfig(
        name="Allganize RAG-ko", path=BENCHMARK_DIR / "allganize_rag_ko.json", quick=True
    ),
    DatasetConfig(
        name="Allganize RAG-Eval", path=BENCHMARK_DIR / "allganize_rag_eval.json", quick=True
    ),
    DatasetConfig(name="PublicHealthQA", path=BENCHMARK_DIR / "publichealthqa_ko.json", quick=True),
    DatasetConfig(name="AutoRAG", path=BENCHMARK_DIR / "autorag_retrieval.json", quick=True),
    DatasetConfig(name="KLUE-MRC", path=BENCHMARK_DIR / "klue_mrc.json", quick=False),
    DatasetConfig(name="Ko-StrategyQA", path=BENCHMARK_DIR / "ko_strategyqa.json", quick=False),
    # v0.18 verification corpora — English domain diversity for
    # "is this reform truly general?" check. Marked quick=False so the
    # CI / quick mode stays fast; opt-in via removing --quick.
    DatasetConfig(name="2Wiki-dev", path=BENCHMARK_DIR / "2wiki_dev.json", quick=False),
    DatasetConfig(name="MuSiQue-dev", path=BENCHMARK_DIR / "musique_dev.json", quick=False),
    DatasetConfig(name="TREC-COVID", path=BENCHMARK_DIR / "trec_covid.json", quick=False),
    DatasetConfig(name="FiQA", path=BENCHMARK_DIR / "fiqa.json", quick=False),
    DatasetConfig(name="SciFact", path=BENCHMARK_DIR / "scifact.json", quick=False),
]


@dataclass
class RunResult:
    name: str
    corpus_size: int = 0
    mrr: float = 0.0
    p_at_k: float = 0.0
    r_at_k: float = 0.0
    ndcg: float = 0.0
    hit_rate: str = ""
    elapsed: float = 0.0
    error: str | None = None


# --- Custom dataset runner (SQLite graph) ---


async def run_custom_dataset(
    cfg: DatasetConfig,
    embedder: object | None = None,
    reranker: object | None = None,
    use_flashrank: bool = False,
) -> RunResult:
    """Run a custom dataset against its pre-built SQLite graph.

    ``embedder`` / ``reranker`` are instantiated once in ``main()`` and
    shared across all datasets, so model weights load exactly once per
    suite run (critical for local --local-bge runs where bge-m3 load is
    ~10-15s on first call).
    """
    if not cfg.path.exists():
        return RunResult(name=cfg.name, error="graph not found")
    if not cfg.query_path or not cfg.query_path.exists():
        return RunResult(name=cfg.name, error="queries not found")

    from synaptic.backends.sqlite_graph import SqliteGraphBackend

    backend = SqliteGraphBackend(str(cfg.path))
    await backend.connect()

    with open(cfg.query_path, encoding="utf-8") as f:
        gt = json.load(f)
    queries = gt.get("queries", [])
    id_field = gt.get("id_field", "doc_id")

    # v0.17.1 — load a ``DomainProfile`` (if present at
    # ``eval/data/profiles/{corpus}.toml``) to pick up table_query_hints.
    # The corpus-basename lookup matches the SQLite filename convention
    # (``assort_graph.sqlite`` → ``assort``); falls back to no hints.
    table_query_hints: dict[str, list[str]] | None = None
    corpus_stem = cfg.path.stem.removesuffix("_graph")
    profile_path = Path(__file__).parent / "data" / "profiles" / f"{corpus_stem}.toml"
    if profile_path.exists():
        try:
            from synaptic.extensions.domain_profile import DomainProfile

            profile = DomainProfile.load(profile_path)
            if profile.table_query_hints:
                table_query_hints = dict(profile.table_query_hints)
        except Exception:
            pass

    from synaptic.extensions.evidence_search import EvidenceSearch

    searcher = EvidenceSearch(
        backend=backend,
        embedder=embedder,
        reranker=reranker,
        table_query_hints=table_query_hints,
    )

    bench = BenchmarkResult()
    t0 = time.time()

    for q in queries:
        qid = q.get("qid", "")
        query_text = q.get("query", "")
        relevant = set(q.get("relevant_docs", []))
        if not relevant:
            continue

        result = await searcher.search(query_text, k=cfg.k * 2, fts_seed_limit=30)

        if id_field == "node_title":
            retrieved = []
            for ev in result.evidence:
                title = ev.node.title
                if title and title not in retrieved:
                    retrieved.append(title)
        else:
            retrieved = []
            for ev in result.evidence:
                doc_id = ev.document_id or (ev.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)

        bench.add(
            query_id=qid,
            query=query_text,
            retrieved=retrieved[: cfg.k],
            relevant=relevant,
            k=cfg.k,
        )

    elapsed = time.time() - t0
    await backend.close()

    summary = bench.summary()
    total = len(queries)
    hits = sum(1 for q in bench.queries if q.get("mrr", 0) > 0)

    return RunResult(
        name=cfg.name,
        corpus_size=total,
        mrr=summary.get("mrr", 0),
        p_at_k=summary.get("mean_precision@k", 0),
        r_at_k=summary.get("mean_recall@k", 0),
        ndcg=summary.get("mean_ndcg@k", 0),
        hit_rate=f"{hits}/{total}",
        elapsed=elapsed,
    )


# --- Public dataset runner (in-memory) ---


async def run_public_dataset(
    cfg: DatasetConfig,
    embedder: object | None = None,
    reranker: object | None = None,
    entity_linker_cfg: tuple[int, float] | None = None,
    reference_linker: bool = False,
) -> RunResult:
    """Run a public benchmark dataset — full pipeline: ingest → index → search.

    Uses MemoryBackend for speed (no disk I/O). Shared ``embedder``/
    ``reranker`` objects (instantiated once in ``main()``) avoid
    per-dataset model reload. When ``entity_linker_cfg = (min_df,
    max_df_ratio)`` is set, runs :class:`EntityLinker` post-hoc to build
    a DF-filtered phrase hub before search.
    """
    if not cfg.path.exists():
        return RunResult(name=cfg.name, error="file not found")

    with open(cfg.path, encoding="utf-8") as f:
        data = json.load(f)

    raw_corpus = data.get("corpus", data.get("documents", []))
    queries = data.get("queries", [])
    if not raw_corpus or not queries:
        return RunResult(name=cfg.name, error="empty dataset")

    # Normalize corpus to list of (doc_id, title, text)
    corpus: list[tuple[str, str, str]] = []
    if isinstance(raw_corpus, dict):
        for doc_id, doc in raw_corpus.items():
            if isinstance(doc, dict):
                corpus.append((str(doc_id), str(doc.get("title", "")), str(doc.get("text", ""))))
            elif isinstance(doc, str):
                corpus.append((str(doc_id), "", doc))
    elif isinstance(raw_corpus, list):
        for doc in raw_corpus:
            if isinstance(doc, dict):
                doc_id = str(doc.get("doc_id", doc.get("_id", doc.get("id", ""))))
                corpus.append(
                    (
                        doc_id,
                        str(doc.get("title", "")),
                        str(doc.get("text", doc.get("content", ""))),
                    )
                )

    if not corpus:
        return RunResult(name=cfg.name, error="could not parse corpus")

    # v0.18-prep — public datasets switched to SqliteGraphBackend
    # (FTS5, C-implemented) instead of MemoryBackend (Python O(N) loop).
    # MemoryBackend FTS scaled badly on the new BEIR / 2Wiki / MuSiQue
    # corpora (2Wiki: 56k docs × 12k queries = 712M comparisons → ~7
    # hours per measurement). SqliteGraphBackend with FTS5 + HNSW is
    # 5-10× faster on the same workload. Tempfile so we don't pollute
    # the data dir; backend.close() at the end deletes via OS cleanup.
    import tempfile

    from synaptic.backends.sqlite_graph import SqliteGraphBackend

    tmp_db = tempfile.NamedTemporaryFile(
        prefix=f"public_{cfg.name.replace(' ', '_')}_",
        suffix=".db",
        delete=False,
    )
    tmp_db.close()
    backend = SqliteGraphBackend(tmp_db.name)
    await backend.connect()
    graph = SynapticGraph(backend, embedder=embedder, reranker=reranker)

    # Pre-compute corpus embeddings in batches when an embedder is wired.
    # Passing ``embedding=`` to ``graph.add`` avoids the per-node single
    # embed call that bottlenecks at batch=1 (the main reason public
    # bench ingest was GPU-idle in previous runs).
    embeddings: list[list[float] | None] = [None] * len(corpus)
    if embedder is not None and hasattr(embedder, "embed_batch"):
        embed_inputs = [
            f"{title or doc_id}\n{(text or '')[:1500]}" for doc_id, title, text in corpus
        ]
        BATCH = 64
        for i in range(0, len(embed_inputs), BATCH):
            chunk = embed_inputs[i : i + BATCH]
            vecs = await embedder.embed_batch(chunk)
            for j, v in enumerate(vecs):
                embeddings[i + j] = v if v else None

    for (doc_id, title, text), emb in zip(corpus, embeddings):
        if not text and not title:
            continue
        await graph.add(
            title=title or doc_id,
            content=text,
            properties={"doc_id": doc_id},
            embedding=emb,
        )

    # Post-hoc DF-filtered entity linking.
    if entity_linker_cfg is not None:
        from synaptic.extensions.domain_profile import DomainProfile
        from synaptic.extensions.entity_linker import EntityLinker
        from synaptic.extensions.phrase_extractor import PhraseExtractor
        from synaptic.models import NodeKind as _NK

        min_df, max_df_ratio = entity_linker_cfg
        profile = DomainProfile(
            name=f"{cfg.name}-eval",
            locale="multi",
            min_df=min_df,
            max_df_ratio=max_df_ratio,
        )
        linker = EntityLinker(
            extractor=PhraseExtractor(),
            profile=profile,
            max_links_per_source=15,
        )
        await linker.link(backend, source_kind=_NK.CONCEPT, embedder=embedder)

    # Post-hoc connective-pattern typed-edge extraction (Korean only).
    if reference_linker:
        from synaptic.extensions.domain_profile import DomainProfile
        from synaptic.extensions.reference_linker import ReferenceLinker
        from synaptic.models import NodeKind as _NK

        ref_linker = ReferenceLinker(DomainProfile(name=f"{cfg.name}-eval", locale="multi"))
        ref_stats = await ref_linker.link(backend, source_kind=_NK.CONCEPT)
        print(
            f"  [ReferenceLinker {cfg.name}] {ref_stats.edges_created} edges "
            f"{ref_stats.by_kind} | raw={ref_stats.raw_matches} "
            f"unresolved={ref_stats.unresolved} "
            f"targets={ref_stats.target_index_size}",
            flush=True,
        )

    # Parse queries — support both list and BEIR dict format
    qrels = data.get("relevant_docs", data.get("qrels", {}))
    query_list: list[tuple[str, str, set[str]]] = []  # (qid, text, relevant_ids)

    if isinstance(queries, dict):
        # BEIR format: queries={qid: text}, relevant_docs={qid: {doc_id: score}}
        for qid, text in queries.items():
            rel = qrels.get(qid, {})
            if isinstance(rel, dict):
                relevant = set(str(k) for k in rel.keys())
            elif isinstance(rel, list):
                relevant = set(str(x) for x in rel)
            else:
                continue
            if relevant and text:
                query_list.append((str(qid), str(text), relevant))
    elif isinstance(queries, list):
        for q in queries:
            qid = str(q.get("qid", q.get("query_id", q.get("_id", ""))))
            text = str(q.get("query", q.get("question", "")))
            rel_raw = q.get("relevant_docs", q.get("answer_ids", q.get("positive_doc_ids", [])))
            if isinstance(rel_raw, dict):
                relevant = set(str(k) for k in rel_raw.keys())
            elif isinstance(rel_raw, list):
                relevant = set(str(x) for x in rel_raw)
            else:
                continue
            if relevant and text:
                query_list.append((qid, text, relevant))

    if not query_list:
        return RunResult(name=cfg.name, error="no valid queries")

    # Build searcher — EvidenceSearch when embedder available, else graph.search
    use_evidence = embedder is not None or reranker is not None
    searcher = None
    if use_evidence:
        from synaptic.extensions.evidence_search import EvidenceSearch

        searcher = EvidenceSearch(backend=backend, embedder=embedder, reranker=reranker)

    # Search
    bench = BenchmarkResult()
    t0 = time.time()

    for qid, query_text, relevant in query_list:
        if searcher:
            result = await searcher.search(query_text, k=cfg.k * 2, fts_seed_limit=30)
            retrieved = []
            for ev in result.evidence:
                doc_id = ev.document_id or (ev.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)
        else:
            result = await graph.search(query_text, limit=cfg.k * 2)
            retrieved = []
            for hit in result.nodes:
                doc_id = (hit.node.properties or {}).get("doc_id", "")
                if doc_id and doc_id not in retrieved:
                    retrieved.append(doc_id)

        bench.add(
            query_id=qid,
            query=query_text,
            retrieved=retrieved[: cfg.k],
            relevant=relevant,
            k=cfg.k,
        )

    elapsed = time.time() - t0

    # Clean up the per-bench tempfile + sidecar HNSW files
    await backend.close()
    import os as _os

    for ext in ("", ".hnsw", ".hnsw.meta.json"):
        _path = tmp_db.name + ext
        try:
            _os.unlink(_path)
        except FileNotFoundError:
            pass

    summary = bench.summary()
    total_q = summary.get("total_queries", 0)
    hits = sum(1 for q in bench.queries if q.get("mrr", 0) > 0)

    return RunResult(
        name=cfg.name,
        corpus_size=len(corpus),
        mrr=summary.get("mrr", 0),
        p_at_k=summary.get("mean_precision@k", 0),
        r_at_k=summary.get("mean_recall@k", 0),
        ndcg=summary.get("mean_ndcg@k", 0),
        hit_rate=f"{hits}/{total_q}",
        elapsed=elapsed,
    )


# --- Multi-turn Agent Benchmark ---

AGENT_SYSTEM = """\
You are a research agent. Use the provided tools to answer the question.

## Tool selection (pick the RIGHT one first time)
- Text question → deep_search(query, category="relevant category from metadata")
- Price/date/attribute filter → filter_nodes(table, property, op, value)
- "가장 X한" / "top N" / "최대/최소" / "most / least / 최근" → top_nodes(table, sort_by, order, limit)
- "how many per X" / bucketed summary → aggregate_nodes(table, group_by, metric)
- "find related records" → join_related(from_value, fk_property, target_table)
- Find by name/text → filter_nodes(table, property=name_column, op="contains", value="keyword")

## English paraphrase / category-like queries — search FIRST
Pure-English descriptive phrases like "portable computing device",
"facial skincare product", "wireless headphones" are PARAPHRASES of
product names — they are NOT column values. There is no
filter_nodes(property="goods_nm", op="==") match for these because
goods_nm holds concrete brand+model strings.

For any English-only query with no exact column-value identifier
(price, date, product code, brand name), use ``search`` or
``deep_search`` FIRST. The vector retrieval will paraphrase-match the
descriptive phrase to actual product names. Only fall back to
``filter_nodes`` / ``top_nodes`` if the search path returns 0 results.

Q: "portable computing device"
→ search("portable computing device")  # vector matches laptop products

Q: "facial skincare product"
→ search("facial skincare product")  # vector matches mask / cream products

## Key rules
- Use the exact table and column names from the structured data metadata below
- ALWAYS use category filter when you can identify the topic from metadata
- You can call MULTIPLE tools in ONE turn for efficiency — this is
  strongly preferred when the calls are independent. For compound
  questions ("X의 Y와 Z") emit all the probes in one turn, read the
  merged results, then answer. Going one-tool-per-turn wastes the
  context budget and often times out before the full chain completes.
- Max 15 tool calls total. Be efficient.
- Respond in the same language as the question.

## Fallback when search returns 0 results
1. Try filter_nodes with op="contains" on text columns (e.g., product_name, goods_nm)
2. Try search with shorter/individual keywords from your query
3. Try search with translated terms (Korean ↔ English)

## Structured data patterns
- Node titles = table_name:pk_value (e.g., "products:12800000", "colors:1")
- Use FK relationships from metadata to chain queries across tables
- For cross-table questions: find source → join_related → target table

## Examples
Q: "말 복지 향상 프로그램"
→ deep_search(query="말 복지", category="복지 및 교육")

Q: "50만원 이상 고가 상품"
→ filter_nodes(table="pr_goods_base", property="sales_prc", op=">=", value="500000")

Q: "가장 많이 팔린 상품"
→ top_nodes(table="products", sort_by="cumulative_sales", order="desc", limit=1)

Q: "최근 방송 1위 상품"
→ top_nodes(table="broadcasts", sort_by="broadcast_date", order="desc", limit=1)

Q: "할인율 가장 높은 25SS 상품 3개"
→ top_nodes(table="products", sort_by="discount_rate", order="desc", limit=3,
       where_property="season", where_op="==", where_value="25SS")

Q: "5점 리뷰가 가장 많은 상품"
→ aggregate_nodes(table="feedback", group_by="goods_no", metric="count", where_property="score", where_op="==", where_value="5")

Q: "스마트폰 제품 찾기"
→ filter_nodes(table="pr_goods_base", property="goods_nm", op="contains", value="phone")

## Date queries — use starts_with or date_range or group_by_format
Q: "2023년 12월 판매 건수"
→ filter_nodes(table="sold_hist", property="sold_dtm", op="starts_with", value="2023-12")

Q: "2023년 여름(6-8월) 판매"
→ filter_nodes(table="sold_hist", property="sold_dtm", op="date_range", value="2023-06-01..2023-08-31")

Q: "월별 매출 추이"
→ aggregate_nodes(table="sold_hist", group_by="sold_dtm", group_by_format="YYYY-MM", metric="count")

## Multi-hop chaining — pass previous step's node_titles / group values as from_ids
Q: "가장 많이 팔린 상품의 리뷰"
Step 1: top_nodes(table="products", sort_by="cumulative_sales", order="desc", limit=1)
  → results[0]["title"] == "products:12800000", product_code="12800000"
Step 2: join_related(from_value="12800000", fk_property="product_code", target_table="reviews")
  → review rows for the top product

Q: "최근 가장 많이 팔린 상품 중 핏 만족도 높은 것"
Step 1: top_nodes(table="products", sort_by="cumulative_sales", order="desc", limit=10)
  → list of top-10 products by sales
Step 2: filter_nodes(from_ids=<those product titles>, property="fit_score", op=">=", value="4")
  → filtered subset
Step 3: top_nodes(from_ids=<survivors>, table="products", sort_by="cumulative_sales", order="desc", limit=1)
  → the single winning product

Q: "판매량 1위 상품의 리뷰 평점 평균"
Step 1: top_nodes(table="sold_hist", sort_by="sold_qunt", order="desc", limit=1)
  → results[0]["title"] == "pr_goods_base:G00001"
Step 2: aggregate_nodes(table="feedback", group_by="score", metric="count",
                         where_property="goods_no", where_op="==", where_value="G00001")

Q: "5점 리뷰 최다 상품 중 가장 저렴한 것"
Step 1: aggregate_nodes(table="feedback", group_by="goods_no", metric="count",
                         where_property="score", where_op="==", where_value="5")
  → groups sorted desc, top node_titles = ["pr_goods_base:G00857", "pr_goods_base:G00472", ...]
Step 2: top_nodes(from_ids=["pr_goods_base:G00857","pr_goods_base:G00472"], table="pr_goods_base",
                  sort_by="sales_prc", order="asc", limit=1)
  → the single cheapest among the top-review products

Q: "iPhone과 Galaxy Book의 판매 이력"
→ join_related(from_values=["G00007","G00003"], fk_property="goods_no", target_table="pr_goods_sold_hist")

## Language fallback
- If data contains English product names, try English keywords when Korean search returns 0
- Example: "치즈" returns 0 → try "cheese" instead

## Relative time references
- Words like "올해" / "내년도" / "this year" / "next year" should NOT be
  converted to literal year numbers in search queries. The corpus may
  span multiple years — a hard "2024" filter throws away valid matches.
- Search the topic WITHOUT the year first. Only narrow by year if the
  unfiltered topic search returns too many candidates AND you have
  evidence the user wants a specific year.

## "List all" / enumeration questions
- Queries like "X 목록", "X 상품 전체", "list all X" need the COMPLETE
  set, not one representative. Use ``filter_nodes(limit=100)`` (or
  higher) and keep scanning. The GT for these often has 5-10 specific
  rows; a limit=20 default plus a retry that narrows instead of
  widening will miss half of them.

## Multi-source questions
- Queries like "X 관련 자료", "X 관련 내용", "X 관련 정보" explicitly
  ask for MULTIPLE sources. A single document is rarely the complete
  answer.
- After the first deep_search / search returns 1-2 hits, run at least
  one more search with paraphrased keywords before concluding.

## When a tool returns 0 results
- Every tool that returns 0 results also returns a ``hints`` array with
  specific corrective actions (different operator, dropped WHERE,
  alternative column). READ those hints and follow the first one
  before reissuing the same query with minor changes — that's what
  wastes turns.
"""

AGENT_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "deep_search",
            "description": "Search + expand + read in ONE call.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string"},
                    "category": {"type": "string"},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search",
            "description": "Basic text search.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string"},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "filter_nodes",
            "description": "Filter by property. Returns {total, showing, results}. Supports multi-hop chaining via from_ids.",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": {
                        "type": "string",
                        "description": "Table name from metadata e.g. pr_goods_base",
                    },
                    "property": {"type": "string", "description": "Column name e.g. sales_prc"},
                    "op": {
                        "type": "string",
                        "description": ">=, <=, >, <, ==, !=, contains, starts_with, date_range",
                    },
                    "value": {
                        "type": "string",
                        "description": "Value. For date_range: '2023-06-01..2023-08-31'. For starts_with: prefix like '2023-12'",
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Max results to return (default 20). Use higher for listings.",
                    },
                    "cursor": {
                        "type": "string",
                        "description": "Pagination token from a prior call's next_cursor — pass to fetch the NEXT page when has_more=true. Use for 'list all X' queries.",
                    },
                    "from_ids": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional: restrict to these node titles/IDs (multi-hop chaining from previous step's results)",
                    },
                },
                "required": ["property", "op", "value"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "aggregate_nodes",
            "description": "GROUP BY + COUNT/SUM/AVG/MAX/MIN with WHERE pre-filter, date bucketing, and multi-hop chaining.",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": {"type": "string", "description": "Table name from metadata"},
                    "group_by": {"type": "string", "description": "Column to group by"},
                    "metric": {"type": "string", "enum": ["count", "sum", "avg", "max", "min"]},
                    "metric_property": {
                        "type": "string",
                        "description": "Numeric column for sum/avg/max/min",
                    },
                    "where_property": {
                        "type": "string",
                        "description": "Pre-filter column e.g. score",
                    },
                    "where_op": {
                        "type": "string",
                        "description": "==, !=, >=, <=, >, <, contains, starts_with, date_range",
                    },
                    "where_value": {"type": "string", "description": "Pre-filter value e.g. 5"},
                    "group_by_format": {
                        "type": "string",
                        "description": "Date bucket format: 'YYYY', 'YYYY-MM', 'YYYY-MM-DD'. Use for monthly/yearly aggregation on datetime columns.",
                    },
                    "limit": {"type": "integer", "description": "Max groups (default 50)"},
                    "cursor": {
                        "type": "string",
                        "description": "Pagination token from a prior call's next_cursor.",
                    },
                    "from_ids": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional: restrict aggregation to these node titles/IDs (multi-hop chaining)",
                    },
                },
                "required": ["group_by"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "join_related",
            "description": "FK lookup — find related records. Accepts single from_value OR list of from_values for batch JOIN.",
            "parameters": {
                "type": "object",
                "properties": {
                    "from_value": {"type": "string", "description": "Single FK value e.g. G00001"},
                    "from_values": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Multiple FK values for batch IN-clause JOIN (multi-hop chaining)",
                    },
                    "fk_property": {"type": "string", "description": "FK column e.g. goods_no"},
                    "target_table": {
                        "type": "string",
                        "description": "Target table e.g. pr_goods_sold_hist",
                    },
                    "limit": {"type": "integer", "description": "Max results (default 20)"},
                    "cursor": {
                        "type": "string",
                        "description": "Pagination token from a prior call's next_cursor.",
                    },
                },
                "required": ["fk_property", "target_table"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "top_nodes",
            "description": (
                "Top-N rows of a table ordered by a column — single call for "
                "'가장 X한', 'top N', '최대/최소', '최근' questions. Returns "
                "node_title + sort_value on each result so you can chain "
                "directly into join_related or get_document."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "table": {"type": "string"},
                    "sort_by": {
                        "type": "string",
                        "description": "Numeric column to order by, e.g. 'cumulative_sales'",
                    },
                    "order": {
                        "type": "string",
                        "description": "'desc' (default) or 'asc'",
                    },
                    "limit": {"type": "integer", "description": "Max rows (default 5)"},
                    "where_property": {"type": "string"},
                    "where_op": {"type": "string"},
                    "where_value": {"type": "string"},
                    "cursor": {
                        "type": "string",
                        "description": "Pagination token from a prior call's next_cursor.",
                    },
                    "from_ids": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional — restrict ranking to these node_titles (multi-hop chaining)",
                    },
                },
                "required": ["table", "sort_by"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_document",
            "description": "Read a full document.",
            "parameters": {
                "type": "object",
                "properties": {
                    "doc_id": {"type": "string"},
                    "query": {"type": "string"},
                },
                "required": ["doc_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "expand",
            "description": "Explore neighbours of a node most relevant to your question — pass `query` to rank them toward it; falls back to semantically-nearest nodes if the node has no graph links.",
            "parameters": {
                "type": "object",
                "properties": {
                    "node_id": {"type": "string", "description": "Node ID to expand from"},
                    "query": {
                        "type": "string",
                        "description": "Your current question — ranks neighbours by relevance to it.",
                    },
                },
                "required": ["node_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "follow",
            "description": "Follow a specific edge type from a node. Edge types: contains, part_of, next_chunk, related, mentions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "node_id": {"type": "string", "description": "Source node ID"},
                    "edge_kind": {
                        "type": "string",
                        "description": "Edge type to follow: related, contains, part_of, etc.",
                    },
                },
                "required": ["node_id", "edge_kind"],
            },
        },
    },
]


def _extract_ids(data: dict, found_ids: set[str], known_tables: set[str] | None = None) -> None:
    """Extract ALL possible document identifiers from any tool result.

    Covers every tool's response structure:
    - evidence[].document_id, evidence[].properties.doc_id, evidence[].title
    - results[].properties.doc_id, results[].title
    - merged_evidence[].document_id
    - document_excerpts[].document.properties.doc_id
    - sub_results[].top_result.document_id
    - document.properties.doc_id (get_document)
    - chunks[].properties (get_document)
    - groups[].group (aggregate — group value may be a PK like goods_no)

    Args:
        known_tables: Set of actual table names from the graph (e.g. {"colors", "products"}).
            Used to resolve FK column stems to real table names for aggregate groups.
    """
    # Flat item lists
    for key in (
        "evidence",
        "results",
        "merged_evidence",
        "matches",
        "expanded_neighbours",
        "neighbours",
    ):
        for item in data.get(key, []):
            # Direct document_id field (from EvidenceAggregator)
            did = item.get("document_id", "")
            if did:
                found_ids.add(did)
            # properties.doc_id
            props = item.get("properties", {})
            did2 = props.get("doc_id", "")
            if did2:
                found_ids.add(did2)
            # title (for assort: "products:12800000")
            title = item.get("title", "")
            if title:
                found_ids.add(title)

    # document_excerpts (from deep_search)
    for excerpt in data.get("document_excerpts", []):
        doc = excerpt.get("document", {})
        did = doc.get("properties", {}).get("doc_id", "")
        if did:
            found_ids.add(did)
        title = doc.get("title", "")
        if title: