-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstructural_reference_linker.py
More file actions
198 lines (164 loc) · 7.89 KB
/
structural_reference_linker.py
File metadata and controls
198 lines (164 loc) · 7.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""StructuralReferenceLinker — LLM-free cross-reference edges (v0.24 WS-A).
Generalises the finreg-specific reference linker into a profile-driven,
corpus-agnostic pass. When a :class:`DomainProfile` declares a clean
target inventory, this writes ``REFERENCES`` edges between documents that
explicitly cite one another — turning multi-hop "follow the citation"
retrieval into a single graph hop (measured: finreg multi-hop 0% → 83%).
Why a clean-target gate
-----------------------
The v0.23 ``ReferenceLinker`` was a *measured negative*: on KRRA its
resolution targets were 70k noisy phrase-hub ENTITY nodes, so mapping a
reference token to the document it points at was a coin flip (~50%
precision). The mechanism was never wrong — the corpus was. A corpus
where every document carries a canonical, low-collision key (statute
article numbers, clause codes, manual section ids) is the opposite case.
This linker therefore *verifies* the target inventory is clean before
writing any edge: if the configured key property collides across
documents beyond a small tolerance, it gates itself off and writes
nothing. That makes the pass safe to run on any corpus — it enriches the
graph where it can and no-ops where it can't.
Configuration (DomainProfile):
reference_key_property — node property holding the canonical key
reference_scope_property — optional; resolve within this scope only
reference_token_pattern — OPTIONAL regex override
The matcher is **auto-derived from the corpus** — there is no need to
hand-write a regex per corpus. The linker reads every distinct value of
``reference_key_property`` actually present in the graph and builds an
exact alternation matcher from those strings (longest-first, so
"제30조의2" wins over "제30조"). A citation is whatever literally equals
a real key. ``reference_token_pattern`` exists only as an override for
the rare case where citations appear in a different surface form than
the stored key.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING
from synaptic.models import Edge, EdgeKind
if TYPE_CHECKING:
from synaptic.extensions.domain_profile import DomainProfile
from synaptic.protocols import StorageBackend
logger = logging.getLogger("structural-reference-linker")
# Max share of key values that may collide across documents before the
# target inventory is deemed too noisy to resolve against.
_MAX_COLLISION_RATE = 0.10
@dataclass(slots=True)
class ReferenceLinkStats:
"""Outcome of one :meth:`StructuralReferenceLinker.link` run."""
nodes_scanned: int = 0
keyed_nodes: int = 0
edges_created: int = 0
raw_matches: int = 0
unresolved: int = 0
collision_rate: float = 0.0
gated: bool = False
gate_reason: str = ""
class StructuralReferenceLinker:
"""Profile-driven cross-reference edge builder."""
__slots__ = ("_profile",)
def __init__(self, profile: DomainProfile) -> None:
self._profile = profile
def _enabled(self) -> bool:
# Only the key property is required — the matcher is derived from
# the corpus's own key values, so no hand-written regex is needed.
return bool(self._profile.reference_key_property)
async def link(self, backend: StorageBackend) -> ReferenceLinkStats:
"""Scan every node, resolve reference tokens, write REFERENCES edges.
Returns stats describing what happened — including ``gated=True``
when the target inventory failed the cleanliness check.
"""
stats = ReferenceLinkStats()
if not self._enabled():
stats.gated = True
stats.gate_reason = "profile declares no reference_key_property"
return stats
p = self._profile
key_prop = p.reference_key_property
scope_prop = p.reference_scope_property
nodes = await backend.list_nodes(kind=None, limit=500_000)
stats.nodes_scanned = len(nodes)
# --- Build the target index and measure collisions ---
index: dict[tuple[str, str], list[str]] = {}
for n in nodes:
props = n.properties or {}
key = props.get(key_prop)
if not key:
continue
scope = props.get(scope_prop, "") if scope_prop else ""
index.setdefault((scope, key), []).append(n.id)
stats.keyed_nodes = sum(len(v) for v in index.values())
if stats.keyed_nodes == 0:
stats.gated = True
stats.gate_reason = f"no node carries property {key_prop!r}"
return stats
collided = sum(len(v) - 1 for v in index.values() if len(v) > 1)
stats.collision_rate = collided / stats.keyed_nodes
if stats.collision_rate > _MAX_COLLISION_RATE:
stats.gated = True
stats.gate_reason = (
f"target inventory too noisy: {stats.collision_rate:.0%} of "
f"{key_prop!r} values collide (> {_MAX_COLLISION_RATE:.0%})"
)
logger.info("StructuralReferenceLinker gated — %s", stats.gate_reason)
return stats
# Unambiguous targets only: (scope, key) -> single node_id.
resolved_index = {k: v[0] for k, v in index.items() if len(v) == 1}
# Matcher — derived from the corpus's own key values (exact
# alternation, longest-first so "제30조의2" wins over "제30조").
# A hand-written reference_token_pattern overrides this.
if p.reference_token_pattern is not None:
matcher = p.reference_token_pattern
else:
all_keys = sorted({k for _scope, k in index}, key=len, reverse=True)
matcher = re.compile("|".join(re.escape(k) for k in all_keys))
# --- Scan node text, resolve references, build edges ---
edges: list[Edge] = []
seen: set[tuple[str, str]] = set()
def _emit(src_id: str, target_id: str | None) -> None:
if target_id is None:
stats.unresolved += 1
return
if target_id == src_id or (src_id, target_id) in seen:
return
seen.add((src_id, target_id))
edges.append(
Edge(
source_id=src_id,
target_id=target_id,
kind=EdgeKind.REFERENCES,
weight=1.0,
)
)
crossscope = p.reference_crossscope_pattern
for n in nodes:
content = n.content or ""
scope = (n.properties or {}).get(scope_prop, "") if scope_prop else ""
# Cross-scope first — citations that name their own target
# scope ("「은행법」 제5조"). Record their spans so the
# intra-scope matcher does not mis-resolve the "제5조" inside
# them to the citing document's own scope.
cross_spans: list[tuple[int, int]] = []
if crossscope is not None:
for m in crossscope.finditer(content):
stats.raw_matches += 1
cross_spans.append(m.span())
gd = m.groupdict()
_emit(n.id, resolved_index.get((gd.get("scope", ""), gd.get("key", ""))))
# Intra-scope — a bare key resolves within the citing
# document's own scope, skipping spans already claimed above.
for m in matcher.finditer(content):
if any(s <= m.start() < e for s, e in cross_spans):
continue
stats.raw_matches += 1
_emit(n.id, resolved_index.get((scope, m.group(0))))
if edges:
await backend.save_edges_batch(edges)
stats.edges_created = len(edges)
logger.info(
"StructuralReferenceLinker: %d REFERENCES edges (raw=%d unresolved=%d)",
stats.edges_created,
stats.raw_matches,
stats.unresolved,
)
return stats