-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparse_method_classifier.py
More file actions
120 lines (99 loc) · 4.19 KB
/
parse_method_classifier.py
File metadata and controls
120 lines (99 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
gold_standard_sampler/parse_method_classifier.py
-------------------------------------------------
Determines the parse method (element_citation / mixed_citation / unstructured)
and source provenance from the format code and raw reference string.
Rules:
XML format
→ element-citation if the raw string contains an <element-citation> tag
→ mixed-citation if it contains <mixed-citation> or any JATS/NLM tag
with both structured content (tagged fields) AND free text nodes
→ unstructured as fallback (e.g. EGU references wrapped in XML but
containing only a plain text string)
TEXT format
→ unstructured, source_provenance = arxiv (most common TEXT source)
The provenance is refined below using filename heuristics.
OCR format
→ unstructured, source_provenance = ocr
"""
from __future__ import annotations
import re
from typing import Tuple
from xml.etree import ElementTree as ET
from models import ParseMethod, SourceProvenance
# Tags that indicate structured field content in JATS/NLM XML
_STRUCTURED_TAGS = frozenset([
"element-citation", "mixed-citation",
"surname", "given-names", "person-group",
"year", "source", "volume", "fpage", "lpage",
"article-title", "chapter-title", "pub-id",
"publisher-name", "publisher-loc", "edition",
"conf-name", "conf-date",
])
# Regex to detect any XML tag (used as a quick pre-check)
_ANY_TAG_RE = re.compile(r"<[a-zA-Z]")
# Known arXiv path fragments in result file names
_ARXIV_PATH_RE = re.compile(
r"(astro-ph|cond\.?mat|hep-[a-z]+|gr-qc|nucl-[a-z]+|"
r"quant-ph|math-ph|physics|nlin|cs\.|q-bio|q-fin|stat\.|"
r"eess\.|econ\.|arXiv|arxiv|\d{4}\.\d{4,})",
re.IGNORECASE,
)
def classify_parse_method(
fmt: str,
raw_string: str,
matched_bibcode: str,
) -> Tuple[ParseMethod, SourceProvenance]:
"""
Returns (ParseMethod, SourceProvenance) for a single reference.
"""
fmt = fmt.upper()
if fmt == "OCR":
return ParseMethod.UNSTRUCTURED, SourceProvenance.OCR
if fmt == "TEXT":
return ParseMethod.UNSTRUCTURED, SourceProvenance.ARXIV
# fmt == "XML"
if not _ANY_TAG_RE.search(raw_string):
# XML format file but this particular reference is a plain string
# (e.g. EGU refplaintext fallback)
return ParseMethod.UNSTRUCTURED, SourceProvenance.PUBLISHER_XML
# Try to parse the XML fragment
try:
root = ET.fromstring(f"<root>{raw_string}</root>")
except ET.ParseError:
# Malformed XML — treat as unstructured
return ParseMethod.UNSTRUCTURED, SourceProvenance.PUBLISHER_XML
# Check for element-citation
if root.find(".//element-citation") is not None:
return ParseMethod.ELEMENT_CITATION, SourceProvenance.PUBLISHER_XML
# Check for mixed-citation or any structured JATS tags
has_structured_tags = any(
root.find(f".//{tag}") is not None
for tag in _STRUCTURED_TAGS
)
if not has_structured_tags:
return ParseMethod.UNSTRUCTURED, SourceProvenance.PUBLISHER_XML
# Has structured tags — determine if it is pure element-citation or mixed
# A mixed-citation has both tagged children AND non-trivial text nodes
has_free_text = _has_significant_free_text(root)
if has_free_text:
return ParseMethod.MIXED_CITATION, SourceProvenance.PUBLISHER_XML
else:
# All content is in tags — treat as element_citation even if the
# root tag is <mixed-citation>
return ParseMethod.ELEMENT_CITATION, SourceProvenance.PUBLISHER_XML
def _has_significant_free_text(root: ET.Element) -> bool:
"""
Returns True if the XML element tree has non-trivial text nodes
(i.e. text that is not just whitespace) outside of structured tags.
This distinguishes a nominally mixed-citation that is fully tagged
from one that genuinely has free-text islands.
"""
for el in root.iter():
# Check tail text (text after a closing tag, before next sibling)
if el.tail and el.tail.strip():
return True
# Check direct text content of the root/mixed-citation element itself
if el.tag in ("root", "mixed-citation") and el.text and el.text.strip():
return True
return False