Skip to content

Commit 8bdeaa7

Browse files
authored
fix: robustify page filtering (#437)
* fix: robustify page filtering Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * cover validate_rules Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 8feb09f commit 8bdeaa7

File tree

2 files changed

+119
-12
lines changed

2 files changed

+119
-12
lines changed

docling_core/types/doc/document.py

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6224,6 +6224,13 @@ def index(
62246224
self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
62256225
) -> None:
62266226

6227+
if page_nrs is not None and (
6228+
unavailable_page_nrs := page_nrs - set(doc.pages.keys())
6229+
):
6230+
raise ValueError(
6231+
f"The following page numbers are not present in the document: {unavailable_page_nrs}"
6232+
)
6233+
62276234
orig_ref_to_new_ref: dict[str, str] = {}
62286235
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
62296236

@@ -6265,7 +6272,29 @@ def index(
62656272

62666273
if item.parent:
62676274
# set item's parent
6268-
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
6275+
new_parent_cref = orig_ref_to_new_ref.get(item.parent.cref)
6276+
if new_parent_cref is None:
6277+
6278+
parent_ref = item.parent
6279+
while new_parent_cref is None and parent_ref is not None:
6280+
parent_ref = RefItem(
6281+
cref=parent_ref.resolve(doc).parent.cref
6282+
)
6283+
new_parent_cref = orig_ref_to_new_ref.get(
6284+
parent_ref.cref
6285+
)
6286+
6287+
if new_parent_cref is not None:
6288+
warnings.warn(
6289+
f"Parent {item.parent.cref} not found in indexed nodes, "
6290+
f"using ancestor {new_parent_cref} instead"
6291+
)
6292+
else:
6293+
warnings.warn(
6294+
"No ancestor found in indexed nodes, using body as parent"
6295+
)
6296+
new_parent_cref = "#/body"
6297+
62696298
new_item.parent = RefItem(cref=new_parent_cref)
62706299

62716300
# add item to parent's children
@@ -6355,38 +6384,54 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
63556384
res_doc._update_from_index(doc_index)
63566385
return res_doc
63576386

6358-
def _validate_rules(self):
6387+
def _validate_rules(self, raise_on_error: bool = True):
6388+
6389+
def _handle(error: Exception):
6390+
if raise_on_error:
6391+
raise error
6392+
else:
6393+
warnings.warn(str(error))
63596394

63606395
def validate_furniture(doc: DoclingDocument):
63616396
with warnings.catch_warnings():
63626397
warnings.simplefilter("ignore", category=DeprecationWarning)
63636398
has_furniture_children = len(doc.furniture.children) > 0
63646399
if has_furniture_children:
6365-
raise ValueError(
6366-
f"Deprecated furniture node {doc.furniture.self_ref} has children"
6400+
_handle(
6401+
ValueError(
6402+
f"Deprecated furniture node {doc.furniture.self_ref} has children"
6403+
),
63676404
)
63686405

63696406
def validate_list_group(doc: DoclingDocument, item: ListGroup):
63706407
for ref in item.children:
63716408
child = ref.resolve(doc)
63726409
if not isinstance(child, ListItem):
6373-
raise ValueError(
6374-
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
6410+
_handle(
6411+
ValueError(
6412+
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
6413+
),
63756414
)
63766415

63776416
def validate_list_item(doc: DoclingDocument, item: ListItem):
63786417
if item.parent is None:
6379-
raise ValueError(f"ListItem {item.self_ref} has no parent")
6380-
if not isinstance(item.parent.resolve(doc), ListGroup):
6381-
raise ValueError(
6382-
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
6418+
_handle(
6419+
ValueError(f"ListItem {item.self_ref} has no parent"),
6420+
)
6421+
elif not isinstance(item.parent.resolve(doc), ListGroup):
6422+
_handle(
6423+
ValueError(
6424+
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
6425+
),
63836426
)
63846427

63856428
def validate_group(doc: DoclingDocument, item: GroupItem):
63866429
if (
63876430
item.parent and not item.children
63886431
): # tolerate empty body, but not other groups
6389-
raise ValueError(f"Group {item.self_ref} has no children")
6432+
_handle(
6433+
ValueError(f"Group {item.self_ref} has no children"),
6434+
)
63906435

63916436
validate_furniture(self)
63926437

test/test_docling_doc.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
from collections import deque
34
from copy import deepcopy
45
from pathlib import Path
@@ -46,7 +47,7 @@
4647
TextItem,
4748
TitleItem,
4849
)
49-
from docling_core.types.doc.document import CURRENT_VERSION
50+
from docling_core.types.doc.document import CURRENT_VERSION, PageItem
5051

5152
from .test_data_gen_flag import GEN_TEST_DATA
5253

@@ -1904,3 +1905,64 @@ def test_filter_pages():
19041905
with open(exp_html_file, "r", encoding="utf-8") as f:
19051906
exp_html_data = f.read()
19061907
assert html_data == exp_html_data
1908+
1909+
1910+
def _create_doc_for_filtering():
1911+
doc = DoclingDocument(
1912+
name="",
1913+
pages={
1914+
i: PageItem(page_no=i, size=Size(width=100, height=100), image=None)
1915+
for i in range(1, 3)
1916+
},
1917+
)
1918+
p1_text = doc.add_text(
1919+
text="Text 1",
1920+
parent=doc.body,
1921+
label=DocItemLabel.TEXT,
1922+
prov=ProvenanceItem(
1923+
page_no=1, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)
1924+
),
1925+
)
1926+
doc.add_group(parent=p1_text)
1927+
doc.add_text(
1928+
text="Text 2",
1929+
parent=doc.body,
1930+
label=DocItemLabel.TEXT,
1931+
prov=ProvenanceItem(
1932+
page_no=2, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)
1933+
),
1934+
)
1935+
return doc
1936+
1937+
1938+
def test_filter_pages_filtered_out_parent():
1939+
doc = _create_doc_for_filtering()
1940+
1941+
with pytest.warns(
1942+
UserWarning,
1943+
match="Parent #/texts/0 not found in indexed nodes, using ancestor #/body instead",
1944+
):
1945+
doc.filter(page_nrs={2})
1946+
1947+
1948+
def test_filter_invalid_pages():
1949+
doc = _create_doc_for_filtering()
1950+
with pytest.raises(
1951+
ValueError,
1952+
match=re.escape(
1953+
"The following page numbers are not present in the document: {3}"
1954+
),
1955+
):
1956+
doc.filter(page_nrs={3})
1957+
1958+
1959+
def test_validate_rules():
1960+
doc = _create_doc_for_filtering()
1961+
1962+
message = "Group #/groups/0 has no children"
1963+
1964+
with pytest.raises(ValueError, match=message):
1965+
doc._validate_rules()
1966+
1967+
with pytest.warns(UserWarning, match=message):
1968+
doc._validate_rules(raise_on_error=False)

0 commit comments

Comments
 (0)