From 8562ccff8d79088266bfb14be1df53181456feac Mon Sep 17 00:00:00 2001 From: magdaaniol Date: Mon, 23 Dec 2024 14:30:55 +0100 Subject: [PATCH 1/2] start paginating from 1 --- spacy_layout/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 452aba1..6e0467b 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -91,7 +91,7 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc: inputs = [] pages = { (page.page_no): PageLayout( - page_no=page.page_no + 1, + page_no=page.page_no, width=page.size.width if page.size else 0, height=page.size.height if page.size else 0, ) From 42fa2f20f99e13aa99ac324dd2b6bad7edcfdc4d Mon Sep 17 00:00:00 2001 From: magdaaniol Date: Mon, 23 Dec 2024 15:05:36 +0100 Subject: [PATCH 2/2] add test case --- tests/test_general.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_general.py b/tests/test_general.py index 0601a55..2b80cfa 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -40,6 +40,22 @@ def test_general(path, nlp, span_labels): assert span.label_ in span_labels assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout) +@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)]) +def test_pages(path, pg_no, nlp): + layout = spaCyLayout(nlp) + doc = layout(path) + # This should not raise a KeyError when accessing `pages` dict + # Key Error would mean a mismatched pagination on document layout and span layout + result = layout.get_pages(doc) + assert len(result) == pg_no + assert result[0][0].page_no == 1 + if pg_no == 6: + # there should be 18 spans on the pg_no 1 + assert len(result[0][1]) == 18 + elif pg_no == 1: + # there should be 4 spans on pg_no 1 + assert len(result[0][1]) == 4 + @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE]) @pytest.mark.parametrize("separator", ["\n\n", ""])