diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 452aba1..6e0467b 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -91,7 +91,7 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc: inputs = [] pages = { (page.page_no): PageLayout( - page_no=page.page_no + 1, + page_no=page.page_no, width=page.size.width if page.size else 0, height=page.size.height if page.size else 0, ) diff --git a/tests/test_general.py b/tests/test_general.py index 0601a55..2b80cfa 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -40,6 +40,22 @@ def test_general(path, nlp, span_labels): assert span.label_ in span_labels assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout) +@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)]) +def test_pages(path, pg_no, nlp): + layout = spaCyLayout(nlp) + doc = layout(path) + # This should not raise a KeyError when accessing `pages` dict + # Key Error would mean a mismatched pagination on document layout and span layout + result = layout.get_pages(doc) + assert len(result) == pg_no + assert result[0][0].page_no == 1 + if pg_no == 6: + # there should be 18 spans on the pg_no 1 + assert len(result[0][1]) == 18 + elif pg_no == 1: + # there should be 4 spans on pg_no 1 + assert len(result[0][1]) == 4 + @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE]) @pytest.mark.parametrize("separator", ["\n\n", ""])