From b19b2e4d0e8ec65486f5a4f814f56817de41791d Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 11:08:50 -0600 Subject: [PATCH 01/11] Adding py.typed file. This tells type checkers like mypy that the package has type hints. new file: spacy_layout/py.typed --- spacy_layout/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy_layout/py.typed diff --git a/spacy_layout/py.typed b/spacy_layout/py.typed new file mode 100644 index 0000000..e69de29 From 524bded5d1b104bed131520491d9ac383d657679 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 5 Feb 2026 11:23:25 -0600 Subject: [PATCH 02/11] Added pandas-stubs. This silences mypy error related to pandas usage. modified: requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 2f8e4ae..94e90b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pandas # version range set by Docling srsly # version range set by spaCy # Dev requirements pytest +pandas-stubs \ No newline at end of file From 45bd0fcd8ee1358115529b9fe7da56be7d386667 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 11:16:58 -0600 Subject: [PATCH 03/11] Added return statement to :meth:`spaCyLayout.get_heading`. This silences a mypy error. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index a699230..8a072cc 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -226,6 +226,7 @@ def get_heading(self, span: Span) -> Span | None: for candidate in spans[: span.id][::-1]: if candidate.label_ in self.headings: return candidate + return None def get_tables(self, doc: Doc) -> list[Span]: """Get all tables in the document.""" From b4c78db1927cf5bca7f4d400552c7ae9a9aaba8f Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 11:20:45 -0600 Subject: [PATCH 04/11] Assigned type to `page_spans` in :meth:`spaCyLayout.get_pages`. This silences a mypy error. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 8a072cc..03860d9 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -212,7 +212,7 @@ def get_pages(self, doc: Doc) -> list[tuple[PageLayout, list[Span]]]: """Get all pages and their layout spans.""" layout = doc._.get(self.attrs.doc_layout) pages = {page.page_no: page for page in layout.pages} - page_spans = {page.page_no: [] for page in layout.pages} + page_spans: dict[int, list[Span]] = {page.page_no: [] for page in layout.pages} for span in doc.spans[self.attrs.span_group]: span_layout = span._.get(self.attrs.span_layout) page_spans[span_layout.page_no].append(span) From 19ca2ae3419eae58d91f4acd9d91da9d998ee2ac Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 11:21:54 -0600 Subject: [PATCH 05/11] Added return statement to :meth:`spaCyLayout._get_span_layout`. This silences a mypy error. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 03860d9..854e5ef 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -207,6 +207,7 @@ def _get_span_layout( return SpanLayout( x=x, y=y, width=width, height=height, page_no=prov.page_no ) + return None def get_pages(self, doc: Doc) -> list[tuple[PageLayout, list[Span]]]: """Get all pages and their layout spans.""" From e90cc02fe8b850158d5893e432178537b1cb5ca7 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 11:34:45 -0600 Subject: [PATCH 06/11] Replace loop variable in :meth:`spaCyLayout._results_to_doc`. The original loop variable was used in two different conditions which altered the type. Introducing a separate variable for each condition keeps the type consisten and silences a mypy error. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 854e5ef..c1cfc49 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -144,17 +144,17 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc: # We want to iterate over the tree to get different elements in order for node, _ in document.iterate_items(): if node.self_ref in text_items: - item = text_items[node.self_ref] - if item.text == "": + text_item = text_items[node.self_ref] + if text_item.text == "": continue - inputs.append((item.text, item)) + inputs.append((text_item.text, text_item)) elif node.self_ref in table_items: - item = table_items[node.self_ref] + table_item = table_items[node.self_ref] if isinstance(self.display_table, str): table_text = self.display_table else: - table_text = self.display_table(item.export_to_dataframe()) - inputs.append((table_text, item)) + table_text = self.display_table(table_item.export_to_dataframe()) + inputs.append((table_text, table_item)) doc = self._texts_to_doc(inputs, pages) doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()])) doc._.set(self.attrs.doc_markdown, document.export_to_markdown()) From c94aa7e611f21fa251ecf28d67d4b797977a5a95 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 12:46:04 -0600 Subject: [PATCH 07/11] Assigned type to `inputs` in :meth:`spaCyLayout._result_to_doc`. This silences a mypy error concerning the appending of two different types to `inputs`, `TextItem` from the loop variable `text_item`, and `TableItem` from the loop varaible `table_item`. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index c1cfc49..348ae37 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -14,7 +14,7 @@ import srsly from docling.datamodel.base_models import DocumentStream from docling.document_converter import DocumentConverter -from docling_core.types.doc.document import DoclingDocument +from docling_core.types.doc.document import DoclingDocument, TableItem, TextItem from docling_core.types.doc.labels import DocItemLabel from spacy.tokens import Doc, Span, SpanGroup @@ -130,7 +130,7 @@ def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream return DocumentStream(name="source", stream=BytesIO(source)) def _result_to_doc(self, document: DoclingDocument) -> Doc: - inputs = [] + inputs: list[tuple[str, TextItem | TableItem]] = [] pages = { (page.page_no): PageLayout( page_no=page.page_no, From a3bed4733faf899de08978864f7cfc12a8a307da Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 15:12:54 -0600 Subject: [PATCH 08/11] Cast variable to specific type in :meth:`spaCyLayout._texts_to_doc`. This silences a mypy error regarding local variable `item` not having the attribute :meth:`export_to_dataframe`. modified: spacy_layout/layout.py --- spacy_layout/layout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 348ae37..b6600a2 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -189,6 +189,7 @@ def _texts_to_doc( layout = self._get_span_layout(item, pages) span._.set(self.attrs.span_layout, layout) if item.label in TABLE_ITEM_LABELS: + item = cast(TableItem, item) span._.set(self.attrs.span_data, item.export_to_dataframe()) spans.append(span) doc.spans[self.attrs.span_group] = SpanGroup( From 568ad0b7ed018c6228281794195617db267bb3ce Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 15:21:04 -0600 Subject: [PATCH 09/11] Adding file to hold config settings. new file: pyproject.toml --- pyproject.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e69de29 From 9f75ad8ef50373b0ee506a30f5c49e01a468ce19 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 4 Feb 2026 15:24:25 -0600 Subject: [PATCH 10/11] Added mypy config and ignored `srsly` import. Added "warn_unused_ignores" to alert devs when `srsly` is type checked. modified: pyproject.toml modified: spacy_layout/layout.py --- pyproject.toml | 2 ++ spacy_layout/layout.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e69de29..d4e88ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.mypy] +warn_unused_ignores = true diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index b6600a2..abf9955 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -11,7 +11,7 @@ overload, ) -import srsly +import srsly # type: ignore[import-untyped] from docling.datamodel.base_models import DocumentStream from docling.document_converter import DocumentConverter from docling_core.types.doc.document import DoclingDocument, TableItem, TextItem From 38731d0a4f96b0e59ab559cfd90d9ad78b74fe4c Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 5 Feb 2026 12:09:25 -0600 Subject: [PATCH 11/11] Assigned explicit type to `OBJ_TYPES`. This silences a mypy error related to the usage of `OBJ_TYPES` values in :func:`decode_obj`. modified: spacy_layout/util.py --- spacy_layout/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy_layout/util.py b/spacy_layout/util.py index 6b59ddb..d4a8d0c 100644 --- a/spacy_layout/util.py +++ b/spacy_layout/util.py @@ -10,7 +10,8 @@ from docling_core.types.doc.base import BoundingBox TYPE_ATTR = "__type__" -OBJ_TYPES = {"SpanLayout": SpanLayout, "DocLayout": DocLayout, "PageLayout": PageLayout} +Layouts = SpanLayout | DocLayout | PageLayout +OBJ_TYPES: dict[str, type[Layouts]] = {"SpanLayout": SpanLayout, "DocLayout": DocLayout, "PageLayout": PageLayout} def encode_obj(obj: Any, chain: Callable | None = None) -> Any: