explosion · ines · Mar 8, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/README.md b/README.md
@@ -205,7 +205,7 @@ doc = layout("./starcraft.pdf")
 
 #### <kbd>method</kbd> `spaCyLayout.pipe`
 
-Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale.
+Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).
 
 ```python
 layout = spaCyLayout(nlp)
@@ -215,8 +215,9 @@ docs = layout.pipe(paths)
 
 | Argument | Type | Description |
 | --- | --- | --- |
-| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
-| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
+| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
+| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
+| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |
 
 ## 💡 Examples and code snippets
 

diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py
@@ -1,6 +1,15 @@
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, Iterator
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Iterable,
+    Iterator,
+    Literal,
+    TypeVar,
+    cast,
+    overload,
+)
 
 import srsly
 from docling.datamodel.base_models import DocumentStream
@@ -18,6 +27,8 @@
     from pandas import DataFrame
     from spacy.language import Language
 
+# Type variable for contexts piped with documents
+_AnyContext = TypeVar("_AnyContext")
 
 TABLE_PLACEHOLDER = "TABLE"
 TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
@@ -76,12 +87,42 @@ def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
             result = self.converter.convert(self._get_source(source)).document
         return self._result_to_doc(result)
 
-    def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
+    @overload
+    def pipe(
+        self,
+        sources: Iterable[str | Path | bytes],
+        as_tuples: Literal[False] = ...,
+    ) -> Iterator[Doc]: ...
+
+    @overload
+    def pipe(
+        self,
+        sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
+        as_tuples: Literal[True] = ...,
+    ) -> Iterator[tuple[Doc, _AnyContext]]: ...
+
+    def pipe(
+        self,
+        sources: (
+            Iterable[str | Path | bytes]
+            | Iterable[tuple[str | Path | bytes, _AnyContext]]
+        ),
+        as_tuples: bool = False,
+    ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
         """Process multiple documents and create spaCy Doc objects."""
-        data = (self._get_source(source) for source in sources)
-        results = self.converter.convert_all(data)
-        for result in results:
-            yield self._result_to_doc(result.document)
+        if as_tuples:
+            sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
+            data = (self._get_source(source) for source, _ in sources)
+            contexts = (context for _, context in sources)
+            results = self.converter.convert_all(data)
+            for result, context in zip(results, contexts):
+                yield (self._result_to_doc(result.document), context)
+        else:
+            sources = cast(Iterable[str | Path | bytes], sources)
+            data = (self._get_source(source) for source in sources)
+            results = self.converter.convert_all(data)
+            for result in results:
+                yield self._result_to_doc(result.document)
 
     def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
         if isinstance(source, (str, Path)):

diff --git a/tests/test_general.py b/tests/test_general.py
@@ -74,6 +74,15 @@ def test_simple_pipe(nlp):
         assert len(doc.spans[layout.attrs.span_group]) == 4
 
 
+def test_simple_pipe_as_tuples(nlp):
+    layout = spaCyLayout(nlp)
+    data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
+    result = list(layout.pipe(data, as_tuples=True))
+    for doc, _ in result:
+        assert len(doc.spans[layout.attrs.span_group]) == 4
+    assert [context for _, context in result] == ["pdf", "docx"]
+
+
 def test_table(nlp):
     layout = spaCyLayout(nlp)
     doc = layout(PDF_TABLE)