Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ doc = layout("./starcraft.pdf")

#### <kbd>method</kbd> `spaCyLayout.pipe`

Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale.
Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).

```python
layout = spaCyLayout(nlp)
Expand All @@ -215,8 +215,9 @@ docs = layout.pipe(paths)

| Argument | Type | Description |
| --- | --- | --- |
| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |

## 💡 Examples and code snippets

Expand Down
53 changes: 47 additions & 6 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Iterable, Iterator
from typing import (
TYPE_CHECKING,
Callable,
Iterable,
Iterator,
Literal,
TypeVar,
cast,
overload,
)

import srsly
from docling.datamodel.base_models import DocumentStream
Expand All @@ -18,6 +27,8 @@
from pandas import DataFrame
from spacy.language import Language

# Type variable for contexts piped with documents
_AnyContext = TypeVar("_AnyContext")

TABLE_PLACEHOLDER = "TABLE"
TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
Expand Down Expand Up @@ -76,12 +87,42 @@ def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
result = self.converter.convert(self._get_source(source)).document
return self._result_to_doc(result)

def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
@overload
def pipe(
self,
sources: Iterable[str | Path | bytes],
as_tuples: Literal[False] = ...,
) -> Iterator[Doc]: ...

@overload
def pipe(
self,
sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
as_tuples: Literal[True] = ...,
) -> Iterator[tuple[Doc, _AnyContext]]: ...

def pipe(
self,
sources: (
Iterable[str | Path | bytes]
| Iterable[tuple[str | Path | bytes, _AnyContext]]
),
as_tuples: bool = False,
) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
"""Process multiple documents and create spaCy Doc objects."""
data = (self._get_source(source) for source in sources)
results = self.converter.convert_all(data)
for result in results:
yield self._result_to_doc(result.document)
if as_tuples:
sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
data = (self._get_source(source) for source, _ in sources)
contexts = (context for _, context in sources)
results = self.converter.convert_all(data)
for result, context in zip(results, contexts):
yield (self._result_to_doc(result.document), context)
else:
sources = cast(Iterable[str | Path | bytes], sources)
data = (self._get_source(source) for source in sources)
results = self.converter.convert_all(data)
for result in results:
yield self._result_to_doc(result.document)

def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
if isinstance(source, (str, Path)):
Expand Down
9 changes: 9 additions & 0 deletions tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ def test_simple_pipe(nlp):
assert len(doc.spans[layout.attrs.span_group]) == 4


def test_simple_pipe_as_tuples(nlp):
layout = spaCyLayout(nlp)
data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
result = list(layout.pipe(data, as_tuples=True))
for doc, _ in result:
assert len(doc.spans[layout.attrs.span_group]) == 4
assert [context for _, context in result] == ["pdf", "docx"]


def test_table(nlp):
layout = spaCyLayout(nlp)
doc = layout(PDF_TABLE)
Expand Down