Skip to content

Conversation

@Nintorac
Copy link

@Nintorac Nintorac commented Nov 28, 2025

Great project! Thank you :)

Here is a quick and dirty AI sketchup to resolve #443 - still need some tests and polish, but wanted to get some initial feedback. Is this something you're interested in? Is there anything missed?

Cheers :)

some manual test code, instal upath with pip install universal_pathlib

"""
Test if UPath works with Docling's save_as_json artifacts_dir parameter.

Usage:
1. Start MinIO: docker run -d \
    --name minio \
    -p 9000:9000 \
    -p 9001:9001 \
    -e MINIO_ROOT_USER=minioadmin \
    -e MINIO_ROOT_PASSWORD=minioadmin \
    minio/minio server /data --console-address ":9001"
2. Run: python test_upath_save.py
"""

import os
from pathlib import Path

from upath import UPath
from docling_core.types.doc import DoclingDocument, ImageRefMode

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption


def test_type_compatibility():
    """Check if UPath passes isinstance checks."""
    local_path = Path("/tmp/test")
    s3_path = UPath("s3://bucket/test")

    print("=== Type Compatibility ===")
    print(f"Path instance check:     isinstance(Path(...), Path) = {isinstance(local_path, Path)}")
    print(f"UPath instance check:    isinstance(UPath(...), Path) = {isinstance(s3_path, Path)}")
    print(f"UPath has write_bytes:   {hasattr(s3_path, 'write_bytes')}")
    print(f"UPath has mkdir:         {hasattr(s3_path, 'mkdir')}")
    print(f"UPath has __truediv__:   {hasattr(s3_path, '__truediv__')}")
    print()


def convert_doc():
    """Convert a PDF with picture images enabled."""
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    result = converter.convert("../tests/data/pdf/2206.01062.pdf")
    return result.document


def test_local_path(doc: DoclingDocument):
    """Test save_as_json with regular Path."""
    print("=== Test: Local Path ===")
    output_dir = Path("scratch/local_test")
    output_dir.mkdir(parents=True, exist_ok=True)

    json_path = output_dir / "doc.json"
    artifacts_dir = output_dir / "images"

    try:
        doc.save_as_json(
            json_path,
            artifacts_dir=artifacts_dir,
            image_mode=ImageRefMode.REFERENCED
        )
        print(f"SUCCESS: Saved to {json_path}")
        print(f"  Images dir: {artifacts_dir}")
        if artifacts_dir.exists():
            images = list(artifacts_dir.glob("*.png"))
            print(f"  Image count: {len(images)}")
    except Exception as e:
        print(f"FAILED: {e}")
    print()


def test_upath_local(doc: DoclingDocument):
    """Test save_as_json with UPath (local filesystem)."""
    print("=== Test: UPath Local ===")
    output_dir = UPath("scratch/upath_local_test")
    output_dir.mkdir(parents=True, exist_ok=True)

    json_path = output_dir / "doc.json"
    artifacts_dir = output_dir / "images"

    try:
        doc.save_as_json(
            json_path,
            artifacts_dir=artifacts_dir,
            image_mode=ImageRefMode.REFERENCED
        )
        print(f"SUCCESS: Saved to {json_path}")
        print(f"  Images dir: {artifacts_dir}")
        if artifacts_dir.exists():
            images = list(artifacts_dir.glob("*.png"))
            print(f"  Image count: {len(images)}")
    except TypeError as e:
        print(f"TYPE ERROR (expected if Path type check fails): {e}")
    except Exception as e:
        print(f"FAILED: {type(e).__name__}: {e}")
    print()


def test_upath_s3(doc: DoclingDocument):
    """Test save_as_json with UPath pointing to MinIO/S3."""
    print("=== Test: UPath S3 (MinIO) ===")

    # Configure for local MinIO
    os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
    os.environ["AWS_ENDPOINT_URL"] = "http://localhost:9000"

    # Create bucket first
    bucket_path = UPath("s3://docling-test")
    try:
        bucket_path.mkdir(exist_ok=True)
        print(f"  Created bucket: {bucket_path}")
    except Exception as e:
        print(f"  Bucket creation: {e}")

    output_dir = UPath("s3://docling-test/output")
    json_path = output_dir / "doc.json"
    artifacts_dir = output_dir / "images"

    try:
        doc.save_as_json(
            json_path,
            artifacts_dir=artifacts_dir,
            image_mode=ImageRefMode.REFERENCED
        )
        print(f"SUCCESS: Saved to {json_path}")
        print(f"  Images dir: {artifacts_dir}")

        # List saved files
        print("  Files in S3:")
        for f in output_dir.glob("**/*"):
            if f.is_file():
                print(f"    {f}")
    except TypeError as e:
        print(f"TYPE ERROR (expected if Path type check fails): {e}")
    except Exception as e:
        print(f"FAILED: {type(e).__name__}: {e}")
    print()


def test_upath_s3_roundtrip(doc: DoclingDocument):
    """Test saving and loading document from S3."""
    print("=== Test: UPath S3 Round-trip ===")

    # Configure for local MinIO
    os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
    os.environ["AWS_ENDPOINT_URL"] = "http://localhost:9000"

    output_dir = UPath("s3://docling-test/roundtrip")
    json_path = output_dir / "doc.json"
    yaml_path = output_dir / "doc.yaml"

    try:
        # Save as JSON
        doc.save_as_json(json_path, image_mode=ImageRefMode.EMBEDDED)
        print(f"  Saved JSON to: {json_path}")

        # Save as YAML
        doc.save_as_yaml(yaml_path, image_mode=ImageRefMode.EMBEDDED)
        print(f"  Saved YAML to: {yaml_path}")

        # Load back from JSON
        loaded_json = DoclingDocument.load_from_json(json_path)
        print(f"  Loaded from JSON: {loaded_json.name}")
        print(f"    Pages: {len(loaded_json.pages)}, Pictures: {len(loaded_json.pictures)}")

        # Load back from YAML
        loaded_yaml = DoclingDocument.load_from_yaml(yaml_path)
        print(f"  Loaded from YAML: {loaded_yaml.name}")
        print(f"    Pages: {len(loaded_yaml.pages)}, Pictures: {len(loaded_yaml.pictures)}")

        # Verify content matches
        assert len(loaded_json.pages) == len(doc.pages), "Page count mismatch (JSON)"
        assert len(loaded_yaml.pages) == len(doc.pages), "Page count mismatch (YAML)"
        assert len(loaded_json.pictures) == len(doc.pictures), "Picture count mismatch (JSON)"
        assert len(loaded_yaml.pictures) == len(doc.pictures), "Picture count mismatch (YAML)"

        print("SUCCESS: Round-trip verified!")
    except Exception as e:
        print(f"FAILED: {type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
    print()


def test_manual_upath_save(doc: DoclingDocument):
    """If native doesn't work, test manual save with UPath."""
    print("=== Test: Manual UPath S3 Save ===")

    os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
    os.environ["AWS_ENDPOINT_URL"] = "http://localhost:9000"

    output_dir = UPath("s3://docling-test/manual")

    try:
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save JSON locally first, then upload
        import json
        json_content = doc.model_dump_json(indent=2)
        json_path = output_dir / "doc.json"
        json_path.write_text(json_content)
        print(f"  Saved JSON to: {json_path}")

        # Save images manually
        for i, picture in enumerate(doc.pictures[:3]):  # First 3
            if picture.image and picture.image.pil_image:
                from io import BytesIO
                buf = BytesIO()
                picture.image.pil_image.save(buf, format="PNG")
                img_path = output_dir / f"picture_{i}.png"
                img_path.write_bytes(buf.getvalue())
                print(f"  Saved image: {img_path}")

        print("SUCCESS: Manual save worked")
    except Exception as e:
        print(f"FAILED: {type(e).__name__}: {e}")
    print()


if __name__ == "__main__":
    test_type_compatibility()

    print("Converting document...")
    doc = convert_doc()
    print(f"Document has {len(doc.pages)} pages, {len(doc.pictures)} pictures\n")

    test_local_path(doc)
    test_upath_local(doc)
    test_upath_s3(doc)
    test_upath_s3_roundtrip(doc)
    test_manual_upath_save(doc)

@github-actions
Copy link
Contributor

github-actions bot commented Nov 28, 2025

DCO Check Failed

Hi @Nintorac, your pull request has failed the Developer Certificate of Origin (DCO) check.

This repository supports remediation commits, so you can fix this without rewriting history — but you must follow the required message format.


🛠 Quick Fix: Add a remediation commit

Run this command:

git commit --allow-empty -s -m "DCO Remediation Commit for Nintorac <Nintorac@users.noreply.github.com>

I, Nintorac <Nintorac@users.noreply.github.com>, hereby add my Signed-off-by to this commit: 0ab64663e8039dc85cbfe01fc46f275be779c034"
git push

🔧 Advanced: Sign off each commit directly

For the latest commit:

git commit --amend --signoff
git push --force-with-lease

For multiple commits:

git rebase --signoff origin/main
git push --force-with-lease

More info: DCO check report

@mergify
Copy link

mergify bot commented Nov 28, 2025

Merge Protections

Your pull request matches the following merge protections and will not be merged until they are valid.

🔴 Require two reviewer for test updates

This rule is failing.

When test data is updated, we require two reviewers

  • #approved-reviews-by >= 2

🟢 Enforce conventional commit

Wonderful, this rule succeeded.

Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/

  • title ~= ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\(.+\))?(!)?:

@dosubot
Copy link

dosubot bot commented Nov 28, 2025

Related Documentation

Checked 4 published document(s) in 1 knowledge base(s). No updates required.

How did I do? Any feedback?  Join Discord

@cau-git
Copy link
Contributor

cau-git commented Nov 28, 2025

@Nintorac this looks interesting, but we would need to validate the dependency impact it brings to docling-core. We are trying to keep docling-core dependencies light and make them optional where possible. Do you think UPath could work as an optional dependency? (I see it would impact the signatures and implementations of the save_as_... or load_from_... directly).

Also, please note we have created a layer for handling workloads on non-local file storage already with docling-jobkit. It implements a number of connectors for different cloud storage.

@uq-os
Copy link

uq-os commented Nov 28, 2025

No need for any further dependencies, the main changes here are to make sure all filesystem calls run through pathlib, UPath implements the pathlib api - so tests also only need to test that it works with pathlib.Path

I think if this lib were to implement anything it would be optional and at the fsspec level but this is a simple enough way to get compatibility without extra dependencies and minimal code changes.

In case you haven't gone down the rabbit hole fsspec is quite well supported -
https://duckdb.org/docs/stable/guides/python/filesystems
https://pandas.pydata.org/docs/user_guide/io.html#reading-writing-remote-files
https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html

@Nintorac
Copy link
Author

Ah sorry, didn't mean to send the last message from that account

Copy link
Collaborator

@ceberam ceberam left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Nintorac for creating this PR.
Please review the contributing guidelines and ensure that the PR follows the conventional commits and that the code checks pass.

@ceberam ceberam changed the title add fsspec support feat: add fsspec support Dec 4, 2025
@dolfim-ibm
Copy link
Contributor

@Nintorac This is an interesting PR, for finalizing we need

  1. Please make sure your commits follow the contributing guidelines and provide the proper sign-off
  2. Please address the failures of mypy in the CI tests

Additionally, (optional) we could add some tests which uses the upath library, but then we add it as dependency only in the test group.

Enable cloud storage backends (S3, GCS, Azure, etc.) for document
serialization methods via fsspec/universal-pathlib integration.
@Nintorac
Copy link
Author

Nintorac commented Dec 8, 2025

Sorry, got a little busy. I have fixed the commit message and added some testing :)

@dolfim-ibm
Copy link
Contributor

Sorry, got a little busy. I have fixed the commit message and added some testing :)

Unfortunately the commit message still doesn't contain the sign-off line. FYI, we usually add it by doing the commit with git commit -s.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Add fsspec support

5 participants