Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ Pour exécuter le pipeline d'ingestion avec parsing Docling, installez aussi les
uv sync --group parsing
```

Pour exécuter l'étape d'embedding dans le pipeline d'ingestion :

- le modèle utilisé est `intfloat/multilingual-e5-base`
- `sentence-transformers` est requis (installé via `uv sync`)
- prévoir plus de RAM/temps au premier chargement du modèle

A l'usage, si vous utilisez VSCode, l'environnement virtuel sera automatiquement activé lorsque vous ouvrirez le projet. Sinon, il suffit de l'activer manuellement avec la commande suivante :

```bash
Expand Down
39 changes: 38 additions & 1 deletion eu_fact_force/ingestion/embedding.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,45 @@
from eu_fact_force.ingestion.models import DocumentChunk
from typing import Iterator

MODEL_ID = "intfloat/multilingual-e5-base"
# E5 models expect "passage: " for documents to index and "query: " for search queries (asymmetric retrieval).
PASSAGE_PREFIX = "passage: "
EMBED_BATCH_SIZE = 32
_MODEL = None


def _get_model():
global _MODEL
if _MODEL is None:
from sentence_transformers import SentenceTransformer

_MODEL = SentenceTransformer(MODEL_ID)
return _MODEL


def _iter_batches(items: list[DocumentChunk], batch_size: int) -> Iterator[list[DocumentChunk]]:
for start in range(0, len(items), batch_size):
yield items[start : start + batch_size]


def add_embeddings(chunks: list[DocumentChunk]):
"""
Add embeddings to the chunks and update in the DB.
"""
pass
persisted_chunks = [
chunk for chunk in chunks if chunk.pk is not None and chunk.content.strip()
]
if not persisted_chunks:
return

model = _get_model()
for batch in _iter_batches(persisted_chunks, EMBED_BATCH_SIZE):
texts = [f"{PASSAGE_PREFIX}{chunk.content}" for chunk in batch]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pourquoi tu ajoutes ce préfixe au texte?

vectors = model.encode(
texts,
show_progress_bar=False,
normalize_embeddings=True,
)
for chunk, vector in zip(batch, vectors):
chunk.embedding = vector.tolist() if hasattr(vector, "tolist") else list(vector)
DocumentChunk.objects.bulk_update(batch, ["embedding"])
22 changes: 22 additions & 0 deletions eu_fact_force/ingestion/migrations/0002_documentchunk_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from django.db import migrations
from pgvector.django import VectorExtension, VectorField


class Migration(migrations.Migration):
dependencies = [
("ingestion", "0001_initial"),
]

operations = [
VectorExtension(),
migrations.AddField(
model_name="documentchunk",
name="embedding",
field=VectorField(
blank=True,
dimensions=768,
help_text="Dense embedding vector for semantic retrieval.",
null=True,
),
),
]
8 changes: 8 additions & 0 deletions eu_fact_force/ingestion/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

from django.core.files.storage import default_storage
from django.db import models
from pgvector.django import VectorField

from eu_fact_force.ingestion.s3 import save_file_to_s3

logger = logging.getLogger(__name__)
EMBEDDING_DIMENSIONS = 768


class TimeStampedModel(models.Model):
Expand Down Expand Up @@ -109,6 +111,12 @@ class DocumentChunk(TimeStampedModel):
order = models.PositiveIntegerField(
default=0, help_text="Order in the original file"
)
embedding = VectorField(
dimensions=EMBEDDING_DIMENSIONS,
null=True,
blank=True,
help_text="Dense embedding vector for semantic retrieval.",
)

class Meta:
app_label = "ingestion"
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies = [
"python-dotenv>=1.0",
"psycopg[binary]>=3.2",
"pgvector>=0.2.4",
"sentence-transformers>=5.2.3",
"django-storages[s3]>=1.14",
"boto3>=1.34",
"gunicorn>=25.1.0",
Expand All @@ -20,13 +21,14 @@ pythonpath = ["."]

[dependency-groups]
dev = [
"factory-boy>=3.3.0",
"jupyter>=1.1.1",
"pre-commit>=4.5.1",
"pytest>=9.0.2",
"pytest-django>=4.8",
"pytest-factoryboy>=2.6.0",
"ruff>=0.15.0",
"seaborn>=0.13.2",
"sentence-transformers>=5.2.3",
]
parsing = [
"docling>=2.73.1",
Expand Down
24 changes: 24 additions & 0 deletions tests/factories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Factories for test data."""

import factory
from factory.django import DjangoModelFactory

from eu_fact_force.ingestion.models import DocumentChunk, SourceFile


class SourceFileFactory(DjangoModelFactory):
class Meta:
model = SourceFile

doi = ""
s3_key = ""
status = SourceFile.Status.STORED


class DocumentChunkFactory(DjangoModelFactory):
class Meta:
model = DocumentChunk

source_file = factory.SubFactory(SourceFileFactory)
content = ""
order = 0
57 changes: 57 additions & 0 deletions tests/ingestion/test_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Tests for ingestion embedding persistence."""

import pytest

from eu_fact_force.ingestion import embedding as embedding_module
from eu_fact_force.ingestion.models import DocumentChunk
from tests.factories import DocumentChunkFactory, SourceFileFactory


class _FakeModel:
def __init__(self):
self.calls: list[list[str]] = []

def encode(self, texts, show_progress_bar, normalize_embeddings):
self.calls.append(list(texts))
return [[0.1] * 768 for _ in texts]


@pytest.mark.django_db
def test_add_embeddings_updates_persisted_chunks(monkeypatch):
source = SourceFileFactory()
chunk_1 = DocumentChunkFactory(source_file=source, content="alpha", order=1)
chunk_2 = DocumentChunkFactory(source_file=source, content="beta", order=2)

fake_model = _FakeModel()
monkeypatch.setattr(embedding_module, "_get_model", lambda: fake_model)

embedding_module.add_embeddings([chunk_1, chunk_2])

chunk_1.refresh_from_db()
chunk_2.refresh_from_db()
assert len(chunk_1.embedding) == 768
assert len(chunk_2.embedding) == 768
assert chunk_1.embedding[0] == pytest.approx(0.1)
assert chunk_2.embedding[0] == pytest.approx(0.1)
assert fake_model.calls == [["passage: alpha", "passage: beta"]]


@pytest.mark.django_db
def test_add_embeddings_skips_unsaved_and_empty_chunks(monkeypatch):
"""Only persisted, non-empty chunks are embedded; unsaved and empty-content chunks are ignored."""
source = SourceFileFactory()
persisted = DocumentChunkFactory(source_file=source, content="ok", order=1)
empty_chunk = DocumentChunk(source_file=source, content=" ", order=2)
unsaved_chunk = DocumentChunk(source_file=source, content="temp", order=3)

fake_model = _FakeModel()
monkeypatch.setattr(embedding_module, "_get_model", lambda: fake_model)

embedding_module.add_embeddings([persisted, empty_chunk, unsaved_chunk])

# DB still has only the one persisted chunk; empty_chunk and unsaved_chunk were never saved
assert DocumentChunk.objects.count() == 1
persisted.refresh_from_db()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Je dirais ici qu'il faut récupérer toute la base pour voir si empty et unsaved n'ont pas été sauvegardés justement.

assert len(persisted.embedding) == 768
assert persisted.embedding[0] == pytest.approx(0.1)
assert fake_model.calls == [["passage: ok"]]
17 changes: 15 additions & 2 deletions tests/ingestion/test_run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from eu_fact_force.ingestion import parsing as parsing_module
from eu_fact_force.ingestion import services as services_module
from eu_fact_force.ingestion.models import DocumentChunk, SourceFile
from eu_fact_force.ingestion.services import run_pipeline

Expand All @@ -17,11 +18,22 @@ def test_run_pipeline_uses_readme_md(tmp_storage, monkeypatch):
readme_fn = PROJECT_ROOT / "README.md"
assert readme_fn.exists(), f"Test file must exist: {readme_fn}"

paragraph_1 = "A" * 700
paragraph_2 = "B" * 700
paragraph_3 = "C" * 700
parsed_text = f"{paragraph_1}\n\n{paragraph_2}\n\n{paragraph_3}"

monkeypatch.setattr(
parsing_module,
"_extract_text_from_source_file",
lambda _: "first paragraph\n\nsecond paragraph\n\nthird paragraph",
lambda _: parsed_text,
)
embedding_calls: list[list[str]] = []

def _capture_embeddings(chunks):
embedding_calls.append([chunk.content for chunk in chunks])

monkeypatch.setattr(services_module, "add_embeddings", _capture_embeddings)

source_file, _ = run_pipeline("README.md")

Expand All @@ -36,4 +48,5 @@ def test_run_pipeline_uses_readme_md(tmp_storage, monkeypatch):
DocumentChunk.objects.filter(source_file=source_file).order_by("order")
)
saved_chunk_contents = [chunk.content for chunk in saved_chunks]
assert saved_chunk_contents == ["first paragraph\n\nsecond paragraph\n\nthird paragraph"]
assert saved_chunk_contents == [paragraph_1, paragraph_2, paragraph_3]
assert embedding_calls == [saved_chunk_contents]
45 changes: 43 additions & 2 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.