dataforgoodfr · AymanL · Mar 8, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/README.md b/README.md
@@ -83,6 +83,12 @@ Pour exécuter le pipeline d'ingestion avec parsing Docling, installez aussi les
 uv sync --group parsing
 ```
 
+Pour exécuter l'étape d'embedding dans le pipeline d'ingestion :
+
+- le modèle utilisé est `intfloat/multilingual-e5-base`
+- `sentence-transformers` est requis (installé via `uv sync`)
+- prévoir plus de RAM/temps au premier chargement du modèle
+
 A l'usage, si vous utilisez VSCode, l'environnement virtuel sera automatiquement activé lorsque vous ouvrirez le projet. Sinon, il suffit de l'activer manuellement avec la commande suivante :
 
 ```bash

diff --git a/eu_fact_force/ingestion/embedding.py b/eu_fact_force/ingestion/embedding.py
@@ -1,8 +1,45 @@
 from eu_fact_force.ingestion.models import DocumentChunk
+from typing import Iterator
+
+MODEL_ID = "intfloat/multilingual-e5-base"
+# E5 models expect "passage: " for documents to index and "query: " for search queries (asymmetric retrieval).
+PASSAGE_PREFIX = "passage: "
+EMBED_BATCH_SIZE = 32
+_MODEL = None
+
+
+def _get_model():
+    global _MODEL
+    if _MODEL is None:
+        from sentence_transformers import SentenceTransformer
+
+        _MODEL = SentenceTransformer(MODEL_ID)
+    return _MODEL
+
+
+def _iter_batches(items: list[DocumentChunk], batch_size: int) -> Iterator[list[DocumentChunk]]:
+    for start in range(0, len(items), batch_size):
+        yield items[start : start + batch_size]
 
 
 def add_embeddings(chunks: list[DocumentChunk]):
     """
     Add embeddings to the chunks and update in the DB.
     """
-    pass
+    persisted_chunks = [
+        chunk for chunk in chunks if chunk.pk is not None and chunk.content.strip()
+    ]
+    if not persisted_chunks:
+        return
+
+    model = _get_model()
+    for batch in _iter_batches(persisted_chunks, EMBED_BATCH_SIZE):
+        texts = [f"{PASSAGE_PREFIX}{chunk.content}" for chunk in batch]
+        vectors = model.encode(
+            texts,
+            show_progress_bar=False,
+            normalize_embeddings=True,
+        )
+        for chunk, vector in zip(batch, vectors):
+            chunk.embedding = vector.tolist() if hasattr(vector, "tolist") else list(vector)
+        DocumentChunk.objects.bulk_update(batch, ["embedding"])
diff --git a/eu_fact_force/ingestion/migrations/0002_documentchunk_embedding.py b/eu_fact_force/ingestion/migrations/0002_documentchunk_embedding.py
@@ -0,0 +1,22 @@
+from django.db import migrations
+from pgvector.django import VectorExtension, VectorField
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("ingestion", "0001_initial"),
+    ]
+
+    operations = [
+        VectorExtension(),
+        migrations.AddField(
+            model_name="documentchunk",
+            name="embedding",
+            field=VectorField(
+                blank=True,
+                dimensions=768,
+                help_text="Dense embedding vector for semantic retrieval.",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/eu_fact_force/ingestion/models.py b/eu_fact_force/ingestion/models.py
@@ -3,10 +3,12 @@
 
 from django.core.files.storage import default_storage
 from django.db import models
+from pgvector.django import VectorField
 
 from eu_fact_force.ingestion.s3 import save_file_to_s3
 
 logger = logging.getLogger(__name__)
+EMBEDDING_DIMENSIONS = 768
 
 
 class TimeStampedModel(models.Model):
@@ -109,6 +111,12 @@ class DocumentChunk(TimeStampedModel):
     order = models.PositiveIntegerField(
         default=0, help_text="Order in the original file"
     )
+    embedding = VectorField(
+        dimensions=EMBEDDING_DIMENSIONS,
+        null=True,
+        blank=True,
+        help_text="Dense embedding vector for semantic retrieval.",
+    )
 
     class Meta:
         app_label = "ingestion"

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "python-dotenv>=1.0",
     "psycopg[binary]>=3.2",
     "pgvector>=0.2.4",
+    "sentence-transformers>=5.2.3",
     "django-storages[s3]>=1.14",
     "boto3>=1.34",
     "gunicorn>=25.1.0",
@@ -20,13 +21,14 @@ pythonpath = ["."]
 
 [dependency-groups]
 dev = [
+    "factory-boy>=3.3.0",
     "jupyter>=1.1.1",
     "pre-commit>=4.5.1",
     "pytest>=9.0.2",
     "pytest-django>=4.8",
+    "pytest-factoryboy>=2.6.0",
     "ruff>=0.15.0",
     "seaborn>=0.13.2",
-    "sentence-transformers>=5.2.3",
 ]
 parsing = [
     "docling>=2.73.1",

diff --git a/tests/factories.py b/tests/factories.py
@@ -0,0 +1,24 @@
+"""Factories for test data."""
+
+import factory
+from factory.django import DjangoModelFactory
+
+from eu_fact_force.ingestion.models import DocumentChunk, SourceFile
+
+
+class SourceFileFactory(DjangoModelFactory):
+    class Meta:
+        model = SourceFile
+
+    doi = ""
+    s3_key = ""
+    status = SourceFile.Status.STORED
+
+
+class DocumentChunkFactory(DjangoModelFactory):
+    class Meta:
+        model = DocumentChunk
+
+    source_file = factory.SubFactory(SourceFileFactory)
+    content = ""
+    order = 0
diff --git a/tests/ingestion/test_embedding.py b/tests/ingestion/test_embedding.py
@@ -0,0 +1,57 @@
+"""Tests for ingestion embedding persistence."""
+
+import pytest
+
+from eu_fact_force.ingestion import embedding as embedding_module
+from eu_fact_force.ingestion.models import DocumentChunk
+from tests.factories import DocumentChunkFactory, SourceFileFactory
+
+
+class _FakeModel:
+    def __init__(self):
+        self.calls: list[list[str]] = []
+
+    def encode(self, texts, show_progress_bar, normalize_embeddings):
+        self.calls.append(list(texts))
+        return [[0.1] * 768 for _ in texts]
+
+
+@pytest.mark.django_db
+def test_add_embeddings_updates_persisted_chunks(monkeypatch):
+    source = SourceFileFactory()
+    chunk_1 = DocumentChunkFactory(source_file=source, content="alpha", order=1)
+    chunk_2 = DocumentChunkFactory(source_file=source, content="beta", order=2)
+
+    fake_model = _FakeModel()
+    monkeypatch.setattr(embedding_module, "_get_model", lambda: fake_model)
+
+    embedding_module.add_embeddings([chunk_1, chunk_2])
+
+    chunk_1.refresh_from_db()
+    chunk_2.refresh_from_db()
+    assert len(chunk_1.embedding) == 768
+    assert len(chunk_2.embedding) == 768
+    assert chunk_1.embedding[0] == pytest.approx(0.1)
+    assert chunk_2.embedding[0] == pytest.approx(0.1)
+    assert fake_model.calls == [["passage: alpha", "passage: beta"]]
+
+
+@pytest.mark.django_db
+def test_add_embeddings_skips_unsaved_and_empty_chunks(monkeypatch):
+    """Only persisted, non-empty chunks are embedded; unsaved and empty-content chunks are ignored."""
+    source = SourceFileFactory()
+    persisted = DocumentChunkFactory(source_file=source, content="ok", order=1)
+    empty_chunk = DocumentChunk(source_file=source, content="   ", order=2)
+    unsaved_chunk = DocumentChunk(source_file=source, content="temp", order=3)
+
+    fake_model = _FakeModel()
+    monkeypatch.setattr(embedding_module, "_get_model", lambda: fake_model)
+
+    embedding_module.add_embeddings([persisted, empty_chunk, unsaved_chunk])
+
+    # DB still has only the one persisted chunk; empty_chunk and unsaved_chunk were never saved
+    assert DocumentChunk.objects.count() == 1
+    persisted.refresh_from_db()
+    assert len(persisted.embedding) == 768
+    assert persisted.embedding[0] == pytest.approx(0.1)
+    assert fake_model.calls == [["passage: ok"]]
diff --git a/tests/ingestion/test_run_pipeline.py b/tests/ingestion/test_run_pipeline.py
@@ -5,6 +5,7 @@
 import pytest
 
 from eu_fact_force.ingestion import parsing as parsing_module
+from eu_fact_force.ingestion import services as services_module
 from eu_fact_force.ingestion.models import DocumentChunk, SourceFile
 from eu_fact_force.ingestion.services import run_pipeline
 
@@ -17,11 +18,22 @@ def test_run_pipeline_uses_readme_md(tmp_storage, monkeypatch):
     readme_fn = PROJECT_ROOT / "README.md"
     assert readme_fn.exists(), f"Test file must exist: {readme_fn}"
 
+    paragraph_1 = "A" * 700
+    paragraph_2 = "B" * 700
+    paragraph_3 = "C" * 700
+    parsed_text = f"{paragraph_1}\n\n{paragraph_2}\n\n{paragraph_3}"
+
     monkeypatch.setattr(
         parsing_module,
         "_extract_text_from_source_file",
-        lambda _: "first paragraph\n\nsecond paragraph\n\nthird paragraph",
+        lambda _: parsed_text,
     )
+    embedding_calls: list[list[str]] = []
+
+    def _capture_embeddings(chunks):
+        embedding_calls.append([chunk.content for chunk in chunks])
+
+    monkeypatch.setattr(services_module, "add_embeddings", _capture_embeddings)
 
     source_file, _ = run_pipeline("README.md")
 
@@ -36,4 +48,5 @@ def test_run_pipeline_uses_readme_md(tmp_storage, monkeypatch):
         DocumentChunk.objects.filter(source_file=source_file).order_by("order")
     )
     saved_chunk_contents = [chunk.content for chunk in saved_chunks]
-    assert saved_chunk_contents == ["first paragraph\n\nsecond paragraph\n\nthird paragraph"]
+    assert saved_chunk_contents == [paragraph_1, paragraph_2, paragraph_3]
+    assert embedding_calls == [saved_chunk_contents]
diff --git a/uv.lock b/uv.lock