ProjectTech4DevAI · nishika26 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md
@@ -1,6 +1,6 @@
 Upload a document to Kaapi.
 
-- If only a file is provided, the document will be uploaded and stored, and its ID will be returned.
+- If only a file is provided, the document will be uploaded and stored, and its ID will be returned. The maximum file size allowed for upload is 25 MB.
 - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.
 - If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed.
 

diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
@@ -166,6 +166,7 @@ def execute_job(
     result = None
     creation_request = None
     provider = None
+    storage = None
 
     with log_context(
         tag="collection",
@@ -221,6 +222,49 @@ def execute_job(
                     organization_id=organization_id,
                 )
 
+        with Session(engine) as session:
+            document_crud = DocumentCrud(session, project_id)
+            flat_docs = document_crud.read_each(creation_request.documents)
+            storage = get_cloud_storage(session=session, project_id=project_id)
+
+        file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
+
+        backfill: list[tuple[UUID, float]] = []
+        for doc in flat_docs:
+            if doc.file_size_kb is None:
+                size_kb = round(storage.get_file_size_kb(doc.object_store_url))
+                doc.file_size_kb = size_kb
+                backfill.append((doc.id, size_kb))
-        backfill: list[tuple[UUID, float]] = []
-        for doc in flat_docs:
-            if doc.file_size_kb is None:
-                size_kb = round(storage.get_file_size_kb(doc.object_store_url))
-                doc.file_size_kb = size_kb
-                backfill.append((doc.id, size_kb))
+        backfill: list[tuple[UUID, float]] = []
+        for doc in flat_docs:
+            if doc.file_size_kb is None:
+                size_kb = storage.get_file_size_kb(doc.object_store_url)
+                doc.file_size_kb = size_kb
+                backfill.append((doc.id, size_kb))
-        backfill: list[tuple[UUID, float]] = []
-        for doc in flat_docs:
-            if doc.file_size_kb is None:
-                size_kb = round(storage.get_file_size_kb(doc.object_store_url))
-                doc.file_size_kb = size_kb
-                backfill.append((doc.id, size_kb))
+        backfill: list[tuple[UUID, float]] = []
+        for doc in flat_docs:
+            if doc.file_size_kb is None:
+                size_kb = storage.get_file_size_kb(doc.object_store_url)
+                doc.file_size_kb = size_kb
+                backfill.append((doc.id, size_kb))
+
+        total_size_kb = sum(
+            doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None
+        )
+        total_size_mb = total_size_kb / 1024
+
+        with Session(engine) as session:
+            if backfill:
+                document_crud = DocumentCrud(session, project_id)
+                for doc_id, size_kb in backfill:
+                    doc = document_crud.read_one(doc_id)
+                    doc.file_size_kb = size_kb
+                    document_crud.update(doc)
+
+            collection_job_crud = CollectionJobCrud(session, project_id)
+            collection_job = collection_job_crud.read_one(job_uuid)
+            collection_job = collection_job_crud.update(
+                job_uuid,
+                CollectionJobUpdate(
+                    task_id=task_id,
+                    status=CollectionJobStatus.PROCESSING,
+                    total_size_mb=round(total_size_mb, 2),
+                ),
+            )
+
+            provider = get_llm_provider(
+                session=session,
+                provider=creation_request.provider,
+                project_id=project_id,
+                organization_id=organization_id,
             with tracer.start_as_current_span("collections.create.provider"):
                 result = provider.create(
                     collection_request=creation_request,

diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
@@ -19,7 +19,6 @@
 MAX_DOC_SIZE_MB = 25  # 25 MB maximum per document
 
 # Maximum batch size for uploading documents to vector store
-# Derived from MAX_DOC_SIZE + buffer to ensure single docs always fit
 MAX_BATCH_SIZE_KB = (MAX_DOC_SIZE_MB + 5) * 1024  # 30 MB in KB (25 + 5 MB buffer)
 MAX_BATCH_COUNT = 200  # Maximum documents per batch
 
@@ -83,7 +82,7 @@ def batch_documents(documents: list[Document]) -> list[list[Document]]:
     current_batch_size_kb = 0
 
     for doc in documents:
-        doc_size_kb = doc.file_size_kb or 0
+        doc_size_kb = doc.file_size_kb
 
         would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB
         would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT

diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py
@@ -122,14 +122,12 @@ def test_batch_documents_mixed_size_batching() -> None:
     assert len(batches[2]) == 1  # 15 MB total
 
 
-def test_batch_documents_with_none_file_size() -> None:
-    """Test that documents with None file_size are treated as 0 bytes."""
+def test_batch_documents_with_none_file_size_raises() -> None:
+    """Test that documents with None file_size raise TypeError — sizes must be backfilled before batching."""
     docs = create_fake_documents(10, file_size_kb=None)
-    batches = helpers.batch_documents(docs)
 
-    # All files with None/0 size should fit in one batch (under both limits)
-    assert len(batches) == 1
-    assert len(batches[0]) == 10
+    with pytest.raises(TypeError):
+        helpers.batch_documents(docs)
 
 
 def test_batch_documents_empty_input() -> None: