From 614801ccd8adb843c12e0617ac8a9931a391c439 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 12:06:57 +0530
Subject: [PATCH 01/20] collections:add dynamic batching

---
 ...columns_to_collection_job_and_documents.py | 53 ++++++++++++++++
 backend/app/api/routes/collections.py         | 14 +++++
 backend/app/api/routes/documents.py           |  4 ++
 backend/app/crud/rag/open_ai.py               | 24 ++-----
 backend/app/models/collection_job.py          | 15 +++++
 backend/app/models/document.py                | 10 ++-
 .../services/collections/create_collection.py | 11 ++--
 backend/app/services/collections/helpers.py   | 62 ++++++++++++++-----
 .../services/collections/providers/openai.py  |  7 +--
 backend/app/services/documents/helpers.py     | 24 ++++++-
 10 files changed, 176 insertions(+), 48 deletions(-)
 create mode 100644 backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
new file mode 100644
index 000000000..c1008a3cb
--- /dev/null
+++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
@@ -0,0 +1,53 @@
+"""add columns to collection job and documents table
+
+Revision ID: 050
+Revises: 049
+Create Date: 2026-03-25 10:09:47.318575
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+
+
+# revision identifiers, used by Alembic.
+revision = "050"
+down_revision = "049"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "docs_num",
+            sa.Integer(),
+            nullable=True,
+            comment="Total number of documents to be processed in this job",
+        ),
+    )
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "total_size",
+            sa.Integer(),
+            nullable=True,
+            comment="Total size of documents being uploaded to collection",
+        ),
+    )
+    op.add_column(
+        "document",
+        sa.Column(
+            "file_size",
+            sa.Integer(),
+            nullable=True,
+            comment="Size of the document in bytes",
+        ),
+    )
+
+
+def downgrade():
+    op.drop_column("document", "file_size")
+    op.drop_column("collection_jobs", "total_size")
+    op.drop_column("collection_jobs", "docs_num")
diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index 558e4d867..b4a03df1f 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -11,6 +11,7 @@
     CollectionCrud,
     CollectionJobCrud,
     DocumentCollectionCrud,
+    DocumentCrud,
 )
 from app.core.cloud import get_cloud_storage
 from app.models import (
@@ -95,12 +96,25 @@ def create_collection(
     if request.name:
         ensure_unique_name(session, current_user.project_.id, request.name)
 
+    # Calculate total size of all documents
+    document_crud = DocumentCrud(session, current_user.project_.id)
+    total_size = 0
+    for doc_id in request.documents:
+        doc = document_crud.read_one(doc_id)
+        total_size += doc.file_size or 0
+
+    logger.info(
+        f"[create_collection] Calculated total size | {{'total_documents': {len(request.documents)}, 'total_size_bytes': {total_size}, 'total_size_mb': {round(total_size / (1024 * 1024), 2)}}}"
+    )
+
     collection_job_crud = CollectionJobCrud(session, current_user.project_.id)
     collection_job = collection_job_crud.create(
         CollectionJobCreate(
             action_type=CollectionActionType.CREATE,
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
+            num_docs=len(request.documents),
+            total_size=total_size,
         )
     )
 
diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
index 69c5b4895..c8e17f503 100644
--- a/backend/app/api/routes/documents.py
+++ b/backend/app/api/routes/documents.py
@@ -29,6 +29,7 @@
 from app.core.cloud import get_cloud_storage
 from app.services.collections.helpers import pick_service_for_documennt
 from app.services.documents.helpers import (
+    calculate_file_size,
     schedule_transformation,
     pre_transform_validation,
     build_document_schema,
@@ -129,6 +130,8 @@ async def upload_doc(
         transformer=transformer,
     )
 
+    file_size = await calculate_file_size(src)
+
     storage = get_cloud_storage(session=session, project_id=current_user.project_.id)
     document_id = uuid4()
     object_store_url = storage.put(src, Path(str(document_id)))
@@ -137,6 +140,7 @@ async def upload_doc(
     document = Document(
         id=document_id,
         fname=src.filename,
+        file_size=file_size,
         object_store_url=str(object_store_url),
     )
     source_document = crud.update(document)
diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py
index cdb644abc..20620b9e2 100644
--- a/backend/app/crud/rag/open_ai.py
+++ b/backend/app/crud/rag/open_ai.py
@@ -143,24 +143,12 @@ def update(
                 f"[OpenAIVectorStoreCrud.update] File upload completed | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
             )
             if req.file_counts.completed != req.file_counts.total:
-                view = {x.fname: x for x in docs}
-                for i in self.read(vector_store_id):
-                    if i.last_error is None:
-                        fname = self.client.files.retrieve(i.id)
-                        view.pop(fname)
-
-                error = {
-                    "error": "OpenAI document processing error",
-                    "documents": list(view.values()),
-                }
-                try:
-                    raise InterruptedError(json.dumps(error, cls=BaseModelEncoder))
-                except InterruptedError as err:
-                    logger.error(
-                        f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'error': '{error['error']}', 'failed_documents': {len(error['documents'])}}}",
-                        exc_info=True,
-                    )
-                    raise
+                error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed"
+                logger.error(
+                    f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}",
+                    exc_info=True,
+                )
+                raise InterruptedError(error_msg)
 
             while files:
                 f_obj = files.pop()
diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py
index 7c55e8562..b0ec21249 100644
--- a/backend/app/models/collection_job.py
+++ b/backend/app/models/collection_job.py
@@ -53,6 +53,18 @@ class CollectionJob(SQLModel, table=True):
         description="Tracing ID for correlating logs and traces.",
         sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"},
     )
+    docs_num: int | None = Field(
+        description="Total number of documents to be processed in this job",
+        sa_column_kwargs={
+            "comment": "Total number of documents to be processed in this job"
+        },
+    )
+    total_size: int | None = Field(
+        description="Total size of documents being uploaded to collection",
+        sa_column_kwargs={
+            "comment": "Total size of documents being uploaded to collection"
+        },
+    )
     error_message: str | None = Field(
         default=None,
         sa_column=Column(
@@ -106,6 +118,9 @@ class CollectionJobCreate(SQLModel):
     collection_id: UUID | None = None
     status: CollectionJobStatus
     action_type: CollectionActionType
+    batch_size: int | None = None
+    num_docs: int | None = None
+    total_size: int | None = None
     project_id: int
 
 
diff --git a/backend/app/models/document.py b/backend/app/models/document.py
index bffa7b39c..3fbd71da2 100644
--- a/backend/app/models/document.py
+++ b/backend/app/models/document.py
@@ -1,6 +1,8 @@
 from datetime import datetime
+from typing import Any
 from uuid import UUID, uuid4
 
+from pydantic import model_serializer
 from sqlmodel import Field, SQLModel
 
 from app.core.util import now
@@ -41,6 +43,11 @@ class Document(DocumentBase, table=True):
         default=False,
         sa_column_kwargs={"comment": "Soft delete flag"},
     )
+    file_size: int | None = Field(
+        default=None,
+        description="The size of the document in bytes",
+        sa_column_kwargs={"comment": "Size of the document in bytes"},
+    )
 
     # Foreign keys
     source_document_id: UUID | None = Field(
@@ -80,9 +87,6 @@ class DocumentPublic(DocumentBase):
     updated_at: datetime = Field(
         description="The timestamp when the document was last updated"
     )
-    signed_url: str | None = Field(
-        default=None, description="A signed URL for accessing the document"
-    )
 
 
 class TransformedDocumentPublic(DocumentPublic):
diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
index dd4016616..3f7b98537 100644
--- a/backend/app/services/collections/create_collection.py
+++ b/backend/app/services/collections/create_collection.py
@@ -203,9 +203,9 @@ def execute_job(
             flat_docs = document_crud.read_each(creation_request.documents)
 
         file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
-        file_sizes_kb = [
-            storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs
-        ]
+        file_sizes_bytes = [doc.file_size or 0 for doc in flat_docs]
+        total_size_bytes = sum(file_sizes_bytes)
+        total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
 
         with Session(engine) as session:
             collection_crud = CollectionCrud(session, project_id)
@@ -240,11 +240,12 @@ def execute_job(
 
         elapsed = time.time() - start_time
         logger.info(
-            "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Sizes: %s KB | Types: %s",
+            "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB (%s bytes) | Types: %s",
             collection_id,
             elapsed,
             len(flat_docs),
-            file_sizes_kb,
+            total_size_mb,
+            total_size_bytes,
             list(file_exts),
         )
 
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index 6275ee40d..412eb332d 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -55,25 +55,57 @@ def extract_error_message(err: Exception) -> str:
     return message.strip()[:1000]
 
 
-def batch_documents(
-    document_crud: DocumentCrud, documents: List[UUID], batch_size: int
-):
-    """Batch document IDs into chunks of size `batch_size`, load each via `DocumentCrud.read_each`,
-    and return a list of document batches."""
+def batch_documents(document_crud: DocumentCrud, documents: List[UUID]):
+    """
+    Batch documents dynamically based on size and count limits.
+
+    Creates a new batch when either:
+    - Total size reaches 30 MB (31,457,280 bytes)
+    - Document count reaches 200
+
+    Returns:
+        List of document batches
+    """
+
+    MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024  # 30 MB in bytes
+    MAX_BATCH_COUNT = 200  # Maximum documents per batch
 
     logger.info(
-        f"[batch_documents] Starting batch iteration for documents | {{'batch_size': {batch_size}, 'total_documents': {len(documents)}}}"
+        f"[batch_documents] Starting dynamic batch iteration | {{'total_documents': {len(documents)}, 'max_batch_size': '30 MB', 'max_batch_count': {MAX_BATCH_COUNT}}}"
     )
+
     docs_batches = []
-    start, stop = 0, batch_size
-    while True:
-        view = documents[start:stop]
-        if not view:
-            break
-        batch_docs = document_crud.read_each(view)
-        docs_batches.append(batch_docs)
-        start = stop
-        stop += batch_size
+    current_batch = []
+    current_batch_size = 0
+
+    for doc_id in documents:
+        doc = document_crud.read_one(doc_id)
+        doc_size = doc.file_size or 0  # file_size is in bytes
+
+        would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES
+        would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT
+
+        if current_batch and (would_exceed_size or would_exceed_count):
+            logger.info(
+                f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}"
+            )
+            docs_batches.append(current_batch)
+            current_batch = []
+            current_batch_size = 0
+
+        current_batch.append(doc)
+        current_batch_size += doc_size
+
+    if current_batch:
+        logger.info(
+            f"[batch_documents] Final batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}"
+        )
+        docs_batches.append(current_batch)
+
+    logger.info(
+        f"[batch_documents] Batching complete | {{'total_batches': {len(docs_batches)}, 'total_documents': {len(documents)}}}"
+    )
+
     return docs_batches
 
 
diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py
index bdab86d3a..1a130314a 100644
--- a/backend/app/services/collections/providers/openai.py
+++ b/backend/app/services/collections/providers/openai.py
@@ -31,12 +31,7 @@ def create(
         """
         try:
             # Use user-provided batch_size, default to 10 if not set
-            batch_size = collection_request.batch_size or 10
-            docs_batches = batch_documents(
-                document_crud,
-                collection_request.documents,
-                batch_size,
-            )
+            docs_batches = batch_documents(document_crud, collection_request.documents)
 
             vector_store_crud = OpenAIVectorStoreCrud(self.client)
             vector_store = vector_store_crud.create()
diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py
index cd941eb55..578380a5a 100644
--- a/backend/app/services/documents/helpers.py
+++ b/backend/app/services/documents/helpers.py
@@ -1,7 +1,7 @@
 from typing import Optional, Tuple, Iterable, Union
 from uuid import UUID
 
-from fastapi import HTTPException
+from fastapi import HTTPException, UploadFile
 
 from app.services.doctransform.registry import (
     get_available_transformers,
@@ -23,6 +23,28 @@
 )
 
 
+async def calculate_file_size(file: UploadFile) -> int:
+    """
+    Calculate the size of an uploaded file in bytes.
+
+    Args:
+        file: The uploaded file from FastAPI
+
+    Returns:
+        The size of the file in bytes
+    """
+    if file.size:
+        return file.size
+
+    # If size is not available, calculate by reading the file
+    await file.seek(0)
+    content = await file.read()
+    size_bytes = len(content)
+    await file.seek(0)  # Reset to beginning for subsequent operations
+
+    return size_bytes
+
+
 def pre_transform_validation(
     *,
     src_filename: str,

From f8f9c2f290bd3e3cb42315a1f07f2c09295051eb Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 12:35:58 +0530
Subject: [PATCH 02/20] coderabbit reviews

---
 ...0_add_columns_to_collection_job_and_documents.py |  1 -
 backend/app/api/docs/collections/create.md          |  7 ++++---
 backend/app/api/routes/collections.py               |  2 +-
 backend/app/models/collection.py                    |  1 +
 backend/app/models/collection_job.py                |  5 +++--
 .../app/services/collections/create_collection.py   |  3 +--
 backend/app/services/collections/helpers.py         | 13 ++++---------
 .../app/services/collections/providers/openai.py    |  1 -
 .../routes/collections/test_create_collections.py   |  3 ---
 .../collections/providers/test_openai_provider.py   |  3 ---
 .../services/collections/test_create_collection.py  | 10 ++--------
 11 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
index c1008a3cb..2925e0fa6 100644
--- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
@@ -7,7 +7,6 @@
 """
 from alembic import op
 import sqlalchemy as sa
-import sqlmodel.sql.sqltypes
 
 
 # revision identifiers, used by Alembic.
diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md
index cc85ad36a..853f6d78e 100644
--- a/backend/app/api/docs/collections/create.md
+++ b/backend/app/api/docs/collections/create.md
@@ -3,9 +3,10 @@ pipeline:
 
 * Create a vector store from the document IDs you received after uploading your
   documents through the Documents module.
-* The `batch_size` parameter controls how many documents are sent to OpenAI in a
-  single transaction when creating the vector store. This helps optimize the upload
-  process for large document sets. If not specified, the default value is **10**.
+* Documents are automatically batched when creating the vector store to optimize
+  the upload process for large document sets. A new batch is created when either
+  the cumulative size reaches configured total size of documents given to upload to a vector store
+  or the document count reaches specific number of files in a btch, whichever limit is hit first.
 * [Deprecated] Attach the Vector Store to an OpenAI
   [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use
   parameters in the request body relevant to an Assistant to flesh out
diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index b4a03df1f..004251d6c 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -113,7 +113,7 @@ def create_collection(
             action_type=CollectionActionType.CREATE,
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
-            num_docs=len(request.documents),
+            docs_num=len(request.documents),
             total_size=total_size,
         )
     )
diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py
index f8b545404..0b00639d8 100644
--- a/backend/app/models/collection.py
+++ b/backend/app/models/collection.py
@@ -105,6 +105,7 @@ class CollectionOptions(SQLModel):
     batch_size: int = Field(
         default=10,
         description=(
+            "**[Deprecated]**  "
             "Number of documents to send to OpenAI in a single "
             "transaction. See the `file_ids` parameter in the "
             "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)."
diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py
index b0ec21249..a4e857a1a 100644
--- a/backend/app/models/collection_job.py
+++ b/backend/app/models/collection_job.py
@@ -54,12 +54,14 @@ class CollectionJob(SQLModel, table=True):
         sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"},
     )
     docs_num: int | None = Field(
+        default=None,
         description="Total number of documents to be processed in this job",
         sa_column_kwargs={
             "comment": "Total number of documents to be processed in this job"
         },
     )
     total_size: int | None = Field(
+        default=None,
         description="Total size of documents being uploaded to collection",
         sa_column_kwargs={
             "comment": "Total size of documents being uploaded to collection"
@@ -118,8 +120,7 @@ class CollectionJobCreate(SQLModel):
     collection_id: UUID | None = None
     status: CollectionJobStatus
     action_type: CollectionActionType
-    batch_size: int | None = None
-    num_docs: int | None = None
+    docs_num: int | None = None
     total_size: int | None = None
     project_id: int
 
diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
index 3f7b98537..35b03da80 100644
--- a/backend/app/services/collections/create_collection.py
+++ b/backend/app/services/collections/create_collection.py
@@ -203,8 +203,7 @@ def execute_job(
             flat_docs = document_crud.read_each(creation_request.documents)
 
         file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
-        file_sizes_bytes = [doc.file_size or 0 for doc in flat_docs]
-        total_size_bytes = sum(file_sizes_bytes)
+        total_size_bytes = collection_job.total_size or 0
         total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
 
         with Session(engine) as session:
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index 412eb332d..a242f7b8c 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -55,7 +55,9 @@ def extract_error_message(err: Exception) -> str:
     return message.strip()[:1000]
 
 
-def batch_documents(document_crud: DocumentCrud, documents: List[UUID]):
+def batch_documents(
+    document_crud: DocumentCrud, documents: List[UUID]
+) -> List[List[Document]]:
     """
     Batch documents dynamically based on size and count limits.
 
@@ -70,17 +72,13 @@ def batch_documents(document_crud: DocumentCrud, documents: List[UUID]):
     MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024  # 30 MB in bytes
     MAX_BATCH_COUNT = 200  # Maximum documents per batch
 
-    logger.info(
-        f"[batch_documents] Starting dynamic batch iteration | {{'total_documents': {len(documents)}, 'max_batch_size': '30 MB', 'max_batch_count': {MAX_BATCH_COUNT}}}"
-    )
-
     docs_batches = []
     current_batch = []
     current_batch_size = 0
 
     for doc_id in documents:
         doc = document_crud.read_one(doc_id)
-        doc_size = doc.file_size or 0  # file_size is in bytes
+        doc_size = doc.file_size or 0
 
         would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES
         would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT
@@ -97,9 +95,6 @@ def batch_documents(document_crud: DocumentCrud, documents: List[UUID]):
         current_batch_size += doc_size
 
     if current_batch:
-        logger.info(
-            f"[batch_documents] Final batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}"
-        )
         docs_batches.append(current_batch)
 
     logger.info(
diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py
index 1a130314a..558baf295 100644
--- a/backend/app/services/collections/providers/openai.py
+++ b/backend/app/services/collections/providers/openai.py
@@ -30,7 +30,6 @@ def create(
         Create OpenAI vector store with documents and optionally an assistant.
         """
         try:
-            # Use user-provided batch_size, default to 10 if not set
             docs_batches = batch_documents(document_crud, collection_request.documents)
 
             vector_store_crud = OpenAIVectorStoreCrud(self.client)
diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py
index 220cb4ee8..65e40f85f 100644
--- a/backend/app/tests/api/routes/collections/test_create_collections.py
+++ b/backend/app/tests/api/routes/collections/test_create_collections.py
@@ -26,7 +26,6 @@ def test_collection_creation_with_assistant_calls_start_job_and_returns_job(
         instructions="string",
         temperature=0.000001,
         documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")],
-        batch_size=10,
         callback_url=None,
     )
 
@@ -71,7 +70,6 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f
     creation_data = CreationRequest(
         temperature=0.000001,
         documents=[str(uuid4())],
-        batch_size=10,
         callback_url=None,
     )
 
@@ -109,7 +107,6 @@ def test_collection_creation_vector_only_request_validation_error(
         "model": "gpt-4o",
         "temperature": 0.000001,
         "documents": [str(uuid4())],
-        "batch_size": 10,
         "callback_url": None,
     }
 
diff --git a/backend/app/tests/services/collections/providers/test_openai_provider.py b/backend/app/tests/services/collections/providers/test_openai_provider.py
index 8ae028a0a..25f664146 100644
--- a/backend/app/tests/services/collections/providers/test_openai_provider.py
+++ b/backend/app/tests/services/collections/providers/test_openai_provider.py
@@ -18,7 +18,6 @@ def test_create_openai_vector_store_only() -> None:
 
     collection_request = SimpleNamespace(
         documents=["doc1", "doc2"],
-        batch_size=10,
         model=None,
         instructions=None,
         temperature=None,
@@ -57,7 +56,6 @@ def test_create_openai_with_assistant() -> None:
 
     collection_request = SimpleNamespace(
         documents=["doc1"],
-        batch_size=10,
         model="gpt-4o",
         instructions="You are helpful",
         temperature=0.7,
@@ -138,7 +136,6 @@ def test_create_propagates_exception() -> None:
 
     collection_request = SimpleNamespace(
         documents=["doc1"],
-        batch_size=10,
         model=None,
         instructions=None,
         temperature=None,
diff --git a/backend/app/tests/services/collections/test_create_collection.py b/backend/app/tests/services/collections/test_create_collection.py
index b6913a81f..f7479aa09 100644
--- a/backend/app/tests/services/collections/test_create_collection.py
+++ b/backend/app/tests/services/collections/test_create_collection.py
@@ -58,7 +58,6 @@ def test_start_job_creates_collection_job_and_schedules_task(db: Session) -> Non
     project = get_project(db)
     request = CreationRequest(
         documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")],
-        batch_size=10,
         callback_url=None,
         provider="openai",
     )
@@ -137,7 +136,7 @@ def test_execute_job_success_flow_updates_job_and_creates_collection(
     aws.client.put_object(Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test")
 
     sample_request = CreationRequest(
-        documents=[document.id], batch_size=10, callback_url=None, provider="openai"
+        documents=[document.id], callback_url=None, provider="openai"
     )
 
     mock_get_llm_provider.return_value = get_mock_provider(
@@ -204,9 +203,7 @@ def test_execute_job_assistant_create_failure_marks_failed_and_deletes_collectio
         collection_id=None,
     )
 
-    req = CreationRequest(
-        documents=[], batch_size=10, callback_url=None, provider="openai"
-    )
+    req = CreationRequest(documents=[], callback_url=None, provider="openai")
 
     mock_provider = get_mock_provider(
         llm_service_id="vs_123", llm_service_name="openai vector store"
@@ -269,7 +266,6 @@ def test_execute_job_success_flow_callback_job_and_creates_collection(
 
     sample_request = CreationRequest(
         documents=[document.id],
-        batch_size=10,
         callback_url=callback_url,
         provider="openai",
     )
@@ -350,7 +346,6 @@ def test_execute_job_success_creates_collection_with_callback(
 
     sample_request = CreationRequest(
         documents=[document.id],
-        batch_size=10,
         callback_url=callback_url,
         provider="openai",
     )
@@ -434,7 +429,6 @@ def test_execute_job_failure_flow_callback_job_and_marks_failed(
 
     sample_request = CreationRequest(
         documents=[uuid.uuid4()],
-        batch_size=10,
         callback_url=callback_url,
         provider="openai",
     )

From f9b336525c3c11e751d2784284edf1196a27331b Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 12:38:25 +0530
Subject: [PATCH 03/20] coderabbit reviews

---
 backend/app/crud/rag/open_ai.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py
index 20620b9e2..77c0bbdc1 100644
--- a/backend/app/crud/rag/open_ai.py
+++ b/backend/app/crud/rag/open_ai.py
@@ -145,8 +145,7 @@ def update(
             if req.file_counts.completed != req.file_counts.total:
                 error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed"
                 logger.error(
-                    f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}",
-                    exc_info=True,
+                    f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
                 )
                 raise InterruptedError(error_msg)
 

From a074e36acd2feec78ffdc3465c2e6adad7a03069 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 12:42:24 +0530
Subject: [PATCH 04/20] coderabbit reviews

---
 backend/app/api/docs/collections/create.md  | 2 +-
 backend/app/services/collections/helpers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md
index 853f6d78e..a47383f21 100644
--- a/backend/app/api/docs/collections/create.md
+++ b/backend/app/api/docs/collections/create.md
@@ -6,7 +6,7 @@ pipeline:
 * Documents are automatically batched when creating the vector store to optimize
   the upload process for large document sets. A new batch is created when either
   the cumulative size reaches configured total size of documents given to upload to a vector store
-  or the document count reaches specific number of files in a btch, whichever limit is hit first.
+  or the document count reaches specific number of files in a batch, whichever limit is hit first.
 * [Deprecated] Attach the Vector Store to an OpenAI
   [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use
   parameters in the request body relevant to an Assistant to flesh out
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index a242f7b8c..1b61900b2 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -10,7 +10,7 @@
 
 from app.crud import DocumentCrud, CollectionCrud
 from app.api.deps import SessionDep
-from app.models import DocumentCollection, Collection, CollectionPublic
+from app.models import DocumentCollection, Collection, CollectionPublic, Document
 
 
 logger = logging.getLogger(__name__)

From 1106abc3040f4b4a7aafe3a2a0188e9c0d2224e5 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 15:56:53 +0530
Subject: [PATCH 05/20] fixing test cases

---
 .../collections/test_create_collections.py    | 42 +++++++-
 .../services/collections/test_helpers.py      | 99 ++++++++++++++-----
 2 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py
index 65e40f85f..189a79ccb 100644
--- a/backend/app/tests/api/routes/collections/test_create_collections.py
+++ b/backend/app/tests/api/routes/collections/test_create_collections.py
@@ -3,10 +3,11 @@
 from typing import Any
 
 from fastapi.testclient import TestClient
+from sqlmodel import Session
 
 from app.core.config import settings
 from app.tests.utils.auth import TestAuthContext
-from app.models import CollectionJobStatus
+from app.models import CollectionJobStatus, Document
 from app.models.collection import CreationRequest
 
 
@@ -14,18 +15,39 @@ def _extract_metadata(body: dict) -> dict | None:
     return body.get("metadata") or body.get("meta")
 
 
+def _create_test_document(
+    db: Session, project_id: int, file_size: int = 1024
+) -> Document:
+    """Helper to create a test document."""
+    doc = Document(
+        id=uuid4(),
+        fname="test_document.txt",
+        object_store_url="s3://test-bucket/test_document.txt",
+        project_id=project_id,
+        file_size=file_size,
+    )
+    db.add(doc)
+    db.commit()
+    db.refresh(doc)
+    return doc
+
+
 @patch("app.api.routes.collections.create_service.start_job")
 def test_collection_creation_with_assistant_calls_start_job_and_returns_job(
     mock_start_job: Any,
     client: TestClient,
     user_api_key_header: dict[str, str],
     user_api_key: TestAuthContext,
+    db: Session,
 ) -> None:
+    # Create a test document in the database
+    doc = _create_test_document(db, user_api_key.project_id, file_size=2048)
+
     creation_data = CreationRequest(
         model="gpt-4o",
         instructions="string",
         temperature=0.000001,
-        documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")],
+        documents=[doc.id],
         callback_url=None,
     )
 
@@ -66,10 +88,14 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f
     client: TestClient,
     user_api_key_header: dict[str, str],
     user_api_key: TestAuthContext,
+    db: Session,
 ) -> None:
+    # Create a test document in the database
+    doc = _create_test_document(db, user_api_key.project_id, file_size=5120)
+
     creation_data = CreationRequest(
         temperature=0.000001,
-        documents=[str(uuid4())],
+        documents=[doc.id],
         callback_url=None,
     )
 
@@ -101,12 +127,18 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f
 
 
 def test_collection_creation_vector_only_request_validation_error(
-    client: TestClient, user_api_key_header: dict[str, str]
+    client: TestClient,
+    user_api_key_header: dict[str, str],
+    user_api_key: TestAuthContext,
+    db: Session,
 ) -> None:
+    # Create a test document in the database
+    doc = _create_test_document(db, user_api_key.project_id)
+
     payload = {
         "model": "gpt-4o",
         "temperature": 0.000001,
-        "documents": [str(uuid4())],
+        "documents": [str(doc.id)],
         "callback_url": None,
     }
 
diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py
index f53271f18..02e64a4c6 100644
--- a/backend/app/tests/services/collections/test_helpers.py
+++ b/backend/app/tests/services/collections/test_helpers.py
@@ -46,46 +46,95 @@ def test_extract_error_message_handles_non_matching_bodies() -> None:
 
 
 class FakeDocumentCrud:
-    def __init__(self):
+    def __init__(self, file_size_per_doc=1024):
+        """
+        Args:
+            file_size_per_doc: Size in bytes for each fake document (default 1 KB)
+        """
         self.calls = []
-
-    def read_each(self, ids):
-        self.calls.append(list(ids))
-        return [
-            SimpleNamespace(
-                id=i, fname=f"{i}.txt", object_store_url=f"s3://bucket/{i}.txt"
+        self.file_size_per_doc = file_size_per_doc
+        self.documents = {}
+
+    def read_one(self, doc_id):
+        """Simulate reading a single document by ID."""
+        if doc_id not in self.documents:
+            self.documents[doc_id] = SimpleNamespace(
+                id=doc_id,
+                fname=f"{doc_id}.txt",
+                object_store_url=f"s3://bucket/{doc_id}.txt",
+                file_size=self.file_size_per_doc,
             )
-            for i in ids
-        ]
+        self.calls.append(doc_id)
+        return self.documents[doc_id]
 
 
-def test_batch_documents_even_chunks() -> None:
-    crud = FakeDocumentCrud()
+def test_batch_documents_small_files_single_batch() -> None:
+    """Test that small files all fit in one batch (under 30 MB and under 200 docs)."""
+    crud = FakeDocumentCrud(file_size_per_doc=1024)  # 1 KB per file
     ids = [uuid4() for _ in range(6)]
-    batches = helpers.batch_documents(crud, ids, batch_size=3)
+    batches = helpers.batch_documents(crud, ids)
+
+    # All 6 small files should fit in one batch
+    assert len(batches) == 1
+    assert len(batches[0]) == 6
+    assert [d.id for d in batches[0]] == ids
+
+
+def test_batch_documents_size_based_batching() -> None:
+    """Test that large files trigger size-based batching (30 MB limit)."""
+    # Each file is 20 MB, so max 1 file per batch (since 2 * 20 MB > 30 MB)
+    crud = FakeDocumentCrud(file_size_per_doc=20 * 1024 * 1024)
+    ids = [uuid4() for _ in range(3)]
+    batches = helpers.batch_documents(crud, ids)
+
+    # Should create 3 batches, one for each 20 MB file
+    assert len(batches) == 3
+    assert len(batches[0]) == 1
+    assert len(batches[1]) == 1
+    assert len(batches[2]) == 1
 
-    # read_each called with chunks [0:3], [3:6]
-    assert crud.calls == [ids[0:3], ids[3:6]]
-    # output mirrors what read_each returned
+
+def test_batch_documents_count_based_batching() -> None:
+    """Test that document count triggers batching (200 docs limit)."""
+    crud = FakeDocumentCrud(file_size_per_doc=100)  # Small files
+    ids = [uuid4() for _ in range(250)]
+    batches = helpers.batch_documents(crud, ids)
+
+    # Should create 2 batches: 200 + 50
     assert len(batches) == 2
-    assert [d.id for d in batches[0]] == ids[0:3]
-    assert [d.id for d in batches[1]] == ids[3:6]
+    assert len(batches[0]) == 200
+    assert len(batches[1]) == 50
 
 
-def test_batch_documents_ragged_last_chunk() -> None:
-    crud = FakeDocumentCrud()
+def test_batch_documents_mixed_size_batching() -> None:
+    """Test batching with files that fit multiple per batch but hit 30 MB limit."""
+    # Each file is 15 MB, so 2 files = 30 MB (at limit), 3 files > 30 MB
+    crud = FakeDocumentCrud(file_size_per_doc=15 * 1024 * 1024)
     ids = [uuid4() for _ in range(5)]
-    batches = helpers.batch_documents(crud, ids, batch_size=2)
+    batches = helpers.batch_documents(crud, ids)
+
+    # Should create 3 batches: [2 files, 2 files, 1 file]
+    assert len(batches) == 3
+    assert len(batches[0]) == 2  # 30 MB total
+    assert len(batches[1]) == 2  # 30 MB total
+    assert len(batches[2]) == 1  # 15 MB total
+
+
+def test_batch_documents_with_none_file_size() -> None:
+    """Test that documents with None file_size are treated as 0 bytes."""
+    crud = FakeDocumentCrud(file_size_per_doc=None)
+    ids = [uuid4() for _ in range(10)]
+    batches = helpers.batch_documents(crud, ids)
 
-    assert crud.calls == [ids[0:2], ids[2:4], ids[4:5]]
-    assert [d.id for d in batches[0]] == ids[0:2]
-    assert [d.id for d in batches[1]] == ids[2:4]
-    assert [d.id for d in batches[2]] == ids[4:5]
+    # All files with None/0 size should fit in one batch (under both limits)
+    assert len(batches) == 1
+    assert len(batches[0]) == 10
 
 
 def test_batch_documents_empty_input() -> None:
+    """Test that empty input returns empty batches."""
     crud = FakeDocumentCrud()
-    batches = helpers.batch_documents(crud, [], batch_size=3)
+    batches = helpers.batch_documents(crud, [])
     assert batches == []
     assert crud.calls == []
 

From ff0726f7d1cafaa4e27c5202bd63c86bca353cf1 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 19:09:42 +0530
Subject: [PATCH 06/20] pr review fixes

---
 ...columns_to_collection_job_and_documents.py |  8 +-
 backend/app/api/docs/collections/create.md    |  4 +-
 backend/app/api/routes/collections.py         | 12 ---
 backend/app/api/routes/documents.py           |  4 +-
 backend/app/models/collection_job.py          |  8 +-
 backend/app/models/document.py                |  8 +-
 .../services/collections/create_collection.py | 25 +++---
 backend/app/services/collections/helpers.py   | 39 ++++++----
 .../services/collections/providers/base.py    | 15 ++--
 .../services/collections/providers/openai.py  |  8 +-
 backend/app/services/documents/helpers.py     | 18 ++---
 .../services/collections/test_helpers.py      | 77 +++++++++----------
 12 files changed, 101 insertions(+), 125 deletions(-)

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
index 2925e0fa6..642c5445b 100644
--- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
@@ -29,7 +29,7 @@ def upgrade():
     op.add_column(
         "collection_jobs",
         sa.Column(
-            "total_size",
+            "total_size_mb",
             sa.Integer(),
             nullable=True,
             comment="Total size of documents being uploaded to collection",
@@ -38,7 +38,7 @@ def upgrade():
     op.add_column(
         "document",
         sa.Column(
-            "file_size",
+            "file_size_kb",
             sa.Integer(),
             nullable=True,
             comment="Size of the document in bytes",
@@ -47,6 +47,6 @@ def upgrade():
 
 
 def downgrade():
-    op.drop_column("document", "file_size")
-    op.drop_column("collection_jobs", "total_size")
+    op.drop_column("document", "file_size_kb")
+    op.drop_column("collection_jobs", "total_size_mb")
     op.drop_column("collection_jobs", "docs_num")
diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md
index a47383f21..bc94e7c45 100644
--- a/backend/app/api/docs/collections/create.md
+++ b/backend/app/api/docs/collections/create.md
@@ -5,8 +5,8 @@ pipeline:
   documents through the Documents module.
 * Documents are automatically batched when creating the vector store to optimize
   the upload process for large document sets. A new batch is created when either
-  the cumulative size reaches configured total size of documents given to upload to a vector store
-  or the document count reaches specific number of files in a batch, whichever limit is hit first.
+  the cumulative size reaches 30 MB of documents given to upload to a vector store
+  or the document count reaches 200 files in a batch, whichever limit is hit first.
 * [Deprecated] Attach the Vector Store to an OpenAI
   [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use
   parameters in the request body relevant to an Assistant to flesh out
diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index 004251d6c..2aee46032 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -96,17 +96,6 @@ def create_collection(
     if request.name:
         ensure_unique_name(session, current_user.project_.id, request.name)
 
-    # Calculate total size of all documents
-    document_crud = DocumentCrud(session, current_user.project_.id)
-    total_size = 0
-    for doc_id in request.documents:
-        doc = document_crud.read_one(doc_id)
-        total_size += doc.file_size or 0
-
-    logger.info(
-        f"[create_collection] Calculated total size | {{'total_documents': {len(request.documents)}, 'total_size_bytes': {total_size}, 'total_size_mb': {round(total_size / (1024 * 1024), 2)}}}"
-    )
-
     collection_job_crud = CollectionJobCrud(session, current_user.project_.id)
     collection_job = collection_job_crud.create(
         CollectionJobCreate(
@@ -114,7 +103,6 @@ def create_collection(
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
             docs_num=len(request.documents),
-            total_size=total_size,
         )
     )
 
diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
index c8e17f503..7d32e417f 100644
--- a/backend/app/api/routes/documents.py
+++ b/backend/app/api/routes/documents.py
@@ -130,7 +130,7 @@ async def upload_doc(
         transformer=transformer,
     )
 
-    file_size = await calculate_file_size(src)
+    file_size_kb = await calculate_file_size(src)
 
     storage = get_cloud_storage(session=session, project_id=current_user.project_.id)
     document_id = uuid4()
@@ -140,7 +140,7 @@ async def upload_doc(
     document = Document(
         id=document_id,
         fname=src.filename,
-        file_size=file_size,
+        file_size_kb=file_size_kb,
         object_store_url=str(object_store_url),
     )
     source_document = crud.update(document)
diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py
index a4e857a1a..6caa7fd75 100644
--- a/backend/app/models/collection_job.py
+++ b/backend/app/models/collection_job.py
@@ -60,11 +60,11 @@ class CollectionJob(SQLModel, table=True):
             "comment": "Total number of documents to be processed in this job"
         },
     )
-    total_size: int | None = Field(
+    total_size_mb: float | None = Field(
         default=None,
-        description="Total size of documents being uploaded to collection",
+        description="Total size of documents being uploaded to collection in MB",
         sa_column_kwargs={
-            "comment": "Total size of documents being uploaded to collection"
+            "comment": "Total size of documents being uploaded to collection in MB"
         },
     )
     error_message: str | None = Field(
@@ -121,7 +121,6 @@ class CollectionJobCreate(SQLModel):
     status: CollectionJobStatus
     action_type: CollectionActionType
     docs_num: int | None = None
-    total_size: int | None = None
     project_id: int
 
 
@@ -130,6 +129,7 @@ class CollectionJobUpdate(SQLModel):
     status: CollectionJobStatus | None = None
     error_message: str | None = None
     collection_id: UUID | None = None
+    total_size_mb: float | None = None
     trace_id: str | None = None
 
 
diff --git a/backend/app/models/document.py b/backend/app/models/document.py
index 3fbd71da2..12843e72a 100644
--- a/backend/app/models/document.py
+++ b/backend/app/models/document.py
@@ -1,8 +1,6 @@
 from datetime import datetime
-from typing import Any
 from uuid import UUID, uuid4
 
-from pydantic import model_serializer
 from sqlmodel import Field, SQLModel
 
 from app.core.util import now
@@ -43,10 +41,10 @@ class Document(DocumentBase, table=True):
         default=False,
         sa_column_kwargs={"comment": "Soft delete flag"},
     )
-    file_size: int | None = Field(
+    file_size_kb: float | None = Field(
         default=None,
-        description="The size of the document in bytes",
-        sa_column_kwargs={"comment": "Size of the document in bytes"},
+        description="The size of the document in kilobytes",
+        sa_column_kwargs={"comment": "Size of the document in kilobytes (KB)"},
     )
 
     # Foreign keys
diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
index 35b03da80..c86d72d4d 100644
--- a/backend/app/services/collections/create_collection.py
+++ b/backend/app/services/collections/create_collection.py
@@ -168,6 +168,14 @@ def execute_job(
 
         job_uuid = UUID(job_id)
 
+        with Session(engine) as session:
+            document_crud = DocumentCrud(session, project_id)
+            flat_docs = document_crud.read_each(creation_request.documents)
+
+        file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
+        total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs)
+        total_size_mb = total_size_kb / 1024
+
         with Session(engine) as session:
             collection_job_crud = CollectionJobCrud(session, project_id)
             collection_job = collection_job_crud.read_one(job_uuid)
@@ -176,11 +184,11 @@ def execute_job(
                 CollectionJobUpdate(
                     task_id=task_id,
                     status=CollectionJobStatus.PROCESSING,
+                    total_size_mb=total_size_mb,
                 ),
             )
 
             storage = get_cloud_storage(session=session, project_id=project_id)
-            document_crud = DocumentCrud(session, project_id)
 
             provider = get_llm_provider(
                 session=session,
@@ -192,20 +200,12 @@ def execute_job(
         result = provider.create(
             collection_request=creation_request,
             storage=storage,
-            document_crud=document_crud,
+            documents=flat_docs,
         )
 
         llm_service_id = result.llm_service_id
         llm_service_name = result.llm_service_name
 
-        with Session(engine) as session:
-            document_crud = DocumentCrud(session, project_id)
-            flat_docs = document_crud.read_each(creation_request.documents)
-
-        file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
-        total_size_bytes = collection_job.total_size or 0
-        total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
-
         with Session(engine) as session:
             collection_crud = CollectionCrud(session, project_id)
 
@@ -239,12 +239,11 @@ def execute_job(
 
         elapsed = time.time() - start_time
         logger.info(
-            "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB (%s bytes) | Types: %s",
+            "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB | Types: %s",
             collection_id,
             elapsed,
             len(flat_docs),
-            total_size_mb,
-            total_size_bytes,
+            collection_job.total_size_mb,
             list(file_exts),
         )
 
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index 1b61900b2..9fd35e2ed 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -8,13 +8,18 @@
 from fastapi import HTTPException
 from sqlmodel import select
 
-from app.crud import DocumentCrud, CollectionCrud
+from app.crud import CollectionCrud
 from app.api.deps import SessionDep
 from app.models import DocumentCollection, Collection, CollectionPublic, Document
 
 
 logger = logging.getLogger(__name__)
 
+# Necessary Constants -
+# for dynamic batching of documents to upload to openai vector store
+MAX_BATCH_SIZE_KB = 30 * 1024  # 30 MB in KB
+MAX_BATCH_COUNT = 200
+
 
 def get_service_name(provider: str) -> str:
     """Get the collection service name for a provider."""
@@ -55,47 +60,47 @@ def extract_error_message(err: Exception) -> str:
     return message.strip()[:1000]
 
 
-def batch_documents(
-    document_crud: DocumentCrud, documents: List[UUID]
-) -> List[List[Document]]:
+def batch_documents(documents: List[Document]) -> List[List[Document]]:
     """
     Batch documents dynamically based on size and count limits.
 
     Creates a new batch when either:
-    - Total size reaches 30 MB (31,457,280 bytes)
+    - Total size reaches 30 MB (30,720 KB)
     - Document count reaches 200
 
+    Args:
+        documents: List of Document objects to batch
+
     Returns:
         List of document batches
     """
 
-    MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024  # 30 MB in bytes
-    MAX_BATCH_COUNT = 200  # Maximum documents per batch
-
     docs_batches = []
     current_batch = []
-    current_batch_size = 0
+    current_batch_size_kb = 0
 
-    for doc_id in documents:
-        doc = document_crud.read_one(doc_id)
-        doc_size = doc.file_size or 0
+    for doc in documents:
+        doc_size_kb = doc.file_size_kb or 0
 
-        would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES
+        would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB
         would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT
 
         if current_batch and (would_exceed_size or would_exceed_count):
+            docs_batches.append(current_batch)
             logger.info(
-                f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}"
+                f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches)}, 'doc_count': {len(current_batch)}, 'batch_size_mb': {round(current_batch_size_kb / 1024)}}}"
             )
-            docs_batches.append(current_batch)
             current_batch = []
-            current_batch_size = 0
+            current_batch_size_kb = 0
 
         current_batch.append(doc)
-        current_batch_size += doc_size
+        current_batch_size_kb += doc_size_kb
 
     if current_batch:
         docs_batches.append(current_batch)
+        logger.info(
+            f"[batch_documents] Final Batch completed | {{'batch_num': {len(docs_batches)}, 'doc_count': {len(current_batch)}, 'batch_size_mb': {round(current_batch_size_kb / 1024)}}}"
+        )
 
     logger.info(
         f"[batch_documents] Batching complete | {{'total_batches': {len(docs_batches)}, 'total_documents': {len(documents)}}}"
diff --git a/backend/app/services/collections/providers/base.py b/backend/app/services/collections/providers/base.py
index d76fb6189..5607786b7 100644
--- a/backend/app/services/collections/providers/base.py
+++ b/backend/app/services/collections/providers/base.py
@@ -1,9 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, List
 
-from app.crud import DocumentCrud
 from app.core.cloud.storage import CloudStorage
-from app.models import CreationRequest, Collection
+from app.models import CreationRequest, Collection, Document
 
 
 class BaseProvider(ABC):
@@ -32,21 +31,17 @@ def create(
         self,
         collection_request: CreationRequest,
         storage: CloudStorage,
-        document_crud: DocumentCrud,
+        documents: List[Document],
     ) -> Collection:
         """Create collection with documents and optionally an assistant.
 
         Args:
             collection_request: Collection parameters (name, description, document list, etc.)
             storage: Cloud storage instance for file access
-            document_crud: DocumentCrud instance for fetching documents
-            batch_size: Number of documents to process per batch
-            with_assistant: Whether to create an assistant/agent
-            assistant_options: Options for assistant creation (provider-specific)
+            documents: Pre-fetched list of Document objects to add to the collection
 
         Returns:
-            llm_service_id: ID of the resource to delete
-            llm_service_name: Name of the service (determines resource type)
+            Collection object with llm_service_id and llm_service_name populated
         """
         raise NotImplementedError("Providers must implement execute method")
 
diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py
index 558baf295..b0e418efd 100644
--- a/backend/app/services/collections/providers/openai.py
+++ b/backend/app/services/collections/providers/openai.py
@@ -1,13 +1,13 @@
 import logging
+from typing import List
 
 from openai import OpenAI
 
 from app.services.collections.providers import BaseProvider
-from app.crud import DocumentCrud
 from app.core.cloud.storage import CloudStorage
 from app.crud.rag import OpenAIVectorStoreCrud, OpenAIAssistantCrud
 from app.services.collections.helpers import batch_documents, get_service_name
-from app.models import CreationRequest, Collection
+from app.models import CreationRequest, Collection, Document
 
 
 logger = logging.getLogger(__name__)
@@ -24,13 +24,13 @@ def create(
         self,
         collection_request: CreationRequest,
         storage: CloudStorage,
-        document_crud: DocumentCrud,
+        documents: List[Document],
     ) -> Collection:
         """
         Create OpenAI vector store with documents and optionally an assistant.
         """
         try:
-            docs_batches = batch_documents(document_crud, collection_request.documents)
+            docs_batches = batch_documents(documents)
 
             vector_store_crud = OpenAIVectorStoreCrud(self.client)
             vector_store = vector_store_crud.create()
diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py
index 578380a5a..78619a6e9 100644
--- a/backend/app/services/documents/helpers.py
+++ b/backend/app/services/documents/helpers.py
@@ -23,26 +23,24 @@
 )
 
 
-async def calculate_file_size(file: UploadFile) -> int:
+async def calculate_file_size(file: UploadFile) -> float:
     """
-    Calculate the size of an uploaded file in bytes.
+    Calculate the size of an uploaded file in kilobytes.
 
     Args:
         file: The uploaded file from FastAPI
 
     Returns:
-        The size of the file in bytes
+        The size of the file in kilobytes (KB) as a whole number
     """
     if file.size:
-        return file.size
+        return round(file.size / 1024)
 
-    # If size is not available, calculate by reading the file
-    await file.seek(0)
-    content = await file.read()
-    size_bytes = len(content)
-    await file.seek(0)  # Reset to beginning for subsequent operations
+    file.file.seek(0, 2)
+    size_bytes = file.file.tell()
+    file.file.seek(0)
 
-    return size_bytes
+    return round(size_bytes / 1024)
 
 
 def pre_transform_validation(
diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py
index 02e64a4c6..f2af4b4d9 100644
--- a/backend/app/tests/services/collections/test_helpers.py
+++ b/backend/app/tests/services/collections/test_helpers.py
@@ -45,47 +45,45 @@ def test_extract_error_message_handles_non_matching_bodies() -> None:
 # batch documents
 
 
-class FakeDocumentCrud:
-    def __init__(self, file_size_per_doc=1024):
-        """
-        Args:
-            file_size_per_doc: Size in bytes for each fake document (default 1 KB)
-        """
-        self.calls = []
-        self.file_size_per_doc = file_size_per_doc
-        self.documents = {}
-
-    def read_one(self, doc_id):
-        """Simulate reading a single document by ID."""
-        if doc_id not in self.documents:
-            self.documents[doc_id] = SimpleNamespace(
-                id=doc_id,
-                fname=f"{doc_id}.txt",
-                object_store_url=f"s3://bucket/{doc_id}.txt",
-                file_size=self.file_size_per_doc,
-            )
-        self.calls.append(doc_id)
-        return self.documents[doc_id]
+def create_fake_documents(
+    count: int, file_size_kb: float | None = 1
+) -> list[SimpleNamespace]:
+    """Create fake document objects for testing.
+
+    Args:
+        count: Number of documents to create
+        file_size_kb: Size in KB for each document (default 1 KB)
+
+    Returns:
+        List of SimpleNamespace objects mimicking Document objects
+    """
+    return [
+        SimpleNamespace(
+            id=uuid4(),
+            fname=f"doc_{i}.txt",
+            object_store_url=f"s3://bucket/doc_{i}.txt",
+            file_size_kb=file_size_kb,
+        )
+        for i in range(count)
+    ]
 
 
 def test_batch_documents_small_files_single_batch() -> None:
     """Test that small files all fit in one batch (under 30 MB and under 200 docs)."""
-    crud = FakeDocumentCrud(file_size_per_doc=1024)  # 1 KB per file
-    ids = [uuid4() for _ in range(6)]
-    batches = helpers.batch_documents(crud, ids)
+    docs = create_fake_documents(6, file_size_kb=1)  # 1 KB per file
+    batches = helpers.batch_documents(docs)
 
     # All 6 small files should fit in one batch
     assert len(batches) == 1
     assert len(batches[0]) == 6
-    assert [d.id for d in batches[0]] == ids
+    assert [d.id for d in batches[0]] == [d.id for d in docs]
 
 
 def test_batch_documents_size_based_batching() -> None:
     """Test that large files trigger size-based batching (30 MB limit)."""
-    # Each file is 20 MB, so max 1 file per batch (since 2 * 20 MB > 30 MB)
-    crud = FakeDocumentCrud(file_size_per_doc=20 * 1024 * 1024)
-    ids = [uuid4() for _ in range(3)]
-    batches = helpers.batch_documents(crud, ids)
+    # Each file is 20 MB (20480 KB), so max 1 file per batch (since 2 * 20 MB > 30 MB)
+    docs = create_fake_documents(3, file_size_kb=20 * 1024)
+    batches = helpers.batch_documents(docs)
 
     # Should create 3 batches, one for each 20 MB file
     assert len(batches) == 3
@@ -96,9 +94,8 @@ def test_batch_documents_size_based_batching() -> None:
 
 def test_batch_documents_count_based_batching() -> None:
     """Test that document count triggers batching (200 docs limit)."""
-    crud = FakeDocumentCrud(file_size_per_doc=100)  # Small files
-    ids = [uuid4() for _ in range(250)]
-    batches = helpers.batch_documents(crud, ids)
+    docs = create_fake_documents(250, file_size_kb=0.1)  # Small files
+    batches = helpers.batch_documents(docs)
 
     # Should create 2 batches: 200 + 50
     assert len(batches) == 2
@@ -108,10 +105,9 @@ def test_batch_documents_count_based_batching() -> None:
 
 def test_batch_documents_mixed_size_batching() -> None:
     """Test batching with files that fit multiple per batch but hit 30 MB limit."""
-    # Each file is 15 MB, so 2 files = 30 MB (at limit), 3 files > 30 MB
-    crud = FakeDocumentCrud(file_size_per_doc=15 * 1024 * 1024)
-    ids = [uuid4() for _ in range(5)]
-    batches = helpers.batch_documents(crud, ids)
+    # Each file is 15 MB (15360 KB), so 2 files = 30 MB (at limit), 3 files > 30 MB
+    docs = create_fake_documents(5, file_size_kb=15 * 1024)
+    batches = helpers.batch_documents(docs)
 
     # Should create 3 batches: [2 files, 2 files, 1 file]
     assert len(batches) == 3
@@ -122,9 +118,8 @@ def test_batch_documents_mixed_size_batching() -> None:
 
 def test_batch_documents_with_none_file_size() -> None:
     """Test that documents with None file_size are treated as 0 bytes."""
-    crud = FakeDocumentCrud(file_size_per_doc=None)
-    ids = [uuid4() for _ in range(10)]
-    batches = helpers.batch_documents(crud, ids)
+    docs = create_fake_documents(10, file_size_kb=None)
+    batches = helpers.batch_documents(docs)
 
     # All files with None/0 size should fit in one batch (under both limits)
     assert len(batches) == 1
@@ -133,10 +128,8 @@ def test_batch_documents_with_none_file_size() -> None:
 
 def test_batch_documents_empty_input() -> None:
     """Test that empty input returns empty batches."""
-    crud = FakeDocumentCrud()
-    batches = helpers.batch_documents(crud, [])
+    batches = helpers.batch_documents([])
     assert batches == []
-    assert crud.calls == []
 
 
 def test_ensure_unique_name_success(db: Session) -> None:

From c7da9f02dd64c0769554d62276b0211e6dbece5c Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 19:18:41 +0530
Subject: [PATCH 07/20] coderabbit review

---
 backend/app/services/collections/providers/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/app/services/collections/providers/base.py b/backend/app/services/collections/providers/base.py
index 5607786b7..36283d1fa 100644
--- a/backend/app/services/collections/providers/base.py
+++ b/backend/app/services/collections/providers/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, List
+from typing import Any
 
 from app.core.cloud.storage import CloudStorage
 from app.models import CreationRequest, Collection, Document
@@ -31,7 +31,7 @@ def create(
         self,
         collection_request: CreationRequest,
         storage: CloudStorage,
-        documents: List[Document],
+        documents: list[Document],
     ) -> Collection:
         """Create collection with documents and optionally an assistant.
 

From 8437442dff3b4d96cc663246212e42146199b2a8 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 19:48:50 +0530
Subject: [PATCH 08/20] removing batch_size from request

---
 backend/app/api/routes/collections.py | 15 ++++++++-------
 backend/app/models/collection.py      |  9 ---------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index 2aee46032..fd70f3957 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -120,14 +120,15 @@ def create_collection(
         with_assistant=with_assistant,
     )
 
-    metadata = None
+    metadata = {}
+
     if not with_assistant:
-        metadata = {
-            "note": (
-                "This job will create a vector store only (no Assistant). "
-                "Assistant creation happens when both 'model' and 'instructions' are included."
-            )
-        }
+        metadata["assistant_note"] = (
+            "This job will create a vector store only (no Assistant). "
+            "Assistant creation happens when both 'model' and 'instructions' are included."
+        )
+
+    metadata = metadata if metadata else None
 
     return APIResponse.success_response(
         CollectionJobImmediatePublic.model_validate(collection_job), metadata=metadata
diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py
index 0b00639d8..4a77f0200 100644
--- a/backend/app/models/collection.py
+++ b/backend/app/models/collection.py
@@ -102,15 +102,6 @@ class CollectionOptions(SQLModel):
     documents: list[UUID] = Field(
         description="List of document IDs",
     )
-    batch_size: int = Field(
-        default=10,
-        description=(
-            "**[Deprecated]**  "
-            "Number of documents to send to OpenAI in a single "
-            "transaction. See the `file_ids` parameter in the "
-            "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)."
-        ),
-    )
 
     def model_post_init(self, __context: Any):
         self.documents = list(set(self.documents))

From f94998f06ecbbf7383f49214d3531f79085dde1a Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Wed, 25 Mar 2026 19:58:47 +0530
Subject: [PATCH 09/20] for fixing test cases

---
 backend/app/api/routes/collections.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index fd70f3957..2aee46032 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -120,15 +120,14 @@ def create_collection(
         with_assistant=with_assistant,
     )
 
-    metadata = {}
-
+    metadata = None
     if not with_assistant:
-        metadata["assistant_note"] = (
-            "This job will create a vector store only (no Assistant). "
-            "Assistant creation happens when both 'model' and 'instructions' are included."
-        )
-
-    metadata = metadata if metadata else None
+        metadata = {
+            "note": (
+                "This job will create a vector store only (no Assistant). "
+                "Assistant creation happens when both 'model' and 'instructions' are included."
+            )
+        }
 
     return APIResponse.success_response(
         CollectionJobImmediatePublic.model_validate(collection_job), metadata=metadata

From 5ec17be19d61be63da1ff89931db8565567b0fbe Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Fri, 27 Mar 2026 13:43:52 +0530
Subject: [PATCH 10/20] size limit on doc upload

---
 ...columns_to_collection_job_and_documents.py | 14 +++++++++--
 backend/app/api/routes/collections.py         |  1 +
 backend/app/api/routes/documents.py           | 15 +++++++++++-
 backend/app/core/cloud/storage.py             | 24 +++++++++++++++++++
 backend/app/crud/rag/open_ai.py               | 18 ++++----------
 backend/app/models/collection.py              | 10 ++++----
 .../services/collections/create_collection.py |  2 +-
 backend/app/services/collections/helpers.py   | 10 +++++---
 8 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
index 642c5445b..3ddcd00b1 100644
--- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
@@ -30,7 +30,7 @@ def upgrade():
         "collection_jobs",
         sa.Column(
             "total_size_mb",
-            sa.Integer(),
+            sa.Float(),
             nullable=True,
             comment="Total size of documents being uploaded to collection",
         ),
@@ -39,14 +39,24 @@ def upgrade():
         "document",
         sa.Column(
             "file_size_kb",
-            sa.Integer(),
+            sa.Float(),
             nullable=True,
             comment="Size of the document in bytes",
         ),
     )
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "documents",
+            sa.JSON(),
+            nullable=True,
+            comment="JSON array of document UUIDs included in this job",
+        ),
+    )
 
 
 def downgrade():
     op.drop_column("document", "file_size_kb")
     op.drop_column("collection_jobs", "total_size_mb")
     op.drop_column("collection_jobs", "docs_num")
+    op.drop_column("collection_jobs", "documents")
diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index 2aee46032..332e1da8c 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -103,6 +103,7 @@ def create_collection(
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
             docs_num=len(request.documents),
+            documents=[str(doc_id) for doc_id in request.documents],
         )
     )
 
diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
index 7d32e417f..a64592702 100644
--- a/backend/app/api/routes/documents.py
+++ b/backend/app/api/routes/documents.py
@@ -12,6 +12,7 @@
     UploadFile,
 )
 from fastapi import Path as FastPath
+from fastapi import HTTPException
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.api.permissions import Permission, require_permission
@@ -27,7 +28,7 @@
     DocTransformationJobPublic,
 )
 from app.core.cloud import get_cloud_storage
-from app.services.collections.helpers import pick_service_for_documennt
+from app.services.collections.helpers import pick_service_for_documennt, MAX_DOC_SIZE_MB
 from app.services.documents.helpers import (
     calculate_file_size,
     schedule_transformation,
@@ -131,6 +132,18 @@ async def upload_doc(
     )
 
     file_size_kb = await calculate_file_size(src)
+    file_size_mb = file_size_kb / 1024
+
+    if file_size_mb > MAX_DOC_SIZE_MB:
+        logger.warning(
+            f"[upload_doc] Document size exceeds limit | "
+            f"{{'filename': '{src.filename}', 'size_mb': {round(file_size_mb, 2)}, 'max_size_mb': {MAX_DOC_SIZE_MB}}}"
+        )
+        raise HTTPException(
+            status_code=413,
+            detail=f"Document size ({round(file_size_mb, 2)} MB) exceeds the maximum allowed size of {MAX_DOC_SIZE_MB} MB. "
+            f"Please upload a smaller file.",
+        )
 
     storage = get_cloud_storage(session=session, project_id=current_user.project_.id)
     document_id = uuid4()
diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py
index 727380726..a57273a06 100644
--- a/backend/app/core/cloud/storage.py
+++ b/backend/app/core/cloud/storage.py
@@ -125,6 +125,11 @@ def stream(self, url: str) -> StreamingBody:
         """Stream a file from storage"""
         pass
 
+    @abstractmethod
+    def get(self, url: str) -> bytes:
+        """Get file contents as bytes (for files that fit in memory)"""
+        pass
+
     @abstractmethod
     def get_file_size_kb(self, url: str) -> float:
         """Return the file size in KB"""
@@ -193,6 +198,25 @@ def stream(self, url: str) -> StreamingBody:
             )
             raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
 
+    def get(self, url: str) -> bytes:
+        name = SimpleStorageName.from_url(url)
+        kwargs = asdict(name)
+        try:
+            body = self.aws.client.get_object(**kwargs).get("Body")
+            content = body.read()
+            logger.info(
+                f"[AmazonCloudStorage.get] File retrieved successfully | "
+                f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_bytes': {len(content)}}}"
+            )
+            return content
+        except ClientError as err:
+            logger.error(
+                f"[AmazonCloudStorage.get] AWS get error | "
+                f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
+                exc_info=True,
+            )
+            raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
+
     def get_file_size_kb(self, url: str) -> float:
         name = SimpleStorageName.from_url(url)
         kwargs = asdict(name)
diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py
index 77c0bbdc1..da3bdb198 100644
--- a/backend/app/crud/rag/open_ai.py
+++ b/backend/app/crud/rag/open_ai.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import functools as ft
+from io import BytesIO
 from typing import Iterable
 
 from openai import OpenAI, OpenAIError
@@ -121,15 +122,13 @@ def update(
         storage: CloudStorage,
         documents: Iterable[Document],
     ):
-        files = []
         for docs in documents:
+            files = []
             for d in docs:
-                f_obj = storage.stream(d.object_store_url)
-
-                # monkey patch botocore.response.StreamingBody to make
-                # OpenAI happy
+                # Get file bytes and wrap in BytesIO for OpenAI API
+                content = storage.get(d.object_store_url)
+                f_obj = BytesIO(content)
                 f_obj.name = d.fname
-
                 files.append(f_obj)
 
             logger.info(
@@ -149,13 +148,6 @@ def update(
                 )
                 raise InterruptedError(error_msg)
 
-            while files:
-                f_obj = files.pop()
-                f_obj.close()
-                logger.info(
-                    f"[OpenAIVectorStoreCrud.update] Closed file stream | {{'vector_store_id': '{vector_store_id}', 'filename': '{f_obj.name}'}}"
-                )
-
             yield from docs
 
     def delete(self, vector_store_id: str, retries: int = 3):
diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py
index 4a77f0200..c0d3e3027 100644
--- a/backend/app/models/collection.py
+++ b/backend/app/models/collection.py
@@ -39,12 +39,10 @@ class Collection(SQLModel, table=True):
         description="Unique identifier for the collection",
         sa_column_kwargs={"comment": "Unique identifier for the collection"},
     )
-    provider: ProviderType = (
-        Field(
-            nullable=False,
-            description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
-            sa_column_kwargs={"comment": "LLM provider used for this collection"},
-        ),
+    provider: ProviderType = Field(
+        nullable=False,
+        description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
+        sa_column_kwargs={"comment": "LLM provider used for this collection"},
     )
     llm_service_id: str = Field(
         nullable=False,
diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
index efa6ea3a8..4d897adae 100644
--- a/backend/app/services/collections/create_collection.py
+++ b/backend/app/services/collections/create_collection.py
@@ -173,7 +173,7 @@ def execute_job(
 
         file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
         total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs)
-        total_size_mb = total_size_kb / 1024
+        total_size_mb = round(total_size_kb / 1024, 2)
 
         with Session(engine) as session:
             collection_job_crud = CollectionJobCrud(session, project_id)
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index 9fd35e2ed..4d469ba68 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -16,9 +16,13 @@
 logger = logging.getLogger(__name__)
 
 # Necessary Constants -
-# for dynamic batching of documents to upload to openai vector store
-MAX_BATCH_SIZE_KB = 30 * 1024  # 30 MB in KB
-MAX_BATCH_COUNT = 200
+# Maximum individual document size (must be less than batch size)
+MAX_DOC_SIZE_MB = 25  # 25 MB maximum per document
+
+# Maximum batch size for uploading documents to vector store
+# Derived from MAX_DOC_SIZE + buffer to ensure single docs always fit
+MAX_BATCH_SIZE_KB = (MAX_DOC_SIZE_MB + 5) * 1024  # 30 MB in KB (25 + 5 MB buffer)
+MAX_BATCH_COUNT = 200  # Maximum documents per batch
 
 
 def get_service_name(provider: str) -> str:

From 3246e880b8e85b05c850d624067b4085fecd3388 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Fri, 27 Mar 2026 14:58:48 +0530
Subject: [PATCH 11/20] test coverage increase

---
 .../services/collections/test_helpers.py      | 114 +++++++++++++++++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py
index f2af4b4d9..7cddaf305 100644
--- a/backend/app/tests/services/collections/test_helpers.py
+++ b/backend/app/tests/services/collections/test_helpers.py
@@ -11,7 +11,13 @@
 from app.services.collections import helpers
 from app.tests.utils.utils import get_project
 from app.tests.utils.collection import get_vector_store_collection
-from app.services.collections.helpers import ensure_unique_name
+from app.services.collections.helpers import (
+    ensure_unique_name,
+    get_service_name,
+    to_collection_public,
+)
+from app.models import Collection, ProviderType
+from app.core.util import now
 
 
 def test_extract_error_message_parses_json_and_strips_prefix() -> None:
@@ -167,3 +173,109 @@ def test_ensure_unique_name_conflict_with_vector_store_collection(db: Session) -
 
     assert exc.value.status_code == 409
     assert "already exists" in exc.value.detail
+
+
+# get_service_name
+
+
+def test_get_service_name_openai() -> None:
+    """Test that OpenAI provider returns correct service name."""
+    result = get_service_name("openai")
+    assert result == "openai vector store"
+
+
+def test_get_service_name_case_insensitive() -> None:
+    """Test that provider name is case-insensitive."""
+    assert get_service_name("OpenAI") == "openai vector store"
+    assert get_service_name("OPENAI") == "openai vector store"
+    assert get_service_name("OpEnAi") == "openai vector store"
+
+
+def test_get_service_name_unknown_provider() -> None:
+    """Test that unknown providers return empty string."""
+    assert get_service_name("unknown") == ""
+    assert get_service_name("bedrock") == ""  # Commented out in the mapping
+    assert get_service_name("gemini") == ""  # Commented out in the mapping
+    assert get_service_name("") == ""
+
+
+# to_collection_public
+
+
+def test_to_collection_public_vector_store() -> None:
+    """Test conversion of vector store collection to public model."""
+    collection = Collection(
+        id=uuid4(),
+        project_id=1,
+        provider=ProviderType.openai,
+        llm_service_id="vs_123",
+        llm_service_name="openai vector store",  # Matches get_service_name("openai")
+        name="Test Collection",
+        description="Test description",
+        inserted_at=now(),
+        updated_at=now(),
+        deleted_at=None,
+    )
+
+    result = to_collection_public(collection)
+
+    # For vector store, should map to knowledge_base fields
+    assert result.id == collection.id
+    assert result.knowledge_base_id == "vs_123"
+    assert result.knowledge_base_provider == "openai vector store"
+    assert result.llm_service_id is None
+    assert result.llm_service_name is None
+    assert result.project_id == 1
+    assert result.inserted_at == collection.inserted_at
+    assert result.updated_at == collection.updated_at
+    assert result.deleted_at is None
+
+
+def test_to_collection_public_assistant() -> None:
+    """Test conversion of assistant collection to public model."""
+    collection = Collection(
+        id=uuid4(),
+        project_id=2,
+        provider=ProviderType.openai,
+        llm_service_id="asst_456",
+        llm_service_name="gpt-4",  # Does NOT match vector store name
+        name="Assistant Collection",
+        description="Assistant description",
+        inserted_at=now(),
+        updated_at=now(),
+        deleted_at=None,
+    )
+
+    result = to_collection_public(collection)
+
+    # For assistant, should map to llm_service fields
+    assert result.id == collection.id
+    assert result.llm_service_id == "asst_456"
+    assert result.llm_service_name == "gpt-4"
+    assert result.knowledge_base_id is None
+    assert result.knowledge_base_provider is None
+    assert result.project_id == 2
+    assert result.inserted_at == collection.inserted_at
+    assert result.updated_at == collection.updated_at
+    assert result.deleted_at is None
+
+
+def test_to_collection_public_with_deleted_at() -> None:
+    """Test that deleted_at field is properly included when set."""
+    deleted_time = now()
+    collection = Collection(
+        id=uuid4(),
+        project_id=3,
+        provider=ProviderType.openai,
+        llm_service_id="vs_789",
+        llm_service_name="openai vector store",
+        name="Deleted Collection",
+        description="Deleted",
+        inserted_at=now(),
+        updated_at=now(),
+        deleted_at=deleted_time,
+    )
+
+    result = to_collection_public(collection)
+
+    assert result.deleted_at == deleted_time

From 9be2e06665c9a38c82ebc3057d4208eeaae9b911 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 9 Apr 2026 11:06:04 +0530
Subject: [PATCH 12/20] pr review fixes

---
 ..._columns_to_collection_job_and_documents.py | 18 +++++++++---------
 backend/app/api/routes/collection_job.py       |  1 -
 backend/app/api/routes/collections.py          |  7 ++++---
 backend/app/crud/rag/open_ai.py                |  1 -
 backend/app/models/collection.py               |  2 +-
 backend/app/models/collection_job.py           | 10 +++++++++-
 .../services/collections/create_collection.py  |  1 -
 backend/app/services/collections/helpers.py    |  3 +--
 .../collections/test_create_collections.py     |  8 ++++----
 9 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
index 3ddcd00b1..ddbb77516 100644
--- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
@@ -32,25 +32,25 @@ def upgrade():
             "total_size_mb",
             sa.Float(),
             nullable=True,
-            comment="Total size of documents being uploaded to collection",
+            comment="Total size of documents being uploaded to collection in MB",
         ),
     )
     op.add_column(
-        "document",
+        "collection_jobs",
         sa.Column(
-            "file_size_kb",
-            sa.Float(),
+            "documents",
+            sa.JSON(),
             nullable=True,
-            comment="Size of the document in bytes",
+            comment="List of documents given to make collection",
         ),
     )
     op.add_column(
-        "collection_jobs",
+        "document",
         sa.Column(
-            "documents",
-            sa.JSON(),
+            "file_size_kb",
+            sa.Float(),
             nullable=True,
-            comment="JSON array of document UUIDs included in this job",
+            comment="Size of the document in kilobytes (KB)",
         ),
     )
 
diff --git a/backend/app/api/routes/collection_job.py b/backend/app/api/routes/collection_job.py
index c586a3cc9..792814688 100644
--- a/backend/app/api/routes/collection_job.py
+++ b/backend/app/api/routes/collection_job.py
@@ -17,7 +17,6 @@
     CollectionActionType,
     CollectionJobPublic,
 )
-from app.models.collection import CollectionPublic
 from app.utils import APIResponse, load_description
 from app.services.collections.helpers import extract_error_message, to_collection_public
 
diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
index 332e1da8c..28155c9b5 100644
--- a/backend/app/api/routes/collections.py
+++ b/backend/app/api/routes/collections.py
@@ -11,7 +11,6 @@
     CollectionCrud,
     CollectionJobCrud,
     DocumentCollectionCrud,
-    DocumentCrud,
 )
 from app.core.cloud import get_cloud_storage
 from app.models import (
@@ -96,14 +95,16 @@ def create_collection(
     if request.name:
         ensure_unique_name(session, current_user.project_.id, request.name)
 
+    unique_documents = list(dict.fromkeys(request.documents))
+
     collection_job_crud = CollectionJobCrud(session, current_user.project_.id)
     collection_job = collection_job_crud.create(
         CollectionJobCreate(
             action_type=CollectionActionType.CREATE,
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
-            docs_num=len(request.documents),
-            documents=[str(doc_id) for doc_id in request.documents],
+            docs_num=len(unique_documents),
+            documents=[str(doc_id) for doc_id in unique_documents],
         )
     )
 
diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py
index da3bdb198..cdae82440 100644
--- a/backend/app/crud/rag/open_ai.py
+++ b/backend/app/crud/rag/open_ai.py
@@ -8,7 +8,6 @@
 from pydantic import BaseModel
 
 from app.core.cloud import CloudStorage
-from app.core.config import settings
 from app.models import Document
 
 logger = logging.getLogger(__name__)
diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py
index c0d3e3027..ccd606deb 100644
--- a/backend/app/models/collection.py
+++ b/backend/app/models/collection.py
@@ -4,7 +4,7 @@
 from uuid import UUID, uuid4
 
 from pydantic import HttpUrl, model_validator, model_serializer
-from sqlalchemy import UniqueConstraint, Index, text
+from sqlalchemy import Index, text
 from sqlmodel import Field, Relationship, SQLModel
 
 from app.core.util import now
diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py
index 6caa7fd75..333ebfd14 100644
--- a/backend/app/models/collection_job.py
+++ b/backend/app/models/collection_job.py
@@ -2,7 +2,8 @@
 from enum import Enum
 from uuid import UUID, uuid4
 
-from sqlmodel import Column, Field, SQLModel, Text
+from pydantic import field_validator
+from sqlmodel import JSON, Column, Field, SQLModel, Text
 
 from app.core.util import now
 from app.models.collection import CollectionIDPublic, CollectionPublic
@@ -73,6 +74,12 @@ class CollectionJob(SQLModel, table=True):
             Text, nullable=True, comment="Error message if the job failed"
         ),
     )
+    documents: list[str] | None = Field(
+        default=None,
+        sa_column=Column(
+            JSON, nullable=True, comment="List of documents given to make collection"
+        ),
+    )
 
     # Foreign keys
     collection_id: UUID | None = Field(
@@ -122,6 +129,7 @@ class CollectionJobCreate(SQLModel):
     action_type: CollectionActionType
     docs_num: int | None = None
     project_id: int
+    documents: list[str] | None = None
 
 
 class CollectionJobUpdate(SQLModel):
diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py
index 4d897adae..eb37fd039 100644
--- a/backend/app/services/collections/create_collection.py
+++ b/backend/app/services/collections/create_collection.py
@@ -18,7 +18,6 @@
     CollectionJob,
     Collection,
     CollectionJobUpdate,
-    CollectionPublic,
     CollectionJobPublic,
     CreationRequest,
 )
diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py
index 4d469ba68..6985ac78e 100644
--- a/backend/app/services/collections/helpers.py
+++ b/backend/app/services/collections/helpers.py
@@ -3,7 +3,6 @@
 import ast
 import re
 from uuid import UUID
-from typing import List
 
 from fastapi import HTTPException
 from sqlmodel import select
@@ -64,7 +63,7 @@ def extract_error_message(err: Exception) -> str:
     return message.strip()[:1000]
 
 
-def batch_documents(documents: List[Document]) -> List[List[Document]]:
+def batch_documents(documents: list[Document]) -> list[list[Document]]:
     """
     Batch documents dynamically based on size and count limits.
 
diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py
index 189a79ccb..b51631939 100644
--- a/backend/app/tests/api/routes/collections/test_create_collections.py
+++ b/backend/app/tests/api/routes/collections/test_create_collections.py
@@ -16,7 +16,7 @@ def _extract_metadata(body: dict) -> dict | None:
 
 
 def _create_test_document(
-    db: Session, project_id: int, file_size: int = 1024
+    db: Session, project_id: int, file_size: float = 1
 ) -> Document:
     """Helper to create a test document."""
     doc = Document(
@@ -24,7 +24,7 @@ def _create_test_document(
         fname="test_document.txt",
         object_store_url="s3://test-bucket/test_document.txt",
         project_id=project_id,
-        file_size=file_size,
+        file_size_kb=file_size,
     )
     db.add(doc)
     db.commit()
@@ -41,7 +41,7 @@ def test_collection_creation_with_assistant_calls_start_job_and_returns_job(
     db: Session,
 ) -> None:
     # Create a test document in the database
-    doc = _create_test_document(db, user_api_key.project_id, file_size=2048)
+    doc = _create_test_document(db, user_api_key.project_id, file_size=2)
 
     creation_data = CreationRequest(
         model="gpt-4o",
@@ -91,7 +91,7 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f
     db: Session,
 ) -> None:
     # Create a test document in the database
-    doc = _create_test_document(db, user_api_key.project_id, file_size=5120)
+    doc = _create_test_document(db, user_api_key.project_id, file_size=5)
 
     creation_data = CreationRequest(
         temperature=0.000001,

From db593e213d3aee373d05ebed6bbd4593be2c2bae Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 9 Apr 2026 11:56:56 +0530
Subject: [PATCH 13/20] adding doc helper test cases

---
 .../app/tests/services/documents/__init__.py  |  0
 .../tests/services/documents/test_helpers.py  | 83 +++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 backend/app/tests/services/documents/__init__.py
 create mode 100644 backend/app/tests/services/documents/test_helpers.py

diff --git a/backend/app/tests/services/documents/__init__.py b/backend/app/tests/services/documents/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/app/tests/services/documents/test_helpers.py b/backend/app/tests/services/documents/test_helpers.py
new file mode 100644
index 000000000..9d3eec363
--- /dev/null
+++ b/backend/app/tests/services/documents/test_helpers.py
@@ -0,0 +1,83 @@
+from io import BytesIO
+
+import pytest
+from fastapi import UploadFile
+
+from app.services.documents.helpers import calculate_file_size
+
+
+def make_upload_file(content: bytes, size: int | None = None) -> UploadFile:
+    """Create an UploadFile with the given content and optional pre-set size."""
+    file = UploadFile(file=BytesIO(content), size=size)
+    return file
+
+
+class TestCalculateFileSizeWithSizeAttribute:
+    @pytest.mark.anyio
+    async def test_uses_size_attribute_when_set(self) -> None:
+        """Uses file.size directly when it is provided."""
+        file = make_upload_file(b"irrelevant", size=2048)
+        result = await calculate_file_size(file)
+        assert result == 2  # 2048 / 1024 = 2.0
+
+    @pytest.mark.anyio
+    async def test_rounds_fractional_kb(self) -> None:
+        """Rounds the result when size is not an exact multiple of 1024."""
+        file = make_upload_file(b"irrelevant", size=1536)  # 1.5 KB → rounds to 2
+        result = await calculate_file_size(file)
+        assert result == 2
+
+    @pytest.mark.anyio
+    async def test_rounds_down_fractional_kb(self) -> None:
+        """Rounds down when fractional part is below .5."""
+        file = make_upload_file(b"irrelevant", size=1300)  # ~1.27 KB → rounds to 1
+        result = await calculate_file_size(file)
+        assert result == 1
+
+    @pytest.mark.anyio
+    async def test_large_file_size(self) -> None:
+        """Correctly converts large sizes."""
+        file = make_upload_file(b"irrelevant", size=10 * 1024 * 1024)  # 10 MB
+        result = await calculate_file_size(file)
+        assert result == 10 * 1024  # 10240 KB
+
+
+class TestCalculateFileSizeViaSeek:
+    @pytest.mark.anyio
+    async def test_falls_back_to_seek_when_size_is_none(self) -> None:
+        """Falls back to seek/tell when file.size is None."""
+        content = b"x" * 2048
+        file = make_upload_file(content, size=None)
+        result = await calculate_file_size(file)
+        assert result == 2  # 2048 / 1024 = 2
+
+    @pytest.mark.anyio
+    async def test_falls_back_to_seek_when_size_is_zero(self) -> None:
+        """Falls back to seek/tell when file.size is 0 (falsy)."""
+        content = b"x" * 3072
+        file = make_upload_file(content, size=0)
+        result = await calculate_file_size(file)
+        assert result == 3  # 3072 / 1024 = 3
+
+    @pytest.mark.anyio
+    async def test_resets_file_pointer_after_seek(self) -> None:
+        """File pointer is back at position 0 after size calculation."""
+        content = b"hello world"
+        file = make_upload_file(content, size=None)
+        await calculate_file_size(file)
+        assert file.file.tell() == 0
+
+    @pytest.mark.anyio
+    async def test_seek_with_fractional_kb(self) -> None:
+        """Rounds correctly when content size is not a multiple of 1024."""
+        content = b"x" * 1600  # ~1.56 KB → rounds to 2
+        file = make_upload_file(content, size=None)
+        result = await calculate_file_size(file)
+        assert result == 2
+
+    @pytest.mark.anyio
+    async def test_empty_file_via_seek(self) -> None:
+        """Returns 0 for an empty file when size is None."""
+        file = make_upload_file(b"", size=None)
+        result = await calculate_file_size(file)
+        assert result == 0

From c775c90606e89bb953eb2636f76f2bf6a452fa30 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 9 Apr 2026 12:44:06 +0530
Subject: [PATCH 14/20] making file size function non-async

---
 backend/app/api/routes/documents.py           |  2 +-
 backend/app/services/documents/helpers.py     |  2 +-
 .../tests/services/documents/test_helpers.py  | 69 +++++++------------
 3 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
index 8d9dadc30..3c02ee4a6 100644
--- a/backend/app/api/routes/documents.py
+++ b/backend/app/api/routes/documents.py
@@ -131,7 +131,7 @@ async def upload_doc(
         transformer=transformer,
     )
 
-    file_size_kb = await calculate_file_size(src)
+    file_size_kb = calculate_file_size(src)
     file_size_mb = file_size_kb / 1024
 
     if file_size_mb > MAX_DOC_SIZE_MB:
diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py
index 78619a6e9..7d78b6160 100644
--- a/backend/app/services/documents/helpers.py
+++ b/backend/app/services/documents/helpers.py
@@ -23,7 +23,7 @@
 )
 
 
-async def calculate_file_size(file: UploadFile) -> float:
+def calculate_file_size(file: UploadFile) -> float:
     """
     Calculate the size of an uploaded file in kilobytes.
 
diff --git a/backend/app/tests/services/documents/test_helpers.py b/backend/app/tests/services/documents/test_helpers.py
index 9d3eec363..9a4b25344 100644
--- a/backend/app/tests/services/documents/test_helpers.py
+++ b/backend/app/tests/services/documents/test_helpers.py
@@ -1,6 +1,5 @@
 from io import BytesIO
 
-import pytest
 from fastapi import UploadFile
 
 from app.services.documents.helpers import calculate_file_size
@@ -8,76 +7,54 @@
 
 def make_upload_file(content: bytes, size: int | None = None) -> UploadFile:
     """Create an UploadFile with the given content and optional pre-set size."""
-    file = UploadFile(file=BytesIO(content), size=size)
-    return file
+    return UploadFile(file=BytesIO(content), size=size)
 
 
 class TestCalculateFileSizeWithSizeAttribute:
-    @pytest.mark.anyio
-    async def test_uses_size_attribute_when_set(self) -> None:
+    def test_uses_size_attribute_when_set(self) -> None:
         """Uses file.size directly when it is provided."""
         file = make_upload_file(b"irrelevant", size=2048)
-        result = await calculate_file_size(file)
-        assert result == 2  # 2048 / 1024 = 2.0
+        assert calculate_file_size(file) == 2  # 2048 / 1024 = 2.0
 
-    @pytest.mark.anyio
-    async def test_rounds_fractional_kb(self) -> None:
+    def test_rounds_fractional_kb(self) -> None:
         """Rounds the result when size is not an exact multiple of 1024."""
         file = make_upload_file(b"irrelevant", size=1536)  # 1.5 KB → rounds to 2
-        result = await calculate_file_size(file)
-        assert result == 2
+        assert calculate_file_size(file) == 2
 
-    @pytest.mark.anyio
-    async def test_rounds_down_fractional_kb(self) -> None:
+    def test_rounds_down_fractional_kb(self) -> None:
         """Rounds down when fractional part is below .5."""
         file = make_upload_file(b"irrelevant", size=1300)  # ~1.27 KB → rounds to 1
-        result = await calculate_file_size(file)
-        assert result == 1
+        assert calculate_file_size(file) == 1
 
-    @pytest.mark.anyio
-    async def test_large_file_size(self) -> None:
+    def test_large_file_size(self) -> None:
         """Correctly converts large sizes."""
         file = make_upload_file(b"irrelevant", size=10 * 1024 * 1024)  # 10 MB
-        result = await calculate_file_size(file)
-        assert result == 10 * 1024  # 10240 KB
+        assert calculate_file_size(file) == 10 * 1024  # 10240 KB
 
 
 class TestCalculateFileSizeViaSeek:
-    @pytest.mark.anyio
-    async def test_falls_back_to_seek_when_size_is_none(self) -> None:
+    def test_falls_back_to_seek_when_size_is_none(self) -> None:
         """Falls back to seek/tell when file.size is None."""
-        content = b"x" * 2048
-        file = make_upload_file(content, size=None)
-        result = await calculate_file_size(file)
-        assert result == 2  # 2048 / 1024 = 2
+        file = make_upload_file(b"x" * 2048, size=None)
+        assert calculate_file_size(file) == 2  # 2048 / 1024 = 2
 
-    @pytest.mark.anyio
-    async def test_falls_back_to_seek_when_size_is_zero(self) -> None:
+    def test_falls_back_to_seek_when_size_is_zero(self) -> None:
         """Falls back to seek/tell when file.size is 0 (falsy)."""
-        content = b"x" * 3072
-        file = make_upload_file(content, size=0)
-        result = await calculate_file_size(file)
-        assert result == 3  # 3072 / 1024 = 3
+        file = make_upload_file(b"x" * 3072, size=0)
+        assert calculate_file_size(file) == 3  # 3072 / 1024 = 3
 
-    @pytest.mark.anyio
-    async def test_resets_file_pointer_after_seek(self) -> None:
+    def test_resets_file_pointer_after_seek(self) -> None:
         """File pointer is back at position 0 after size calculation."""
-        content = b"hello world"
-        file = make_upload_file(content, size=None)
-        await calculate_file_size(file)
+        file = make_upload_file(b"hello world", size=None)
+        calculate_file_size(file)
         assert file.file.tell() == 0
 
-    @pytest.mark.anyio
-    async def test_seek_with_fractional_kb(self) -> None:
+    def test_seek_with_fractional_kb(self) -> None:
         """Rounds correctly when content size is not a multiple of 1024."""
-        content = b"x" * 1600  # ~1.56 KB → rounds to 2
-        file = make_upload_file(content, size=None)
-        result = await calculate_file_size(file)
-        assert result == 2
+        file = make_upload_file(b"x" * 1600, size=None)  # ~1.56 KB → rounds to 2
+        assert calculate_file_size(file) == 2
 
-    @pytest.mark.anyio
-    async def test_empty_file_via_seek(self) -> None:
+    def test_empty_file_via_seek(self) -> None:
         """Returns 0 for an empty file when size is None."""
         file = make_upload_file(b"", size=None)
-        result = await calculate_file_size(file)
-        assert result == 0
+        assert calculate_file_size(file) == 0

From f809ce86041709e02ea5cf17cc0352188cd1a116 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 9 Apr 2026 13:09:09 +0530
Subject: [PATCH 15/20] adding max time limit test case

---
 .../documents/test_route_document_upload.py   | 46 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py
index 6f16b52b1..f5ccca242 100644
--- a/backend/app/tests/api/routes/documents/test_route_document_upload.py
+++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from urllib.parse import urlparse
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
 import pytest
 from moto import mock_aws
@@ -301,6 +301,50 @@ def test_transformation_job_created_in_database(
         # Check that start_job was called with the right arguments
         assert "transformer_name" in kwargs or len(args) >= 4
 
+    def test_upload_file_within_size_limit(
+        self,
+        db: Session,
+        route: Route,
+        scratch: Path,
+        uploader: WebUploader,
+    ) -> None:
+        """Test that a file within the size limit uploads successfully."""
+        aws = AmazonCloudStorageClient()
+        aws.create()
+
+        # Mock calculate_file_size to return a value just under the 25 MB limit (in KB)
+        with patch(
+            "app.api.routes.documents.calculate_file_size",
+            return_value=25 * 1024 - 1,  # 25 MB - 1 KB
+        ):
+            response = uploader.put(route, scratch)
+
+        assert response.status_code == 200
+
+    def test_upload_file_exceeds_size_limit(
+        self,
+        db: Session,
+        route: Route,
+        scratch: Path,
+        uploader: WebUploader,
+    ) -> None:
+        """Test that a file exceeding 25 MB returns a 413 error."""
+        aws = AmazonCloudStorageClient()
+        aws.create()
+
+        # Mock calculate_file_size to return a value over the 25 MB limit (in KB)
+        with patch(
+            "app.api.routes.documents.calculate_file_size",
+            return_value=25 * 1024 + 1,  # 25 MB + 1 KB
+        ):
+            response = uploader.put(route, scratch)
+
+        assert response.status_code == 413
+        print("response =", response.json())
+        error_detail = response.json()["error"]
+        assert "exceeds the maximum allowed size" in error_detail
+        assert "25" in error_detail
+
     def test_upload_response_structure_without_transformation(
         self,
         db: Session,

From 5e259287b714715e4a779d9e20426c9704451716 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Fri, 10 Apr 2026 16:18:01 +0530
Subject: [PATCH 16/20] fixng migration issue

---
 ...=> 051_add_columns_to_collection_job_and_documents.py} | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename backend/app/alembic/versions/{050_add_columns_to_collection_job_and_documents.py => 051_add_columns_to_collection_job_and_documents.py} (95%)

diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
similarity index 95%
rename from backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
rename to backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
index ddbb77516..86b0be3b4 100644
--- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
@@ -1,7 +1,7 @@
 """add columns to collection job and documents table
 
-Revision ID: 050
-Revises: 049
+Revision ID: 051
+Revises: 050
 Create Date: 2026-03-25 10:09:47.318575
 
 """
@@ -10,8 +10,8 @@
 
 
 # revision identifiers, used by Alembic.
-revision = "050"
-down_revision = "049"
+revision = "051"
+down_revision = "050"
 branch_labels = None
 depends_on = None
 

From aa19cd595d554a18761599ada1a7298d171486a9 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Fri, 10 Apr 2026 16:56:44 +0530
Subject: [PATCH 17/20] small error

---
 backend/app/services/collections/providers/openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py
index d73db2de1..f52e83394 100644
--- a/backend/app/services/collections/providers/openai.py
+++ b/backend/app/services/collections/providers/openai.py
@@ -6,7 +6,7 @@
 from app.services.collections.providers import BaseProvider
 from app.core.cloud.storage import CloudStorage
 from app.crud.rag import OpenAIVectorStoreCrud, OpenAIAssistantCrud
-from app.services.collections.helpers import get_service_name
+from app.services.collections.helpers import get_service_name, batch_documents
 from app.models import CreationRequest, Collection, Document
 
 

From 69f5c00bc673de4596df3e5ffab15d76bdca8ec0 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Fri, 10 Apr 2026 18:47:02 +0530
Subject: [PATCH 18/20] fixing test cases

---
 .../collections/providers/test_openai_provider.py   | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/backend/app/tests/services/collections/providers/test_openai_provider.py b/backend/app/tests/services/collections/providers/test_openai_provider.py
index 68b2be1bd..b21577d49 100644
--- a/backend/app/tests/services/collections/providers/test_openai_provider.py
+++ b/backend/app/tests/services/collections/providers/test_openai_provider.py
@@ -24,7 +24,10 @@ def test_create_openai_vector_store_only() -> None:
     )
 
     storage = MagicMock()
-    docs_batches = [["doc1"], ["doc2"]]
+    documents = [
+        SimpleNamespace(file_size_kb=10),
+        SimpleNamespace(file_size_kb=20),
+    ]
     vector_store_id = generate_openai_id("vs_")
 
     with patch(
@@ -37,7 +40,7 @@ def test_create_openai_vector_store_only() -> None:
         collection = provider.create(
             collection_request,
             storage,
-            docs_batches,
+            documents,
         )
 
     assert isinstance(collection, Collection)
@@ -57,7 +60,7 @@ def test_create_openai_with_assistant() -> None:
     )
 
     storage = MagicMock()
-    docs_batches = [["doc1"]]
+    documents = [SimpleNamespace(file_size_kb=10)]
     vector_store_id = generate_openai_id("vs_")
     assistant_id = generate_openai_id("asst_")
 
@@ -76,7 +79,7 @@ def test_create_openai_with_assistant() -> None:
         collection = provider.create(
             collection_request,
             storage,
-            docs_batches,
+            documents,
         )
 
     assert collection.llm_service_id == assistant_id
@@ -140,5 +143,5 @@ def test_create_propagates_exception() -> None:
             provider.create(
                 collection_request,
                 MagicMock(),
-                [["doc1"]],
+                [SimpleNamespace(file_size_kb=10)],
             )

From a4087c8fd521585178f54c1cf0065ee0b7c03b5d Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 16 Apr 2026 14:11:10 +0530
Subject: [PATCH 19/20] fix alembic migration

---
 ...=> 052_add_columns_to_collection_job_and_documents.py} | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename backend/app/alembic/versions/{051_add_columns_to_collection_job_and_documents.py => 052_add_columns_to_collection_job_and_documents.py} (95%)

diff --git a/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py
similarity index 95%
rename from backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
rename to backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py
index 86b0be3b4..f6bf70dc3 100644
--- a/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py
@@ -1,7 +1,7 @@
 """add columns to collection job and documents table
 
-Revision ID: 051
-Revises: 050
+Revision ID: 052
+Revises: 051
 Create Date: 2026-03-25 10:09:47.318575
 
 """
@@ -10,8 +10,8 @@
 
 
 # revision identifiers, used by Alembic.
-revision = "051"
-down_revision = "050"
+revision = "052"
+down_revision = "051"
 branch_labels = None
 depends_on = None
 

From c65616af62e3c2da351cda4980ec6b86960d4857 Mon Sep 17 00:00:00 2001
From: nishika26 <nishikayadav26@gmail.com>
Date: Thu, 16 Apr 2026 14:14:03 +0530
Subject: [PATCH 20/20] fix alembic migration

---
 ...=> 053_add_columns_to_collection_job_and_documents.py} | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename backend/app/alembic/versions/{052_add_columns_to_collection_job_and_documents.py => 053_add_columns_to_collection_job_and_documents.py} (95%)

diff --git a/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py
similarity index 95%
rename from backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py
rename to backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py
index f6bf70dc3..de956f5d6 100644
--- a/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py
+++ b/backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py
@@ -1,7 +1,7 @@
 """add columns to collection job and documents table
 
-Revision ID: 052
-Revises: 051
+Revision ID: 053
+Revises: 052
 Create Date: 2026-03-25 10:09:47.318575
 
 """
@@ -10,8 +10,8 @@
 
 
 # revision identifiers, used by Alembic.
-revision = "052"
-down_revision = "051"
+revision = "053"
+down_revision = "052"
 branch_labels = None
 depends_on = None