From 614801ccd8adb843c12e0617ac8a9931a391c439 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 12:06:57 +0530 Subject: [PATCH 01/20] collections:add dynamic batching --- ...columns_to_collection_job_and_documents.py | 53 ++++++++++++++++ backend/app/api/routes/collections.py | 14 +++++ backend/app/api/routes/documents.py | 4 ++ backend/app/crud/rag/open_ai.py | 24 ++----- backend/app/models/collection_job.py | 15 +++++ backend/app/models/document.py | 10 ++- .../services/collections/create_collection.py | 11 ++-- backend/app/services/collections/helpers.py | 62 ++++++++++++++----- .../services/collections/providers/openai.py | 7 +-- backend/app/services/documents/helpers.py | 24 ++++++- 10 files changed, 176 insertions(+), 48 deletions(-) create mode 100644 backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py new file mode 100644 index 000000000..c1008a3cb --- /dev/null +++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py @@ -0,0 +1,53 @@ +"""add columns to collection job and documents table + +Revision ID: 050 +Revises: 049 +Create Date: 2026-03-25 10:09:47.318575 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes + + +# revision identifiers, used by Alembic. +revision = "050" +down_revision = "049" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "collection_jobs", + sa.Column( + "docs_num", + sa.Integer(), + nullable=True, + comment="Total number of documents to be processed in this job", + ), + ) + op.add_column( + "collection_jobs", + sa.Column( + "total_size", + sa.Integer(), + nullable=True, + comment="Total size of documents being uploaded to collection", + ), + ) + op.add_column( + "document", + sa.Column( + "file_size", + sa.Integer(), + nullable=True, + comment="Size of the document in bytes", + ), + ) + + +def downgrade(): + op.drop_column("document", "file_size") + op.drop_column("collection_jobs", "total_size") + op.drop_column("collection_jobs", "docs_num") diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 558e4d867..b4a03df1f 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -11,6 +11,7 @@ CollectionCrud, CollectionJobCrud, DocumentCollectionCrud, + DocumentCrud, ) from app.core.cloud import get_cloud_storage from app.models import ( @@ -95,12 +96,25 @@ def create_collection( if request.name: ensure_unique_name(session, current_user.project_.id, request.name) + # Calculate total size of all documents + document_crud = DocumentCrud(session, current_user.project_.id) + total_size = 0 + for doc_id in request.documents: + doc = document_crud.read_one(doc_id) + total_size += doc.file_size or 0 + + logger.info( + f"[create_collection] Calculated total size | {{'total_documents': {len(request.documents)}, 'total_size_bytes': {total_size}, 'total_size_mb': {round(total_size / (1024 * 1024), 2)}}}" + ) + collection_job_crud = CollectionJobCrud(session, current_user.project_.id) collection_job = collection_job_crud.create( CollectionJobCreate( action_type=CollectionActionType.CREATE, project_id=current_user.project_.id, status=CollectionJobStatus.PENDING, + num_docs=len(request.documents), + total_size=total_size, ) ) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 69c5b4895..c8e17f503 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -29,6 +29,7 @@ from app.core.cloud import get_cloud_storage from app.services.collections.helpers import pick_service_for_documennt from app.services.documents.helpers import ( + calculate_file_size, schedule_transformation, pre_transform_validation, build_document_schema, @@ -129,6 +130,8 @@ async def upload_doc( transformer=transformer, ) + file_size = await calculate_file_size(src) + storage = get_cloud_storage(session=session, project_id=current_user.project_.id) document_id = uuid4() object_store_url = storage.put(src, Path(str(document_id))) @@ -137,6 +140,7 @@ async def upload_doc( document = Document( id=document_id, fname=src.filename, + file_size=file_size, object_store_url=str(object_store_url), ) source_document = crud.update(document) diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py index cdb644abc..20620b9e2 100644 --- a/backend/app/crud/rag/open_ai.py +++ b/backend/app/crud/rag/open_ai.py @@ -143,24 +143,12 @@ def update( f"[OpenAIVectorStoreCrud.update] File upload completed | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}" ) if req.file_counts.completed != req.file_counts.total: - view = {x.fname: x for x in docs} - for i in self.read(vector_store_id): - if i.last_error is None: - fname = self.client.files.retrieve(i.id) - view.pop(fname) - - error = { - "error": "OpenAI document processing error", - "documents": list(view.values()), - } - try: - raise InterruptedError(json.dumps(error, cls=BaseModelEncoder)) - except InterruptedError as err: - logger.error( - f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'error': '{error['error']}', 'failed_documents': {len(error['documents'])}}}", - exc_info=True, - ) - raise + error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed" + logger.error( + f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}", + exc_info=True, + ) + raise InterruptedError(error_msg) while files: f_obj = files.pop() diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py index 7c55e8562..b0ec21249 100644 --- a/backend/app/models/collection_job.py +++ b/backend/app/models/collection_job.py @@ -53,6 +53,18 @@ class CollectionJob(SQLModel, table=True): description="Tracing ID for correlating logs and traces.", sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"}, ) + docs_num: int | None = Field( + description="Total number of documents to be processed in this job", + sa_column_kwargs={ + "comment": "Total number of documents to be processed in this job" + }, + ) + total_size: int | None = Field( + description="Total size of documents being uploaded to collection", + sa_column_kwargs={ + "comment": "Total size of documents being uploaded to collection" + }, + ) error_message: str | None = Field( default=None, sa_column=Column( @@ -106,6 +118,9 @@ class CollectionJobCreate(SQLModel): collection_id: UUID | None = None status: CollectionJobStatus action_type: CollectionActionType + batch_size: int | None = None + num_docs: int | None = None + total_size: int | None = None project_id: int diff --git a/backend/app/models/document.py b/backend/app/models/document.py index bffa7b39c..3fbd71da2 100644 --- a/backend/app/models/document.py +++ b/backend/app/models/document.py @@ -1,6 +1,8 @@ from datetime import datetime +from typing import Any from uuid import UUID, uuid4 +from pydantic import model_serializer from sqlmodel import Field, SQLModel from app.core.util import now @@ -41,6 +43,11 @@ class Document(DocumentBase, table=True): default=False, sa_column_kwargs={"comment": "Soft delete flag"}, ) + file_size: int | None = Field( + default=None, + description="The size of the document in bytes", + sa_column_kwargs={"comment": "Size of the document in bytes"}, + ) # Foreign keys source_document_id: UUID | None = Field( @@ -80,9 +87,6 @@ class DocumentPublic(DocumentBase): updated_at: datetime = Field( description="The timestamp when the document was last updated" ) - signed_url: str | None = Field( - default=None, description="A signed URL for accessing the document" - ) class TransformedDocumentPublic(DocumentPublic): diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index dd4016616..3f7b98537 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -203,9 +203,9 @@ def execute_job( flat_docs = document_crud.read_each(creation_request.documents) file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - file_sizes_kb = [ - storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs - ] + file_sizes_bytes = [doc.file_size or 0 for doc in flat_docs] + total_size_bytes = sum(file_sizes_bytes) + total_size_mb = round(total_size_bytes / (1024 * 1024), 2) with Session(engine) as session: collection_crud = CollectionCrud(session, project_id) @@ -240,11 +240,12 @@ def execute_job( elapsed = time.time() - start_time logger.info( - "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Sizes: %s KB | Types: %s", + "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB (%s bytes) | Types: %s", collection_id, elapsed, len(flat_docs), - file_sizes_kb, + total_size_mb, + total_size_bytes, list(file_exts), ) diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 6275ee40d..412eb332d 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -55,25 +55,57 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] -def batch_documents( - document_crud: DocumentCrud, documents: List[UUID], batch_size: int -): - """Batch document IDs into chunks of size `batch_size`, load each via `DocumentCrud.read_each`, - and return a list of document batches.""" +def batch_documents(document_crud: DocumentCrud, documents: List[UUID]): + """ + Batch documents dynamically based on size and count limits. + + Creates a new batch when either: + - Total size reaches 30 MB (31,457,280 bytes) + - Document count reaches 200 + + Returns: + List of document batches + """ + + MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024 # 30 MB in bytes + MAX_BATCH_COUNT = 200 # Maximum documents per batch logger.info( - f"[batch_documents] Starting batch iteration for documents | {{'batch_size': {batch_size}, 'total_documents': {len(documents)}}}" + f"[batch_documents] Starting dynamic batch iteration | {{'total_documents': {len(documents)}, 'max_batch_size': '30 MB', 'max_batch_count': {MAX_BATCH_COUNT}}}" ) + docs_batches = [] - start, stop = 0, batch_size - while True: - view = documents[start:stop] - if not view: - break - batch_docs = document_crud.read_each(view) - docs_batches.append(batch_docs) - start = stop - stop += batch_size + current_batch = [] + current_batch_size = 0 + + for doc_id in documents: + doc = document_crud.read_one(doc_id) + doc_size = doc.file_size or 0 # file_size is in bytes + + would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES + would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT + + if current_batch and (would_exceed_size or would_exceed_count): + logger.info( + f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}" + ) + docs_batches.append(current_batch) + current_batch = [] + current_batch_size = 0 + + current_batch.append(doc) + current_batch_size += doc_size + + if current_batch: + logger.info( + f"[batch_documents] Final batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}" + ) + docs_batches.append(current_batch) + + logger.info( + f"[batch_documents] Batching complete | {{'total_batches': {len(docs_batches)}, 'total_documents': {len(documents)}}}" + ) + return docs_batches diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py index bdab86d3a..1a130314a 100644 --- a/backend/app/services/collections/providers/openai.py +++ b/backend/app/services/collections/providers/openai.py @@ -31,12 +31,7 @@ def create( """ try: # Use user-provided batch_size, default to 10 if not set - batch_size = collection_request.batch_size or 10 - docs_batches = batch_documents( - document_crud, - collection_request.documents, - batch_size, - ) + docs_batches = batch_documents(document_crud, collection_request.documents) vector_store_crud = OpenAIVectorStoreCrud(self.client) vector_store = vector_store_crud.create() diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py index cd941eb55..578380a5a 100644 --- a/backend/app/services/documents/helpers.py +++ b/backend/app/services/documents/helpers.py @@ -1,7 +1,7 @@ from typing import Optional, Tuple, Iterable, Union from uuid import UUID -from fastapi import HTTPException +from fastapi import HTTPException, UploadFile from app.services.doctransform.registry import ( get_available_transformers, @@ -23,6 +23,28 @@ ) +async def calculate_file_size(file: UploadFile) -> int: + """ + Calculate the size of an uploaded file in bytes. + + Args: + file: The uploaded file from FastAPI + + Returns: + The size of the file in bytes + """ + if file.size: + return file.size + + # If size is not available, calculate by reading the file + await file.seek(0) + content = await file.read() + size_bytes = len(content) + await file.seek(0) # Reset to beginning for subsequent operations + + return size_bytes + + def pre_transform_validation( *, src_filename: str, From f8f9c2f290bd3e3cb42315a1f07f2c09295051eb Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 12:35:58 +0530 Subject: [PATCH 02/20] coderabbit reviews --- ...0_add_columns_to_collection_job_and_documents.py | 1 - backend/app/api/docs/collections/create.md | 7 ++++--- backend/app/api/routes/collections.py | 2 +- backend/app/models/collection.py | 1 + backend/app/models/collection_job.py | 5 +++-- .../app/services/collections/create_collection.py | 3 +-- backend/app/services/collections/helpers.py | 13 ++++--------- .../app/services/collections/providers/openai.py | 1 - .../routes/collections/test_create_collections.py | 3 --- .../collections/providers/test_openai_provider.py | 3 --- .../services/collections/test_create_collection.py | 10 ++-------- 11 files changed, 16 insertions(+), 33 deletions(-) diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py index c1008a3cb..2925e0fa6 100644 --- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py @@ -7,7 +7,6 @@ """ from alembic import op import sqlalchemy as sa -import sqlmodel.sql.sqltypes # revision identifiers, used by Alembic. diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md index cc85ad36a..853f6d78e 100644 --- a/backend/app/api/docs/collections/create.md +++ b/backend/app/api/docs/collections/create.md @@ -3,9 +3,10 @@ pipeline: * Create a vector store from the document IDs you received after uploading your documents through the Documents module. -* The `batch_size` parameter controls how many documents are sent to OpenAI in a - single transaction when creating the vector store. This helps optimize the upload - process for large document sets. If not specified, the default value is **10**. +* Documents are automatically batched when creating the vector store to optimize + the upload process for large document sets. A new batch is created when either + the cumulative size reaches configured total size of documents given to upload to a vector store + or the document count reaches specific number of files in a btch, whichever limit is hit first. * [Deprecated] Attach the Vector Store to an OpenAI [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use parameters in the request body relevant to an Assistant to flesh out diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index b4a03df1f..004251d6c 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -113,7 +113,7 @@ def create_collection( action_type=CollectionActionType.CREATE, project_id=current_user.project_.id, status=CollectionJobStatus.PENDING, - num_docs=len(request.documents), + docs_num=len(request.documents), total_size=total_size, ) ) diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index f8b545404..0b00639d8 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -105,6 +105,7 @@ class CollectionOptions(SQLModel): batch_size: int = Field( default=10, description=( + "**[Deprecated]** " "Number of documents to send to OpenAI in a single " "transaction. See the `file_ids` parameter in the " "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)." diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py index b0ec21249..a4e857a1a 100644 --- a/backend/app/models/collection_job.py +++ b/backend/app/models/collection_job.py @@ -54,12 +54,14 @@ class CollectionJob(SQLModel, table=True): sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"}, ) docs_num: int | None = Field( + default=None, description="Total number of documents to be processed in this job", sa_column_kwargs={ "comment": "Total number of documents to be processed in this job" }, ) total_size: int | None = Field( + default=None, description="Total size of documents being uploaded to collection", sa_column_kwargs={ "comment": "Total size of documents being uploaded to collection" @@ -118,8 +120,7 @@ class CollectionJobCreate(SQLModel): collection_id: UUID | None = None status: CollectionJobStatus action_type: CollectionActionType - batch_size: int | None = None - num_docs: int | None = None + docs_num: int | None = None total_size: int | None = None project_id: int diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 3f7b98537..35b03da80 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -203,8 +203,7 @@ def execute_job( flat_docs = document_crud.read_each(creation_request.documents) file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - file_sizes_bytes = [doc.file_size or 0 for doc in flat_docs] - total_size_bytes = sum(file_sizes_bytes) + total_size_bytes = collection_job.total_size or 0 total_size_mb = round(total_size_bytes / (1024 * 1024), 2) with Session(engine) as session: diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 412eb332d..a242f7b8c 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -55,7 +55,9 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] -def batch_documents(document_crud: DocumentCrud, documents: List[UUID]): +def batch_documents( + document_crud: DocumentCrud, documents: List[UUID] +) -> List[List[Document]]: """ Batch documents dynamically based on size and count limits. @@ -70,17 +72,13 @@ def batch_documents(document_crud: DocumentCrud, documents: List[UUID]): MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024 # 30 MB in bytes MAX_BATCH_COUNT = 200 # Maximum documents per batch - logger.info( - f"[batch_documents] Starting dynamic batch iteration | {{'total_documents': {len(documents)}, 'max_batch_size': '30 MB', 'max_batch_count': {MAX_BATCH_COUNT}}}" - ) - docs_batches = [] current_batch = [] current_batch_size = 0 for doc_id in documents: doc = document_crud.read_one(doc_id) - doc_size = doc.file_size or 0 # file_size is in bytes + doc_size = doc.file_size or 0 would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT @@ -97,9 +95,6 @@ def batch_documents(document_crud: DocumentCrud, documents: List[UUID]): current_batch_size += doc_size if current_batch: - logger.info( - f"[batch_documents] Final batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}" - ) docs_batches.append(current_batch) logger.info( diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py index 1a130314a..558baf295 100644 --- a/backend/app/services/collections/providers/openai.py +++ b/backend/app/services/collections/providers/openai.py @@ -30,7 +30,6 @@ def create( Create OpenAI vector store with documents and optionally an assistant. """ try: - # Use user-provided batch_size, default to 10 if not set docs_batches = batch_documents(document_crud, collection_request.documents) vector_store_crud = OpenAIVectorStoreCrud(self.client) diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py index 220cb4ee8..65e40f85f 100644 --- a/backend/app/tests/api/routes/collections/test_create_collections.py +++ b/backend/app/tests/api/routes/collections/test_create_collections.py @@ -26,7 +26,6 @@ def test_collection_creation_with_assistant_calls_start_job_and_returns_job( instructions="string", temperature=0.000001, documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")], - batch_size=10, callback_url=None, ) @@ -71,7 +70,6 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f creation_data = CreationRequest( temperature=0.000001, documents=[str(uuid4())], - batch_size=10, callback_url=None, ) @@ -109,7 +107,6 @@ def test_collection_creation_vector_only_request_validation_error( "model": "gpt-4o", "temperature": 0.000001, "documents": [str(uuid4())], - "batch_size": 10, "callback_url": None, } diff --git a/backend/app/tests/services/collections/providers/test_openai_provider.py b/backend/app/tests/services/collections/providers/test_openai_provider.py index 8ae028a0a..25f664146 100644 --- a/backend/app/tests/services/collections/providers/test_openai_provider.py +++ b/backend/app/tests/services/collections/providers/test_openai_provider.py @@ -18,7 +18,6 @@ def test_create_openai_vector_store_only() -> None: collection_request = SimpleNamespace( documents=["doc1", "doc2"], - batch_size=10, model=None, instructions=None, temperature=None, @@ -57,7 +56,6 @@ def test_create_openai_with_assistant() -> None: collection_request = SimpleNamespace( documents=["doc1"], - batch_size=10, model="gpt-4o", instructions="You are helpful", temperature=0.7, @@ -138,7 +136,6 @@ def test_create_propagates_exception() -> None: collection_request = SimpleNamespace( documents=["doc1"], - batch_size=10, model=None, instructions=None, temperature=None, diff --git a/backend/app/tests/services/collections/test_create_collection.py b/backend/app/tests/services/collections/test_create_collection.py index b6913a81f..f7479aa09 100644 --- a/backend/app/tests/services/collections/test_create_collection.py +++ b/backend/app/tests/services/collections/test_create_collection.py @@ -58,7 +58,6 @@ def test_start_job_creates_collection_job_and_schedules_task(db: Session) -> Non project = get_project(db) request = CreationRequest( documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")], - batch_size=10, callback_url=None, provider="openai", ) @@ -137,7 +136,7 @@ def test_execute_job_success_flow_updates_job_and_creates_collection( aws.client.put_object(Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test") sample_request = CreationRequest( - documents=[document.id], batch_size=10, callback_url=None, provider="openai" + documents=[document.id], callback_url=None, provider="openai" ) mock_get_llm_provider.return_value = get_mock_provider( @@ -204,9 +203,7 @@ def test_execute_job_assistant_create_failure_marks_failed_and_deletes_collectio collection_id=None, ) - req = CreationRequest( - documents=[], batch_size=10, callback_url=None, provider="openai" - ) + req = CreationRequest(documents=[], callback_url=None, provider="openai") mock_provider = get_mock_provider( llm_service_id="vs_123", llm_service_name="openai vector store" @@ -269,7 +266,6 @@ def test_execute_job_success_flow_callback_job_and_creates_collection( sample_request = CreationRequest( documents=[document.id], - batch_size=10, callback_url=callback_url, provider="openai", ) @@ -350,7 +346,6 @@ def test_execute_job_success_creates_collection_with_callback( sample_request = CreationRequest( documents=[document.id], - batch_size=10, callback_url=callback_url, provider="openai", ) @@ -434,7 +429,6 @@ def test_execute_job_failure_flow_callback_job_and_marks_failed( sample_request = CreationRequest( documents=[uuid.uuid4()], - batch_size=10, callback_url=callback_url, provider="openai", ) From f9b336525c3c11e751d2784284edf1196a27331b Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 12:38:25 +0530 Subject: [PATCH 03/20] coderabbit reviews --- backend/app/crud/rag/open_ai.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py index 20620b9e2..77c0bbdc1 100644 --- a/backend/app/crud/rag/open_ai.py +++ b/backend/app/crud/rag/open_ai.py @@ -145,8 +145,7 @@ def update( if req.file_counts.completed != req.file_counts.total: error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed" logger.error( - f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}", - exc_info=True, + f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}" ) raise InterruptedError(error_msg) From a074e36acd2feec78ffdc3465c2e6adad7a03069 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 12:42:24 +0530 Subject: [PATCH 04/20] coderabbit reviews --- backend/app/api/docs/collections/create.md | 2 +- backend/app/services/collections/helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md index 853f6d78e..a47383f21 100644 --- a/backend/app/api/docs/collections/create.md +++ b/backend/app/api/docs/collections/create.md @@ -6,7 +6,7 @@ pipeline: * Documents are automatically batched when creating the vector store to optimize the upload process for large document sets. A new batch is created when either the cumulative size reaches configured total size of documents given to upload to a vector store - or the document count reaches specific number of files in a btch, whichever limit is hit first. + or the document count reaches specific number of files in a batch, whichever limit is hit first. * [Deprecated] Attach the Vector Store to an OpenAI [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use parameters in the request body relevant to an Assistant to flesh out diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index a242f7b8c..1b61900b2 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -10,7 +10,7 @@ from app.crud import DocumentCrud, CollectionCrud from app.api.deps import SessionDep -from app.models import DocumentCollection, Collection, CollectionPublic +from app.models import DocumentCollection, Collection, CollectionPublic, Document logger = logging.getLogger(__name__) From 1106abc3040f4b4a7aafe3a2a0188e9c0d2224e5 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 15:56:53 +0530 Subject: [PATCH 05/20] fixing test cases --- .../collections/test_create_collections.py | 42 +++++++- .../services/collections/test_helpers.py | 99 ++++++++++++++----- 2 files changed, 111 insertions(+), 30 deletions(-) diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py index 65e40f85f..189a79ccb 100644 --- a/backend/app/tests/api/routes/collections/test_create_collections.py +++ b/backend/app/tests/api/routes/collections/test_create_collections.py @@ -3,10 +3,11 @@ from typing import Any from fastapi.testclient import TestClient +from sqlmodel import Session from app.core.config import settings from app.tests.utils.auth import TestAuthContext -from app.models import CollectionJobStatus +from app.models import CollectionJobStatus, Document from app.models.collection import CreationRequest @@ -14,18 +15,39 @@ def _extract_metadata(body: dict) -> dict | None: return body.get("metadata") or body.get("meta") +def _create_test_document( + db: Session, project_id: int, file_size: int = 1024 +) -> Document: + """Helper to create a test document.""" + doc = Document( + id=uuid4(), + fname="test_document.txt", + object_store_url="s3://test-bucket/test_document.txt", + project_id=project_id, + file_size=file_size, + ) + db.add(doc) + db.commit() + db.refresh(doc) + return doc + + @patch("app.api.routes.collections.create_service.start_job") def test_collection_creation_with_assistant_calls_start_job_and_returns_job( mock_start_job: Any, client: TestClient, user_api_key_header: dict[str, str], user_api_key: TestAuthContext, + db: Session, ) -> None: + # Create a test document in the database + doc = _create_test_document(db, user_api_key.project_id, file_size=2048) + creation_data = CreationRequest( model="gpt-4o", instructions="string", temperature=0.000001, - documents=[UUID("f3e86a17-1e6f-41ec-b020-5b08eebef928")], + documents=[doc.id], callback_url=None, ) @@ -66,10 +88,14 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f client: TestClient, user_api_key_header: dict[str, str], user_api_key: TestAuthContext, + db: Session, ) -> None: + # Create a test document in the database + doc = _create_test_document(db, user_api_key.project_id, file_size=5120) + creation_data = CreationRequest( temperature=0.000001, - documents=[str(uuid4())], + documents=[doc.id], callback_url=None, ) @@ -101,12 +127,18 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f def test_collection_creation_vector_only_request_validation_error( - client: TestClient, user_api_key_header: dict[str, str] + client: TestClient, + user_api_key_header: dict[str, str], + user_api_key: TestAuthContext, + db: Session, ) -> None: + # Create a test document in the database + doc = _create_test_document(db, user_api_key.project_id) + payload = { "model": "gpt-4o", "temperature": 0.000001, - "documents": [str(uuid4())], + "documents": [str(doc.id)], "callback_url": None, } diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py index f53271f18..02e64a4c6 100644 --- a/backend/app/tests/services/collections/test_helpers.py +++ b/backend/app/tests/services/collections/test_helpers.py @@ -46,46 +46,95 @@ def test_extract_error_message_handles_non_matching_bodies() -> None: class FakeDocumentCrud: - def __init__(self): + def __init__(self, file_size_per_doc=1024): + """ + Args: + file_size_per_doc: Size in bytes for each fake document (default 1 KB) + """ self.calls = [] - - def read_each(self, ids): - self.calls.append(list(ids)) - return [ - SimpleNamespace( - id=i, fname=f"{i}.txt", object_store_url=f"s3://bucket/{i}.txt" + self.file_size_per_doc = file_size_per_doc + self.documents = {} + + def read_one(self, doc_id): + """Simulate reading a single document by ID.""" + if doc_id not in self.documents: + self.documents[doc_id] = SimpleNamespace( + id=doc_id, + fname=f"{doc_id}.txt", + object_store_url=f"s3://bucket/{doc_id}.txt", + file_size=self.file_size_per_doc, ) - for i in ids - ] + self.calls.append(doc_id) + return self.documents[doc_id] -def test_batch_documents_even_chunks() -> None: - crud = FakeDocumentCrud() +def test_batch_documents_small_files_single_batch() -> None: + """Test that small files all fit in one batch (under 30 MB and under 200 docs).""" + crud = FakeDocumentCrud(file_size_per_doc=1024) # 1 KB per file ids = [uuid4() for _ in range(6)] - batches = helpers.batch_documents(crud, ids, batch_size=3) + batches = helpers.batch_documents(crud, ids) + + # All 6 small files should fit in one batch + assert len(batches) == 1 + assert len(batches[0]) == 6 + assert [d.id for d in batches[0]] == ids + + +def test_batch_documents_size_based_batching() -> None: + """Test that large files trigger size-based batching (30 MB limit).""" + # Each file is 20 MB, so max 1 file per batch (since 2 * 20 MB > 30 MB) + crud = FakeDocumentCrud(file_size_per_doc=20 * 1024 * 1024) + ids = [uuid4() for _ in range(3)] + batches = helpers.batch_documents(crud, ids) + + # Should create 3 batches, one for each 20 MB file + assert len(batches) == 3 + assert len(batches[0]) == 1 + assert len(batches[1]) == 1 + assert len(batches[2]) == 1 - # read_each called with chunks [0:3], [3:6] - assert crud.calls == [ids[0:3], ids[3:6]] - # output mirrors what read_each returned + +def test_batch_documents_count_based_batching() -> None: + """Test that document count triggers batching (200 docs limit).""" + crud = FakeDocumentCrud(file_size_per_doc=100) # Small files + ids = [uuid4() for _ in range(250)] + batches = helpers.batch_documents(crud, ids) + + # Should create 2 batches: 200 + 50 assert len(batches) == 2 - assert [d.id for d in batches[0]] == ids[0:3] - assert [d.id for d in batches[1]] == ids[3:6] + assert len(batches[0]) == 200 + assert len(batches[1]) == 50 -def test_batch_documents_ragged_last_chunk() -> None: - crud = FakeDocumentCrud() +def test_batch_documents_mixed_size_batching() -> None: + """Test batching with files that fit multiple per batch but hit 30 MB limit.""" + # Each file is 15 MB, so 2 files = 30 MB (at limit), 3 files > 30 MB + crud = FakeDocumentCrud(file_size_per_doc=15 * 1024 * 1024) ids = [uuid4() for _ in range(5)] - batches = helpers.batch_documents(crud, ids, batch_size=2) + batches = helpers.batch_documents(crud, ids) + + # Should create 3 batches: [2 files, 2 files, 1 file] + assert len(batches) == 3 + assert len(batches[0]) == 2 # 30 MB total + assert len(batches[1]) == 2 # 30 MB total + assert len(batches[2]) == 1 # 15 MB total + + +def test_batch_documents_with_none_file_size() -> None: + """Test that documents with None file_size are treated as 0 bytes.""" + crud = FakeDocumentCrud(file_size_per_doc=None) + ids = [uuid4() for _ in range(10)] + batches = helpers.batch_documents(crud, ids) - assert crud.calls == [ids[0:2], ids[2:4], ids[4:5]] - assert [d.id for d in batches[0]] == ids[0:2] - assert [d.id for d in batches[1]] == ids[2:4] - assert [d.id for d in batches[2]] == ids[4:5] + # All files with None/0 size should fit in one batch (under both limits) + assert len(batches) == 1 + assert len(batches[0]) == 10 def test_batch_documents_empty_input() -> None: + """Test that empty input returns empty batches.""" crud = FakeDocumentCrud() - batches = helpers.batch_documents(crud, [], batch_size=3) + batches = helpers.batch_documents(crud, []) assert batches == [] assert crud.calls == [] From ff0726f7d1cafaa4e27c5202bd63c86bca353cf1 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 19:09:42 +0530 Subject: [PATCH 06/20] pr review fixes --- ...columns_to_collection_job_and_documents.py | 8 +- backend/app/api/docs/collections/create.md | 4 +- backend/app/api/routes/collections.py | 12 --- backend/app/api/routes/documents.py | 4 +- backend/app/models/collection_job.py | 8 +- backend/app/models/document.py | 8 +- .../services/collections/create_collection.py | 25 +++--- backend/app/services/collections/helpers.py | 39 ++++++---- .../services/collections/providers/base.py | 15 ++-- .../services/collections/providers/openai.py | 8 +- backend/app/services/documents/helpers.py | 18 ++--- .../services/collections/test_helpers.py | 77 +++++++++---------- 12 files changed, 101 insertions(+), 125 deletions(-) diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py index 2925e0fa6..642c5445b 100644 --- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py @@ -29,7 +29,7 @@ def upgrade(): op.add_column( "collection_jobs", sa.Column( - "total_size", + "total_size_mb", sa.Integer(), nullable=True, comment="Total size of documents being uploaded to collection", @@ -38,7 +38,7 @@ def upgrade(): op.add_column( "document", sa.Column( - "file_size", + "file_size_kb", sa.Integer(), nullable=True, comment="Size of the document in bytes", @@ -47,6 +47,6 @@ def upgrade(): def downgrade(): - op.drop_column("document", "file_size") - op.drop_column("collection_jobs", "total_size") + op.drop_column("document", "file_size_kb") + op.drop_column("collection_jobs", "total_size_mb") op.drop_column("collection_jobs", "docs_num") diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md index a47383f21..bc94e7c45 100644 --- a/backend/app/api/docs/collections/create.md +++ b/backend/app/api/docs/collections/create.md @@ -5,8 +5,8 @@ pipeline: documents through the Documents module. * Documents are automatically batched when creating the vector store to optimize the upload process for large document sets. A new batch is created when either - the cumulative size reaches configured total size of documents given to upload to a vector store - or the document count reaches specific number of files in a batch, whichever limit is hit first. + the cumulative size reaches 30 MB of documents given to upload to a vector store + or the document count reaches 200 files in a batch, whichever limit is hit first. * [Deprecated] Attach the Vector Store to an OpenAI [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use parameters in the request body relevant to an Assistant to flesh out diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 004251d6c..2aee46032 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -96,17 +96,6 @@ def create_collection( if request.name: ensure_unique_name(session, current_user.project_.id, request.name) - # Calculate total size of all documents - document_crud = DocumentCrud(session, current_user.project_.id) - total_size = 0 - for doc_id in request.documents: - doc = document_crud.read_one(doc_id) - total_size += doc.file_size or 0 - - logger.info( - f"[create_collection] Calculated total size | {{'total_documents': {len(request.documents)}, 'total_size_bytes': {total_size}, 'total_size_mb': {round(total_size / (1024 * 1024), 2)}}}" - ) - collection_job_crud = CollectionJobCrud(session, current_user.project_.id) collection_job = collection_job_crud.create( CollectionJobCreate( @@ -114,7 +103,6 @@ def create_collection( project_id=current_user.project_.id, status=CollectionJobStatus.PENDING, docs_num=len(request.documents), - total_size=total_size, ) ) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index c8e17f503..7d32e417f 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -130,7 +130,7 @@ async def upload_doc( transformer=transformer, ) - file_size = await calculate_file_size(src) + file_size_kb = await calculate_file_size(src) storage = get_cloud_storage(session=session, project_id=current_user.project_.id) document_id = uuid4() @@ -140,7 +140,7 @@ async def upload_doc( document = Document( id=document_id, fname=src.filename, - file_size=file_size, + file_size_kb=file_size_kb, object_store_url=str(object_store_url), ) source_document = crud.update(document) diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py index a4e857a1a..6caa7fd75 100644 --- a/backend/app/models/collection_job.py +++ b/backend/app/models/collection_job.py @@ -60,11 +60,11 @@ class CollectionJob(SQLModel, table=True): "comment": "Total number of documents to be processed in this job" }, ) - total_size: int | None = Field( + total_size_mb: float | None = Field( default=None, - description="Total size of documents being uploaded to collection", + description="Total size of documents being uploaded to collection in MB", sa_column_kwargs={ - "comment": "Total size of documents being uploaded to collection" + "comment": "Total size of documents being uploaded to collection in MB" }, ) error_message: str | None = Field( @@ -121,7 +121,6 @@ class CollectionJobCreate(SQLModel): status: CollectionJobStatus action_type: CollectionActionType docs_num: int | None = None - total_size: int | None = None project_id: int @@ -130,6 +129,7 @@ class CollectionJobUpdate(SQLModel): status: CollectionJobStatus | None = None error_message: str | None = None collection_id: UUID | None = None + total_size_mb: float | None = None trace_id: str | None = None diff --git a/backend/app/models/document.py b/backend/app/models/document.py index 3fbd71da2..12843e72a 100644 --- a/backend/app/models/document.py +++ b/backend/app/models/document.py @@ -1,8 +1,6 @@ from datetime import datetime -from typing import Any from uuid import UUID, uuid4 -from pydantic import model_serializer from sqlmodel import Field, SQLModel from app.core.util import now @@ -43,10 +41,10 @@ class Document(DocumentBase, table=True): default=False, sa_column_kwargs={"comment": "Soft delete flag"}, ) - file_size: int | None = Field( + file_size_kb: float | None = Field( default=None, - description="The size of the document in bytes", - sa_column_kwargs={"comment": "Size of the document in bytes"}, + description="The size of the document in kilobytes", + sa_column_kwargs={"comment": "Size of the document in kilobytes (KB)"}, ) # Foreign keys diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 35b03da80..c86d72d4d 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -168,6 +168,14 @@ def execute_job( job_uuid = UUID(job_id) + with Session(engine) as session: + document_crud = DocumentCrud(session, project_id) + flat_docs = document_crud.read_each(creation_request.documents) + + file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} + total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs) + total_size_mb = total_size_kb / 1024 + with Session(engine) as session: collection_job_crud = CollectionJobCrud(session, project_id) collection_job = collection_job_crud.read_one(job_uuid) @@ -176,11 +184,11 @@ def execute_job( CollectionJobUpdate( task_id=task_id, status=CollectionJobStatus.PROCESSING, + total_size_mb=total_size_mb, ), ) storage = get_cloud_storage(session=session, project_id=project_id) - document_crud = DocumentCrud(session, project_id) provider = get_llm_provider( session=session, @@ -192,20 +200,12 @@ def execute_job( result = provider.create( collection_request=creation_request, storage=storage, - document_crud=document_crud, + documents=flat_docs, ) llm_service_id = result.llm_service_id llm_service_name = result.llm_service_name - with Session(engine) as session: - document_crud = DocumentCrud(session, project_id) - flat_docs = document_crud.read_each(creation_request.documents) - - file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - total_size_bytes = collection_job.total_size or 0 - total_size_mb = round(total_size_bytes / (1024 * 1024), 2) - with Session(engine) as session: collection_crud = CollectionCrud(session, project_id) @@ -239,12 +239,11 @@ def execute_job( elapsed = time.time() - start_time logger.info( - "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB (%s bytes) | Types: %s", + "[create_collection.execute_job] Collection created: %s | Time: %.2fs | Files: %d | Total Size: %s MB | Types: %s", collection_id, elapsed, len(flat_docs), - total_size_mb, - total_size_bytes, + collection_job.total_size_mb, list(file_exts), ) diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 1b61900b2..9fd35e2ed 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -8,13 +8,18 @@ from fastapi import HTTPException from sqlmodel import select -from app.crud import DocumentCrud, CollectionCrud +from app.crud import CollectionCrud from app.api.deps import SessionDep from app.models import DocumentCollection, Collection, CollectionPublic, Document logger = logging.getLogger(__name__) +# Necessary Constants - +# for dynamic batching of documents to upload to openai vector store +MAX_BATCH_SIZE_KB = 30 * 1024 # 30 MB in KB +MAX_BATCH_COUNT = 200 + def get_service_name(provider: str) -> str: """Get the collection service name for a provider.""" @@ -55,47 +60,47 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] -def batch_documents( - document_crud: DocumentCrud, documents: List[UUID] -) -> List[List[Document]]: +def batch_documents(documents: List[Document]) -> List[List[Document]]: """ Batch documents dynamically based on size and count limits. Creates a new batch when either: - - Total size reaches 30 MB (31,457,280 bytes) + - Total size reaches 30 MB (30,720 KB) - Document count reaches 200 + Args: + documents: List of Document objects to batch + Returns: List of document batches """ - MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024 # 30 MB in bytes - MAX_BATCH_COUNT = 200 # Maximum documents per batch - docs_batches = [] current_batch = [] - current_batch_size = 0 + current_batch_size_kb = 0 - for doc_id in documents: - doc = document_crud.read_one(doc_id) - doc_size = doc.file_size or 0 + for doc in documents: + doc_size_kb = doc.file_size_kb or 0 - would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES + would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT if current_batch and (would_exceed_size or would_exceed_count): + docs_batches.append(current_batch) logger.info( - f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}" + f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches)}, 'doc_count': {len(current_batch)}, 'batch_size_mb': {round(current_batch_size_kb / 1024)}}}" ) - docs_batches.append(current_batch) current_batch = [] - current_batch_size = 0 + current_batch_size_kb = 0 current_batch.append(doc) - current_batch_size += doc_size + current_batch_size_kb += doc_size_kb if current_batch: docs_batches.append(current_batch) + logger.info( + f"[batch_documents] Final Batch completed | {{'batch_num': {len(docs_batches)}, 'doc_count': {len(current_batch)}, 'batch_size_mb': {round(current_batch_size_kb / 1024)}}}" + ) logger.info( f"[batch_documents] Batching complete | {{'total_batches': {len(docs_batches)}, 'total_documents': {len(documents)}}}" diff --git a/backend/app/services/collections/providers/base.py b/backend/app/services/collections/providers/base.py index d76fb6189..5607786b7 100644 --- a/backend/app/services/collections/providers/base.py +++ b/backend/app/services/collections/providers/base.py @@ -1,9 +1,8 @@ from abc import ABC, abstractmethod -from typing import Any +from typing import Any, List -from app.crud import DocumentCrud from app.core.cloud.storage import CloudStorage -from app.models import CreationRequest, Collection +from app.models import CreationRequest, Collection, Document class BaseProvider(ABC): @@ -32,21 +31,17 @@ def create( self, collection_request: CreationRequest, storage: CloudStorage, - document_crud: DocumentCrud, + documents: List[Document], ) -> Collection: """Create collection with documents and optionally an assistant. Args: collection_request: Collection parameters (name, description, document list, etc.) storage: Cloud storage instance for file access - document_crud: DocumentCrud instance for fetching documents - batch_size: Number of documents to process per batch - with_assistant: Whether to create an assistant/agent - assistant_options: Options for assistant creation (provider-specific) + documents: Pre-fetched list of Document objects to add to the collection Returns: - llm_service_id: ID of the resource to delete - llm_service_name: Name of the service (determines resource type) + Collection object with llm_service_id and llm_service_name populated """ raise NotImplementedError("Providers must implement execute method") diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py index 558baf295..b0e418efd 100644 --- a/backend/app/services/collections/providers/openai.py +++ b/backend/app/services/collections/providers/openai.py @@ -1,13 +1,13 @@ import logging +from typing import List from openai import OpenAI from app.services.collections.providers import BaseProvider -from app.crud import DocumentCrud from app.core.cloud.storage import CloudStorage from app.crud.rag import OpenAIVectorStoreCrud, OpenAIAssistantCrud from app.services.collections.helpers import batch_documents, get_service_name -from app.models import CreationRequest, Collection +from app.models import CreationRequest, Collection, Document logger = logging.getLogger(__name__) @@ -24,13 +24,13 @@ def create( self, collection_request: CreationRequest, storage: CloudStorage, - document_crud: DocumentCrud, + documents: List[Document], ) -> Collection: """ Create OpenAI vector store with documents and optionally an assistant. """ try: - docs_batches = batch_documents(document_crud, collection_request.documents) + docs_batches = batch_documents(documents) vector_store_crud = OpenAIVectorStoreCrud(self.client) vector_store = vector_store_crud.create() diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py index 578380a5a..78619a6e9 100644 --- a/backend/app/services/documents/helpers.py +++ b/backend/app/services/documents/helpers.py @@ -23,26 +23,24 @@ ) -async def calculate_file_size(file: UploadFile) -> int: +async def calculate_file_size(file: UploadFile) -> float: """ - Calculate the size of an uploaded file in bytes. + Calculate the size of an uploaded file in kilobytes. Args: file: The uploaded file from FastAPI Returns: - The size of the file in bytes + The size of the file in kilobytes (KB) as a whole number """ if file.size: - return file.size + return round(file.size / 1024) - # If size is not available, calculate by reading the file - await file.seek(0) - content = await file.read() - size_bytes = len(content) - await file.seek(0) # Reset to beginning for subsequent operations + file.file.seek(0, 2) + size_bytes = file.file.tell() + file.file.seek(0) - return size_bytes + return round(size_bytes / 1024) def pre_transform_validation( diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py index 02e64a4c6..f2af4b4d9 100644 --- a/backend/app/tests/services/collections/test_helpers.py +++ b/backend/app/tests/services/collections/test_helpers.py @@ -45,47 +45,45 @@ def test_extract_error_message_handles_non_matching_bodies() -> None: # batch documents -class FakeDocumentCrud: - def __init__(self, file_size_per_doc=1024): - """ - Args: - file_size_per_doc: Size in bytes for each fake document (default 1 KB) - """ - self.calls = [] - self.file_size_per_doc = file_size_per_doc - self.documents = {} - - def read_one(self, doc_id): - """Simulate reading a single document by ID.""" - if doc_id not in self.documents: - self.documents[doc_id] = SimpleNamespace( - id=doc_id, - fname=f"{doc_id}.txt", - object_store_url=f"s3://bucket/{doc_id}.txt", - file_size=self.file_size_per_doc, - ) - self.calls.append(doc_id) - return self.documents[doc_id] +def create_fake_documents( + count: int, file_size_kb: float | None = 1 +) -> list[SimpleNamespace]: + """Create fake document objects for testing. + + Args: + count: Number of documents to create + file_size_kb: Size in KB for each document (default 1 KB) + + Returns: + List of SimpleNamespace objects mimicking Document objects + """ + return [ + SimpleNamespace( + id=uuid4(), + fname=f"doc_{i}.txt", + object_store_url=f"s3://bucket/doc_{i}.txt", + file_size_kb=file_size_kb, + ) + for i in range(count) + ] def test_batch_documents_small_files_single_batch() -> None: """Test that small files all fit in one batch (under 30 MB and under 200 docs).""" - crud = FakeDocumentCrud(file_size_per_doc=1024) # 1 KB per file - ids = [uuid4() for _ in range(6)] - batches = helpers.batch_documents(crud, ids) + docs = create_fake_documents(6, file_size_kb=1) # 1 KB per file + batches = helpers.batch_documents(docs) # All 6 small files should fit in one batch assert len(batches) == 1 assert len(batches[0]) == 6 - assert [d.id for d in batches[0]] == ids + assert [d.id for d in batches[0]] == [d.id for d in docs] def test_batch_documents_size_based_batching() -> None: """Test that large files trigger size-based batching (30 MB limit).""" - # Each file is 20 MB, so max 1 file per batch (since 2 * 20 MB > 30 MB) - crud = FakeDocumentCrud(file_size_per_doc=20 * 1024 * 1024) - ids = [uuid4() for _ in range(3)] - batches = helpers.batch_documents(crud, ids) + # Each file is 20 MB (20480 KB), so max 1 file per batch (since 2 * 20 MB > 30 MB) + docs = create_fake_documents(3, file_size_kb=20 * 1024) + batches = helpers.batch_documents(docs) # Should create 3 batches, one for each 20 MB file assert len(batches) == 3 @@ -96,9 +94,8 @@ def test_batch_documents_size_based_batching() -> None: def test_batch_documents_count_based_batching() -> None: """Test that document count triggers batching (200 docs limit).""" - crud = FakeDocumentCrud(file_size_per_doc=100) # Small files - ids = [uuid4() for _ in range(250)] - batches = helpers.batch_documents(crud, ids) + docs = create_fake_documents(250, file_size_kb=0.1) # Small files + batches = helpers.batch_documents(docs) # Should create 2 batches: 200 + 50 assert len(batches) == 2 @@ -108,10 +105,9 @@ def test_batch_documents_count_based_batching() -> None: def test_batch_documents_mixed_size_batching() -> None: """Test batching with files that fit multiple per batch but hit 30 MB limit.""" - # Each file is 15 MB, so 2 files = 30 MB (at limit), 3 files > 30 MB - crud = FakeDocumentCrud(file_size_per_doc=15 * 1024 * 1024) - ids = [uuid4() for _ in range(5)] - batches = helpers.batch_documents(crud, ids) + # Each file is 15 MB (15360 KB), so 2 files = 30 MB (at limit), 3 files > 30 MB + docs = create_fake_documents(5, file_size_kb=15 * 1024) + batches = helpers.batch_documents(docs) # Should create 3 batches: [2 files, 2 files, 1 file] assert len(batches) == 3 @@ -122,9 +118,8 @@ def test_batch_documents_mixed_size_batching() -> None: def test_batch_documents_with_none_file_size() -> None: """Test that documents with None file_size are treated as 0 bytes.""" - crud = FakeDocumentCrud(file_size_per_doc=None) - ids = [uuid4() for _ in range(10)] - batches = helpers.batch_documents(crud, ids) + docs = create_fake_documents(10, file_size_kb=None) + batches = helpers.batch_documents(docs) # All files with None/0 size should fit in one batch (under both limits) assert len(batches) == 1 @@ -133,10 +128,8 @@ def test_batch_documents_with_none_file_size() -> None: def test_batch_documents_empty_input() -> None: """Test that empty input returns empty batches.""" - crud = FakeDocumentCrud() - batches = helpers.batch_documents(crud, []) + batches = helpers.batch_documents([]) assert batches == [] - assert crud.calls == [] def test_ensure_unique_name_success(db: Session) -> None: From c7da9f02dd64c0769554d62276b0211e6dbece5c Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 19:18:41 +0530 Subject: [PATCH 07/20] coderabbit review --- backend/app/services/collections/providers/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/services/collections/providers/base.py b/backend/app/services/collections/providers/base.py index 5607786b7..36283d1fa 100644 --- a/backend/app/services/collections/providers/base.py +++ b/backend/app/services/collections/providers/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, List +from typing import Any from app.core.cloud.storage import CloudStorage from app.models import CreationRequest, Collection, Document @@ -31,7 +31,7 @@ def create( self, collection_request: CreationRequest, storage: CloudStorage, - documents: List[Document], + documents: list[Document], ) -> Collection: """Create collection with documents and optionally an assistant. From 8437442dff3b4d96cc663246212e42146199b2a8 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 19:48:50 +0530 Subject: [PATCH 08/20] removing batch_size from request --- backend/app/api/routes/collections.py | 15 ++++++++------- backend/app/models/collection.py | 9 --------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 2aee46032..fd70f3957 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -120,14 +120,15 @@ def create_collection( with_assistant=with_assistant, ) - metadata = None + metadata = {} + if not with_assistant: - metadata = { - "note": ( - "This job will create a vector store only (no Assistant). " - "Assistant creation happens when both 'model' and 'instructions' are included." - ) - } + metadata["assistant_note"] = ( + "This job will create a vector store only (no Assistant). " + "Assistant creation happens when both 'model' and 'instructions' are included." + ) + + metadata = metadata if metadata else None return APIResponse.success_response( CollectionJobImmediatePublic.model_validate(collection_job), metadata=metadata diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index 0b00639d8..4a77f0200 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -102,15 +102,6 @@ class CollectionOptions(SQLModel): documents: list[UUID] = Field( description="List of document IDs", ) - batch_size: int = Field( - default=10, - description=( - "**[Deprecated]** " - "Number of documents to send to OpenAI in a single " - "transaction. See the `file_ids` parameter in the " - "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)." - ), - ) def model_post_init(self, __context: Any): self.documents = list(set(self.documents)) From f94998f06ecbbf7383f49214d3531f79085dde1a Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 25 Mar 2026 19:58:47 +0530 Subject: [PATCH 09/20] for fixing test cases --- backend/app/api/routes/collections.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index fd70f3957..2aee46032 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -120,15 +120,14 @@ def create_collection( with_assistant=with_assistant, ) - metadata = {} - + metadata = None if not with_assistant: - metadata["assistant_note"] = ( - "This job will create a vector store only (no Assistant). " - "Assistant creation happens when both 'model' and 'instructions' are included." - ) - - metadata = metadata if metadata else None + metadata = { + "note": ( + "This job will create a vector store only (no Assistant). " + "Assistant creation happens when both 'model' and 'instructions' are included." + ) + } return APIResponse.success_response( CollectionJobImmediatePublic.model_validate(collection_job), metadata=metadata From 5ec17be19d61be63da1ff89931db8565567b0fbe Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 27 Mar 2026 13:43:52 +0530 Subject: [PATCH 10/20] size limit on doc upload --- ...columns_to_collection_job_and_documents.py | 14 +++++++++-- backend/app/api/routes/collections.py | 1 + backend/app/api/routes/documents.py | 15 +++++++++++- backend/app/core/cloud/storage.py | 24 +++++++++++++++++++ backend/app/crud/rag/open_ai.py | 18 ++++---------- backend/app/models/collection.py | 10 ++++---- .../services/collections/create_collection.py | 2 +- backend/app/services/collections/helpers.py | 10 +++++--- 8 files changed, 68 insertions(+), 26 deletions(-) diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py index 642c5445b..3ddcd00b1 100644 --- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py @@ -30,7 +30,7 @@ def upgrade(): "collection_jobs", sa.Column( "total_size_mb", - sa.Integer(), + sa.Float(), nullable=True, comment="Total size of documents being uploaded to collection", ), @@ -39,14 +39,24 @@ def upgrade(): "document", sa.Column( "file_size_kb", - sa.Integer(), + sa.Float(), nullable=True, comment="Size of the document in bytes", ), ) + op.add_column( + "collection_jobs", + sa.Column( + "documents", + sa.JSON(), + nullable=True, + comment="JSON array of document UUIDs included in this job", + ), + ) def downgrade(): op.drop_column("document", "file_size_kb") op.drop_column("collection_jobs", "total_size_mb") op.drop_column("collection_jobs", "docs_num") + op.drop_column("collection_jobs", "documents") diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 2aee46032..332e1da8c 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -103,6 +103,7 @@ def create_collection( project_id=current_user.project_.id, status=CollectionJobStatus.PENDING, docs_num=len(request.documents), + documents=[str(doc_id) for doc_id in request.documents], ) ) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 7d32e417f..a64592702 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -12,6 +12,7 @@ UploadFile, ) from fastapi import Path as FastPath +from fastapi import HTTPException from app.api.deps import AuthContextDep, SessionDep from app.api.permissions import Permission, require_permission @@ -27,7 +28,7 @@ DocTransformationJobPublic, ) from app.core.cloud import get_cloud_storage -from app.services.collections.helpers import pick_service_for_documennt +from app.services.collections.helpers import pick_service_for_documennt, MAX_DOC_SIZE_MB from app.services.documents.helpers import ( calculate_file_size, schedule_transformation, @@ -131,6 +132,18 @@ async def upload_doc( ) file_size_kb = await calculate_file_size(src) + file_size_mb = file_size_kb / 1024 + + if file_size_mb > MAX_DOC_SIZE_MB: + logger.warning( + f"[upload_doc] Document size exceeds limit | " + f"{{'filename': '{src.filename}', 'size_mb': {round(file_size_mb, 2)}, 'max_size_mb': {MAX_DOC_SIZE_MB}}}" + ) + raise HTTPException( + status_code=413, + detail=f"Document size ({round(file_size_mb, 2)} MB) exceeds the maximum allowed size of {MAX_DOC_SIZE_MB} MB. " + f"Please upload a smaller file.", + ) storage = get_cloud_storage(session=session, project_id=current_user.project_.id) document_id = uuid4() diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py index 727380726..a57273a06 100644 --- a/backend/app/core/cloud/storage.py +++ b/backend/app/core/cloud/storage.py @@ -125,6 +125,11 @@ def stream(self, url: str) -> StreamingBody: """Stream a file from storage""" pass + @abstractmethod + def get(self, url: str) -> bytes: + """Get file contents as bytes (for files that fit in memory)""" + pass + @abstractmethod def get_file_size_kb(self, url: str) -> float: """Return the file size in KB""" @@ -193,6 +198,25 @@ def stream(self, url: str) -> StreamingBody: ) raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err + def get(self, url: str) -> bytes: + name = SimpleStorageName.from_url(url) + kwargs = asdict(name) + try: + body = self.aws.client.get_object(**kwargs).get("Body") + content = body.read() + logger.info( + f"[AmazonCloudStorage.get] File retrieved successfully | " + f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_bytes': {len(content)}}}" + ) + return content + except ClientError as err: + logger.error( + f"[AmazonCloudStorage.get] AWS get error | " + f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}", + exc_info=True, + ) + raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err + def get_file_size_kb(self, url: str) -> float: name = SimpleStorageName.from_url(url) kwargs = asdict(name) diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py index 77c0bbdc1..da3bdb198 100644 --- a/backend/app/crud/rag/open_ai.py +++ b/backend/app/crud/rag/open_ai.py @@ -1,6 +1,7 @@ import json import logging import functools as ft +from io import BytesIO from typing import Iterable from openai import OpenAI, OpenAIError @@ -121,15 +122,13 @@ def update( storage: CloudStorage, documents: Iterable[Document], ): - files = [] for docs in documents: + files = [] for d in docs: - f_obj = storage.stream(d.object_store_url) - - # monkey patch botocore.response.StreamingBody to make - # OpenAI happy + # Get file bytes and wrap in BytesIO for OpenAI API + content = storage.get(d.object_store_url) + f_obj = BytesIO(content) f_obj.name = d.fname - files.append(f_obj) logger.info( @@ -149,13 +148,6 @@ def update( ) raise InterruptedError(error_msg) - while files: - f_obj = files.pop() - f_obj.close() - logger.info( - f"[OpenAIVectorStoreCrud.update] Closed file stream | {{'vector_store_id': '{vector_store_id}', 'filename': '{f_obj.name}'}}" - ) - yield from docs def delete(self, vector_store_id: str, retries: int = 3): diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index 4a77f0200..c0d3e3027 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -39,12 +39,10 @@ class Collection(SQLModel, table=True): description="Unique identifier for the collection", sa_column_kwargs={"comment": "Unique identifier for the collection"}, ) - provider: ProviderType = ( - Field( - nullable=False, - description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)", - sa_column_kwargs={"comment": "LLM provider used for this collection"}, - ), + provider: ProviderType = Field( + nullable=False, + description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)", + sa_column_kwargs={"comment": "LLM provider used for this collection"}, ) llm_service_id: str = Field( nullable=False, diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index efa6ea3a8..4d897adae 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -173,7 +173,7 @@ def execute_job( file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs) - total_size_mb = total_size_kb / 1024 + total_size_mb = round(total_size_kb / 1024, 2) with Session(engine) as session: collection_job_crud = CollectionJobCrud(session, project_id) diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 9fd35e2ed..4d469ba68 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -16,9 +16,13 @@ logger = logging.getLogger(__name__) # Necessary Constants - -# for dynamic batching of documents to upload to openai vector store -MAX_BATCH_SIZE_KB = 30 * 1024 # 30 MB in KB -MAX_BATCH_COUNT = 200 +# Maximum individual document size (must be less than batch size) +MAX_DOC_SIZE_MB = 25 # 25 MB maximum per document + +# Maximum batch size for uploading documents to vector store +# Derived from MAX_DOC_SIZE + buffer to ensure single docs always fit +MAX_BATCH_SIZE_KB = (MAX_DOC_SIZE_MB + 5) * 1024 # 30 MB in KB (25 + 5 MB buffer) +MAX_BATCH_COUNT = 200 # Maximum documents per batch def get_service_name(provider: str) -> str: From 3246e880b8e85b05c850d624067b4085fecd3388 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 27 Mar 2026 14:58:48 +0530 Subject: [PATCH 11/20] test coverage increase --- .../services/collections/test_helpers.py | 114 +++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py index f2af4b4d9..7cddaf305 100644 --- a/backend/app/tests/services/collections/test_helpers.py +++ b/backend/app/tests/services/collections/test_helpers.py @@ -11,7 +11,13 @@ from app.services.collections import helpers from app.tests.utils.utils import get_project from app.tests.utils.collection import get_vector_store_collection -from app.services.collections.helpers import ensure_unique_name +from app.services.collections.helpers import ( + ensure_unique_name, + get_service_name, + to_collection_public, +) +from app.models import Collection, ProviderType +from app.core.util import now def test_extract_error_message_parses_json_and_strips_prefix() -> None: @@ -167,3 +173,109 @@ def test_ensure_unique_name_conflict_with_vector_store_collection(db: Session) - assert exc.value.status_code == 409 assert "already exists" in exc.value.detail + + +# get_service_name + + +def test_get_service_name_openai() -> None: + """Test that OpenAI provider returns correct service name.""" + result = get_service_name("openai") + assert result == "openai vector store" + + +def test_get_service_name_case_insensitive() -> None: + """Test that provider name is case-insensitive.""" + assert get_service_name("OpenAI") == "openai vector store" + assert get_service_name("OPENAI") == "openai vector store" + assert get_service_name("OpEnAi") == "openai vector store" + + +def test_get_service_name_unknown_provider() -> None: + """Test that unknown providers return empty string.""" + assert get_service_name("unknown") == "" + assert get_service_name("bedrock") == "" # Commented out in the mapping + assert get_service_name("gemini") == "" # Commented out in the mapping + assert get_service_name("") == "" + + +# to_collection_public + + +def test_to_collection_public_vector_store() -> None: + """Test conversion of vector store collection to public model.""" + collection = Collection( + id=uuid4(), + project_id=1, + provider=ProviderType.openai, + llm_service_id="vs_123", + llm_service_name="openai vector store", # Matches get_service_name("openai") + name="Test Collection", + description="Test description", + inserted_at=now(), + updated_at=now(), + deleted_at=None, + ) + + result = to_collection_public(collection) + + # For vector store, should map to knowledge_base fields + assert result.id == collection.id + assert result.knowledge_base_id == "vs_123" + assert result.knowledge_base_provider == "openai vector store" + assert result.llm_service_id is None + assert result.llm_service_name is None + assert result.project_id == 1 + assert result.inserted_at == collection.inserted_at + assert result.updated_at == collection.updated_at + assert result.deleted_at is None + + +def test_to_collection_public_assistant() -> None: + """Test conversion of assistant collection to public model.""" + collection = Collection( + id=uuid4(), + project_id=2, + provider=ProviderType.openai, + llm_service_id="asst_456", + llm_service_name="gpt-4", # Does NOT match vector store name + name="Assistant Collection", + description="Assistant description", + inserted_at=now(), + updated_at=now(), + deleted_at=None, + ) + + result = to_collection_public(collection) + + # For assistant, should map to llm_service fields + assert result.id == collection.id + assert result.llm_service_id == "asst_456" + assert result.llm_service_name == "gpt-4" + assert result.knowledge_base_id is None + assert result.knowledge_base_provider is None + assert result.project_id == 2 + assert result.inserted_at == collection.inserted_at + assert result.updated_at == collection.updated_at + assert result.deleted_at is None + + +def test_to_collection_public_with_deleted_at() -> None: + """Test that deleted_at field is properly included when set.""" + deleted_time = now() + collection = Collection( + id=uuid4(), + project_id=3, + provider=ProviderType.openai, + llm_service_id="vs_789", + llm_service_name="openai vector store", + name="Deleted Collection", + description="Deleted", + inserted_at=now(), + updated_at=now(), + deleted_at=deleted_time, + ) + + result = to_collection_public(collection) + + assert result.deleted_at == deleted_time From 9be2e06665c9a38c82ebc3057d4208eeaae9b911 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 9 Apr 2026 11:06:04 +0530 Subject: [PATCH 12/20] pr review fixes --- ..._columns_to_collection_job_and_documents.py | 18 +++++++++--------- backend/app/api/routes/collection_job.py | 1 - backend/app/api/routes/collections.py | 7 ++++--- backend/app/crud/rag/open_ai.py | 1 - backend/app/models/collection.py | 2 +- backend/app/models/collection_job.py | 10 +++++++++- .../services/collections/create_collection.py | 1 - backend/app/services/collections/helpers.py | 3 +-- .../collections/test_create_collections.py | 8 ++++---- 9 files changed, 28 insertions(+), 23 deletions(-) diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py index 3ddcd00b1..ddbb77516 100644 --- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py @@ -32,25 +32,25 @@ def upgrade(): "total_size_mb", sa.Float(), nullable=True, - comment="Total size of documents being uploaded to collection", + comment="Total size of documents being uploaded to collection in MB", ), ) op.add_column( - "document", + "collection_jobs", sa.Column( - "file_size_kb", - sa.Float(), + "documents", + sa.JSON(), nullable=True, - comment="Size of the document in bytes", + comment="List of documents given to make collection", ), ) op.add_column( - "collection_jobs", + "document", sa.Column( - "documents", - sa.JSON(), + "file_size_kb", + sa.Float(), nullable=True, - comment="JSON array of document UUIDs included in this job", + comment="Size of the document in kilobytes (KB)", ), ) diff --git a/backend/app/api/routes/collection_job.py b/backend/app/api/routes/collection_job.py index c586a3cc9..792814688 100644 --- a/backend/app/api/routes/collection_job.py +++ b/backend/app/api/routes/collection_job.py @@ -17,7 +17,6 @@ CollectionActionType, CollectionJobPublic, ) -from app.models.collection import CollectionPublic from app.utils import APIResponse, load_description from app.services.collections.helpers import extract_error_message, to_collection_public diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 332e1da8c..28155c9b5 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -11,7 +11,6 @@ CollectionCrud, CollectionJobCrud, DocumentCollectionCrud, - DocumentCrud, ) from app.core.cloud import get_cloud_storage from app.models import ( @@ -96,14 +95,16 @@ def create_collection( if request.name: ensure_unique_name(session, current_user.project_.id, request.name) + unique_documents = list(dict.fromkeys(request.documents)) + collection_job_crud = CollectionJobCrud(session, current_user.project_.id) collection_job = collection_job_crud.create( CollectionJobCreate( action_type=CollectionActionType.CREATE, project_id=current_user.project_.id, status=CollectionJobStatus.PENDING, - docs_num=len(request.documents), - documents=[str(doc_id) for doc_id in request.documents], + docs_num=len(unique_documents), + documents=[str(doc_id) for doc_id in unique_documents], ) ) diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py index da3bdb198..cdae82440 100644 --- a/backend/app/crud/rag/open_ai.py +++ b/backend/app/crud/rag/open_ai.py @@ -8,7 +8,6 @@ from pydantic import BaseModel from app.core.cloud import CloudStorage -from app.core.config import settings from app.models import Document logger = logging.getLogger(__name__) diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index c0d3e3027..ccd606deb 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -4,7 +4,7 @@ from uuid import UUID, uuid4 from pydantic import HttpUrl, model_validator, model_serializer -from sqlalchemy import UniqueConstraint, Index, text +from sqlalchemy import Index, text from sqlmodel import Field, Relationship, SQLModel from app.core.util import now diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py index 6caa7fd75..333ebfd14 100644 --- a/backend/app/models/collection_job.py +++ b/backend/app/models/collection_job.py @@ -2,7 +2,8 @@ from enum import Enum from uuid import UUID, uuid4 -from sqlmodel import Column, Field, SQLModel, Text +from pydantic import field_validator +from sqlmodel import JSON, Column, Field, SQLModel, Text from app.core.util import now from app.models.collection import CollectionIDPublic, CollectionPublic @@ -73,6 +74,12 @@ class CollectionJob(SQLModel, table=True): Text, nullable=True, comment="Error message if the job failed" ), ) + documents: list[str] | None = Field( + default=None, + sa_column=Column( + JSON, nullable=True, comment="List of documents given to make collection" + ), + ) # Foreign keys collection_id: UUID | None = Field( @@ -122,6 +129,7 @@ class CollectionJobCreate(SQLModel): action_type: CollectionActionType docs_num: int | None = None project_id: int + documents: list[str] | None = None class CollectionJobUpdate(SQLModel): diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 4d897adae..eb37fd039 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -18,7 +18,6 @@ CollectionJob, Collection, CollectionJobUpdate, - CollectionPublic, CollectionJobPublic, CreationRequest, ) diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 4d469ba68..6985ac78e 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -3,7 +3,6 @@ import ast import re from uuid import UUID -from typing import List from fastapi import HTTPException from sqlmodel import select @@ -64,7 +63,7 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] -def batch_documents(documents: List[Document]) -> List[List[Document]]: +def batch_documents(documents: list[Document]) -> list[list[Document]]: """ Batch documents dynamically based on size and count limits. diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py index 189a79ccb..b51631939 100644 --- a/backend/app/tests/api/routes/collections/test_create_collections.py +++ b/backend/app/tests/api/routes/collections/test_create_collections.py @@ -16,7 +16,7 @@ def _extract_metadata(body: dict) -> dict | None: def _create_test_document( - db: Session, project_id: int, file_size: int = 1024 + db: Session, project_id: int, file_size: float = 1 ) -> Document: """Helper to create a test document.""" doc = Document( @@ -24,7 +24,7 @@ def _create_test_document( fname="test_document.txt", object_store_url="s3://test-bucket/test_document.txt", project_id=project_id, - file_size=file_size, + file_size_kb=file_size, ) db.add(doc) db.commit() @@ -41,7 +41,7 @@ def test_collection_creation_with_assistant_calls_start_job_and_returns_job( db: Session, ) -> None: # Create a test document in the database - doc = _create_test_document(db, user_api_key.project_id, file_size=2048) + doc = _create_test_document(db, user_api_key.project_id, file_size=2) creation_data = CreationRequest( model="gpt-4o", @@ -91,7 +91,7 @@ def test_collection_creation_vector_only_adds_metadata_and_sets_with_assistant_f db: Session, ) -> None: # Create a test document in the database - doc = _create_test_document(db, user_api_key.project_id, file_size=5120) + doc = _create_test_document(db, user_api_key.project_id, file_size=5) creation_data = CreationRequest( temperature=0.000001, From db593e213d3aee373d05ebed6bbd4593be2c2bae Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 9 Apr 2026 11:56:56 +0530 Subject: [PATCH 13/20] adding doc helper test cases --- .../app/tests/services/documents/__init__.py | 0 .../tests/services/documents/test_helpers.py | 83 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 backend/app/tests/services/documents/__init__.py create mode 100644 backend/app/tests/services/documents/test_helpers.py diff --git a/backend/app/tests/services/documents/__init__.py b/backend/app/tests/services/documents/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/services/documents/test_helpers.py b/backend/app/tests/services/documents/test_helpers.py new file mode 100644 index 000000000..9d3eec363 --- /dev/null +++ b/backend/app/tests/services/documents/test_helpers.py @@ -0,0 +1,83 @@ +from io import BytesIO + +import pytest +from fastapi import UploadFile + +from app.services.documents.helpers import calculate_file_size + + +def make_upload_file(content: bytes, size: int | None = None) -> UploadFile: + """Create an UploadFile with the given content and optional pre-set size.""" + file = UploadFile(file=BytesIO(content), size=size) + return file + + +class TestCalculateFileSizeWithSizeAttribute: + @pytest.mark.anyio + async def test_uses_size_attribute_when_set(self) -> None: + """Uses file.size directly when it is provided.""" + file = make_upload_file(b"irrelevant", size=2048) + result = await calculate_file_size(file) + assert result == 2 # 2048 / 1024 = 2.0 + + @pytest.mark.anyio + async def test_rounds_fractional_kb(self) -> None: + """Rounds the result when size is not an exact multiple of 1024.""" + file = make_upload_file(b"irrelevant", size=1536) # 1.5 KB → rounds to 2 + result = await calculate_file_size(file) + assert result == 2 + + @pytest.mark.anyio + async def test_rounds_down_fractional_kb(self) -> None: + """Rounds down when fractional part is below .5.""" + file = make_upload_file(b"irrelevant", size=1300) # ~1.27 KB → rounds to 1 + result = await calculate_file_size(file) + assert result == 1 + + @pytest.mark.anyio + async def test_large_file_size(self) -> None: + """Correctly converts large sizes.""" + file = make_upload_file(b"irrelevant", size=10 * 1024 * 1024) # 10 MB + result = await calculate_file_size(file) + assert result == 10 * 1024 # 10240 KB + + +class TestCalculateFileSizeViaSeek: + @pytest.mark.anyio + async def test_falls_back_to_seek_when_size_is_none(self) -> None: + """Falls back to seek/tell when file.size is None.""" + content = b"x" * 2048 + file = make_upload_file(content, size=None) + result = await calculate_file_size(file) + assert result == 2 # 2048 / 1024 = 2 + + @pytest.mark.anyio + async def test_falls_back_to_seek_when_size_is_zero(self) -> None: + """Falls back to seek/tell when file.size is 0 (falsy).""" + content = b"x" * 3072 + file = make_upload_file(content, size=0) + result = await calculate_file_size(file) + assert result == 3 # 3072 / 1024 = 3 + + @pytest.mark.anyio + async def test_resets_file_pointer_after_seek(self) -> None: + """File pointer is back at position 0 after size calculation.""" + content = b"hello world" + file = make_upload_file(content, size=None) + await calculate_file_size(file) + assert file.file.tell() == 0 + + @pytest.mark.anyio + async def test_seek_with_fractional_kb(self) -> None: + """Rounds correctly when content size is not a multiple of 1024.""" + content = b"x" * 1600 # ~1.56 KB → rounds to 2 + file = make_upload_file(content, size=None) + result = await calculate_file_size(file) + assert result == 2 + + @pytest.mark.anyio + async def test_empty_file_via_seek(self) -> None: + """Returns 0 for an empty file when size is None.""" + file = make_upload_file(b"", size=None) + result = await calculate_file_size(file) + assert result == 0 From c775c90606e89bb953eb2636f76f2bf6a452fa30 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 9 Apr 2026 12:44:06 +0530 Subject: [PATCH 14/20] making file size function non-async --- backend/app/api/routes/documents.py | 2 +- backend/app/services/documents/helpers.py | 2 +- .../tests/services/documents/test_helpers.py | 69 +++++++------------ 3 files changed, 25 insertions(+), 48 deletions(-) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 8d9dadc30..3c02ee4a6 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -131,7 +131,7 @@ async def upload_doc( transformer=transformer, ) - file_size_kb = await calculate_file_size(src) + file_size_kb = calculate_file_size(src) file_size_mb = file_size_kb / 1024 if file_size_mb > MAX_DOC_SIZE_MB: diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py index 78619a6e9..7d78b6160 100644 --- a/backend/app/services/documents/helpers.py +++ b/backend/app/services/documents/helpers.py @@ -23,7 +23,7 @@ ) -async def calculate_file_size(file: UploadFile) -> float: +def calculate_file_size(file: UploadFile) -> float: """ Calculate the size of an uploaded file in kilobytes. diff --git a/backend/app/tests/services/documents/test_helpers.py b/backend/app/tests/services/documents/test_helpers.py index 9d3eec363..9a4b25344 100644 --- a/backend/app/tests/services/documents/test_helpers.py +++ b/backend/app/tests/services/documents/test_helpers.py @@ -1,6 +1,5 @@ from io import BytesIO -import pytest from fastapi import UploadFile from app.services.documents.helpers import calculate_file_size @@ -8,76 +7,54 @@ def make_upload_file(content: bytes, size: int | None = None) -> UploadFile: """Create an UploadFile with the given content and optional pre-set size.""" - file = UploadFile(file=BytesIO(content), size=size) - return file + return UploadFile(file=BytesIO(content), size=size) class TestCalculateFileSizeWithSizeAttribute: - @pytest.mark.anyio - async def test_uses_size_attribute_when_set(self) -> None: + def test_uses_size_attribute_when_set(self) -> None: """Uses file.size directly when it is provided.""" file = make_upload_file(b"irrelevant", size=2048) - result = await calculate_file_size(file) - assert result == 2 # 2048 / 1024 = 2.0 + assert calculate_file_size(file) == 2 # 2048 / 1024 = 2.0 - @pytest.mark.anyio - async def test_rounds_fractional_kb(self) -> None: + def test_rounds_fractional_kb(self) -> None: """Rounds the result when size is not an exact multiple of 1024.""" file = make_upload_file(b"irrelevant", size=1536) # 1.5 KB → rounds to 2 - result = await calculate_file_size(file) - assert result == 2 + assert calculate_file_size(file) == 2 - @pytest.mark.anyio - async def test_rounds_down_fractional_kb(self) -> None: + def test_rounds_down_fractional_kb(self) -> None: """Rounds down when fractional part is below .5.""" file = make_upload_file(b"irrelevant", size=1300) # ~1.27 KB → rounds to 1 - result = await calculate_file_size(file) - assert result == 1 + assert calculate_file_size(file) == 1 - @pytest.mark.anyio - async def test_large_file_size(self) -> None: + def test_large_file_size(self) -> None: """Correctly converts large sizes.""" file = make_upload_file(b"irrelevant", size=10 * 1024 * 1024) # 10 MB - result = await calculate_file_size(file) - assert result == 10 * 1024 # 10240 KB + assert calculate_file_size(file) == 10 * 1024 # 10240 KB class TestCalculateFileSizeViaSeek: - @pytest.mark.anyio - async def test_falls_back_to_seek_when_size_is_none(self) -> None: + def test_falls_back_to_seek_when_size_is_none(self) -> None: """Falls back to seek/tell when file.size is None.""" - content = b"x" * 2048 - file = make_upload_file(content, size=None) - result = await calculate_file_size(file) - assert result == 2 # 2048 / 1024 = 2 + file = make_upload_file(b"x" * 2048, size=None) + assert calculate_file_size(file) == 2 # 2048 / 1024 = 2 - @pytest.mark.anyio - async def test_falls_back_to_seek_when_size_is_zero(self) -> None: + def test_falls_back_to_seek_when_size_is_zero(self) -> None: """Falls back to seek/tell when file.size is 0 (falsy).""" - content = b"x" * 3072 - file = make_upload_file(content, size=0) - result = await calculate_file_size(file) - assert result == 3 # 3072 / 1024 = 3 + file = make_upload_file(b"x" * 3072, size=0) + assert calculate_file_size(file) == 3 # 3072 / 1024 = 3 - @pytest.mark.anyio - async def test_resets_file_pointer_after_seek(self) -> None: + def test_resets_file_pointer_after_seek(self) -> None: """File pointer is back at position 0 after size calculation.""" - content = b"hello world" - file = make_upload_file(content, size=None) - await calculate_file_size(file) + file = make_upload_file(b"hello world", size=None) + calculate_file_size(file) assert file.file.tell() == 0 - @pytest.mark.anyio - async def test_seek_with_fractional_kb(self) -> None: + def test_seek_with_fractional_kb(self) -> None: """Rounds correctly when content size is not a multiple of 1024.""" - content = b"x" * 1600 # ~1.56 KB → rounds to 2 - file = make_upload_file(content, size=None) - result = await calculate_file_size(file) - assert result == 2 + file = make_upload_file(b"x" * 1600, size=None) # ~1.56 KB → rounds to 2 + assert calculate_file_size(file) == 2 - @pytest.mark.anyio - async def test_empty_file_via_seek(self) -> None: + def test_empty_file_via_seek(self) -> None: """Returns 0 for an empty file when size is None.""" file = make_upload_file(b"", size=None) - result = await calculate_file_size(file) - assert result == 0 + assert calculate_file_size(file) == 0 From f809ce86041709e02ea5cf17cc0352188cd1a116 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 9 Apr 2026 13:09:09 +0530 Subject: [PATCH 15/20] adding max time limit test case --- .../documents/test_route_document_upload.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index 6f16b52b1..f5ccca242 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -4,7 +4,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from urllib.parse import urlparse -from unittest.mock import patch +from unittest.mock import patch, MagicMock import pytest from moto import mock_aws @@ -301,6 +301,50 @@ def test_transformation_job_created_in_database( # Check that start_job was called with the right arguments assert "transformer_name" in kwargs or len(args) >= 4 + def test_upload_file_within_size_limit( + self, + db: Session, + route: Route, + scratch: Path, + uploader: WebUploader, + ) -> None: + """Test that a file within the size limit uploads successfully.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Mock calculate_file_size to return a value just under the 25 MB limit (in KB) + with patch( + "app.api.routes.documents.calculate_file_size", + return_value=25 * 1024 - 1, # 25 MB - 1 KB + ): + response = uploader.put(route, scratch) + + assert response.status_code == 200 + + def test_upload_file_exceeds_size_limit( + self, + db: Session, + route: Route, + scratch: Path, + uploader: WebUploader, + ) -> None: + """Test that a file exceeding 25 MB returns a 413 error.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Mock calculate_file_size to return a value over the 25 MB limit (in KB) + with patch( + "app.api.routes.documents.calculate_file_size", + return_value=25 * 1024 + 1, # 25 MB + 1 KB + ): + response = uploader.put(route, scratch) + + assert response.status_code == 413 + print("response =", response.json()) + error_detail = response.json()["error"] + assert "exceeds the maximum allowed size" in error_detail + assert "25" in error_detail + def test_upload_response_structure_without_transformation( self, db: Session, From 5e259287b714715e4a779d9e20426c9704451716 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 10 Apr 2026 16:18:01 +0530 Subject: [PATCH 16/20] fixng migration issue --- ...=> 051_add_columns_to_collection_job_and_documents.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{050_add_columns_to_collection_job_and_documents.py => 051_add_columns_to_collection_job_and_documents.py} (95%) diff --git a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py similarity index 95% rename from backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py rename to backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py index ddbb77516..86b0be3b4 100644 --- a/backend/app/alembic/versions/050_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py @@ -1,7 +1,7 @@ """add columns to collection job and documents table -Revision ID: 050 -Revises: 049 +Revision ID: 051 +Revises: 050 Create Date: 2026-03-25 10:09:47.318575 """ @@ -10,8 +10,8 @@ # revision identifiers, used by Alembic. -revision = "050" -down_revision = "049" +revision = "051" +down_revision = "050" branch_labels = None depends_on = None From aa19cd595d554a18761599ada1a7298d171486a9 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 10 Apr 2026 16:56:44 +0530 Subject: [PATCH 17/20] small error --- backend/app/services/collections/providers/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/services/collections/providers/openai.py b/backend/app/services/collections/providers/openai.py index d73db2de1..f52e83394 100644 --- a/backend/app/services/collections/providers/openai.py +++ b/backend/app/services/collections/providers/openai.py @@ -6,7 +6,7 @@ from app.services.collections.providers import BaseProvider from app.core.cloud.storage import CloudStorage from app.crud.rag import OpenAIVectorStoreCrud, OpenAIAssistantCrud -from app.services.collections.helpers import get_service_name +from app.services.collections.helpers import get_service_name, batch_documents from app.models import CreationRequest, Collection, Document From 69f5c00bc673de4596df3e5ffab15d76bdca8ec0 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 10 Apr 2026 18:47:02 +0530 Subject: [PATCH 18/20] fixing test cases --- .../collections/providers/test_openai_provider.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/app/tests/services/collections/providers/test_openai_provider.py b/backend/app/tests/services/collections/providers/test_openai_provider.py index 68b2be1bd..b21577d49 100644 --- a/backend/app/tests/services/collections/providers/test_openai_provider.py +++ b/backend/app/tests/services/collections/providers/test_openai_provider.py @@ -24,7 +24,10 @@ def test_create_openai_vector_store_only() -> None: ) storage = MagicMock() - docs_batches = [["doc1"], ["doc2"]] + documents = [ + SimpleNamespace(file_size_kb=10), + SimpleNamespace(file_size_kb=20), + ] vector_store_id = generate_openai_id("vs_") with patch( @@ -37,7 +40,7 @@ def test_create_openai_vector_store_only() -> None: collection = provider.create( collection_request, storage, - docs_batches, + documents, ) assert isinstance(collection, Collection) @@ -57,7 +60,7 @@ def test_create_openai_with_assistant() -> None: ) storage = MagicMock() - docs_batches = [["doc1"]] + documents = [SimpleNamespace(file_size_kb=10)] vector_store_id = generate_openai_id("vs_") assistant_id = generate_openai_id("asst_") @@ -76,7 +79,7 @@ def test_create_openai_with_assistant() -> None: collection = provider.create( collection_request, storage, - docs_batches, + documents, ) assert collection.llm_service_id == assistant_id @@ -140,5 +143,5 @@ def test_create_propagates_exception() -> None: provider.create( collection_request, MagicMock(), - [["doc1"]], + [SimpleNamespace(file_size_kb=10)], ) From a4087c8fd521585178f54c1cf0065ee0b7c03b5d Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 16 Apr 2026 14:11:10 +0530 Subject: [PATCH 19/20] fix alembic migration --- ...=> 052_add_columns_to_collection_job_and_documents.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{051_add_columns_to_collection_job_and_documents.py => 052_add_columns_to_collection_job_and_documents.py} (95%) diff --git a/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py similarity index 95% rename from backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py rename to backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py index 86b0be3b4..f6bf70dc3 100644 --- a/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py @@ -1,7 +1,7 @@ """add columns to collection job and documents table -Revision ID: 051 -Revises: 050 +Revision ID: 052 +Revises: 051 Create Date: 2026-03-25 10:09:47.318575 """ @@ -10,8 +10,8 @@ # revision identifiers, used by Alembic. -revision = "051" -down_revision = "050" +revision = "052" +down_revision = "051" branch_labels = None depends_on = None From c65616af62e3c2da351cda4980ec6b86960d4857 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 16 Apr 2026 14:14:03 +0530 Subject: [PATCH 20/20] fix alembic migration --- ...=> 053_add_columns_to_collection_job_and_documents.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{052_add_columns_to_collection_job_and_documents.py => 053_add_columns_to_collection_job_and_documents.py} (95%) diff --git a/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py similarity index 95% rename from backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py rename to backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py index f6bf70dc3..de956f5d6 100644 --- a/backend/app/alembic/versions/052_add_columns_to_collection_job_and_documents.py +++ b/backend/app/alembic/versions/053_add_columns_to_collection_job_and_documents.py @@ -1,7 +1,7 @@ """add columns to collection job and documents table -Revision ID: 052 -Revises: 051 +Revision ID: 053 +Revises: 052 Create Date: 2026-03-25 10:09:47.318575 """ @@ -10,8 +10,8 @@ # revision identifiers, used by Alembic. -revision = "052" -down_revision = "051" +revision = "053" +down_revision = "052" branch_labels = None depends_on = None