From 27dbfa7c197127dc9fedb8da61e25d498d71563a Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 17 Nov 2025 09:58:31 +0530 Subject: [PATCH 01/15] commiting everything all at once --- ...9a_alter_doc_transform_table_for_celery.py | 30 +++ .../app/api/routes/doc_transformation_job.py | 88 ++++-- backend/app/api/routes/documents.py | 46 ++-- backend/app/api/routes/fine_tuning.py | 2 +- backend/app/core/doctransform/service.py | 146 ---------- backend/app/crud/__init__.py | 4 +- backend/app/crud/document/__init__.py | 0 .../{ => document}/doc_transformation_job.py | 41 +-- backend/app/crud/{ => document}/document.py | 1 - backend/app/models/__init__.py | 6 +- backend/app/models/doc_transformation_job.py | 29 +- backend/app/models/document.py | 26 +- backend/app/services/collections/helpers.py | 2 +- backend/app/services/doctransform/job.py | 251 ++++++++++++++++++ .../doctransform/registry.py | 4 +- .../doctransform/transformer.py | 0 .../doctransform/zerox_transformer.py | 2 +- .../documents/test_route_document_info.py | 2 +- .../documents/test_route_document_list.py | 2 +- .../test_route_document_permanent_remove.py | 2 +- .../documents/test_route_document_remove.py | 2 +- .../documents/test_route_document_upload.py | 8 +- .../api/routes/test_doc_transformation_job.py | 2 +- .../core/doctransformer/test_registry.py | 3 +- .../doctransformer/test_service/conftest.py | 6 +- .../test_service/test_execute_job.py | 4 +- .../test_service/test_integration.py | 2 +- .../test_service/test_start_job.py | 4 +- .../core/doctransformer/test_service/utils.py | 2 +- .../tests/crud/test_doc_transformation_job.py | 3 +- 30 files changed, 481 insertions(+), 239 deletions(-) create mode 100644 backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py delete mode 100644 backend/app/core/doctransform/service.py create mode 100644 backend/app/crud/document/__init__.py rename backend/app/crud/{ => document}/doc_transformation_job.py (71%) rename backend/app/crud/{ => document}/document.py (99%) create mode 100644 backend/app/services/doctransform/job.py rename backend/app/{core => services}/doctransform/registry.py (96%) rename backend/app/{core => services}/doctransform/transformer.py (100%) rename backend/app/{core => services}/doctransform/zerox_transformer.py (97%) diff --git a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py new file mode 100644 index 000000000..3d9fe25cb --- /dev/null +++ b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py @@ -0,0 +1,30 @@ +"""alter doc transform table for celery + +Revision ID: eed36ae3c79a +Revises: 6fe772038a5a +Create Date: 2025-11-12 20:08:39.774862 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'eed36ae3c79a' +down_revision = '6fe772038a5a' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('doc_transformation_job', sa.Column('task_id', sqlmodel.sql.sqltypes.AutoString(), nullable=True)) + op.add_column('doc_transformation_job', sa.Column('trace_id', sqlmodel.sql.sqltypes.AutoString(), nullable=True)) + op.alter_column("doc_transformation_job", "created_at", new_column_name="inserted_at") + + +def downgrade(): + op.alter_column("doc_transformation_job", "inserted_at", new_column_name="created_at") + op.drop_column('doc_transformation_job', 'trace_id') + op.drop_column('doc_transformation_job', 'task_id') + \ No newline at end of file diff --git a/backend/app/api/routes/doc_transformation_job.py b/backend/app/api/routes/doc_transformation_job.py index fa40769b5..f8158664e 100644 --- a/backend/app/api/routes/doc_transformation_job.py +++ b/backend/app/api/routes/doc_transformation_job.py @@ -1,34 +1,56 @@ from uuid import UUID +import logging -from fastapi import APIRouter, HTTPException, Query, Path as FastPath +from fastapi import APIRouter, HTTPException, Query, Path from app.api.deps import CurrentUserOrgProject, SessionDep -from app.crud.doc_transformation_job import DocTransformationJobCrud -from app.models import DocTransformationJob, DocTransformationJobs +from app.crud import DocTransformationJobCrud, DocumentCrud +from app.models import DocTransformationJobPublic, DocTransformationJobsPublic, TransformedDocumentPublic from app.utils import APIResponse +from app.core.cloud import get_cloud_storage -router = APIRouter(prefix="/documents/transformations", tags=["doc_transformation_job"]) + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/documents", tags=["doc_transformation_job"]) @router.get( - "/{job_id}", + "/transformation/{job_id}", description="Get the status and details of a document transformation job.", - response_model=APIResponse[DocTransformationJob], + response_model=APIResponse[DocTransformationJobPublic], ) def get_transformation_job( session: SessionDep, current_user: CurrentUserOrgProject, - job_id: UUID = FastPath(description="Transformation job ID"), + job_id: UUID = Path(..., description="Transformation job ID"), + include_url: bool = Query(False, description="Include a signed URL for the transformed document"), ): - crud = DocTransformationJobCrud(session, current_user.project_id) - job = crud.read_one(job_id) - return APIResponse.success_response(job) + job_crud = DocTransformationJobCrud(session, current_user.project_id) + doc_crud = DocumentCrud(session, current_user.project_id) + + job = job_crud.read_one(job_id) + + transformed_doc_schema = None + if getattr(job, "transformed_document_id", None): + document = doc_crud.read_one(job.transformed_document_id) + transformed_doc_schema = TransformedDocumentPublic.model_validate( + document, from_attributes=True + ) + + if include_url: + storage = get_cloud_storage(session=session, project_id=current_user.project_id) + transformed_doc_schema.signed_url= storage.get_signed_url(document.object_store_url) + + job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) + job_schema = job_schema.model_copy(update={"transformed_document": transformed_doc_schema}) + + return APIResponse.success_response(job_schema) @router.get( - "/", + "/transformation/", description="Get the status and details of multiple document transformation jobs by IDs.", - response_model=APIResponse[DocTransformationJobs], + response_model=APIResponse[DocTransformationJobsPublic], ) def get_multiple_transformation_jobs( session: SessionDep, @@ -36,10 +58,44 @@ def get_multiple_transformation_jobs( job_ids: list[UUID] = Query( description="List of transformation job IDs", min=1, max_length=100 ), + include_url: bool = Query(False, description="Include a signed URL for each transformed document"), ): - crud = DocTransformationJobCrud(session, project_id=current_user.project_id) - jobs = crud.read_each(set(job_ids)) - jobs_not_found = set(job_ids) - {job.id for job in jobs} + job_crud = DocTransformationJobCrud(session, project_id=current_user.project_id) + doc_crud = DocumentCrud(session, project_id=current_user.project_id) + + jobs = job_crud.read_each(set(job_ids)) + jobs_found_ids = {job.id for job in jobs} + jobs_not_found = set(job_ids) - jobs_found_ids + + storage = None + if include_url: + storage = get_cloud_storage(session=session, project_id=current_user.project_id) + + job_schemas: list[DocTransformationJobPublic] = [] + + for job in jobs: + transformed_doc_schema = None + + if getattr(job, "transformed_document_id", None): + document = doc_crud.read_one(job.transformed_document_id) + transformed_doc_schema = TransformedDocumentPublic.model_validate( + document, from_attributes=True + ) + + if include_url and storage: + transformed_doc_schema.signed_url = storage.get_signed_url( + document.object_store_url + ) + + job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) + job_schema = job_schema.model_copy( + update={"transformed_document": transformed_doc_schema} + ) + job_schemas.append(job_schema) + return APIResponse.success_response( - DocTransformationJobs(jobs=jobs, jobs_not_found=list(jobs_not_found)) + DocTransformationJobsPublic( + jobs=job_schemas, + jobs_not_found=list(jobs_not_found), + ) ) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index ed5a431ce..009bf7dff 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -1,5 +1,6 @@ import logging from pathlib import Path +from typing import Union from uuid import UUID, uuid4 from fastapi import ( @@ -15,19 +16,21 @@ from app.api.deps import CurrentUserOrgProject, SessionDep from app.core.cloud import get_cloud_storage -from app.core.doctransform import service as transformation_service -from app.core.doctransform.registry import ( +from app.services.doctransform import job as transformation_job +from app.services.doctransform.registry import ( get_available_transformers, get_file_format, is_transformation_supported, resolve_transformer, ) -from app.crud import CollectionCrud, DocumentCrud +from app.crud import CollectionCrud, DocumentCrud, DocTransformationJobCrud from app.crud.rag import OpenAIAssistantCrud, OpenAIVectorStoreCrud from app.models import ( Document, DocumentPublic, + TransformedDocumentPublic, DocumentUploadResponse, + DocTransformJobCreate, Message, TransformationJobInfo, ) @@ -40,7 +43,7 @@ @router.get( - "/list", + "/", description=load_description("documents/list.md"), response_model=APIResponse[list[DocumentPublic]], ) @@ -56,14 +59,13 @@ def list_docs( @router.post( - "/upload", + "/", description=load_description("documents/upload.md"), response_model=APIResponse[DocumentUploadResponse], ) async def upload_doc( session: SessionDep, current_user: CurrentUserOrgProject, - background_tasks: BackgroundTasks, src: UploadFile = File(...), target_format: str | None = Form( @@ -74,6 +76,8 @@ async def upload_doc( | None = Form( None, description="Name of the transformer to apply when converting. " ), + callback_url: str + | None = Form("URL to call to report endpoint status") ): # Determine source file format try: @@ -120,21 +124,22 @@ async def upload_doc( job_info: TransformationJobInfo | None = None if target_format and actual_transformer: - job_id = transformation_service.start_job( + job_crud = DocTransformationJobCrud(session, current_user.project_id) + job = job_crud.create(DocTransformJobCreate(source_document_id=source_document.id)) + + transformation_job.start_job( db=session, + job_id=job.id, current_user=current_user, - source_document_id=source_document.id, transformer_name=actual_transformer, target_format=target_format, - background_tasks=background_tasks, + callback_url = callback_url ) job_info = TransformationJobInfo( message=f"Document accepted for transformation from {source_format} to {target_format}.", - job_id=str(job_id), - source_format=source_format, - target_format=target_format, + job_id=str(job.id), transformer=actual_transformer, - status_check_url=f"/documents/transformations/{job_id}", + status_check_url=f"/documents/transformations/{job.id}", ) document_schema = DocumentPublic.model_validate( @@ -151,7 +156,7 @@ async def upload_doc( @router.delete( - "/remove/{doc_id}", + "/{doc_id}", description=load_description("documents/delete.md"), response_model=APIResponse[Message], ) @@ -182,7 +187,7 @@ def remove_doc( @router.delete( - "/remove/{doc_id}/permanent", + "/{doc_id}/permanent", description=load_description("documents/permanent_delete.md"), response_model=APIResponse[Message], ) @@ -216,9 +221,9 @@ def permanent_delete_doc( @router.get( - "/info/{doc_id}", + "/{doc_id}", description=load_description("documents/info.md"), - response_model=APIResponse[DocumentPublic], + response_model=APIResponse[Union[DocumentPublic, TransformedDocumentPublic]], ) def doc_info( session: SessionDep, @@ -231,7 +236,12 @@ def doc_info( crud = DocumentCrud(session, current_user.project_id) document = crud.read_one(doc_id) - doc_schema = DocumentPublic.model_validate(document, from_attributes=True) + if document.source_document_id is None: + doc_schema = DocumentPublic.model_validate(document, from_attributes=True) + + else: + doc_schema = TransformedDocumentPublic.model_validate(document, from_attributes=True) + if include_url: storage = get_cloud_storage(session=session, project_id=current_user.project_id) doc_schema.signed_url = storage.get_signed_url(document.object_store_url) diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py index 203d6907a..33e14cbe7 100644 --- a/backend/app/api/routes/fine_tuning.py +++ b/backend/app/api/routes/fine_tuning.py @@ -18,7 +18,7 @@ ModelEvaluationStatus, ) from app.core.cloud import get_cloud_storage -from app.crud.document import DocumentCrud +from app.crud.document.document import DocumentCrud from app.utils import ( get_openai_client, handle_openai_error, diff --git a/backend/app/core/doctransform/service.py b/backend/app/core/doctransform/service.py deleted file mode 100644 index e119492f8..000000000 --- a/backend/app/core/doctransform/service.py +++ /dev/null @@ -1,146 +0,0 @@ -import tempfile -import shutil -import logging -from pathlib import Path -from uuid import uuid4, UUID - -from fastapi import BackgroundTasks, UploadFile -from tenacity import retry, wait_exponential, stop_after_attempt -from sqlmodel import Session -from starlette.datastructures import Headers - -from app.crud.doc_transformation_job import DocTransformationJobCrud -from app.crud.document import DocumentCrud -from app.models.document import Document -from app.models.doc_transformation_job import TransformationStatus -from app.models import User -from app.core.cloud import get_cloud_storage -from app.api.deps import CurrentUserOrgProject -from app.core.doctransform.registry import convert_document, FORMAT_TO_EXTENSION -from app.core.db import engine - -logger = logging.getLogger(__name__) - - -def start_job( - db: Session, - current_user: CurrentUserOrgProject, - source_document_id: UUID, - transformer_name: str, - target_format: str, - background_tasks: BackgroundTasks, -) -> UUID: - job_crud = DocTransformationJobCrud(session=db, project_id=current_user.project_id) - job = job_crud.create(source_document_id=source_document_id) - - # Extract the project ID before passing to background task - project_id = current_user.project_id - background_tasks.add_task( - execute_job, project_id, job.id, transformer_name, target_format - ) - logger.info( - f"[start_job] Job scheduled for document transformation | id: {job.id}, project_id: {project_id}" - ) - return job.id - - -@retry(wait=wait_exponential(multiplier=5, min=5, max=10), stop=stop_after_attempt(3)) -def execute_job( - project_id: int, - job_id: UUID, - transformer_name: str, - target_format: str, -): - tmp_dir: Path | None = None - try: - logger.info( - f"[execute_job started] Transformation Job started | job_id={job_id} | transformer_name={transformer_name} | target_format={target_format} | project_id={project_id}" - ) - - # Update job status to PROCESSING and fetch source document info - with Session(engine) as db: - job_crud = DocTransformationJobCrud(session=db, project_id=project_id) - job = job_crud.update_status(job_id, TransformationStatus.PROCESSING) - - doc_crud = DocumentCrud(session=db, project_id=project_id) - - source_doc = doc_crud.read_one(job.source_document_id) - - source_doc_id = source_doc.id - source_doc_fname = source_doc.fname - source_doc_object_store_url = source_doc.object_store_url - - storage = get_cloud_storage(session=db, project_id=project_id) - - # Download and transform document - body = storage.stream(source_doc_object_store_url) - tmp_dir = Path(tempfile.mkdtemp()) - tmp_in = tmp_dir / f"{source_doc_id}" - with open(tmp_in, "wb") as f: - shutil.copyfileobj(body, f) - - # prepare output file path - fname_no_ext = Path(source_doc_fname).stem - target_extension = FORMAT_TO_EXTENSION.get(target_format, f".{target_format}") - transformed_doc_id = uuid4() - tmp_out = tmp_dir / f"{fname_no_ext}{target_extension}" - - # transform document - now returns the output file path - convert_document(tmp_in, tmp_out, transformer_name) - - # Determine content type based on target format - content_type_map = {"markdown": "text/markdown; charset=utf-8"} - content_type = content_type_map.get(target_format, "text/plain") - - # upload transformed file and create document record - with open(tmp_out, "rb") as fobj: - file_upload = UploadFile( - filename=tmp_out.name, - file=fobj, - headers=Headers({"content-type": content_type}), - ) - dest = storage.put(file_upload, Path(str(transformed_doc_id))) - - # create new Document record - with Session(engine) as db: - new_doc = Document( - id=transformed_doc_id, - project_id=project_id, - fname=tmp_out.name, - object_store_url=str(dest), - source_document_id=source_doc_id, - ) - created = DocumentCrud(db, project_id).update(new_doc) - - job_crud = DocTransformationJobCrud(session=db, project_id=project_id) - job_crud.update_status( - job_id, - TransformationStatus.COMPLETED, - transformed_document_id=created.id, - ) - - logger.info( - f"[execute_job] Doc Transformation job completed | job_id={job_id} | transformed_doc_id={created.id} | project_id={project_id}" - ) - - except Exception as e: - logger.error( - f"Transformation job failed | job_id={job_id} | error={e}", exc_info=True - ) - try: - with Session(engine) as db: - job_crud = DocTransformationJobCrud(session=db, project_id=project_id) - job_crud.update_status( - job_id, TransformationStatus.FAILED, error_message=str(e) - ) - logger.info( - f"[execute_job] Doc Transformation job failed | job_id={job_id} | error={e}" - ) - except Exception as db_error: - logger.error( - f"Failed to update job status to FAILED | job_id={job_id} | db_error={db_error}" - ) - raise - finally: - if tmp_dir and tmp_dir.exists(): - shutil.rmtree(tmp_dir) diff --git a/backend/app/crud/__init__.py b/backend/app/crud/__init__.py index be5d36a40..658193106 100644 --- a/backend/app/crud/__init__.py +++ b/backend/app/crud/__init__.py @@ -6,9 +6,9 @@ ) from .collection.collection import CollectionCrud from .collection.collection_job import CollectionJobCrud -from .document import DocumentCrud +from .document.document import DocumentCrud from .document_collection import DocumentCollectionCrud -from .doc_transformation_job import DocTransformationJobCrud +from .document.doc_transformation_job import DocTransformationJobCrud from .jobs import JobCrud from .organization import ( diff --git a/backend/app/crud/document/__init__.py b/backend/app/crud/document/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/crud/doc_transformation_job.py b/backend/app/crud/document/doc_transformation_job.py similarity index 71% rename from backend/app/crud/doc_transformation_job.py rename to backend/app/crud/document/doc_transformation_job.py index 7edf5f8dd..e5c316e6c 100644 --- a/backend/app/crud/doc_transformation_job.py +++ b/backend/app/crud/document/doc_transformation_job.py @@ -1,9 +1,16 @@ import logging from uuid import UUID from typing import List, Optional -from sqlmodel import Session, select, and_, join + +from sqlmodel import Session, select, and_ + from app.crud import DocumentCrud -from app.models import DocTransformationJob, TransformationStatus +from app.models import ( + DocTransformationJob, + TransformationStatus, + DocTransformJobCreate, + DocTransformJobUpdate +) from app.models.document import Document from app.core.util import now from app.core.exception_handlers import HTTPException @@ -16,16 +23,14 @@ def __init__(self, session: Session, project_id: int): self.session = session self.project_id = project_id - def create(self, source_document_id: UUID) -> DocTransformationJob: - # Ensure the source document exists and is not deleted - DocumentCrud(self.session, self.project_id).read_one(source_document_id) + def create(self, payload: DocTransformJobCreate) -> DocTransformationJob: - job = DocTransformationJob(source_document_id=source_document_id) + job = DocTransformationJob(**payload.model_dump()) self.session.add(job) self.session.commit() self.session.refresh(job) logger.info( - f"[DocTransformationJobCrud.create] Created new transformation job | id: {job.id}, source_document_id: {source_document_id}" + f"[DocTransformationJobCrud.create] Created new transformation job | id: {job.id}, source_document_id: {job.source_document_id}" ) return job @@ -66,26 +71,26 @@ def read_each(self, job_ids: set[UUID]) -> list[DocTransformationJob]: jobs = self.session.exec(statement).all() return jobs - def update_status( + def update( self, job_id: UUID, - status: TransformationStatus, - *, - error_message: Optional[str] = None, - transformed_document_id: Optional[UUID] = None, + patch: DocTransformJobUpdate, ) -> DocTransformationJob: + """Update an existing doc transformation job and return the updated row.""" job = self.read_one(job_id) - job.status = status + + # Only apply fields that were explicitly set and not None + changes = patch.model_dump(exclude_unset=True, exclude_none=True) + for field, value in changes.items(): + setattr(job, field, value) + job.updated_at = now() - if error_message is not None: - job.error_message = error_message - if transformed_document_id is not None: - job.transformed_document_id = transformed_document_id self.session.add(job) self.session.commit() self.session.refresh(job) + logger.info( - f"[DocTransformationJobCrud.update_status] Updated job status | id: {job.id}, status: {status}" + f"[DocTransformationJobCrud.update_status] Updated job status | id: {job.id}" ) return job diff --git a/backend/app/crud/document.py b/backend/app/crud/document/document.py similarity index 99% rename from backend/app/crud/document.py rename to backend/app/crud/document/document.py index 8703d56a8..1443e385a 100644 --- a/backend/app/crud/document.py +++ b/backend/app/crud/document/document.py @@ -1,6 +1,5 @@ import logging from uuid import UUID -from typing import Optional, List from sqlmodel import Session, select, and_ diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index b2f294025..afa803aaa 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -32,13 +32,17 @@ from .document import ( Document, DocumentPublic, + DocTransformationJobPublic, + DocTransformationJobsPublic, + TransformedDocumentPublic, DocumentUploadResponse, TransformationJobInfo, ) from .doc_transformation_job import ( DocTransformationJob, - DocTransformationJobs, TransformationStatus, + DocTransformJobCreate, + DocTransformJobUpdate, ) from .document_collection import DocumentCollection diff --git a/backend/app/models/doc_transformation_job.py b/backend/app/models/doc_transformation_job.py index b968d8837..94643fe6c 100644 --- a/backend/app/models/doc_transformation_job.py +++ b/backend/app/models/doc_transformation_job.py @@ -2,10 +2,13 @@ from uuid import UUID, uuid4 from typing import Optional from datetime import datetime + from sqlmodel import SQLModel, Field + from app.core.util import now + class TransformationStatus(str, enum.Enum): PENDING = "PENDING" PROCESSING = "PROCESSING" @@ -18,15 +21,29 @@ class DocTransformationJob(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) source_document_id: UUID = Field(foreign_key="document.id") - transformed_document_id: Optional[UUID] = Field( + transformed_document_id: UUID | None = Field( default=None, foreign_key="document.id" ) status: TransformationStatus = Field(default=TransformationStatus.PENDING) - error_message: Optional[str] = Field(default=None) - created_at: datetime = Field(default_factory=now) + task_id: str = Field(nullable=True) + trace_id: str | None = Field( + default=None, description="Tracing ID for correlating logs and traces." + ) + error_message: str | None = Field(default=None) + inserted_at: datetime = Field(default_factory=now) updated_at: datetime = Field(default_factory=now) -class DocTransformationJobs(SQLModel): - jobs: list[DocTransformationJob] - jobs_not_found: list[UUID] +class DocTransformJobCreate(SQLModel): + source_document_id: UUID + + +class DocTransformJobUpdate(SQLModel): + transformed_document_id: UUID | None = None + task_id: str | None = None + status: TransformationStatus | None = None + error_message: str | None = None + trace_id: str | None = None + + + diff --git a/backend/app/models/document.py b/backend/app/models/document.py index 16d77a174..858a7940c 100644 --- a/backend/app/models/document.py +++ b/backend/app/models/document.py @@ -5,6 +5,7 @@ from sqlmodel import Field, SQLModel from app.core.util import now +from app.models.doc_transformation_job import TransformationStatus class DocumentBase(SQLModel): @@ -51,20 +52,20 @@ class DocumentPublic(DocumentBase): updated_at: datetime = Field( description="The timestamp when the document was last updated" ) - source_document_id: UUID | None = Field( - default=None, - description="The ID of the source document if this document is a transformation", - ) signed_url: str | None = Field( default=None, description="A signed URL for accessing the document" ) +class TransformedDocumentPublic(DocumentPublic): + source_document_id: UUID | None = Field( + default=None, + description="The ID of the source document if this document is a transformation", + ) class TransformationJobInfo(SQLModel): message: str job_id: UUID = Field(description="The unique identifier of the transformation job") - source_format: str = Field(description="The format of the source document") - target_format: str = Field(description="The format of the target document") + status: TransformationStatus transformer: str = Field(description="The name of the transformer used") status_check_url: str = Field( description="The URL to check the status of the transformation job" @@ -74,3 +75,16 @@ class TransformationJobInfo(SQLModel): class DocumentUploadResponse(DocumentPublic): signed_url: str = Field(description="A signed URL for accessing the document") transformation_job: TransformationJobInfo | None = None + + +class DocTransformationJobPublic(SQLModel): + id: UUID + source_document_id: UUID + status: TransformationStatus + transformed_document: TransformedDocumentPublic | None = None + error_message: str | None = None + + +class DocTransformationJobsPublic(SQLModel): + jobs: list[DocTransformationJobPublic] + jobs_not_found: list[UUID] diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 5ca1c6759..795b04cd6 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -8,7 +8,7 @@ from sqlmodel import select from openai import OpenAIError -from app.crud.document import DocumentCrud +from app.crud.document.document import DocumentCrud from app.models import DocumentCollection, Collection diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py new file mode 100644 index 000000000..381ac9709 --- /dev/null +++ b/backend/app/services/doctransform/job.py @@ -0,0 +1,251 @@ +import tempfile +import shutil +import logging +from pathlib import Path +from uuid import uuid4, UUID + +from fastapi import UploadFile +from tenacity import retry, wait_exponential, stop_after_attempt +from sqlmodel import Session +from asgi_correlation_id import correlation_id +from starlette.datastructures import Headers + +from app.crud.document.doc_transformation_job import DocTransformationJobCrud +from app.crud.document.document import DocumentCrud +from app.models.document import Document +from app.models import ( + DocTransformJobUpdate, + TransformationStatus, + DocTransformationJobPublic, + TransformedDocumentPublic, + DocTransformationJob ) +from app.core.cloud import get_cloud_storage +from app.api.deps import CurrentUserOrgProject +from app.celery.utils import start_low_priority_job +from app.utils import send_callback, APIResponse +from app.services.doctransform.registry import convert_document, FORMAT_TO_EXTENSION +from app.core.db import engine + +logger = logging.getLogger(__name__) + + +def start_job( + db: Session, + current_user: CurrentUserOrgProject, + job_id: UUID, + transformer_name: str, + target_format: str, + callback_url: str +) -> UUID: + + trace_id = correlation_id.get() or "N/A" + job_crud = DocTransformationJobCrud(db, project_id=current_user.project_id) + job_crud.update(job_id, DocTransformJobUpdate(trace_id=trace_id)) + job=job_crud.read_one(job_id) + + project_id = current_user.project_id + + task_id = start_low_priority_job( + function_path="app.services.doctransform.job.execute_job", + project_id=project_id, + job_id=str(job.id), + source_document_id=str(job.source_document_id), + trace_id=trace_id, + transformer_name=transformer_name, + target_format=target_format, + callback_url = callback_url + ) + + logger.info( + f"[start_job] Job scheduled for document transformation | id: {job.id}, project_id: {project_id}, task_id: {task_id}" + ) + return job.id + + +def build_success_payload( + job: DocTransformationJob, + transformed_doc: Document, +) -> dict: + """ + { + "success": true, + "data": { job fields + transformed_document (full) }, + "error": null, + "metadata": null + } + """ + transformed_public = TransformedDocumentPublic.model_validate(transformed_doc) + job_public = DocTransformationJobPublic.model_validate( + job, + update={"transformed_document": transformed_public}, + ) + # keep error_message out of the data envelope + return APIResponse.success_response(job_public).model_dump( + mode="json", exclude={"data": {"error_message"}} + ) + + +def build_failure_payload(job: DocTransformationJob, error_message: str) -> dict: + """ + { + "success": false, + "data": { job fields, transformed_document: null }, + "error": "something went wrong", + "metadata": null + } + """ + # ensure transformed_document is explicitly null in the payload + job_public = DocTransformationJobPublic.model_validate( + job, + update={"transformed_document": None}, + ) + return APIResponse.failure_response(error_message, job_public).model_dump( + mode="json", + exclude={"data": {"error_message"}}, + ) + + +@retry(wait=wait_exponential(multiplier=5, min=5, max=10), stop=stop_after_attempt(3)) +def execute_job( + project_id: int, + job_id: str, + source_document_id: str, + transformer_name: str, + target_format: str, + task_id: str, + callback_url: str | None, + task_instance, +): + import time + + start_time = time.time() + tmp_dir: Path | None = None + job_uuid = UUID(job_id) + source_uuid = UUID(source_document_id) + + job_for_payload = None # keep latest job snapshot for payloads + + try: + logger.info( + "[doc_transform.execute_job] started | job_id=%s | transformer=%s | target=%s | project_id=%s", + job_uuid, transformer_name, target_format, project_id + ) + + # --- mark PROCESSING and fetch source + storage --- + with Session(engine) as db: + job_crud = DocTransformationJobCrud(session=db, project_id=project_id) + job_for_payload = job_crud.update( + job_uuid, + DocTransformJobUpdate(status=TransformationStatus.PROCESSING, task_id=task_id), + ) + + doc_crud = DocumentCrud(session=db, project_id=project_id) + source_doc = doc_crud.read_one(source_uuid) + + source_doc_id = source_doc.id + source_doc_fname = source_doc.fname + source_doc_object_store_url = source_doc.object_store_url + + storage = get_cloud_storage(session=db, project_id=project_id) + + # --- download source --- + body = storage.stream(source_doc_object_store_url) + tmp_dir = Path(tempfile.mkdtemp()) + tmp_in = tmp_dir / f"{source_doc_id}" + with open(tmp_in, "wb") as f: + shutil.copyfileobj(body, f) + + # --- transform --- + fname_no_ext = Path(source_doc_fname).stem + target_extension = FORMAT_TO_EXTENSION.get(target_format, f".{target_format}") + transformed_doc_id = uuid4() + tmp_out = tmp_dir / f"{fname_no_ext}{target_extension}" + + convert_document(tmp_in, tmp_out, transformer_name) + + # --- upload transformed file --- + content_type_map = {"markdown": "text/markdown; charset=utf-8"} + content_type = content_type_map.get(target_format, "text/plain") + + with open(tmp_out, "rb") as fobj: + file_upload = UploadFile( + filename=tmp_out.name, + file=fobj, + headers=Headers({"content-type": content_type}), + ) + dest = storage.put(file_upload, Path(str(transformed_doc_id))) + + # --- create Document row, mark job COMPLETE, build + send callback --- + with Session(engine) as db: + new_doc = Document( + id=transformed_doc_id, + project_id=project_id, + fname=tmp_out.name, + object_store_url=str(dest), + source_document_id=source_doc_id, + ) + # NOTE: your code used update(); keep it if your CRUD's update() upserts, else use create() + created = DocumentCrud(db, project_id).update(new_doc) + + job_crud = DocTransformationJobCrud(session=db, project_id=project_id) + job_for_payload = job_crud.update( + job_uuid, + DocTransformJobUpdate( + status=TransformationStatus.COMPLETED, + transformed_document_id=created.id, + ), + ) + + try: + signed_url = getattr(storage, "get_signed_url", None) + if callable(signed_url): + created.signed_url = signed_url(created.object_store_url) + except Exception: + pass + + success_payload = build_success_payload(job_for_payload, created) + + elapsed = time.time() - start_time + logger.info( + "[doc_transform.execute_job] completed | job_id=%s | transformed_doc_id=%s | time=%.2fs", + job_uuid, created.id, elapsed + ) + + if callback_url: + send_callback(callback_url, success_payload) + + except Exception as e: + logger.error( + "[doc_transform.execute_job] FAILED | job_id=%s | error=%s", + job_uuid, e, exc_info=True + ) + + # try to mark the job as FAILED and send failure callback + try: + with Session(engine) as db: + job_crud = DocTransformationJobCrud(session=db, project_id=project_id) + job_for_payload = job_crud.update( + job_uuid, + DocTransformJobUpdate(status=TransformationStatus.FAILED, error_message=str(e)), + ) + except Exception as db_error: + logger.error( + "[doc_transform.execute_job] failed to persist FAILED status | job_id=%s | db_error=%s", + job_uuid, db_error + ) + + if callback_url and job_for_payload: + try: + failure_payload = build_failure_payload(job_for_payload, str(e)) + send_callback(callback_url, failure_payload) + except Exception as cb_error: + logger.error( + "[doc_transform.execute_job] callback failed | job_id=%s | error=%s", + job_uuid, cb_error + ) + + # bubble up for caller/infra + raise + finally: + if tmp_dir and tmp_dir.exists(): + shutil.rmtree(tmp_dir) diff --git a/backend/app/core/doctransform/registry.py b/backend/app/services/doctransform/registry.py similarity index 96% rename from backend/app/core/doctransform/registry.py rename to backend/app/services/doctransform/registry.py index b2ba380f5..29c2fc251 100644 --- a/backend/app/core/doctransform/registry.py +++ b/backend/app/services/doctransform/registry.py @@ -1,7 +1,7 @@ from pathlib import Path -from app.core.doctransform.transformer import Transformer -from app.core.doctransform.zerox_transformer import ZeroxTransformer +from app.services.doctransform.transformer import Transformer +from app.services.doctransform.zerox_transformer import ZeroxTransformer class TransformationError(Exception): diff --git a/backend/app/core/doctransform/transformer.py b/backend/app/services/doctransform/transformer.py similarity index 100% rename from backend/app/core/doctransform/transformer.py rename to backend/app/services/doctransform/transformer.py diff --git a/backend/app/core/doctransform/zerox_transformer.py b/backend/app/services/doctransform/zerox_transformer.py similarity index 97% rename from backend/app/core/doctransform/zerox_transformer.py rename to backend/app/services/doctransform/zerox_transformer.py index a69f31474..321a6ba65 100644 --- a/backend/app/core/doctransform/zerox_transformer.py +++ b/backend/app/services/doctransform/zerox_transformer.py @@ -4,7 +4,7 @@ from pathlib import Path from pyzerox import zerox -from app.core.doctransform.transformer import Transformer +from app.services.doctransform.transformer import Transformer logger = logging.getLogger(__name__) diff --git a/backend/app/tests/api/routes/documents/test_route_document_info.py b/backend/app/tests/api/routes/documents/test_route_document_info.py index 5ed921431..3a49013fd 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_info.py +++ b/backend/app/tests/api/routes/documents/test_route_document_info.py @@ -14,7 +14,7 @@ @pytest.fixture def route(): - return Route("info") + return Route("") class TestDocumentRouteInfo: diff --git a/backend/app/tests/api/routes/documents/test_route_document_list.py b/backend/app/tests/api/routes/documents/test_route_document_list.py index 8b2ec7a73..a580fb2a7 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_list.py +++ b/backend/app/tests/api/routes/documents/test_route_document_list.py @@ -21,7 +21,7 @@ def pushq(self, key, value): @pytest.fixture def route(): - return QueryRoute("list") + return QueryRoute("") class TestDocumentRouteList: diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py index 179d247d6..57de20ad9 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -26,7 +26,7 @@ @pytest.fixture def route(): - return Route("remove") + return Route("") @pytest.fixture(scope="class") diff --git a/backend/app/tests/api/routes/documents/test_route_document_remove.py b/backend/app/tests/api/routes/documents/test_route_document_remove.py index 7c1e34769..d1aa37b3d 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_remove.py +++ b/backend/app/tests/api/routes/documents/test_route_document_remove.py @@ -17,7 +17,7 @@ @pytest.fixture def route(): - return Route("remove") + return Route("") class TestDocumentRouteRemove: diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index 320c1992a..e364c1b44 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -73,7 +73,7 @@ def pdf_scratch(): @pytest.fixture def route(): - return Route("upload") + return Route("") @pytest.fixture @@ -151,7 +151,7 @@ def test_upload_without_transformation( assert "id" in response.data assert "fname" in response.data - @patch("app.core.doctransform.service.start_job") + @patch("app.services.doctransform.job.start_job") def test_upload_with_transformation( self, mock_start_job, @@ -186,7 +186,7 @@ def test_upload_with_transformation( ) assert "message" in transformation_job - @patch("app.core.doctransform.service.start_job") + @patch("app.services.doctransform.job.start_job") def test_upload_with_specific_transformer( self, mock_start_job, @@ -276,7 +276,7 @@ def test_upload_with_unsupported_file_extension( finally: unsupported_file.unlink() - @patch("app.core.doctransform.service.start_job") + @patch("app.services.doctransform.job.start_job") def test_transformation_job_created_in_database( self, mock_start_job, diff --git a/backend/app/tests/api/routes/test_doc_transformation_job.py b/backend/app/tests/api/routes/test_doc_transformation_job.py index 924600588..5f4d1ec6c 100644 --- a/backend/app/tests/api/routes/test_doc_transformation_job.py +++ b/backend/app/tests/api/routes/test_doc_transformation_job.py @@ -2,7 +2,7 @@ from sqlmodel import Session from app.core.config import settings -from app.crud.doc_transformation_job import DocTransformationJobCrud +from app.crud.document.doc_transformation_job import DocTransformationJobCrud from app.models import TransformationStatus from app.tests.utils.document import DocumentStore from app.tests.utils.auth import TestAuthContext diff --git a/backend/app/tests/core/doctransformer/test_registry.py b/backend/app/tests/core/doctransformer/test_registry.py index 7f8303fff..7e79a1b04 100644 --- a/backend/app/tests/core/doctransformer/test_registry.py +++ b/backend/app/tests/core/doctransformer/test_registry.py @@ -1,5 +1,6 @@ import pytest -from app.core.doctransform.registry import ( + +from app.services.doctransform.registry import ( get_file_format, get_supported_transformations, is_transformation_supported, diff --git a/backend/app/tests/core/doctransformer/test_service/conftest.py b/backend/app/tests/core/doctransformer/test_service/conftest.py index 75574be06..685d06083 100644 --- a/backend/app/tests/core/doctransformer/test_service/conftest.py +++ b/backend/app/tests/core/doctransformer/test_service/conftest.py @@ -32,9 +32,9 @@ def aws_credentials() -> None: @pytest.fixture def fast_execute_job() -> Generator[Callable[[int, UUID, str, str], Any], None, None]: """Create a version of execute_job without retry delays for faster testing.""" - from app.core.doctransform import service + from app.services.doctransform import job - original_execute_job = service.execute_job + original_execute_job = job.execute_job @retry( stop=stop_after_attempt(2), wait=wait_fixed(0.01) @@ -47,7 +47,7 @@ def fast_execute_job_func( project_id, job_id, transformer_name, target_format ) - with patch.object(service, "execute_job", fast_execute_job_func): + with patch.object(job, "execute_job", fast_execute_job_func): yield fast_execute_job_func diff --git a/backend/app/tests/core/doctransformer/test_service/test_execute_job.py b/backend/app/tests/core/doctransformer/test_service/test_execute_job.py index 5f80644aa..3d1df685c 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_execute_job.py +++ b/backend/app/tests/core/doctransformer/test_service/test_execute_job.py @@ -11,8 +11,8 @@ from tenacity import RetryError from app.crud import DocTransformationJobCrud, DocumentCrud -from app.core.doctransform.registry import TransformationError -from app.core.doctransform.service import execute_job +from app.services.doctransform.registry import TransformationError +from app.services.doctransform.job import execute_job from app.core.exception_handlers import HTTPException from app.models import Document, Project, TransformationStatus from app.tests.core.doctransformer.test_service.utils import ( diff --git a/backend/app/tests/core/doctransformer/test_service/test_integration.py b/backend/app/tests/core/doctransformer/test_service/test_integration.py index 20df40862..e58be8a46 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/core/doctransformer/test_service/test_integration.py @@ -10,7 +10,7 @@ from sqlmodel import Session from app.crud import DocTransformationJobCrud, DocumentCrud -from app.core.doctransform.service import execute_job, start_job +from app.services.doctransform.job import execute_job, start_job from app.models import ( Document, DocTransformationJob, diff --git a/backend/app/tests/core/doctransformer/test_service/test_start_job.py b/backend/app/tests/core/doctransformer/test_service/test_start_job.py index 94a59dd12..d7aa006f9 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_start_job.py +++ b/backend/app/tests/core/doctransformer/test_service/test_start_job.py @@ -8,8 +8,8 @@ from fastapi import BackgroundTasks from sqlmodel import Session -from app.core.doctransform.service import execute_job, start_job -from app.core.doctransform.registry import TRANSFORMERS +from app.services.doctransform.job import execute_job, start_job +from app.services.doctransform.registry import TRANSFORMERS from app.core.exception_handlers import HTTPException from app.models import ( Document, diff --git a/backend/app/tests/core/doctransformer/test_service/utils.py b/backend/app/tests/core/doctransformer/test_service/utils.py index 82451adde..277c5208c 100644 --- a/backend/app/tests/core/doctransformer/test_service/utils.py +++ b/backend/app/tests/core/doctransformer/test_service/utils.py @@ -9,7 +9,7 @@ from app.core.cloud import AmazonCloudStorageClient from app.core.config import settings -from app.core.doctransform.transformer import Transformer +from app.services.doctransform.transformer import Transformer from app.models import Document diff --git a/backend/app/tests/crud/test_doc_transformation_job.py b/backend/app/tests/crud/test_doc_transformation_job.py index 4f5f3ce68..63ffb4fab 100644 --- a/backend/app/tests/crud/test_doc_transformation_job.py +++ b/backend/app/tests/crud/test_doc_transformation_job.py @@ -1,6 +1,7 @@ import pytest from sqlmodel import Session -from app.crud.doc_transformation_job import DocTransformationJobCrud + +from app.crud.document.doc_transformation_job import DocTransformationJobCrud from app.models import TransformationStatus from app.core.exception_handlers import HTTPException from app.tests.utils.document import DocumentStore From 9e92a28a82d013c763e751dee3f4be3d95b646f6 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 17 Nov 2025 11:13:34 +0530 Subject: [PATCH 02/15] format fixing --- ...9a_alter_doc_transform_table_for_celery.py | 27 +++++--- .../app/api/routes/doc_transformation_job.py | 36 ++++++++--- backend/app/api/routes/documents.py | 17 +++--- .../crud/document/doc_transformation_job.py | 11 ++-- backend/app/models/doc_transformation_job.py | 4 -- backend/app/models/document.py | 6 +- backend/app/services/doctransform/job.py | 61 +++++++++++-------- 7 files changed, 100 insertions(+), 62 deletions(-) diff --git a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py index 3d9fe25cb..94c28798b 100644 --- a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py +++ b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py @@ -11,20 +11,29 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = 'eed36ae3c79a' -down_revision = '6fe772038a5a' +revision = "eed36ae3c79a" +down_revision = "6fe772038a5a" branch_labels = None depends_on = None def upgrade(): - op.add_column('doc_transformation_job', sa.Column('task_id', sqlmodel.sql.sqltypes.AutoString(), nullable=True)) - op.add_column('doc_transformation_job', sa.Column('trace_id', sqlmodel.sql.sqltypes.AutoString(), nullable=True)) - op.alter_column("doc_transformation_job", "created_at", new_column_name="inserted_at") + op.add_column( + "doc_transformation_job", + sa.Column("task_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.add_column( + "doc_transformation_job", + sa.Column("trace_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.alter_column( + "doc_transformation_job", "created_at", new_column_name="inserted_at" + ) def downgrade(): - op.alter_column("doc_transformation_job", "inserted_at", new_column_name="created_at") - op.drop_column('doc_transformation_job', 'trace_id') - op.drop_column('doc_transformation_job', 'task_id') - \ No newline at end of file + op.alter_column( + "doc_transformation_job", "inserted_at", new_column_name="created_at" + ) + op.drop_column("doc_transformation_job", "trace_id") + op.drop_column("doc_transformation_job", "task_id") diff --git a/backend/app/api/routes/doc_transformation_job.py b/backend/app/api/routes/doc_transformation_job.py index f8158664e..6ea514f45 100644 --- a/backend/app/api/routes/doc_transformation_job.py +++ b/backend/app/api/routes/doc_transformation_job.py @@ -1,11 +1,15 @@ from uuid import UUID import logging -from fastapi import APIRouter, HTTPException, Query, Path +from fastapi import APIRouter, HTTPException, Query, Path from app.api.deps import CurrentUserOrgProject, SessionDep from app.crud import DocTransformationJobCrud, DocumentCrud -from app.models import DocTransformationJobPublic, DocTransformationJobsPublic, TransformedDocumentPublic +from app.models import ( + DocTransformationJobPublic, + DocTransformationJobsPublic, + TransformedDocumentPublic, +) from app.utils import APIResponse from app.core.cloud import get_cloud_storage @@ -23,7 +27,9 @@ def get_transformation_job( session: SessionDep, current_user: CurrentUserOrgProject, job_id: UUID = Path(..., description="Transformation job ID"), - include_url: bool = Query(False, description="Include a signed URL for the transformed document"), + include_url: bool = Query( + False, description="Include a signed URL for the transformed document" + ), ): job_crud = DocTransformationJobCrud(session, current_user.project_id) doc_crud = DocumentCrud(session, current_user.project_id) @@ -34,15 +40,21 @@ def get_transformation_job( if getattr(job, "transformed_document_id", None): document = doc_crud.read_one(job.transformed_document_id) transformed_doc_schema = TransformedDocumentPublic.model_validate( - document, from_attributes=True - ) + document, from_attributes=True + ) if include_url: - storage = get_cloud_storage(session=session, project_id=current_user.project_id) - transformed_doc_schema.signed_url= storage.get_signed_url(document.object_store_url) + storage = get_cloud_storage( + session=session, project_id=current_user.project_id + ) + transformed_doc_schema.signed_url = storage.get_signed_url( + document.object_store_url + ) job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) - job_schema = job_schema.model_copy(update={"transformed_document": transformed_doc_schema}) + job_schema = job_schema.model_copy( + update={"transformed_document": transformed_doc_schema} + ) return APIResponse.success_response(job_schema) @@ -58,7 +70,9 @@ def get_multiple_transformation_jobs( job_ids: list[UUID] = Query( description="List of transformation job IDs", min=1, max_length=100 ), - include_url: bool = Query(False, description="Include a signed URL for each transformed document"), + include_url: bool = Query( + False, description="Include a signed URL for each transformed document" + ), ): job_crud = DocTransformationJobCrud(session, project_id=current_user.project_id) doc_crud = DocumentCrud(session, project_id=current_user.project_id) @@ -87,7 +101,9 @@ def get_multiple_transformation_jobs( document.object_store_url ) - job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) + job_schema = DocTransformationJobPublic.model_validate( + job, from_attributes=True + ) job_schema = job_schema.model_copy( update={"transformed_document": transformed_doc_schema} ) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 009bf7dff..a835479d2 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -76,8 +76,7 @@ async def upload_doc( | None = Form( None, description="Name of the transformer to apply when converting. " ), - callback_url: str - | None = Form("URL to call to report endpoint status") + callback_url: str | None = Form("URL to call to report endpoint status"), ): # Determine source file format try: @@ -125,15 +124,17 @@ async def upload_doc( job_info: TransformationJobInfo | None = None if target_format and actual_transformer: job_crud = DocTransformationJobCrud(session, current_user.project_id) - job = job_crud.create(DocTransformJobCreate(source_document_id=source_document.id)) - + job = job_crud.create( + DocTransformJobCreate(source_document_id=source_document.id) + ) + transformation_job.start_job( db=session, job_id=job.id, current_user=current_user, transformer_name=actual_transformer, target_format=target_format, - callback_url = callback_url + callback_url=callback_url, ) job_info = TransformationJobInfo( message=f"Document accepted for transformation from {source_format} to {target_format}.", @@ -237,10 +238,12 @@ def doc_info( document = crud.read_one(doc_id) if document.source_document_id is None: - doc_schema = DocumentPublic.model_validate(document, from_attributes=True) + doc_schema = DocumentPublic.model_validate(document, from_attributes=True) else: - doc_schema = TransformedDocumentPublic.model_validate(document, from_attributes=True) + doc_schema = TransformedDocumentPublic.model_validate( + document, from_attributes=True + ) if include_url: storage = get_cloud_storage(session=session, project_id=current_user.project_id) diff --git a/backend/app/crud/document/doc_transformation_job.py b/backend/app/crud/document/doc_transformation_job.py index e5c316e6c..0fd278013 100644 --- a/backend/app/crud/document/doc_transformation_job.py +++ b/backend/app/crud/document/doc_transformation_job.py @@ -6,10 +6,10 @@ from app.crud import DocumentCrud from app.models import ( - DocTransformationJob, - TransformationStatus, - DocTransformJobCreate, - DocTransformJobUpdate + DocTransformationJob, + TransformationStatus, + DocTransformJobCreate, + DocTransformJobUpdate, ) from app.models.document import Document from app.core.util import now @@ -24,7 +24,6 @@ def __init__(self, session: Session, project_id: int): self.project_id = project_id def create(self, payload: DocTransformJobCreate) -> DocTransformationJob: - job = DocTransformationJob(**payload.model_dump()) self.session.add(job) self.session.commit() @@ -74,7 +73,7 @@ def read_each(self, job_ids: set[UUID]) -> list[DocTransformationJob]: def update( self, job_id: UUID, - patch: DocTransformJobUpdate, + patch: DocTransformJobUpdate, ) -> DocTransformationJob: """Update an existing doc transformation job and return the updated row.""" job = self.read_one(job_id) diff --git a/backend/app/models/doc_transformation_job.py b/backend/app/models/doc_transformation_job.py index 94643fe6c..0438c7782 100644 --- a/backend/app/models/doc_transformation_job.py +++ b/backend/app/models/doc_transformation_job.py @@ -8,7 +8,6 @@ from app.core.util import now - class TransformationStatus(str, enum.Enum): PENDING = "PENDING" PROCESSING = "PROCESSING" @@ -44,6 +43,3 @@ class DocTransformJobUpdate(SQLModel): status: TransformationStatus | None = None error_message: str | None = None trace_id: str | None = None - - - diff --git a/backend/app/models/document.py b/backend/app/models/document.py index 858a7940c..81667ce81 100644 --- a/backend/app/models/document.py +++ b/backend/app/models/document.py @@ -56,12 +56,14 @@ class DocumentPublic(DocumentBase): default=None, description="A signed URL for accessing the document" ) + class TransformedDocumentPublic(DocumentPublic): source_document_id: UUID | None = Field( default=None, description="The ID of the source document if this document is a transformation", ) + class TransformationJobInfo(SQLModel): message: str job_id: UUID = Field(description="The unique identifier of the transformation job") @@ -78,8 +80,8 @@ class DocumentUploadResponse(DocumentPublic): class DocTransformationJobPublic(SQLModel): - id: UUID - source_document_id: UUID + id: UUID + source_document_id: UUID status: TransformationStatus transformed_document: TransformedDocumentPublic | None = None error_message: str | None = None diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py index 381ac9709..0fd723844 100644 --- a/backend/app/services/doctransform/job.py +++ b/backend/app/services/doctransform/job.py @@ -13,12 +13,13 @@ from app.crud.document.doc_transformation_job import DocTransformationJobCrud from app.crud.document.document import DocumentCrud from app.models.document import Document -from app.models import ( - DocTransformJobUpdate, - TransformationStatus, +from app.models import ( + DocTransformJobUpdate, + TransformationStatus, DocTransformationJobPublic, TransformedDocumentPublic, - DocTransformationJob ) + DocTransformationJob, +) from app.core.cloud import get_cloud_storage from app.api.deps import CurrentUserOrgProject from app.celery.utils import start_low_priority_job @@ -35,26 +36,25 @@ def start_job( job_id: UUID, transformer_name: str, target_format: str, - callback_url: str + callback_url: str, ) -> UUID: - trace_id = correlation_id.get() or "N/A" job_crud = DocTransformationJobCrud(db, project_id=current_user.project_id) job_crud.update(job_id, DocTransformJobUpdate(trace_id=trace_id)) - job=job_crud.read_one(job_id) + job = job_crud.read_one(job_id) project_id = current_user.project_id task_id = start_low_priority_job( - function_path="app.services.doctransform.job.execute_job", - project_id=project_id, - job_id=str(job.id), - source_document_id=str(job.source_document_id), - trace_id=trace_id, - transformer_name=transformer_name, - target_format=target_format, - callback_url = callback_url - ) + function_path="app.services.doctransform.job.execute_job", + project_id=project_id, + job_id=str(job.id), + source_document_id=str(job.source_document_id), + trace_id=trace_id, + transformer_name=transformer_name, + target_format=target_format, + callback_url=callback_url, + ) logger.info( f"[start_job] Job scheduled for document transformation | id: {job.id}, project_id: {project_id}, task_id: {task_id}" @@ -128,7 +128,10 @@ def execute_job( try: logger.info( "[doc_transform.execute_job] started | job_id=%s | transformer=%s | target=%s | project_id=%s", - job_uuid, transformer_name, target_format, project_id + job_uuid, + transformer_name, + target_format, + project_id, ) # --- mark PROCESSING and fetch source + storage --- @@ -136,7 +139,9 @@ def execute_job( job_crud = DocTransformationJobCrud(session=db, project_id=project_id) job_for_payload = job_crud.update( job_uuid, - DocTransformJobUpdate(status=TransformationStatus.PROCESSING, task_id=task_id), + DocTransformJobUpdate( + status=TransformationStatus.PROCESSING, task_id=task_id + ), ) doc_crud = DocumentCrud(session=db, project_id=project_id) @@ -199,7 +204,7 @@ def execute_job( try: signed_url = getattr(storage, "get_signed_url", None) if callable(signed_url): - created.signed_url = signed_url(created.object_store_url) + created.signed_url = signed_url(created.object_store_url) except Exception: pass @@ -208,7 +213,9 @@ def execute_job( elapsed = time.time() - start_time logger.info( "[doc_transform.execute_job] completed | job_id=%s | transformed_doc_id=%s | time=%.2fs", - job_uuid, created.id, elapsed + job_uuid, + created.id, + elapsed, ) if callback_url: @@ -217,7 +224,9 @@ def execute_job( except Exception as e: logger.error( "[doc_transform.execute_job] FAILED | job_id=%s | error=%s", - job_uuid, e, exc_info=True + job_uuid, + e, + exc_info=True, ) # try to mark the job as FAILED and send failure callback @@ -226,12 +235,15 @@ def execute_job( job_crud = DocTransformationJobCrud(session=db, project_id=project_id) job_for_payload = job_crud.update( job_uuid, - DocTransformJobUpdate(status=TransformationStatus.FAILED, error_message=str(e)), + DocTransformJobUpdate( + status=TransformationStatus.FAILED, error_message=str(e) + ), ) except Exception as db_error: logger.error( "[doc_transform.execute_job] failed to persist FAILED status | job_id=%s | db_error=%s", - job_uuid, db_error + job_uuid, + db_error, ) if callback_url and job_for_payload: @@ -241,7 +253,8 @@ def execute_job( except Exception as cb_error: logger.error( "[doc_transform.execute_job] callback failed | job_id=%s | error=%s", - job_uuid, cb_error + job_uuid, + cb_error, ) # bubble up for caller/infra From cd2fd2c2b5b44579bd21aeb108afc859ee27ed81 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 09:27:26 +0530 Subject: [PATCH 03/15] all left changes and test case fixes --- backend/app/api/routes/documents.py | 8 +- backend/app/celery/tasks/job_execution.py | 4 +- backend/app/celery/utils.py | 4 +- backend/app/models/doc_transformation_job.py | 12 ++ backend/app/models/document.py | 2 +- backend/app/services/doctransform/job.py | 40 ++-- .../documents/test_route_document_upload.py | 3 - .../api/routes/test_doc_transformation_job.py | 183 ++++++++++------ .../test_service/test_start_job.py | 174 ---------------- .../test_crud_document_delete.py | 0 .../test_crud_document_read_many.py | 0 .../test_crud_document_read_one.py | 0 .../test_crud_document_update.py | 0 .../test_doc_transformation_job.py | 146 +++++++------ .../doctransformer/test_registry.py | 4 +- .../doctransformer/test_service/conftest.py | 0 .../test_service/test_execute_job.py | 88 +++++--- .../test_service/test_execute_job_errors.py | 69 ++++--- .../test_service/test_integration.py | 54 +++-- .../test_service/test_start_job.py | 195 ++++++++++++++++++ .../doctransformer/test_service/utils.py | 0 21 files changed, 570 insertions(+), 416 deletions(-) delete mode 100644 backend/app/tests/core/doctransformer/test_service/test_start_job.py rename backend/app/tests/crud/documents/{ => documents}/test_crud_document_delete.py (100%) rename backend/app/tests/crud/documents/{ => documents}/test_crud_document_read_many.py (100%) rename backend/app/tests/crud/documents/{ => documents}/test_crud_document_read_one.py (100%) rename backend/app/tests/crud/documents/{ => documents}/test_crud_document_update.py (100%) rename backend/app/tests/crud/{ => documents}/test_doc_transformation_job.py (62%) rename backend/app/tests/{core => services}/doctransformer/test_registry.py (95%) rename backend/app/tests/{core => services}/doctransformer/test_service/conftest.py (100%) rename backend/app/tests/{core => services}/doctransformer/test_service/test_execute_job.py (76%) rename backend/app/tests/{core => services}/doctransformer/test_service/test_execute_job_errors.py (74%) rename backend/app/tests/{core => services}/doctransformer/test_service/test_integration.py (78%) create mode 100644 backend/app/tests/services/doctransformer/test_service/test_start_job.py rename backend/app/tests/{core => services}/doctransformer/test_service/utils.py (100%) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index a835479d2..8632cdac6 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -32,6 +32,7 @@ DocumentUploadResponse, DocTransformJobCreate, Message, + TransformationStatus, TransformationJobInfo, ) from app.services.collections.helpers import pick_service_for_documennt @@ -128,7 +129,7 @@ async def upload_doc( DocTransformJobCreate(source_document_id=source_document.id) ) - transformation_job.start_job( + transformation_job_id = transformation_job.start_job( db=session, job_id=job.id, current_user=current_user, @@ -138,9 +139,10 @@ async def upload_doc( ) job_info = TransformationJobInfo( message=f"Document accepted for transformation from {source_format} to {target_format}.", - job_id=str(job.id), + job_id=str(transformation_job_id), + status=TransformationStatus.PENDING, transformer=actual_transformer, - status_check_url=f"/documents/transformations/{job.id}", + status_check_url=f"/documents/transformations/{transformation_job_id}", ) document_schema = DocumentPublic.model_validate( diff --git a/backend/app/celery/tasks/job_execution.py b/backend/app/celery/tasks/job_execution.py index 6464d2cfd..8274de960 100644 --- a/backend/app/celery/tasks/job_execution.py +++ b/backend/app/celery/tasks/job_execution.py @@ -22,7 +22,7 @@ def execute_high_priority_task( Use this for urgent operations that need immediate processing. Args: - function_path: Import path to the execute_job function (e.g., "app.core.doctransform.service.execute_job") + function_path: Import path to the execute_job function (e.g., "app.services.doctransform.service.execute_job") project_id: ID of the project executing the job job_id: ID of the job (should already exist in database) trace_id: Trace/correlation ID to preserve context across Celery tasks @@ -47,7 +47,7 @@ def execute_low_priority_task( Use this for background operations that can wait. Args: - function_path: Import path to the execute_job function (e.g., "app.core.doctransform.service.execute_job") + function_path: Import path to the execute_job function (e.g., "app.services.doctransform.service.execute_job") project_id: ID of the project executing the job job_id: ID of the job (should already exist in database) trace_id: Trace/correlation ID to preserve context across Celery tasks diff --git a/backend/app/celery/utils.py b/backend/app/celery/utils.py index 5e0d485a4..957c02d9a 100644 --- a/backend/app/celery/utils.py +++ b/backend/app/celery/utils.py @@ -22,7 +22,7 @@ def start_high_priority_job( Start a high priority job using Celery. Args: - function_path: Import path to the execute_job function (e.g., "app.core.doctransform.service.execute_job") + function_path: Import path to the execute_job function (e.g., "app.services.doctransform.service.execute_job") project_id: ID of the project executing the job job_id: ID of the job (should already exist in database) trace_id: Trace/correlation ID to preserve context across Celery tasks @@ -50,7 +50,7 @@ def start_low_priority_job( Start a low priority job using Celery. Args: - function_path: Import path to the execute_job function (e.g., "app.core.doctransform.service.execute_job") + function_path: Import path to the execute_job function (e.g., "app.services.doctransform.service.execute_job") project_id: ID of the project executing the job job_id: ID of the job (should already exist in database) trace_id: Trace/correlation ID to preserve context across Celery tasks diff --git a/backend/app/models/doc_transformation_job.py b/backend/app/models/doc_transformation_job.py index 0438c7782..e2c844288 100644 --- a/backend/app/models/doc_transformation_job.py +++ b/backend/app/models/doc_transformation_job.py @@ -32,6 +32,18 @@ class DocTransformationJob(SQLModel, table=True): inserted_at: datetime = Field(default_factory=now) updated_at: datetime = Field(default_factory=now) + @property + def job_id(self) -> UUID: + return self.id + + @property + def job_inserted_at(self) -> datetime: + return self.inserted_at + + @property + def job_updated_at(self) -> datetime: + return self.updated_at + class DocTransformJobCreate(SQLModel): source_document_id: UUID diff --git a/backend/app/models/document.py b/backend/app/models/document.py index 81667ce81..60d281423 100644 --- a/backend/app/models/document.py +++ b/backend/app/models/document.py @@ -80,7 +80,7 @@ class DocumentUploadResponse(DocumentPublic): class DocTransformationJobPublic(SQLModel): - id: UUID + job_id: UUID source_document_id: UUID status: TransformationStatus transformed_document: TransformedDocumentPublic | None = None diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py index 0fd723844..1a282b57e 100644 --- a/backend/app/services/doctransform/job.py +++ b/backend/app/services/doctransform/job.py @@ -12,13 +12,14 @@ from app.crud.document.doc_transformation_job import DocTransformationJobCrud from app.crud.document.document import DocumentCrud -from app.models.document import Document from app.models import ( + Document, DocTransformJobUpdate, TransformationStatus, DocTransformationJobPublic, TransformedDocumentPublic, DocTransformationJob, + TransformedDocumentPublic, ) from app.core.cloud import get_cloud_storage from app.api.deps import CurrentUserOrgProject @@ -36,8 +37,8 @@ def start_job( job_id: UUID, transformer_name: str, target_format: str, - callback_url: str, -) -> UUID: + callback_url: str | None, +) -> str: trace_id = correlation_id.get() or "N/A" job_crud = DocTransformationJobCrud(db, project_id=current_user.project_id) job_crud.update(job_id, DocTransformJobUpdate(trace_id=trace_id)) @@ -64,7 +65,7 @@ def start_job( def build_success_payload( job: DocTransformationJob, - transformed_doc: Document, + transformed_doc: TransformedDocumentPublic, ) -> dict: """ { @@ -120,12 +121,13 @@ def execute_job( start_time = time.time() tmp_dir: Path | None = None - job_uuid = UUID(job_id) - source_uuid = UUID(source_document_id) job_for_payload = None # keep latest job snapshot for payloads try: + job_uuid = UUID(job_id) + source_uuid = UUID(source_document_id) + logger.info( "[doc_transform.execute_job] started | job_id=%s | transformer=%s | target=%s | project_id=%s", job_uuid, @@ -134,7 +136,6 @@ def execute_job( project_id, ) - # --- mark PROCESSING and fetch source + storage --- with Session(engine) as db: job_crud = DocTransformationJobCrud(session=db, project_id=project_id) job_for_payload = job_crud.update( @@ -180,7 +181,6 @@ def execute_job( ) dest = storage.put(file_upload, Path(str(transformed_doc_id))) - # --- create Document row, mark job COMPLETE, build + send callback --- with Session(engine) as db: new_doc = Document( id=transformed_doc_id, @@ -189,7 +189,6 @@ def execute_job( object_store_url=str(dest), source_document_id=source_doc_id, ) - # NOTE: your code used update(); keep it if your CRUD's update() upserts, else use create() created = DocumentCrud(db, project_id).update(new_doc) job_crud = DocTransformationJobCrud(session=db, project_id=project_id) @@ -201,14 +200,24 @@ def execute_job( ), ) + signed_url = None try: - signed_url = getattr(storage, "get_signed_url", None) - if callable(signed_url): - created.signed_url = signed_url(created.object_store_url) - except Exception: - pass + get_signed_url = getattr(storage, "get_signed_url", None) + if callable(get_signed_url): + signed_url = get_signed_url(created.object_store_url) + except Exception as e: + logger.warning( + "[doc_transform] failed to generate signed URL for doc %s: %s", + created.id, + e, + ) + + transformed_public = TransformedDocumentPublic.model_validate( + created, + update={"signed_url": signed_url} if signed_url else None, + ) - success_payload = build_success_payload(job_for_payload, created) + success_payload = build_success_payload(job_for_payload, transformed_public) elapsed = time.time() - start_time logger.info( @@ -229,7 +238,6 @@ def execute_job( exc_info=True, ) - # try to mark the job as FAILED and send failure callback try: with Session(engine) as db: job_crud = DocTransformationJobCrud(session=db, project_id=project_id) diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index e364c1b44..67451cf86 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -177,8 +177,6 @@ def test_upload_with_transformation( transformation_job = response.data["transformation_job"] assert transformation_job["job_id"] == mock_job_id - assert transformation_job["source_format"] == "pdf" - assert transformation_job["target_format"] == "markdown" assert transformation_job["transformer"] == "zerox" # Default transformer assert ( transformation_job["status_check_url"] @@ -321,7 +319,6 @@ def test_upload_response_structure_without_transformation( "fname", "inserted_at", "updated_at", - "source_document_id", ] for field in required_fields: assert field in response.data diff --git a/backend/app/tests/api/routes/test_doc_transformation_job.py b/backend/app/tests/api/routes/test_doc_transformation_job.py index 5f4d1ec6c..a0b50aac4 100644 --- a/backend/app/tests/api/routes/test_doc_transformation_job.py +++ b/backend/app/tests/api/routes/test_doc_transformation_job.py @@ -3,7 +3,11 @@ from app.core.config import settings from app.crud.document.doc_transformation_job import DocTransformationJobCrud -from app.models import TransformationStatus +from app.models import ( + TransformationStatus, + DocTransformJobCreate, + DocTransformJobUpdate, +) from app.tests.utils.document import DocumentStore from app.tests.utils.auth import TestAuthContext @@ -12,24 +16,22 @@ class TestGetTransformationJob: def test_get_existing_job_success( self, client: TestClient, db: Session, user_api_key: TestAuthContext ): - """Test successfully retrieving an existing transformation job.""" document = DocumentStore(db, user_api_key.project_id).put() - job = DocTransformationJobCrud(db, user_api_key.project_id) - created_job = job.create(document.id) + crud = DocTransformationJobCrud(db, user_api_key.project_id) + created_job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{created_job.id}", + resp = client.get( + f"{settings.API_V1_STR}/documents/transformation/{created_job.id}", headers={"X-API-KEY": user_api_key.key}, ) + assert resp.status_code == 200 + data = resp.json()["data"] - assert response.status_code == 200 - data = response.json() - assert "data" in data - assert data["data"]["id"] is not None - assert data["data"]["source_document_id"] == str(document.id) - assert data["data"]["status"] == TransformationStatus.PENDING - assert data["data"]["error_message"] is None - assert data["data"]["transformed_document_id"] is None + assert data["job_id"] is not None + assert data["source_document_id"] == str(document.id) + assert data["status"] == TransformationStatus.PENDING + assert data["error_message"] is None + assert data["transformed_document"] is None def test_get_nonexistent_job_404( self, client: TestClient, db: Session, user_api_key: TestAuthContext @@ -38,7 +40,7 @@ def test_get_nonexistent_job_404( fake_uuid = "00000000-0000-0000-0000-000000000001" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{fake_uuid}", + f"{settings.API_V1_STR}/documents/transformation/{fake_uuid}", headers={"X-API-KEY": user_api_key.key}, ) @@ -47,15 +49,13 @@ def test_get_nonexistent_job_404( def test_get_job_invalid_uuid_422( self, client: TestClient, user_api_key: TestAuthContext ): - """Test getting a job with invalid UUID format returns 422.""" - invalid_uuid = "not-a-uuid" - - response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{invalid_uuid}", + resp = client.get( + f"{settings.API_V1_STR}/documents/transformation/not-a-uuid", headers={"X-API-KEY": user_api_key.key}, ) - - assert response.status_code == 422 + assert resp.status_code == 422 + body = resp.json() + assert "error" in body and "valid UUID" in body["error"] def test_get_job_different_project_404( self, @@ -68,11 +68,10 @@ def test_get_job_different_project_404( store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Try to access with user from different project (superuser) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{job.id}", + f"{settings.API_V1_STR}/documents/transformation/{job.id}", headers={"X-API-KEY": superuser_api_key.key}, ) @@ -86,24 +85,28 @@ def test_get_completed_job_with_result( crud = DocTransformationJobCrud(db, user_api_key.project_id) source_document = store.put() transformed_document = store.put() - job = crud.create(source_document.id) + job = crud.create(DocTransformJobCreate(source_document_id=source_document.id)) # Update job to completed status - crud.update_status( + crud.update( job.id, - TransformationStatus.COMPLETED, - transformed_document_id=transformed_document.id, + DocTransformJobUpdate( + status=TransformationStatus.COMPLETED, + transformed_document_id=transformed_document.id, + ), ) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{job.id}", + f"{settings.API_V1_STR}/documents/transformation/{job.id}", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 200 data = response.json() assert data["data"]["status"] == TransformationStatus.COMPLETED - assert data["data"]["transformed_document_id"] == str(transformed_document.id) + assert data["data"]["transformed_document"]["id"] == str( + transformed_document.id + ) def test_get_failed_job_with_error( self, client: TestClient, db: Session, user_api_key: TestAuthContext @@ -112,14 +115,19 @@ def test_get_failed_job_with_error( store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) error_msg = "Transformation failed due to invalid format" # Update job to failed status - crud.update_status(job.id, TransformationStatus.FAILED, error_message=error_msg) + crud.update( + job.id, + DocTransformJobUpdate( + status=TransformationStatus.FAILED, error_message=error_msg + ), + ) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/{job.id}", + f"{settings.API_V1_STR}/documents/transformation/{job.id}", headers={"X-API-KEY": user_api_key.key}, ) @@ -137,11 +145,14 @@ def test_get_multiple_jobs_success( store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) documents = store.fill(3) - jobs = [crud.create(doc.id) for doc in documents] + jobs = [ + crud.create(DocTransformJobCreate(source_document_id=doc.id)) + for doc in documents + ] job_ids_params = "&".join(f"job_ids={job.id}" for job in jobs) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?{job_ids_params}", + f"{settings.API_V1_STR}/documents/transformation/?{job_ids_params}", headers={"X-API-KEY": user_api_key.key}, ) @@ -151,7 +162,7 @@ def test_get_multiple_jobs_success( assert len(data["data"]["jobs"]) == 3 assert len(data["data"]["jobs_not_found"]) == 0 - returned_ids = {job["id"] for job in data["data"]["jobs"]} + returned_ids = {job["job_id"] for job in data["data"]["jobs"]} expected_ids = {str(job.id) for job in jobs} assert returned_ids == expected_ids @@ -162,7 +173,10 @@ def test_get_mixed_existing_nonexisting_jobs( store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) documents = store.fill(2) - jobs = [crud.create(doc.id) for doc in documents] + jobs = [ + crud.create(DocTransformJobCreate(source_document_id=doc.id)) + for doc in documents + ] fake_uuid = "00000000-0000-0000-0000-000000000001" job_ids_params = ( @@ -170,7 +184,7 @@ def test_get_mixed_existing_nonexisting_jobs( ) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?{job_ids_params}", + f"{settings.API_V1_STR}/documents/transformation/?{job_ids_params}", headers={"X-API-KEY": user_api_key.key}, ) @@ -185,69 +199,97 @@ def test_get_jobs_with_empty_string( ): """Test retrieving jobs with empty job_ids parameter.""" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?job_ids=", + f"{settings.API_V1_STR}/documents/transformation/?job_ids=", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 422 + body = response.json() + assert body["success"] is False + assert body["data"] is None + assert "error" in body + assert "valid UUID" in body["error"] or "expected length" in body["error"] + assert "job_ids" in body["error"] def test_get_jobs_with_whitespace_only( self, client: TestClient, user_api_key: TestAuthContext ): """Test retrieving jobs with whitespace-only job_ids.""" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?job_ids= ", + f"{settings.API_V1_STR}/documents/transformation/?job_ids= ", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 422 + body = response.json() + assert body["success"] is False + assert body["data"] is None + assert "error" in body + assert "valid UUID" in body["error"] def test_get_jobs_invalid_uuid_format_422( self, client: TestClient, user_api_key: TestAuthContext ): - """Test that invalid UUID format returns 422.""" + """Invalid UUID format should return 422 (validation error).""" invalid_uuid = "not-a-uuid" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?job_ids={invalid_uuid}", + f"{settings.API_V1_STR}/documents/transformation/?job_ids={invalid_uuid}", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 422 - data = response.json() - assert "Input should be a valid UUID" in data["error"] + body = response.json() + assert body["success"] is False + assert body["data"] is None + assert "error" in body + assert "valid UUID" in body["error"] or "expected length" in body["error"] + assert "job_ids" in body["error"] def test_get_jobs_mixed_valid_invalid_uuid_422( self, client: TestClient, db: Session, user_api_key: TestAuthContext ): - """Test that mixed valid/invalid UUIDs returns 422.""" + """Mixed valid/invalid UUIDs should return 422.""" store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) job_ids_params = f"job_ids={job.id}&job_ids=not-a-uuid" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?{job_ids_params}", + f"{settings.API_V1_STR}/documents/transformation/?{job_ids_params}", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 422 - data = response.json() - assert "Input should be a valid UUID" in data["error"] - assert "job_ids" in data["error"] + body = response.json() + assert body["success"] is False + assert body["data"] is None + assert "error" in body + assert "job_ids" in body["error"] + assert ( + "valid UUID" in body["error"] + or "invalid character" in body["error"] + or "invalid length" in body["error"] + ) def test_get_jobs_missing_parameter_422( self, client: TestClient, user_api_key: TestAuthContext ): - """Test that missing job_ids parameter returns empty results.""" + """Missing job_ids parameter should 422 (Query(min=1)).""" response = client.get( - f"{settings.API_V1_STR}/documents/transformations/", + f"{settings.API_V1_STR}/documents/transformation/", headers={"X-API-KEY": user_api_key.key}, ) assert response.status_code == 422 + body = response.json() + assert body["success"] is False + assert body["data"] is None + assert "error" in body + assert "Field required" in body["error"] + assert "job_ids" in body["error"] def test_get_jobs_different_project_not_found( self, @@ -256,15 +298,14 @@ def test_get_jobs_different_project_not_found( user_api_key: TestAuthContext, superuser_api_key: TestAuthContext, ): - """Test that jobs from different projects are not returned.""" + """Jobs from different projects are not returned.""" store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Try to access with user from different project (superuser) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?job_ids={job.id}", + f"{settings.API_V1_STR}/documents/transformation/?job_ids={job.id}", headers={"X-API-KEY": superuser_api_key.key}, ) @@ -281,22 +322,33 @@ def test_get_jobs_with_various_statuses( store = DocumentStore(db, user_api_key.project_id) crud = DocTransformationJobCrud(db, user_api_key.project_id) documents = store.fill(4) - jobs = [crud.create(doc.id) for doc in documents] + jobs = [ + crud.create(DocTransformJobCreate(source_document_id=doc.id)) + for doc in documents + ] - crud.update_status(jobs[1].id, TransformationStatus.PROCESSING) - crud.update_status( + crud.update( + jobs[1].id, DocTransformJobUpdate(status=TransformationStatus.PROCESSING) + ) + crud.update( jobs[2].id, - TransformationStatus.COMPLETED, - transformed_document_id=documents[2].id, + DocTransformJobUpdate( + status=TransformationStatus.COMPLETED, + transformed_document_id=documents[2].id, + ), ) - crud.update_status( - jobs[3].id, TransformationStatus.FAILED, error_message="Test error" + crud.update( + jobs[3].id, + DocTransformJobUpdate( + status=TransformationStatus.FAILED, + error_message="Test error", + ), ) job_ids_params = "&".join(f"job_ids={job.id}" for job in jobs) response = client.get( - f"{settings.API_V1_STR}/documents/transformations/?{job_ids_params}", + f"{settings.API_V1_STR}/documents/transformation/?{job_ids_params}", headers={"X-API-KEY": user_api_key.key}, ) @@ -304,7 +356,6 @@ def test_get_jobs_with_various_statuses( data = response.json() assert len(data["data"]["jobs"]) == 4 - # Check that all statuses are represented statuses = {job["status"] for job in data["data"]["jobs"]} expected_statuses = { TransformationStatus.PENDING, diff --git a/backend/app/tests/core/doctransformer/test_service/test_start_job.py b/backend/app/tests/core/doctransformer/test_service/test_start_job.py deleted file mode 100644 index d7aa006f9..000000000 --- a/backend/app/tests/core/doctransformer/test_service/test_start_job.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Tests for the start_job function in document transformation service. -""" -from typing import Tuple -from uuid import uuid4 - -import pytest -from fastapi import BackgroundTasks -from sqlmodel import Session - -from app.services.doctransform.job import execute_job, start_job -from app.services.doctransform.registry import TRANSFORMERS -from app.core.exception_handlers import HTTPException -from app.models import ( - Document, - DocTransformationJob, - Project, - TransformationStatus, - UserProjectOrg, -) -from app.tests.core.doctransformer.test_service.utils import ( - DocTransformTestBase, - MockTestTransformer, -) - - -class TestStartJob(DocTransformTestBase): - """Test cases for the start_job function.""" - - def test_start_job_success( - self, - db: Session, - current_user: UserProjectOrg, - test_document: Tuple[Document, Project], - background_tasks: BackgroundTasks, - ) -> None: - """Test successful job creation and scheduling.""" - document, _ = test_document - job_id = start_job( - db=db, - current_user=current_user, - source_document_id=document.id, - transformer_name="test-transformer", - target_format="markdown", - background_tasks=background_tasks, - ) - - job = db.get(DocTransformationJob, job_id) - assert job is not None - assert job.source_document_id == document.id - assert job.status == TransformationStatus.PENDING - assert job.error_message is None - assert job.transformed_document_id is None - - assert len(background_tasks.tasks) == 1 - task = background_tasks.tasks[0] - assert task.func == execute_job - assert task.args[0] == current_user.project_id - assert task.args[1] == job_id - assert task.args[2] == "test-transformer" - assert task.args[3] == "markdown" - - def test_start_job_with_nonexistent_document( - self, - db: Session, - current_user: UserProjectOrg, - background_tasks: BackgroundTasks, - ) -> None: - """Test job creation with non-existent document raises error.""" - nonexistent_id = uuid4() - - with pytest.raises(HTTPException) as exc_info: - start_job( - db=db, - current_user=current_user, - source_document_id=nonexistent_id, - transformer_name="test-transformer", - target_format="markdown", - background_tasks=background_tasks, - ) - - assert exc_info.value.status_code == 404 - assert "Document not found" in str(exc_info.value.detail) - - def test_start_job_with_deleted_document( - self, - db: Session, - current_user: UserProjectOrg, - test_document: Tuple[Document, Project], - background_tasks: BackgroundTasks, - ) -> None: - """Test job creation with deleted document raises error.""" - document, _ = test_document - - document.is_deleted = True - db.add(document) - db.commit() - - with pytest.raises(HTTPException) as exc_info: - start_job( - db=db, - current_user=current_user, - source_document_id=document.id, - transformer_name="test-transformer", - target_format="markdown", - background_tasks=background_tasks, - ) - - assert exc_info.value.status_code == 404 - assert "Document not found" in str(exc_info.value.detail) - - def test_start_job_with_different_formats( - self, - db: Session, - current_user: UserProjectOrg, - test_document: Tuple[Document, Project], - background_tasks: BackgroundTasks, - monkeypatch, - ) -> None: - """Test job creation with different target formats.""" - # Add the test transformer to the registry for this test - monkeypatch.setitem(TRANSFORMERS, "test", MockTestTransformer) - - document, _ = test_document - formats = ["markdown", "text", "html"] - - for target_format in formats: - job_id = start_job( - db=db, - current_user=current_user, - source_document_id=document.id, - transformer_name="test", - target_format=target_format, - background_tasks=background_tasks, - ) - - job = db.get(DocTransformationJob, job_id) - assert job is not None - assert job.status == TransformationStatus.PENDING - - task = background_tasks.tasks[-1] - assert task.args[3] == target_format - - @pytest.mark.parametrize("transformer_name", ["test"]) - def test_start_job_with_different_transformers( - self, - db: Session, - current_user: UserProjectOrg, - test_document: Tuple[Document, Project], - background_tasks: BackgroundTasks, - transformer_name: str, - monkeypatch, - ) -> None: - """Test job creation with different transformer names.""" - # Add the test transformer to the registry for this test - monkeypatch.setitem(TRANSFORMERS, "test", MockTestTransformer) - - document, _ = test_document - - job_id = start_job( - db=db, - current_user=current_user, - source_document_id=document.id, - transformer_name=transformer_name, - target_format="markdown", - background_tasks=background_tasks, - ) - - job = db.get(DocTransformationJob, job_id) - assert job is not None - assert job.status == TransformationStatus.PENDING - - task = background_tasks.tasks[-1] - assert task.args[2] == transformer_name diff --git a/backend/app/tests/crud/documents/test_crud_document_delete.py b/backend/app/tests/crud/documents/documents/test_crud_document_delete.py similarity index 100% rename from backend/app/tests/crud/documents/test_crud_document_delete.py rename to backend/app/tests/crud/documents/documents/test_crud_document_delete.py diff --git a/backend/app/tests/crud/documents/test_crud_document_read_many.py b/backend/app/tests/crud/documents/documents/test_crud_document_read_many.py similarity index 100% rename from backend/app/tests/crud/documents/test_crud_document_read_many.py rename to backend/app/tests/crud/documents/documents/test_crud_document_read_many.py diff --git a/backend/app/tests/crud/documents/test_crud_document_read_one.py b/backend/app/tests/crud/documents/documents/test_crud_document_read_one.py similarity index 100% rename from backend/app/tests/crud/documents/test_crud_document_read_one.py rename to backend/app/tests/crud/documents/documents/test_crud_document_read_one.py diff --git a/backend/app/tests/crud/documents/test_crud_document_update.py b/backend/app/tests/crud/documents/documents/test_crud_document_update.py similarity index 100% rename from backend/app/tests/crud/documents/test_crud_document_update.py rename to backend/app/tests/crud/documents/documents/test_crud_document_update.py diff --git a/backend/app/tests/crud/test_doc_transformation_job.py b/backend/app/tests/crud/documents/test_doc_transformation_job.py similarity index 62% rename from backend/app/tests/crud/test_doc_transformation_job.py rename to backend/app/tests/crud/documents/test_doc_transformation_job.py index 63ffb4fab..e3df0a006 100644 --- a/backend/app/tests/crud/test_doc_transformation_job.py +++ b/backend/app/tests/crud/documents/test_doc_transformation_job.py @@ -1,8 +1,15 @@ import pytest from sqlmodel import Session +from sqlalchemy.exc import IntegrityError from app.crud.document.doc_transformation_job import DocTransformationJobCrud -from app.models import TransformationStatus +from app.models import ( + TransformationStatus, + DocTransformJobCreate, + DocTransformJobUpdate, +) +from app.core.config import settings +from app.tests.utils.auth import TestAuthContext from app.core.exception_handlers import HTTPException from app.tests.utils.document import DocumentStore from app.tests.utils.utils import get_project, SequentialUuidGenerator @@ -24,55 +31,53 @@ class TestDocTransformationJobCrudCreate: def test_can_create_job_with_valid_document( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test creating a transformation job with a valid source document.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) assert job.id is not None assert job.source_document_id == document.id assert job.status == TransformationStatus.PENDING assert job.error_message is None assert job.transformed_document_id is None - assert job.created_at is not None + assert job.inserted_at is not None assert job.updated_at is not None def test_cannot_create_job_with_invalid_document( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that creating a job with non-existent document raises an error.""" + """With FK enforced, creating with a non-existent document should fail at commit.""" invalid_id = next(SequentialUuidGenerator()) - with pytest.raises(HTTPException) as exc_info: - crud.create(invalid_id) - - assert exc_info.value.status_code == 404 - assert "Document not found" in str(exc_info.value.detail) + with pytest.raises(IntegrityError): + crud.create(DocTransformJobCreate(source_document_id=invalid_id)) def test_cannot_create_job_with_deleted_document( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that creating a job with a deleted document raises an error.""" + """ + Creation itself will succeed (FK exists), but later reads should treat it as not found + because read filters out deleted documents. + """ document = store.put() - # Mark document as deleted document.is_deleted = True db.add(document) db.commit() + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) + # read_one should 404 due to is_deleted=True on joined document with pytest.raises(HTTPException) as exc_info: - crud.create(document.id) - + crud.read_one(job.id) assert exc_info.value.status_code == 404 - assert "Document not found" in str(exc_info.value.detail) + assert "Transformation job not found" in str(exc_info.value.detail) class TestDocTransformationJobCrudReadOne: def test_can_read_existing_job( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test reading an existing transformation job.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) result = crud.read_one(job.id) @@ -83,7 +88,6 @@ def test_can_read_existing_job( def test_cannot_read_nonexistent_job( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that reading a non-existent job raises an error.""" invalid_id = next(SequentialUuidGenerator()) with pytest.raises(HTTPException) as exc_info: @@ -95,11 +99,9 @@ def test_cannot_read_nonexistent_job( def test_cannot_read_job_with_deleted_document( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that reading a job whose source document is deleted raises an error.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Mark document as deleted document.is_deleted = True db.add(document) db.commit() @@ -113,12 +115,10 @@ def test_cannot_read_job_with_deleted_document( def test_cannot_read_job_from_different_project( self, db: Session, store: DocumentStore ): - """Test that reading a job from a different project raises an error.""" document = store.put() job_crud = DocTransformationJobCrud(db, store.project.id) - job = job_crud.create(document.id) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Try to read from different project other_project = create_test_project(db) other_crud = DocTransformationJobCrud(db, other_project.id) @@ -133,9 +133,11 @@ class TestDocTransformationJobCrudReadEach: def test_can_read_multiple_existing_jobs( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test reading multiple existing transformation jobs.""" documents = store.fill(3) - jobs = [crud.create(doc.id) for doc in documents] + jobs = [ + crud.create(DocTransformJobCreate(source_document_id=doc.id)) + for doc in documents + ] job_ids = {job.id for job in jobs} results = crud.read_each(job_ids) @@ -147,40 +149,37 @@ def test_can_read_multiple_existing_jobs( def test_read_partial_existing_jobs( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test reading a mix of existing and non-existing jobs.""" documents = store.fill(2) - jobs = [crud.create(doc.id) for doc in documents] + jobs = [ + crud.create(DocTransformJobCreate(source_document_id=doc.id)) + for doc in documents + ] job_ids = {job.id for job in jobs} - job_ids.add(next(SequentialUuidGenerator())) # Add non-existent ID + job_ids.add(next(SequentialUuidGenerator())) # non-existent results = crud.read_each(job_ids) - assert len(results) == 2 # Only existing jobs returned + assert len(results) == 2 result_ids = {job.id for job in results} assert result_ids == {job.id for job in jobs} def test_read_empty_job_set( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test reading an empty set of job IDs.""" results = crud.read_each(set()) - assert len(results) == 0 def test_cannot_read_jobs_from_different_project( self, db: Session, store: DocumentStore ): - """Test that jobs from different projects are not returned.""" document = store.put() job_crud = DocTransformationJobCrud(db, store.project.id) - job = job_crud.create(document.id) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Try to read from different project other_project = get_project(db, name="Dalgo") other_crud = DocTransformationJobCrud(db, other_project.id) results = other_crud.read_each({job.id}) - assert len(results) == 0 @@ -188,58 +187,66 @@ class TestDocTransformationJobCrudUpdateStatus: def test_can_update_status_to_processing( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test updating job status to processing.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - updated_job = crud.update_status(job.id, TransformationStatus.PROCESSING) + updated = crud.update( + job.id, + DocTransformJobUpdate(status=TransformationStatus.PROCESSING), + ) - assert updated_job.id == job.id - assert updated_job.status == TransformationStatus.PROCESSING - assert updated_job.updated_at >= job.updated_at + assert updated.id == job.id + assert updated.status == TransformationStatus.PROCESSING + assert updated.updated_at >= job.updated_at def test_can_update_status_to_completed_with_result( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test updating job status to completed with transformed document.""" source_document = store.put() transformed_document = store.put() - job = crud.create(source_document.id) + job = crud.create(DocTransformJobCreate(source_document_id=source_document.id)) - updated_job = crud.update_status( + updated = crud.update( job.id, - TransformationStatus.COMPLETED, - transformed_document_id=transformed_document.id, + DocTransformJobUpdate( + status=TransformationStatus.COMPLETED, + transformed_document_id=transformed_document.id, + ), ) - assert updated_job.status == TransformationStatus.COMPLETED - assert updated_job.transformed_document_id == transformed_document.id - assert updated_job.error_message is None + assert updated.status == TransformationStatus.COMPLETED + assert updated.transformed_document_id == transformed_document.id + assert updated.error_message is None def test_can_update_status_to_failed_with_error( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test updating job status to failed with error message.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) error_msg = "Transformation failed due to invalid format" - updated_job = crud.update_status( - job.id, TransformationStatus.FAILED, error_message=error_msg + updated = crud.update( + job.id, + DocTransformJobUpdate( + status=TransformationStatus.FAILED, + error_message=error_msg, + ), ) - assert updated_job.status == TransformationStatus.FAILED - assert updated_job.error_message == error_msg - assert updated_job.transformed_document_id is None + assert updated.status == TransformationStatus.FAILED + assert updated.error_message == error_msg + assert updated.transformed_document_id is None def test_cannot_update_nonexistent_job( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that updating a non-existent job raises an error.""" invalid_id = next(SequentialUuidGenerator()) with pytest.raises(HTTPException) as exc_info: - crud.update_status(invalid_id, TransformationStatus.PROCESSING) + crud.update( + invalid_id, + DocTransformJobUpdate(status=TransformationStatus.PROCESSING), + ) assert exc_info.value.status_code == 404 assert "Transformation job not found" in str(exc_info.value.detail) @@ -247,17 +254,20 @@ def test_cannot_update_nonexistent_job( def test_update_preserves_existing_fields( self, db: Session, store: DocumentStore, crud: DocTransformationJobCrud ): - """Test that updating status preserves other fields when not specified.""" + """Fields not present in the patch must be preserved by `update`.""" document = store.put() - job = crud.create(document.id) + job = crud.create(DocTransformJobCreate(source_document_id=document.id)) - # First update with error message - crud.update_status( - job.id, TransformationStatus.FAILED, error_message="Initial error" + crud.update( + job.id, + DocTransformJobUpdate( + status=TransformationStatus.FAILED, error_message="Initial error" + ), ) - # Second update without error message - should preserve it - updated_job = crud.update_status(job.id, TransformationStatus.PROCESSING) + updated = crud.update( + job.id, DocTransformJobUpdate(status=TransformationStatus.PROCESSING) + ) - assert updated_job.status == TransformationStatus.PROCESSING - assert updated_job.error_message == "Initial error" # Should be preserved + assert updated.status == TransformationStatus.PROCESSING + assert updated.error_message == "Initial error" diff --git a/backend/app/tests/core/doctransformer/test_registry.py b/backend/app/tests/services/doctransformer/test_registry.py similarity index 95% rename from backend/app/tests/core/doctransformer/test_registry.py rename to backend/app/tests/services/doctransformer/test_registry.py index 7e79a1b04..0b1909f4e 100644 --- a/backend/app/tests/core/doctransformer/test_registry.py +++ b/backend/app/tests/services/doctransformer/test_registry.py @@ -20,7 +20,7 @@ def patched_transformations(monkeypatch): ("pdf", "markdown"): {"default": "zerox", "zerox": "zerox"}, } monkeypatch.setattr( - "app.core.doctransform.registry.SUPPORTED_TRANSFORMATIONS", mapping + "app.services.doctransform.registry.SUPPORTED_TRANSFORMATIONS", mapping ) return mapping @@ -47,7 +47,7 @@ def test_get_supported_transformations(patched_transformations): def test_is_transformation_supported(monkeypatch): monkeypatch.setattr( - "app.core.doctransform.registry.SUPPORTED_TRANSFORMATIONS", + "app.services.doctransform.registry.SUPPORTED_TRANSFORMATIONS", {("docx", "pdf"): {"default": "pandoc"}}, ) assert is_transformation_supported("docx", "pdf") diff --git a/backend/app/tests/core/doctransformer/test_service/conftest.py b/backend/app/tests/services/doctransformer/test_service/conftest.py similarity index 100% rename from backend/app/tests/core/doctransformer/test_service/conftest.py rename to backend/app/tests/services/doctransformer/test_service/conftest.py diff --git a/backend/app/tests/core/doctransformer/test_service/test_execute_job.py b/backend/app/tests/services/doctransformer/test_service/test_execute_job.py similarity index 76% rename from backend/app/tests/core/doctransformer/test_service/test_execute_job.py rename to backend/app/tests/services/doctransformer/test_service/test_execute_job.py index 3d1df685c..2daf0101c 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_execute_job.py +++ b/backend/app/tests/services/doctransformer/test_service/test_execute_job.py @@ -14,8 +14,8 @@ from app.services.doctransform.registry import TransformationError from app.services.doctransform.job import execute_job from app.core.exception_handlers import HTTPException -from app.models import Document, Project, TransformationStatus -from app.tests.core.doctransformer.test_service.utils import ( +from app.models import Document, Project, TransformationStatus, DocTransformationJob +from app.tests.services.doctransformer.test_service.utils import ( DocTransformTestBase, MockTestTransformer, ) @@ -50,23 +50,27 @@ def test_execute_job_success( # Create transformation job job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) - db.commit() + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) # Mock the Session to use our existing database session with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format=target_format, + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify job completion @@ -101,9 +105,10 @@ def test_execute_job_with_nonexistent_job( nonexistent_job_id = uuid4() with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None @@ -112,9 +117,13 @@ def test_execute_job_with_nonexistent_job( with pytest.raises((HTTPException, RetryError)): fast_execute_job( project_id=project.id, - job_id=nonexistent_job_id, + job_id=str(nonexistent_job_id), + source_document_id=str(uuid4()), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) @mock_aws @@ -131,13 +140,13 @@ def test_execute_job_with_missing_source_document( # Create job but don't upload document to S3 job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) - db.commit() + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None @@ -145,11 +154,14 @@ def test_execute_job_with_missing_source_document( with pytest.raises(Exception): fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) - # Verify job was marked as failed db.refresh(job) assert job.status == TransformationStatus.FAILED @@ -170,16 +182,17 @@ def test_execute_job_with_transformer_error( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) db.commit() # Mock convert_document to raise TransformationError with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.service.convert_document" + "app.services.doctransform.job.convert_document" ) as mock_convert, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None @@ -189,12 +202,15 @@ def test_execute_job_with_transformer_error( with pytest.raises((TransformationError, RetryError)): fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) - # Verify job was marked as failed db.refresh(job) assert job.status == TransformationStatus.FAILED assert "Mock transformation error" in job.error_message @@ -211,23 +227,27 @@ def test_execute_job_status_transitions( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) initial_status = job.status - db.commit() with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify status progression by checking final job state @@ -258,13 +278,12 @@ def test_execute_job_with_different_content_types( expected_extension, ) in format_extensions: job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) - db.commit() + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", + "app.services.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db @@ -272,11 +291,14 @@ def test_execute_job_with_different_content_types( execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format=target_format, + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) - # Verify transformation completed and check file extension db.refresh(job) assert job.status == TransformationStatus.COMPLETED diff --git a/backend/app/tests/core/doctransformer/test_service/test_execute_job_errors.py b/backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py similarity index 74% rename from backend/app/tests/core/doctransformer/test_service/test_execute_job_errors.py rename to backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py index e719cd5ce..8b8b84576 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_execute_job_errors.py +++ b/backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py @@ -2,6 +2,7 @@ Tests for retry mechanisms and error handling in document transformation service. """ from io import BytesIO +from uuid import uuid4 from typing import Any, Callable, Tuple from unittest.mock import patch @@ -10,12 +11,12 @@ from sqlmodel import Session from app.crud import DocTransformationJobCrud -from app.models import Document, Project, TransformationStatus -from app.tests.core.doctransformer.test_service.utils import ( +from app.models import Document, Project, TransformationStatus, DocTransformationJob +from app.tests.services.doctransformer.test_service.utils import ( DocTransformTestBase, MockTestTransformer, ) -from app.tests.core.doctransformer.test_service.utils import ( +from app.tests.services.doctransformer.test_service.utils import ( create_failing_convert_document, create_persistent_failing_convert_document, ) @@ -38,16 +39,17 @@ def test_execute_job_with_storage_error( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) db.commit() # Mock storage.put to raise an error with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.service.get_cloud_storage" + "app.services.doctransform.job.get_cloud_storage" ) as mock_storage_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None @@ -59,9 +61,13 @@ def test_execute_job_with_storage_error( with pytest.raises(Exception): fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify job was marked as failed @@ -84,30 +90,34 @@ def test_execute_job_retry_mechanism( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) db.commit() # Create a side effect that fails once then succeeds (fast retry will only try 2 times) failing_convert_document = create_failing_convert_document(fail_count=1) with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.service.convert_document", + "app.services.doctransform.job.convert_document", side_effect=failing_convert_document, ), patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) - # Verify the function was retried and eventually succeeded db.refresh(job) assert job.status == TransformationStatus.COMPLETED @@ -126,7 +136,7 @@ def test_execute_job_exhausted_retries( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) db.commit() # Mock convert_document to always fail @@ -135,12 +145,13 @@ def test_execute_job_exhausted_retries( ) with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.service.convert_document", + "app.services.doctransform.job.convert_document", side_effect=persistent_failing_convert_document, ), patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None @@ -148,11 +159,14 @@ def test_execute_job_exhausted_retries( with pytest.raises(Exception): fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) - # Verify job was marked as failed after retries db.refresh(job) assert job.status == TransformationStatus.FAILED @@ -172,20 +186,21 @@ def test_execute_job_database_error_during_completion( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) db.commit() with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None # Mock DocumentCrud.update to fail when creating the transformed document with patch( - "app.core.doctransform.service.DocumentCrud" + "app.services.doctransform.job.DocumentCrud" ) as mock_doc_crud_class: mock_doc_crud_instance = mock_doc_crud_class.return_value mock_doc_crud_instance.read_one.return_value = ( @@ -198,9 +213,13 @@ def test_execute_job_database_error_during_completion( with pytest.raises(Exception): fast_execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify job was marked as failed diff --git a/backend/app/tests/core/doctransformer/test_service/test_integration.py b/backend/app/tests/services/doctransformer/test_service/test_integration.py similarity index 78% rename from backend/app/tests/core/doctransformer/test_service/test_integration.py rename to backend/app/tests/services/doctransformer/test_service/test_integration.py index e58be8a46..6f6c86089 100644 --- a/backend/app/tests/core/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_service/test_integration.py @@ -2,10 +2,10 @@ Integration tests for document transformation service. """ from typing import Tuple +from uuid import uuid4 from unittest.mock import patch import pytest -from fastapi import BackgroundTasks from moto import mock_aws from sqlmodel import Session @@ -18,7 +18,7 @@ TransformationStatus, UserProjectOrg, ) -from app.tests.core.doctransformer.test_service.utils import ( +from app.tests.services.doctransformer.test_service.utils import ( DocTransformTestBase, MockTestTransformer, ) @@ -37,6 +37,9 @@ def test_execute_job_end_to_end_workflow( aws = self.setup_aws_s3() self.create_s3_document_content(aws, document) + job_crud = DocTransformationJobCrud(session=db, project_id=project.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + # Start job using the service current_user = UserProjectOrg( id=1, @@ -44,35 +47,36 @@ def test_execute_job_end_to_end_workflow( project_id=project.id, organization_id=project.organization_id, ) - background_tasks = BackgroundTasks() - job_id = start_job( + returned_job_id = start_job( db=db, current_user=current_user, - source_document_id=document.id, + job_id=job.id, transformer_name="test", target_format="markdown", - background_tasks=background_tasks, + callback_url=None, ) - - # Verify job was created - job = db.get(DocTransformationJob, job_id) - assert job.status == TransformationStatus.PENDING + assert job.id == returned_job_id # Execute the job manually (simulating background execution) with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer} + "app.services.doctransform.registry.TRANSFORMERS", + {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify complete workflow @@ -100,16 +104,16 @@ def test_execute_job_concurrent_jobs( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) jobs = [] for i in range(3): - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) jobs.append(job) db.commit() # Execute all jobs for job in jobs: with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", + "app.services.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db @@ -117,9 +121,13 @@ def test_execute_job_concurrent_jobs( execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format="markdown", + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify all jobs completed successfully @@ -144,16 +152,16 @@ def test_multiple_format_transformations( # Create jobs for different formats job_crud = DocTransformationJobCrud(session=db, project_id=project.id) for target_format in formats: - job = job_crud.create(source_document_id=document.id) + job = job_crud.create(DocTransformationJob(source_document_id=document.id)) jobs.append((job, target_format)) db.commit() # Execute all jobs for job, target_format in jobs: with patch( - "app.core.doctransform.service.Session" + "app.services.doctransform.job.Session" ) as mock_session_class, patch( - "app.core.doctransform.registry.TRANSFORMERS", + "app.services.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db @@ -161,9 +169,13 @@ def test_multiple_format_transformations( execute_job( project_id=project.id, - job_id=job.id, + job_id=str(job.id), + source_document_id=str(document.id), transformer_name="test", target_format=target_format, + task_id=str(uuid4()), + callback_url=None, + task_instance=None, ) # Verify all jobs completed successfully with correct formats diff --git a/backend/app/tests/services/doctransformer/test_service/test_start_job.py b/backend/app/tests/services/doctransformer/test_service/test_start_job.py new file mode 100644 index 000000000..17ee431bc --- /dev/null +++ b/backend/app/tests/services/doctransformer/test_service/test_start_job.py @@ -0,0 +1,195 @@ +""" +Tests for the start_job function in document transformation service. +""" +import pytest +from unittest.mock import patch +from typing import Tuple +from uuid import uuid4 + +from fastapi import BackgroundTasks +from sqlmodel import Session + +from app.services.doctransform.job import execute_job, start_job +from app.services.doctransform.registry import TRANSFORMERS +from app.core.exception_handlers import HTTPException +from app.crud import DocTransformationJobCrud +from app.models import ( + Document, + DocTransformationJob, + Project, + TransformationStatus, + UserProjectOrg, +) +from app.tests.services.doctransformer.test_service.utils import ( + DocTransformTestBase, + MockTestTransformer, +) + + +class TestStartJob(DocTransformTestBase): + """Test cases for the start_job function.""" + + def _create_job(self, db: Session, project_id: int, source_document_id): + job = DocTransformationJob(source_document_id=source_document_id) + job = DocTransformationJobCrud(db, project_id=project_id).create(job) + db.commit() + return job + + def test_start_job_success( + self, + db: Session, + current_user: UserProjectOrg, + test_document: tuple[Document, Project], + ) -> None: + """start_job should enqueue execute_job with correct kwargs and return the same job id.""" + document, _project = test_document + + job = self._create_job(db, current_user.project_id, document.id) + + with patch( + "app.services.doctransform.job.start_low_priority_job" + ) as mock_schedule: + mock_schedule.return_value = "fake-task-id" + + returned_job_id = start_job( + db=db, + current_user=current_user, + job_id=job.id, + transformer_name="test-transformer", + target_format="markdown", + callback_url=None, + ) + + assert returned_job_id == job.id + + job = db.get(DocTransformationJob, job.id) + assert job is not None + assert job.source_document_id == document.id + assert job.status == TransformationStatus.PENDING + assert job.error_message is None + assert job.transformed_document_id is None + + mock_schedule.assert_called_once() + kwargs = mock_schedule.call_args.kwargs + assert kwargs["function_path"] == "app.services.doctransform.job.execute_job" + assert kwargs["project_id"] == current_user.project_id + assert kwargs["job_id"] == str(job.id) + assert kwargs["source_document_id"] == str(job.source_document_id) + assert kwargs["transformer_name"] == "test-transformer" + assert kwargs["target_format"] == "markdown" + assert kwargs["callback_url"] is None + + def test_start_job_with_nonexistent_document( + self, + db: Session, + current_user: UserProjectOrg, + ) -> None: + """ + Previously: start_job validated document and raised 404. + Now: start_job expects an existing job; the equivalent negative case is a non-existent JOB. + """ + nonexistent_job_id = uuid4() + + with pytest.raises(HTTPException): + with patch( + "app.services.doctransform.job.start_low_priority_job" + ) as mock_schedule: + mock_schedule.return_value = "fake-task-id" + start_job( + db=db, + current_user=current_user, + job_id=nonexistent_job_id, + transformer_name="test-transformer", + target_format="markdown", + callback_url=None, + ) + + def test_start_job_with_different_formats( + self, + db: Session, + current_user: UserProjectOrg, + test_document: tuple[Document, Project], + monkeypatch, + ) -> None: + """Ensure start_job passes target_format through to the scheduler.""" + monkeypatch.setitem(TRANSFORMERS, "test", MockTestTransformer) + + document, _ = test_document + formats = ["markdown", "text", "html"] + + with patch( + "app.services.doctransform.job.start_low_priority_job" + ) as mock_schedule: + mock_schedule.return_value = "fake-task-id" + + for target_format in formats: + job = self._create_job(db, current_user.project_id, document.id) + + returned_job_id = start_job( + db=db, + current_user=current_user, + job_id=job.id, + transformer_name="test", + target_format=target_format, + callback_url=None, + ) + + # job row still PENDING + job = db.get(DocTransformationJob, job.id) + assert job is not None + assert job.status == TransformationStatus.PENDING + + # scheduler called with correct kwargs + kwargs = mock_schedule.call_args.kwargs + assert kwargs["target_format"] == target_format + assert ( + kwargs["function_path"] + == "app.services.doctransform.job.execute_job" + ) + assert kwargs["project_id"] == current_user.project_id + assert kwargs["job_id"] == str(job.id) + assert kwargs["source_document_id"] == str(job.source_document_id) + assert kwargs["transformer_name"] == "test" + assert returned_job_id == job.id # new start_job returns the same UUID + + @pytest.mark.parametrize("transformer_name", ["test"]) + def test_start_job_with_different_transformers( + self, + db: Session, + current_user: UserProjectOrg, + test_document: tuple[Document, Project], + transformer_name: str, + monkeypatch, + ) -> None: + """Ensure start_job passes transformer_name through to the scheduler.""" + monkeypatch.setitem(TRANSFORMERS, "test", MockTestTransformer) + + document, _ = test_document + job = self._create_job(db, current_user.project_id, document.id) + + with patch( + "app.services.doctransform.job.start_low_priority_job" + ) as mock_schedule: + mock_schedule.return_value = "fake-task-id" + + returned_job_id = start_job( + db=db, + current_user=current_user, + job_id=job.id, + transformer_name=transformer_name, + target_format="markdown", + callback_url=None, + ) + + job = db.get(DocTransformationJob, job.id) + assert job is not None + assert job.status == TransformationStatus.PENDING + + kwargs = mock_schedule.call_args.kwargs + assert kwargs["transformer_name"] == transformer_name + assert kwargs["target_format"] == "markdown" + assert kwargs["function_path"] == "app.services.doctransform.job.execute_job" + assert kwargs["project_id"] == current_user.project_id + assert kwargs["job_id"] == str(job.id) + assert kwargs["source_document_id"] == str(job.source_document_id) + assert returned_job_id == job.id diff --git a/backend/app/tests/core/doctransformer/test_service/utils.py b/backend/app/tests/services/doctransformer/test_service/utils.py similarity index 100% rename from backend/app/tests/core/doctransformer/test_service/utils.py rename to backend/app/tests/services/doctransformer/test_service/utils.py From cdc34e47e8bf7963f04a6b9ebc53978a948a233b Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 12:51:02 +0530 Subject: [PATCH 04/15] fixing fast execute job --- .../doctransformer/test_service/conftest.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/backend/app/tests/services/doctransformer/test_service/conftest.py b/backend/app/tests/services/doctransformer/test_service/conftest.py index 685d06083..47c2aef02 100644 --- a/backend/app/tests/services/doctransformer/test_service/conftest.py +++ b/backend/app/tests/services/doctransformer/test_service/conftest.py @@ -40,11 +40,25 @@ def fast_execute_job() -> Generator[Callable[[int, UUID, str, str], Any], None, stop=stop_after_attempt(2), wait=wait_fixed(0.01) ) # Very fast retry for tests def fast_execute_job_func( - project_id: int, job_id: UUID, transformer_name: str, target_format: str + project_id: int, + job_id: str, + source_document_id: str, + transformer_name: str, + target_format: str, + task_id: str, + callback_url: str | None, + task_instance, ) -> Any: # Call the original function's implementation without the decorator return original_execute_job.__wrapped__( - project_id, job_id, transformer_name, target_format + project_id, + job_id, + source_document_id, + transformer_name, + target_format, + task_id, + callback_url, + task_instance, ) with patch.object(job, "execute_job", fast_execute_job_func): From 9225aa99c12f75971b0891c08289699ee582ccd6 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 13:37:08 +0530 Subject: [PATCH 05/15] ci fail fixing --- backend/app/services/doctransform/job.py | 3 +-- .../services/doctransformer/test_service/test_integration.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py index 1a282b57e..d298d1ec3 100644 --- a/backend/app/services/doctransform/job.py +++ b/backend/app/services/doctransform/job.py @@ -1,5 +1,6 @@ import tempfile import shutil +import time import logging from pathlib import Path from uuid import uuid4, UUID @@ -117,8 +118,6 @@ def execute_job( callback_url: str | None, task_instance, ): - import time - start_time = time.time() tmp_dir: Path | None = None diff --git a/backend/app/tests/services/doctransformer/test_service/test_integration.py b/backend/app/tests/services/doctransformer/test_service/test_integration.py index 6f6c86089..63d6de6cd 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_service/test_integration.py @@ -88,6 +88,7 @@ def test_execute_job_end_to_end_workflow( document_crud = DocumentCrud(session=db, project_id=project.id) transformed_doc = document_crud.read_one(job.transformed_document_id) assert transformed_doc.source_document_id == document.id + print("Transformed fname:", transformed_doc.fname) assert "" in transformed_doc.fname @mock_aws From 1759942eb23285e3571c98dff33574212e85f4fc Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 13:56:56 +0530 Subject: [PATCH 06/15] ci failing fix trial --- .../test_service/test_integration.py | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/backend/app/tests/services/doctransformer/test_service/test_integration.py b/backend/app/tests/services/doctransformer/test_service/test_integration.py index 63d6de6cd..c256d3fbb 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_service/test_integration.py @@ -40,7 +40,6 @@ def test_execute_job_end_to_end_workflow( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformationJob(source_document_id=document.id)) - # Start job using the service current_user = UserProjectOrg( id=1, email="test@example.com", @@ -48,26 +47,26 @@ def test_execute_job_end_to_end_workflow( organization_id=project.organization_id, ) - returned_job_id = start_job( - db=db, - current_user=current_user, - job_id=job.id, - transformer_name="test", - target_format="markdown", - callback_url=None, - ) - assert job.id == returned_job_id - - # Execute the job manually (simulating background execution) with patch( - "app.services.doctransform.job.Session" - ) as mock_session_class, patch( + "app.services.collections.create_collection.start_low_priority_job", + return_value="fake-task-id", + ), patch("app.services.doctransform.job.Session") as mock_session_class, patch( "app.services.doctransform.registry.TRANSFORMERS", {"test": MockTestTransformer}, ): mock_session_class.return_value.__enter__.return_value = db mock_session_class.return_value.__exit__.return_value = None + returned_job_id = start_job( + db=db, + current_user=current_user, + job_id=job.id, + transformer_name="test", + target_format="markdown", + callback_url=None, + ) + assert job.id == returned_job_id + execute_job( project_id=project.id, job_id=str(job.id), @@ -79,16 +78,13 @@ def test_execute_job_end_to_end_workflow( task_instance=None, ) - # Verify complete workflow db.refresh(job) assert job.status == TransformationStatus.COMPLETED assert job.transformed_document_id is not None - # Verify transformed document exists and is valid document_crud = DocumentCrud(session=db, project_id=project.id) transformed_doc = document_crud.read_one(job.transformed_document_id) assert transformed_doc.source_document_id == document.id - print("Transformed fname:", transformed_doc.fname) assert "" in transformed_doc.fname @mock_aws @@ -109,7 +105,6 @@ def test_execute_job_concurrent_jobs( jobs.append(job) db.commit() - # Execute all jobs for job in jobs: with patch( "app.services.doctransform.job.Session" @@ -131,7 +126,6 @@ def test_execute_job_concurrent_jobs( task_instance=None, ) - # Verify all jobs completed successfully for job in jobs: db.refresh(job) assert job.status == TransformationStatus.COMPLETED @@ -150,14 +144,12 @@ def test_multiple_format_transformations( formats = ["markdown", "text", "html"] jobs = [] - # Create jobs for different formats job_crud = DocTransformationJobCrud(session=db, project_id=project.id) for target_format in formats: job = job_crud.create(DocTransformationJob(source_document_id=document.id)) jobs.append((job, target_format)) db.commit() - # Execute all jobs for job, target_format in jobs: with patch( "app.services.doctransform.job.Session" @@ -179,7 +171,6 @@ def test_multiple_format_transformations( task_instance=None, ) - # Verify all jobs completed successfully with correct formats document_crud = DocumentCrud(session=db, project_id=project.id) for i, (job, target_format) in enumerate(jobs): db.refresh(job) @@ -188,7 +179,6 @@ def test_multiple_format_transformations( transformed_doc = document_crud.read_one(job.transformed_document_id) assert transformed_doc is not None - # Verify correct file extension based on format if target_format == "markdown": assert transformed_doc.fname.endswith(".md") elif target_format == "text": From 34405cb3b7e8e4d7fca8434f489aca610a600773 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 14:12:39 +0530 Subject: [PATCH 07/15] ci failing fix trial --- .../services/doctransformer/test_service/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/tests/services/doctransformer/test_service/test_integration.py b/backend/app/tests/services/doctransformer/test_service/test_integration.py index c256d3fbb..72686904c 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_service/test_integration.py @@ -48,7 +48,7 @@ def test_execute_job_end_to_end_workflow( ) with patch( - "app.services.collections.create_collection.start_low_priority_job", + "app.services.doctransform.job.start_low_priority_job", return_value="fake-task-id", ), patch("app.services.doctransform.job.Session") as mock_session_class, patch( "app.services.doctransform.registry.TRANSFORMERS", From ef76fc7202c6ddb1d67fdf95872bf3a1db5c5b3f Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 14:58:48 +0530 Subject: [PATCH 08/15] code rabbit pr review --- backend/app/models/doc_transformation_job.py | 2 +- backend/app/services/doctransform/job.py | 3 +-- .../{test_service => test_job}/conftest.py | 10 +++++---- .../test_execute_job.py | 21 +++++++------------ .../test_execute_job_errors.py | 14 ++++++------- .../test_integration.py | 10 ++++----- .../test_start_job.py | 9 ++++---- .../{test_service => test_job}/utils.py | 0 8 files changed, 32 insertions(+), 37 deletions(-) rename backend/app/tests/services/doctransformer/{test_service => test_job}/conftest.py (93%) rename backend/app/tests/services/doctransformer/{test_service => test_job}/test_execute_job.py (93%) rename backend/app/tests/services/doctransformer/{test_service => test_job}/test_execute_job_errors.py (94%) rename backend/app/tests/services/doctransformer/{test_service => test_job}/test_integration.py (94%) rename backend/app/tests/services/doctransformer/{test_service => test_job}/test_start_job.py (96%) rename backend/app/tests/services/doctransformer/{test_service => test_job}/utils.py (100%) diff --git a/backend/app/models/doc_transformation_job.py b/backend/app/models/doc_transformation_job.py index e2c844288..5431f7e5f 100644 --- a/backend/app/models/doc_transformation_job.py +++ b/backend/app/models/doc_transformation_job.py @@ -24,7 +24,7 @@ class DocTransformationJob(SQLModel, table=True): default=None, foreign_key="document.id" ) status: TransformationStatus = Field(default=TransformationStatus.PENDING) - task_id: str = Field(nullable=True) + task_id: str | None = Field(default=None, nullable=True) trace_id: str | None = Field( default=None, description="Tracing ID for correlating logs and traces." ) diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py index d298d1ec3..a8951e19e 100644 --- a/backend/app/services/doctransform/job.py +++ b/backend/app/services/doctransform/job.py @@ -20,7 +20,6 @@ DocTransformationJobPublic, TransformedDocumentPublic, DocTransformationJob, - TransformedDocumentPublic, ) from app.core.cloud import get_cloud_storage from app.api.deps import CurrentUserOrgProject @@ -39,7 +38,7 @@ def start_job( transformer_name: str, target_format: str, callback_url: str | None, -) -> str: +) -> UUID: trace_id = correlation_id.get() or "N/A" job_crud = DocTransformationJobCrud(db, project_id=current_user.project_id) job_crud.update(job_id, DocTransformJobUpdate(trace_id=trace_id)) diff --git a/backend/app/tests/services/doctransformer/test_service/conftest.py b/backend/app/tests/services/doctransformer/test_job/conftest.py similarity index 93% rename from backend/app/tests/services/doctransformer/test_service/conftest.py rename to backend/app/tests/services/doctransformer/test_job/conftest.py index 47c2aef02..edba7ec9d 100644 --- a/backend/app/tests/services/doctransformer/test_service/conftest.py +++ b/backend/app/tests/services/doctransformer/test_job/conftest.py @@ -4,7 +4,6 @@ import os from typing import Any, Callable, Generator, Tuple from unittest.mock import patch -from uuid import UUID import pytest from fastapi import BackgroundTasks @@ -12,7 +11,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed from app.crud import get_project_by_id -from app.models import User +from app.services.doctransform import job from app.core.config import settings from app.models import Document, Project, UserProjectOrg from app.tests.utils.document import DocumentStore @@ -30,9 +29,12 @@ def aws_credentials() -> None: @pytest.fixture -def fast_execute_job() -> Generator[Callable[[int, UUID, str, str], Any], None, None]: +def fast_execute_job() -> ( + Generator[ + Callable[[int, str, str, str, str, str, str | None, Any], Any], None, None + ] +): """Create a version of execute_job without retry delays for faster testing.""" - from app.services.doctransform import job original_execute_job = job.execute_job diff --git a/backend/app/tests/services/doctransformer/test_service/test_execute_job.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py similarity index 93% rename from backend/app/tests/services/doctransformer/test_service/test_execute_job.py rename to backend/app/tests/services/doctransformer/test_job/test_execute_job.py index 2daf0101c..14149a16b 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_execute_job.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py @@ -14,8 +14,8 @@ from app.services.doctransform.registry import TransformationError from app.services.doctransform.job import execute_job from app.core.exception_handlers import HTTPException -from app.models import Document, Project, TransformationStatus, DocTransformationJob -from app.tests.services.doctransformer.test_service.utils import ( +from app.models import Document, Project, TransformationStatus, DocTransformJobCreate +from app.tests.services.doctransformer.test_job.utils import ( DocTransformTestBase, MockTestTransformer, ) @@ -48,11 +48,9 @@ def test_execute_job_success( source_content = b"This is a test document for transformation." self.create_s3_document_content(aws, document, source_content) - # Create transformation job job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - # Mock the Session to use our existing database session with patch( "app.services.doctransform.job.Session" ) as mock_session_class, patch( @@ -73,13 +71,11 @@ def test_execute_job_success( task_instance=None, ) - # Verify job completion db.refresh(job) assert job.status == TransformationStatus.COMPLETED assert job.transformed_document_id is not None assert job.error_message is None - # Verify transformed document document_crud = DocumentCrud(session=db, project_id=project.id) transformed_doc = document_crud.read_one(job.transformed_document_id) assert transformed_doc is not None @@ -88,7 +84,6 @@ def test_execute_job_success( assert transformed_doc.source_document_id == document.id assert transformed_doc.object_store_url is not None - # Verify transformed content in S3 self.verify_s3_content(aws, transformed_doc) @mock_aws @@ -140,7 +135,7 @@ def test_execute_job_with_missing_source_document( # Create job but don't upload document to S3 job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) with patch( "app.services.doctransform.job.Session" @@ -162,7 +157,7 @@ def test_execute_job_with_missing_source_document( callback_url=None, task_instance=None, ) - # Verify job was marked as failed + db.refresh(job) assert job.status == TransformationStatus.FAILED assert job.error_message is not None @@ -182,7 +177,7 @@ def test_execute_job_with_transformer_error( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) db.commit() # Mock convert_document to raise TransformationError @@ -227,7 +222,7 @@ def test_execute_job_status_transitions( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) initial_status = job.status with patch( @@ -278,7 +273,7 @@ def test_execute_job_with_different_content_types( expected_extension, ) in format_extensions: job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) with patch( "app.services.doctransform.job.Session" diff --git a/backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py similarity index 94% rename from backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py rename to backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py index 8b8b84576..41a9b0115 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_execute_job_errors.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py @@ -11,12 +11,12 @@ from sqlmodel import Session from app.crud import DocTransformationJobCrud -from app.models import Document, Project, TransformationStatus, DocTransformationJob -from app.tests.services.doctransformer.test_service.utils import ( +from app.models import Document, Project, TransformationStatus, DocTransformJobCreate +from app.tests.services.doctransformer.test_job.utils import ( DocTransformTestBase, MockTestTransformer, ) -from app.tests.services.doctransformer.test_service.utils import ( +from app.tests.services.doctransformer.test_job.utils import ( create_failing_convert_document, create_persistent_failing_convert_document, ) @@ -39,7 +39,7 @@ def test_execute_job_with_storage_error( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) db.commit() # Mock storage.put to raise an error @@ -90,7 +90,7 @@ def test_execute_job_retry_mechanism( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) db.commit() # Create a side effect that fails once then succeeds (fast retry will only try 2 times) @@ -136,7 +136,7 @@ def test_execute_job_exhausted_retries( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) db.commit() # Mock convert_document to always fail @@ -186,7 +186,7 @@ def test_execute_job_database_error_during_completion( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) db.commit() with patch( diff --git a/backend/app/tests/services/doctransformer/test_service/test_integration.py b/backend/app/tests/services/doctransformer/test_job/test_integration.py similarity index 94% rename from backend/app/tests/services/doctransformer/test_service/test_integration.py rename to backend/app/tests/services/doctransformer/test_job/test_integration.py index 72686904c..d2d39b085 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_job/test_integration.py @@ -13,12 +13,12 @@ from app.services.doctransform.job import execute_job, start_job from app.models import ( Document, - DocTransformationJob, Project, TransformationStatus, UserProjectOrg, + DocTransformJobCreate, ) -from app.tests.services.doctransformer.test_service.utils import ( +from app.tests.services.doctransformer.test_job.utils import ( DocTransformTestBase, MockTestTransformer, ) @@ -38,7 +38,7 @@ def test_execute_job_end_to_end_workflow( self.create_s3_document_content(aws, document) job_crud = DocTransformationJobCrud(session=db, project_id=project.id) - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) current_user = UserProjectOrg( id=1, @@ -101,7 +101,7 @@ def test_execute_job_concurrent_jobs( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) jobs = [] for i in range(3): - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) jobs.append(job) db.commit() @@ -146,7 +146,7 @@ def test_multiple_format_transformations( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) for target_format in formats: - job = job_crud.create(DocTransformationJob(source_document_id=document.id)) + job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) jobs.append((job, target_format)) db.commit() diff --git a/backend/app/tests/services/doctransformer/test_service/test_start_job.py b/backend/app/tests/services/doctransformer/test_job/test_start_job.py similarity index 96% rename from backend/app/tests/services/doctransformer/test_service/test_start_job.py rename to backend/app/tests/services/doctransformer/test_job/test_start_job.py index 17ee431bc..d09439f7c 100644 --- a/backend/app/tests/services/doctransformer/test_service/test_start_job.py +++ b/backend/app/tests/services/doctransformer/test_job/test_start_job.py @@ -3,13 +3,11 @@ """ import pytest from unittest.mock import patch -from typing import Tuple from uuid import uuid4 -from fastapi import BackgroundTasks from sqlmodel import Session -from app.services.doctransform.job import execute_job, start_job +from app.services.doctransform.job import start_job from app.services.doctransform.registry import TRANSFORMERS from app.core.exception_handlers import HTTPException from app.crud import DocTransformationJobCrud @@ -19,8 +17,9 @@ Project, TransformationStatus, UserProjectOrg, + DocTransformJobCreate, ) -from app.tests.services.doctransformer.test_service.utils import ( +from app.tests.services.doctransformer.test_job.utils import ( DocTransformTestBase, MockTestTransformer, ) @@ -30,7 +29,7 @@ class TestStartJob(DocTransformTestBase): """Test cases for the start_job function.""" def _create_job(self, db: Session, project_id: int, source_document_id): - job = DocTransformationJob(source_document_id=source_document_id) + job = DocTransformJobCreate(source_document_id=source_document_id) job = DocTransformationJobCrud(db, project_id=project_id).create(job) db.commit() return job diff --git a/backend/app/tests/services/doctransformer/test_service/utils.py b/backend/app/tests/services/doctransformer/test_job/utils.py similarity index 100% rename from backend/app/tests/services/doctransformer/test_service/utils.py rename to backend/app/tests/services/doctransformer/test_job/utils.py From 0b0eb4c8d2f65e3a377bf52e65df1f4676a94ae4 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 15:08:54 +0530 Subject: [PATCH 09/15] code rabbit pr review --- .../services/doctransformer/test_job/test_execute_job.py | 1 - .../doctransformer/test_job/test_execute_job_errors.py | 4 ---- .../services/doctransformer/test_job/test_integration.py | 2 -- .../tests/services/doctransformer/test_job/test_start_job.py | 1 - 4 files changed, 8 deletions(-) diff --git a/backend/app/tests/services/doctransformer/test_job/test_execute_job.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py index 14149a16b..3fb8b4844 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_execute_job.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py @@ -178,7 +178,6 @@ def test_execute_job_with_transformer_error( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - db.commit() # Mock convert_document to raise TransformationError with patch( diff --git a/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py index 41a9b0115..42a40f7ca 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py @@ -40,7 +40,6 @@ def test_execute_job_with_storage_error( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - db.commit() # Mock storage.put to raise an error with patch( @@ -91,7 +90,6 @@ def test_execute_job_retry_mechanism( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - db.commit() # Create a side effect that fails once then succeeds (fast retry will only try 2 times) failing_convert_document = create_failing_convert_document(fail_count=1) @@ -137,7 +135,6 @@ def test_execute_job_exhausted_retries( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - db.commit() # Mock convert_document to always fail persistent_failing_convert_document = ( @@ -187,7 +184,6 @@ def test_execute_job_database_error_during_completion( job_crud = DocTransformationJobCrud(session=db, project_id=project.id) job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) - db.commit() with patch( "app.services.doctransform.job.Session" diff --git a/backend/app/tests/services/doctransformer/test_job/test_integration.py b/backend/app/tests/services/doctransformer/test_job/test_integration.py index d2d39b085..3282b772c 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_integration.py +++ b/backend/app/tests/services/doctransformer/test_job/test_integration.py @@ -103,7 +103,6 @@ def test_execute_job_concurrent_jobs( for i in range(3): job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) jobs.append(job) - db.commit() for job in jobs: with patch( @@ -148,7 +147,6 @@ def test_multiple_format_transformations( for target_format in formats: job = job_crud.create(DocTransformJobCreate(source_document_id=document.id)) jobs.append((job, target_format)) - db.commit() for job, target_format in jobs: with patch( diff --git a/backend/app/tests/services/doctransformer/test_job/test_start_job.py b/backend/app/tests/services/doctransformer/test_job/test_start_job.py index d09439f7c..60e3dadee 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_start_job.py +++ b/backend/app/tests/services/doctransformer/test_job/test_start_job.py @@ -31,7 +31,6 @@ class TestStartJob(DocTransformTestBase): def _create_job(self, db: Session, project_id: int, source_document_id): job = DocTransformJobCreate(source_document_id=source_document_id) job = DocTransformationJobCrud(db, project_id=project_id).create(job) - db.commit() return job def test_start_job_success( From 6dd6609bc6b59447f77bf8d7f41d842085dbbb1a Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 18 Nov 2025 15:15:40 +0530 Subject: [PATCH 10/15] small change --- .../doctransformer/test_job/test_execute_job.py | 12 +++++++++--- .../test_job/test_execute_job_errors.py | 16 ++++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/backend/app/tests/services/doctransformer/test_job/test_execute_job.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py index 3fb8b4844..43993d0cc 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_execute_job.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job.py @@ -92,7 +92,9 @@ def test_execute_job_with_nonexistent_job( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, UUID, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test job execution with non-existent job ID.""" _, project = test_document @@ -127,7 +129,9 @@ def test_execute_job_with_missing_source_document( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, UUID, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test job execution when source document is missing from S3.""" document, project = test_document @@ -169,7 +173,9 @@ def test_execute_job_with_transformer_error( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, UUID, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test job execution when transformer raises an error.""" document, project = test_document diff --git a/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py index 42a40f7ca..8e4759f10 100644 --- a/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py +++ b/backend/app/tests/services/doctransformer/test_job/test_execute_job_errors.py @@ -31,7 +31,9 @@ def test_execute_job_with_storage_error( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, Any, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test job execution when S3 upload fails.""" document, project = test_document @@ -81,7 +83,9 @@ def test_execute_job_retry_mechanism( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, Any, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test that retry mechanism works for transient failures.""" document, project = test_document @@ -126,7 +130,9 @@ def test_execute_job_exhausted_retries( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, Any, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test behavior when all retry attempts are exhausted.""" document, project = test_document @@ -175,7 +181,9 @@ def test_execute_job_database_error_during_completion( self, db: Session, test_document: Tuple[Document, Project], - fast_execute_job: Callable[[int, Any, str, str], Any], + fast_execute_job: Callable[ + [int, str, str, str, str, str, str | None, Any], Any + ], ) -> None: """Test handling of database errors when updating job completion.""" document, project = test_document From 7317a6f027f2fc926b716a57052207dccebfb200 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 19 Nov 2025 12:31:03 +0530 Subject: [PATCH 11/15] alembic prev migration change --- .../eed36ae3c79a_alter_doc_transform_table_for_celery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py index 94c28798b..f9a796dd8 100644 --- a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py +++ b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "eed36ae3c79a" -down_revision = "6fe772038a5a" +down_revision = "633e69806207" branch_labels = None depends_on = None From f50b5eafc9649c337dfe4863ea9afb162aff6229 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 20 Nov 2025 17:52:03 +0530 Subject: [PATCH 12/15] pr review changes --- ...9a_alter_doc_transform_table_for_celery.py | 2 +- backend/app/api/main.py | 2 +- .../app/api/routes/doc_transformation_job.py | 78 +++---- backend/app/api/routes/documents.py | 140 ++++++------- backend/app/models/doc_transformation_job.py | 4 +- backend/app/services/doctransform/job.py | 6 +- backend/app/services/documents/helpers.py | 191 ++++++++++++++++++ 7 files changed, 285 insertions(+), 138 deletions(-) create mode 100644 backend/app/services/documents/helpers.py diff --git a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py index f9a796dd8..104ebde0d 100644 --- a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py +++ b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py @@ -1,7 +1,7 @@ """alter doc transform table for celery Revision ID: eed36ae3c79a -Revises: 6fe772038a5a +Revises: 633e69806207 Create Date: 2025-11-12 20:08:39.774862 """ diff --git a/backend/app/api/main.py b/backend/app/api/main.py index e2b473f92..64de2e873 100644 --- a/backend/app/api/main.py +++ b/backend/app/api/main.py @@ -4,8 +4,8 @@ api_keys, assistants, collections, - documents, doc_transformation_job, + documents, login, llm, organization, diff --git a/backend/app/api/routes/doc_transformation_job.py b/backend/app/api/routes/doc_transformation_job.py index 6ea514f45..dac5e131a 100644 --- a/backend/app/api/routes/doc_transformation_job.py +++ b/backend/app/api/routes/doc_transformation_job.py @@ -11,15 +11,16 @@ TransformedDocumentPublic, ) from app.utils import APIResponse +from app.services.documents.helpers import build_job_schema, build_job_schemas from app.core.cloud import get_cloud_storage logger = logging.getLogger(__name__) -router = APIRouter(prefix="/documents", tags=["doc_transformation_job"]) +router = APIRouter(prefix="/documents/transformation", tags=["documents"]) @router.get( - "/transformation/{job_id}", + "/{job_id}", description="Get the status and details of a document transformation job.", response_model=APIResponse[DocTransformationJobPublic], ) @@ -35,32 +36,23 @@ def get_transformation_job( doc_crud = DocumentCrud(session, current_user.project_id) job = job_crud.read_one(job_id) - - transformed_doc_schema = None - if getattr(job, "transformed_document_id", None): - document = doc_crud.read_one(job.transformed_document_id) - transformed_doc_schema = TransformedDocumentPublic.model_validate( - document, from_attributes=True - ) - - if include_url: - storage = get_cloud_storage( - session=session, project_id=current_user.project_id - ) - transformed_doc_schema.signed_url = storage.get_signed_url( - document.object_store_url - ) - - job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) - job_schema = job_schema.model_copy( - update={"transformed_document": transformed_doc_schema} + storage = ( + get_cloud_storage(session=session, project_id=current_user.project_id) + if include_url + else None ) + job_schema = build_job_schema( + job=job, + doc_crud=doc_crud, + include_url=include_url, + storage=storage, + ) return APIResponse.success_response(job_schema) @router.get( - "/transformation/", + "/", description="Get the status and details of multiple document transformation jobs by IDs.", response_model=APIResponse[DocTransformationJobsPublic], ) @@ -68,7 +60,10 @@ def get_multiple_transformation_jobs( session: SessionDep, current_user: CurrentUserOrgProject, job_ids: list[UUID] = Query( - description="List of transformation job IDs", min=1, max_length=100 + ..., + description="List of transformation job IDs", + min_items=1, + max_items=100, ), include_url: bool = Query( False, description="Include a signed URL for each transformed document" @@ -81,33 +76,18 @@ def get_multiple_transformation_jobs( jobs_found_ids = {job.id for job in jobs} jobs_not_found = set(job_ids) - jobs_found_ids - storage = None - if include_url: - storage = get_cloud_storage(session=session, project_id=current_user.project_id) - - job_schemas: list[DocTransformationJobPublic] = [] - - for job in jobs: - transformed_doc_schema = None - - if getattr(job, "transformed_document_id", None): - document = doc_crud.read_one(job.transformed_document_id) - transformed_doc_schema = TransformedDocumentPublic.model_validate( - document, from_attributes=True - ) - - if include_url and storage: - transformed_doc_schema.signed_url = storage.get_signed_url( - document.object_store_url - ) + storage = ( + get_cloud_storage(session=session, project_id=current_user.project_id) + if include_url + else None + ) - job_schema = DocTransformationJobPublic.model_validate( - job, from_attributes=True - ) - job_schema = job_schema.model_copy( - update={"transformed_document": transformed_doc_schema} - ) - job_schemas.append(job_schema) + job_schemas = build_job_schemas( + jobs=jobs, + doc_crud=doc_crud, + include_url=include_url, + storage=storage, + ) return APIResponse.success_response( DocTransformationJobsPublic( diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 8632cdac6..3b93946ca 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -5,10 +5,8 @@ from fastapi import ( APIRouter, - BackgroundTasks, File, Form, - HTTPException, Query, UploadFile, ) @@ -16,26 +14,23 @@ from app.api.deps import CurrentUserOrgProject, SessionDep from app.core.cloud import get_cloud_storage -from app.services.doctransform import job as transformation_job -from app.services.doctransform.registry import ( - get_available_transformers, - get_file_format, - is_transformation_supported, - resolve_transformer, -) -from app.crud import CollectionCrud, DocumentCrud, DocTransformationJobCrud +from app.crud import CollectionCrud, DocumentCrud from app.crud.rag import OpenAIAssistantCrud, OpenAIVectorStoreCrud from app.models import ( Document, DocumentPublic, TransformedDocumentPublic, DocumentUploadResponse, - DocTransformJobCreate, Message, - TransformationStatus, TransformationJobInfo, ) from app.services.collections.helpers import pick_service_for_documennt +from app.services.documents.helpers import ( + schedule_transformation, + pre_transform_validation, + build_document_schema, + build_document_schemas, +) from app.utils import APIResponse, get_openai_client, load_description @@ -46,17 +41,32 @@ @router.get( "/", description=load_description("documents/list.md"), - response_model=APIResponse[list[DocumentPublic]], + response_model=APIResponse[list[Union[DocumentPublic, TransformedDocumentPublic]]], ) def list_docs( session: SessionDep, current_user: CurrentUserOrgProject, skip: int = Query(0, ge=0), limit: int = Query(100, gt=0, le=100), + include_url: bool = Query( + False, description="Include a signed URL to access each document" + ), ): crud = DocumentCrud(session, current_user.project_id) - data = crud.read_many(skip, limit) - return APIResponse.success_response(data) + documents = crud.read_many(skip, limit) + + storage = ( + get_cloud_storage(session=session, project_id=current_user.project_id) + if include_url and documents + else None + ) + + results = build_document_schemas( + documents=documents, + include_url=include_url, + storage=storage, + ) + return APIResponse.success_response(results) @router.post( @@ -71,47 +81,23 @@ async def upload_doc( target_format: str | None = Form( None, - description="Desired output format for the uploaded document (e.g., pdf, docx, txt). ", + description="Desired output format for the uploaded document (e.g., pdf, docx, txt).", ), transformer: str | None = Form( - None, description="Name of the transformer to apply when converting. " + None, description="Name of the transformer to apply when converting." ), - callback_url: str | None = Form("URL to call to report endpoint status"), + callback_url: str + | None = Form(None, description="URL to call to report endpoint status"), ): - # Determine source file format - try: - source_format = get_file_format(src.filename) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - - # validate if transformation is possible or not - if target_format: - if not is_transformation_supported(source_format, target_format): - raise HTTPException( - status_code=400, - detail=f"Transformation from {source_format} to {target_format} is not supported", - ) - - # Resolve the transformer to use - if not transformer: - transformer = "default" - try: - actual_transformer = resolve_transformer( - source_format, target_format, transformer - ) - except ValueError as e: - available_transformers = get_available_transformers( - source_format, target_format - ) - raise HTTPException( - status_code=400, - detail=f"{str(e)}. Available transformers: {list(available_transformers.keys())}", - ) + source_format, actual_transformer = pre_transform_validation( + src_filename=src.filename, + target_format=target_format, + transformer=transformer, + ) storage = get_cloud_storage(session=session, project_id=current_user.project_id) document_id = uuid4() - object_store_url = storage.put(src, Path(str(document_id))) crud = DocumentCrud(session, current_user.project_id) @@ -122,28 +108,16 @@ async def upload_doc( ) source_document = crud.update(document) - job_info: TransformationJobInfo | None = None - if target_format and actual_transformer: - job_crud = DocTransformationJobCrud(session, current_user.project_id) - job = job_crud.create( - DocTransformJobCreate(source_document_id=source_document.id) - ) - - transformation_job_id = transformation_job.start_job( - db=session, - job_id=job.id, - current_user=current_user, - transformer_name=actual_transformer, - target_format=target_format, - callback_url=callback_url, - ) - job_info = TransformationJobInfo( - message=f"Document accepted for transformation from {source_format} to {target_format}.", - job_id=str(transformation_job_id), - status=TransformationStatus.PENDING, - transformer=actual_transformer, - status_check_url=f"/documents/transformations/{transformation_job_id}", - ) + job_info: TransformationJobInfo | None = schedule_transformation( + session=session, + project_id=current_user.project_id, + current_user=current_user, + source_format=source_format, + target_format=target_format, + actual_transformer=actual_transformer, + source_document_id=source_document.id, + callback_url=callback_url, + ) document_schema = DocumentPublic.model_validate( source_document, from_attributes=True @@ -151,10 +125,11 @@ async def upload_doc( document_schema.signed_url = storage.get_signed_url( source_document.object_store_url ) + response = DocumentUploadResponse( - **document_schema.model_dump(), transformation_job=job_info + **document_schema.model_dump(), + transformation_job=job_info, ) - return APIResponse.success_response(response) @@ -239,16 +214,15 @@ def doc_info( crud = DocumentCrud(session, current_user.project_id) document = crud.read_one(doc_id) - if document.source_document_id is None: - doc_schema = DocumentPublic.model_validate(document, from_attributes=True) - - else: - doc_schema = TransformedDocumentPublic.model_validate( - document, from_attributes=True - ) - - if include_url: - storage = get_cloud_storage(session=session, project_id=current_user.project_id) - doc_schema.signed_url = storage.get_signed_url(document.object_store_url) + storage = ( + get_cloud_storage(session=session, project_id=current_user.project_id) + if include_url + else None + ) + doc_schema = build_document_schema( + document=document, + include_url=include_url, + storage=storage, + ) return APIResponse.success_response(doc_schema) diff --git a/backend/app/models/doc_transformation_job.py b/backend/app/models/doc_transformation_job.py index 5431f7e5f..139825ee0 100644 --- a/backend/app/models/doc_transformation_job.py +++ b/backend/app/models/doc_transformation_job.py @@ -1,9 +1,9 @@ import enum from uuid import UUID, uuid4 -from typing import Optional from datetime import datetime from sqlmodel import SQLModel, Field +from pydantic import ConfigDict from app.core.util import now @@ -48,6 +48,8 @@ def job_updated_at(self) -> datetime: class DocTransformJobCreate(SQLModel): source_document_id: UUID + model_config = ConfigDict(extra="forbid") + class DocTransformJobUpdate(SQLModel): transformed_document_id: UUID | None = None diff --git a/backend/app/services/doctransform/job.py b/backend/app/services/doctransform/job.py index a8951e19e..8245b213b 100644 --- a/backend/app/services/doctransform/job.py +++ b/backend/app/services/doctransform/job.py @@ -38,7 +38,7 @@ def start_job( transformer_name: str, target_format: str, callback_url: str | None, -) -> UUID: +) -> str: trace_id = correlation_id.get() or "N/A" job_crud = DocTransformationJobCrud(db, project_id=current_user.project_id) job_crud.update(job_id, DocTransformJobUpdate(trace_id=trace_id)) @@ -66,7 +66,7 @@ def start_job( def build_success_payload( job: DocTransformationJob, transformed_doc: TransformedDocumentPublic, -) -> dict: +): """ { "success": true, @@ -86,7 +86,7 @@ def build_success_payload( ) -def build_failure_payload(job: DocTransformationJob, error_message: str) -> dict: +def build_failure_payload(job: DocTransformationJob, error_message: str): """ { "success": false, diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py new file mode 100644 index 000000000..ff653400e --- /dev/null +++ b/backend/app/services/documents/helpers.py @@ -0,0 +1,191 @@ +from typing import Optional, Tuple, Iterable, Union +from uuid import UUID + +from fastapi import HTTPException + +from app.services.doctransform.registry import ( + get_available_transformers, + get_file_format, + is_transformation_supported, + resolve_transformer, +) +from app.crud import DocTransformationJobCrud, DocumentCrud +from app.services.doctransform import job as transformation_job +from app.models import ( + DocTransformJobCreate, + TransformationStatus, + TransformationJobInfo, + Document, + DocumentPublic, + DocTransformationJob, + DocTransformationJobPublic, + TransformedDocumentPublic, +) + + +def pre_transform_validation( + *, + src_filename: str, + target_format: str | None, + transformer: str | None, +) -> Tuple[str, str | None]: + """ + Everything BEFORE storage: + - detect source_format + - validate (source -> target) support if target requested + - resolve actual transformer (or None if no target_format) + + Returns: (source_format, actual_transformer_or_none) + Raises: HTTPException(400) on client errors. + """ + try: + source_format = get_file_format(src_filename) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + actual_transformer: Optional[str] = None + if target_format: + if not is_transformation_supported(source_format, target_format): + raise HTTPException( + status_code=400, + detail=f"Transformation from {source_format} to {target_format} is not supported", + ) + + candidate = transformer or "default" + try: + actual_transformer = resolve_transformer( + source_format, target_format, candidate + ) + except ValueError as e: + available = get_available_transformers(source_format, target_format) + raise HTTPException( + status_code=400, + detail=f"{e}. Available transformers: {list(available.keys())}", + ) + + return source_format, actual_transformer + + +def schedule_transformation( + *, + session, + project_id: UUID, + current_user, + source_format: str, + target_format: str | None, + actual_transformer: str | None, + source_document_id: UUID, + callback_url: str | None, +) -> TransformationJobInfo | None: + """ + Everything AFTER the document row is persisted: + - if target was requested and a transformer was resolved, + create the job and enqueue it; return job_info. + - otherwise return None. + """ + if not (target_format and actual_transformer): + return None + + job_crud = DocTransformationJobCrud(session, project_id) + job = job_crud.create(DocTransformJobCreate(source_document_id=source_document_id)) + + transformation_job_id = transformation_job.start_job( + db=session, + job_id=job.id, + current_user=current_user, + transformer_name=actual_transformer, + target_format=target_format, + callback_url=callback_url, + ) + + return TransformationJobInfo( + message=f"Document accepted for transformation from {source_format} to {target_format}.", + job_id=str(transformation_job_id), + status=TransformationStatus.PENDING, + transformer=actual_transformer, + status_check_url=f"/documents/transformations/{transformation_job_id}", + ) + + +PublicDoc = Union[DocumentPublic, TransformedDocumentPublic] + + +def _to_public_schema(doc: Document) -> PublicDoc: + if doc.source_document_id is None: + return DocumentPublic.model_validate(doc, from_attributes=True) + return TransformedDocumentPublic.model_validate(doc, from_attributes=True) + + +def build_document_schema( + *, + document: Document, + include_url: bool, + storage: object | None, +) -> PublicDoc: + schema = _to_public_schema(document) + if include_url and storage: + schema.signed_url = storage.get_signed_url(document.object_store_url) + return schema + + +def build_document_schemas( + *, + documents: Iterable[Document], + include_url: bool, + storage: object | None, +) -> list[PublicDoc]: + out: list[PublicDoc] = [] + for doc in documents: + schema = _to_public_schema(doc) + if include_url and storage: + schema.signed_url = storage.get_signed_url(doc.object_store_url) + out.append(schema) + return out + + +def build_job_schema( + *, + job: DocTransformationJob, + doc_crud: DocumentCrud, + include_url: bool, + storage: object | None, +) -> DocTransformationJobPublic: + """Build a single job schema, optionally attaching a signed URL.""" + transformed_doc_schema: TransformedDocumentPublic | None = None + object_url: str | None = None + + if job.transformed_document_id is not None: + doc = doc_crud.read_one(job.transformed_document_id) + transformed_doc_schema = TransformedDocumentPublic.model_validate( + doc, from_attributes=True + ) + object_url = doc.object_store_url + + if include_url and storage and object_url: + transformed_doc_schema.signed_url = storage.get_signed_url(object_url) + + job_schema = DocTransformationJobPublic.model_validate(job, from_attributes=True) + return job_schema.model_copy( + update={"transformed_document": transformed_doc_schema} + ) + + +def build_job_schemas( + *, + jobs: Iterable[DocTransformationJob], + doc_crud: DocumentCrud, + include_url: bool, + storage: object | None, +) -> list[DocTransformationJobPublic]: + """Build many job schemas efficiently.""" + out: list[DocTransformationJobPublic] = [] + for job in jobs: + out.append( + build_job_schema( + job=job, + doc_crud=doc_crud, + include_url=include_url, + storage=storage, + ) + ) + return out From ac79b08c45ac7671500a72da492b0bb4fe6880c8 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 20 Nov 2025 18:06:14 +0530 Subject: [PATCH 13/15] coderabbit pr review fix --- backend/app/services/documents/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/services/documents/helpers.py b/backend/app/services/documents/helpers.py index ff653400e..7319830bf 100644 --- a/backend/app/services/documents/helpers.py +++ b/backend/app/services/documents/helpers.py @@ -69,7 +69,7 @@ def pre_transform_validation( def schedule_transformation( *, session, - project_id: UUID, + project_id: int, current_user, source_format: str, target_format: str | None, @@ -103,7 +103,7 @@ def schedule_transformation( job_id=str(transformation_job_id), status=TransformationStatus.PENDING, transformer=actual_transformer, - status_check_url=f"/documents/transformations/{transformation_job_id}", + status_check_url=f"/documents/transformation/{transformation_job_id}", ) @@ -154,7 +154,7 @@ def build_job_schema( transformed_doc_schema: TransformedDocumentPublic | None = None object_url: str | None = None - if job.transformed_document_id is not None: + if job.transformed_document_id: doc = doc_crud.read_one(job.transformed_document_id) transformed_doc_schema = TransformedDocumentPublic.model_validate( doc, from_attributes=True From 55a60f3e7737ac494108811ab197e9d909dbeae6 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 20 Nov 2025 18:15:58 +0530 Subject: [PATCH 14/15] test case fix --- .../tests/api/routes/documents/test_route_document_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index 67451cf86..a16eaea25 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -180,7 +180,7 @@ def test_upload_with_transformation( assert transformation_job["transformer"] == "zerox" # Default transformer assert ( transformation_job["status_check_url"] - == f"/documents/transformations/{mock_job_id}" + == f"/documents/transformation/{mock_job_id}" ) assert "message" in transformation_job From f6e32ab8cb6af27c0d051b59776e28095f6c2be7 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Sun, 23 Nov 2025 11:01:18 +0530 Subject: [PATCH 15/15] alembic revision fix --- .../eed36ae3c79a_alter_doc_transform_table_for_celery.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py index 104ebde0d..5f3ff2151 100644 --- a/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py +++ b/backend/app/alembic/versions/eed36ae3c79a_alter_doc_transform_table_for_celery.py @@ -1,7 +1,7 @@ """alter doc transform table for celery Revision ID: eed36ae3c79a -Revises: 633e69806207 +Revises: ecda6b144627 Create Date: 2025-11-12 20:08:39.774862 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "eed36ae3c79a" -down_revision = "633e69806207" +down_revision = "ecda6b144627" branch_labels = None depends_on = None