-
Notifications
You must be signed in to change notification settings - Fork 10
Collections: Save file size of previously uploaded documents #769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1eabb00
7f5d86f
fa853c3
8e3d29d
bed1d1a
d02bac8
8b7556c
9d5d171
9cae069
166cb9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,6 +166,7 @@ def execute_job( | |
| result = None | ||
| creation_request = None | ||
| provider = None | ||
| storage = None | ||
|
|
||
| with log_context( | ||
| tag="collection", | ||
|
|
@@ -221,6 +222,49 @@ def execute_job( | |
| organization_id=organization_id, | ||
| ) | ||
|
|
||
| with Session(engine) as session: | ||
| document_crud = DocumentCrud(session, project_id) | ||
| flat_docs = document_crud.read_each(creation_request.documents) | ||
| storage = get_cloud_storage(session=session, project_id=project_id) | ||
|
|
||
| file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} | ||
|
|
||
| backfill: list[tuple[UUID, float]] = [] | ||
| for doc in flat_docs: | ||
| if doc.file_size_kb is None: | ||
| size_kb = round(storage.get_file_size_kb(doc.object_store_url)) | ||
| doc.file_size_kb = size_kb | ||
| backfill.append((doc.id, size_kb)) | ||
|
|
||
| total_size_kb = sum( | ||
| doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None | ||
| ) | ||
| total_size_mb = total_size_kb / 1024 | ||
|
Comment on lines
+232
to
+242
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Single storage failure aborts the entire collection job.
Given this PR's goal is to backfill historical data, recovering gracefully from per-doc failures is likely the safer posture: 🛡️ Suggested per-doc guard backfill: list[tuple[UUID, float]] = []
for doc in flat_docs:
if doc.file_size_kb is None:
- size_kb = round(storage.get_file_size_kb(doc.object_store_url))
- doc.file_size_kb = size_kb
- backfill.append((doc.id, size_kb))
+ try:
+ size_kb = storage.get_file_size_kb(doc.object_store_url)
+ except Exception as err:
+ logger.warning(
+ f"[execute_job] Failed to backfill file_size_kb | "
+ f"{{'doc_id': '{doc.id}', 'error': '{err}'}}"
+ )
+ continue
+ doc.file_size_kb = size_kb
+ backfill.append((doc.id, size_kb))Note: if you keep fail-fast semantics intentionally, the downstream 🤖 Prompt for AI Agents |
||
|
|
||
| with Session(engine) as session: | ||
| if backfill: | ||
| document_crud = DocumentCrud(session, project_id) | ||
| for doc_id, size_kb in backfill: | ||
| doc = document_crud.read_one(doc_id) | ||
| doc.file_size_kb = size_kb | ||
| document_crud.update(doc) | ||
|
|
||
| collection_job_crud = CollectionJobCrud(session, project_id) | ||
| collection_job = collection_job_crud.read_one(job_uuid) | ||
| collection_job = collection_job_crud.update( | ||
| job_uuid, | ||
| CollectionJobUpdate( | ||
| task_id=task_id, | ||
| status=CollectionJobStatus.PROCESSING, | ||
| total_size_mb=round(total_size_mb, 2), | ||
| ), | ||
| ) | ||
|
|
||
| provider = get_llm_provider( | ||
| session=session, | ||
| provider=creation_request.provider, | ||
| project_id=project_id, | ||
| organization_id=organization_id, | ||
| with tracer.start_as_current_span("collections.create.provider"): | ||
|
Comment on lines
+225
to
268
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
python - <<'PY'
import ast
from pathlib import Path
path = Path("backend/app/services/collections/create_collection.py")
try:
ast.parse(path.read_text(), filename=str(path))
except SyntaxError as err:
print(f"{path}:{err.lineno}:{err.offset}: {err.msg}")
raise SystemExit(1)
print("syntax ok")
PYRepository: ProjectTech4DevAI/kaapi-backend Length of output: 172 🏁 Script executed: cat -n backend/app/services/collections/create_collection.py | sed -n '175,275p'Repository: ProjectTech4DevAI/kaapi-backend Length of output: 5065 Fix indentation and close the The Proposed structural fix- with Session(engine) as session:
- document_crud = DocumentCrud(session, project_id)
- flat_docs = document_crud.read_each(creation_request.documents)
- storage = get_cloud_storage(session=session, project_id=project_id)
+ with Session(engine) as session:
+ document_crud = DocumentCrud(session, project_id)
+ flat_docs = document_crud.read_each(creation_request.documents)
+ storage = get_cloud_storage(session=session, project_id=project_id)
- file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
+ file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
- backfill: list[tuple[UUID, float]] = []
- for doc in flat_docs:
- if doc.file_size_kb is None:
- size_kb = round(storage.get_file_size_kb(doc.object_store_url))
- doc.file_size_kb = size_kb
- backfill.append((doc.id, size_kb))
+ backfill: list[tuple[UUID, float]] = []
+ for doc in flat_docs:
+ if doc.file_size_kb is None:
+ size_kb = round(storage.get_file_size_kb(doc.object_store_url))
+ doc.file_size_kb = size_kb
+ backfill.append((doc.id, size_kb))
- total_size_kb = sum(
- doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None
- )
- total_size_mb = total_size_kb / 1024
+ total_size_kb = sum(
+ doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None
+ )
+ total_size_mb = total_size_kb / 1024
- with Session(engine) as session:
- if backfill:
- document_crud = DocumentCrud(session, project_id)
- for doc_id, size_kb in backfill:
- doc = document_crud.read_one(doc_id)
- doc.file_size_kb = size_kb
- document_crud.update(doc)
+ with Session(engine) as session:
+ if backfill:
+ document_crud = DocumentCrud(session, project_id)
+ for doc_id, size_kb in backfill:
+ doc = document_crud.read_one(doc_id)
+ doc.file_size_kb = size_kb
+ document_crud.update(doc)
- collection_job_crud = CollectionJobCrud(session, project_id)
- collection_job = collection_job_crud.read_one(job_uuid)
- collection_job = collection_job_crud.update(
- job_uuid,
- CollectionJobUpdate(
- task_id=task_id,
- status=CollectionJobStatus.PROCESSING,
- total_size_mb=round(total_size_mb, 2),
- ),
- )
+ collection_job_crud = CollectionJobCrud(session, project_id)
+ collection_job = collection_job_crud.read_one(job_uuid)
+ collection_job = collection_job_crud.update(
+ job_uuid,
+ CollectionJobUpdate(
+ task_id=task_id,
+ status=CollectionJobStatus.PROCESSING,
+ total_size_mb=round(total_size_mb, 2),
+ ),
+ )
- provider = get_llm_provider(
- session=session,
- provider=creation_request.provider,
- project_id=project_id,
- organization_id=organization_id,
+ provider = get_llm_provider(
+ session=session,
+ provider=creation_request.provider,
+ project_id=project_id,
+ organization_id=organization_id,
+ )🧰 Tools🪛 Ruff (0.15.10)[warning] 225-225: Expected (invalid-syntax) [warning] 267-268: Expected (invalid-syntax) 🤖 Prompt for AI Agents |
||
| result = provider.create( | ||
| collection_request=creation_request, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-backend
Length of output: 4257
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-backend
Length of output: 112
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-backend
Length of output: 662
Remove bare
round(...)to preserve fractional KB precision.storage.get_file_size_kb()returns a float already rounded to 2 decimals (e.g., 512.37). Wrapping it with bareround(...)truncates to an integer, causing precision loss. This contradicts the declared typebackfill: list[tuple[UUID, float]]and theDocument.file_size_kbcolumn type (Floatin the database schema).🔧 Proposed fix
📝 Committable suggestion
🤖 Prompt for AI Agents