diff --git a/app/editor/tab.py b/app/editor/tab.py index 71db5b9..d8ee081 100644 --- a/app/editor/tab.py +++ b/app/editor/tab.py @@ -78,6 +78,33 @@ def _MODE_DEFS(self): def _DRAW_COLORS(self): return {t(k): v for k, v in zip(self._DRAW_COLORS_KEYS, self._DRAW_COLORS_VALS)} + @property + def _user_pending(self) -> list: + """Pending edits that represent USER changes since loading the PDF. + + ``_load_existing_annotations`` mirrors notes already embedded in + the PDF into ``self._pending`` with ``_existing=True`` so the + canvas can render their bubbles. Without this filter, just + opening any PDF with notes would (a) make ``closeEvent`` prompt + about "unsaved changes" the user never made and (b) trigger the + Forms-mode "has pending edits" warning. + + R11 C6: ``delete_annot`` edits raised by the canvas context menu + on an existing note are also tagged with ``_existing=True`` (see + ``_on_note_deleted`` — the flag carries through so undo can put + the original back). A plain ``_existing`` filter therefore + silently dropped real user deletions: ``_run`` warned "no + pending edits" and ``closeEvent`` skipped the unsaved-changes + prompt. The deletion was lost on save. We now keep + ``delete_annot`` edits regardless of ``_existing`` because they + always represent the user's explicit intent to remove an + annotation. + """ + return [ + e for e in self._pending + if not e.get("_existing") or e.get("type") == "delete_annot" + ] + def __init__(self, status_fn): super().__init__() self._status = status_fn @@ -651,8 +678,13 @@ def _load_existing_annotations(self): "_annot_type": annot.type[0], "_annot_bbox": [r.x0, r.y0, r.x1, r.y1], }) - self._pending_list.addItem( - t("edit.status.note_label", n=page_idx + 1)) + # R11 #3: do NOT add existing notes to the + # _pending_list UI — that list represents + # edits THIS session that have not been + # applied yet. Showing pre-existing notes + # there made the user think they had + # unsaved work the moment they opened a + # PDF with sticky notes. count += 1 self._status(t("edit.status.note_loaded", count=count, total=total_annots)) @@ -1106,7 +1138,12 @@ def _run(self): if self._mode_idx == _MODE_FORMS: # If there are also pending edits, warn — Forms apply uses # pypdf and would silently drop the in-memory edits otherwise. - if self._pending: + # Use _user_pending so pre-existing notes loaded from the PDF + # don't trigger a false-positive warning. delete_annot edits + # ARE included even when carrying _existing=True (see the + # _user_pending docstring) so the warning still fires when + # the user has removed an existing note. + if self._user_pending: reply = QMessageBox.question( self, t("msg.warning"), t("editor.forms.has_pending"), @@ -1118,7 +1155,11 @@ def _run(self): return self._apply_forms(out) return - if not self._pending: + # R11 #3: filter pre-existing notes mirrored from the PDF so + # clicking Apply on a freshly-opened PDF (with only loaded + # annotations and no user edits) shows "no pending edits" + # instead of re-saving the same content unchanged. + if not self._user_pending: QMessageBox.warning(self, t("msg.warning"), t("msg.no_pending")); return try: import fitz @@ -1275,56 +1316,65 @@ def _fitz_permissions_of(doc) -> int: def _apply_forms(self, out): try: from pypdf import PdfWriter, PdfReader - _r = PdfReader(self._doc_path) - was_encrypted = bool(_r.is_encrypted) - if was_encrypted and self._pdf_password: - _r.decrypt(self._pdf_password) - # If input was encrypted, ask the user how to save the output. - encrypt_choice = "plaintext" - if was_encrypted and self._pdf_password: - encrypt_choice = self._prompt_encryption_choice() - if encrypt_choice is None: + # R11 #8: open the source via an explicit ``with open(...)`` so + # the file handle is closed deterministically when the block + # exits — previously ``PdfReader(self._doc_path)`` held an + # internal stream alive until garbage collection, which on + # Windows blocked another tool from renaming/overwriting the + # same file. We do all writer work INSIDE the with-block so + # the lazy reads triggered by writer.append / write happen + # while the stream is still valid. + with open(self._doc_path, "rb") as _src: + _r = PdfReader(_src) + was_encrypted = bool(_r.is_encrypted) + if was_encrypted and self._pdf_password: + _r.decrypt(self._pdf_password) + # If input was encrypted, ask the user how to save. + encrypt_choice = "plaintext" + if was_encrypted and self._pdf_password: + encrypt_choice = self._prompt_encryption_choice() + if encrypt_choice is None: + return + writer = PdfWriter(); writer.append(_r) + fields = {self._form_table.item(r, 0).text(): + (self._form_table.item(r, 1).text() if self._form_table.item(r, 1) else "") + for r in range(self._form_table.rowCount())} + # R10 #6: pypdf's update_page_form_field_values raises + # PyPdfError("No /AcroForm dictionary in PDF…") on PDFs + # without form fields. The user hits this whenever they + # click Apply in Forms mode on a regular PDF; the cryptic + # error message looked like an internal crash. Detect + # up-front and short-circuit with a friendly status + # instead, leaving the file untouched. + if "/AcroForm" not in writer._root_object: + self._status(t("editor.forms.no_fields")) + self._form_status.setText(t("editor.forms.no_fields")) return - writer = PdfWriter(); writer.append(_r) - fields = {self._form_table.item(r, 0).text(): - (self._form_table.item(r, 1).text() if self._form_table.item(r, 1) else "") - for r in range(self._form_table.rowCount())} - # R10 #6: pypdf's update_page_form_field_values raises - # PyPdfError("No /AcroForm dictionary in PDF…") on PDFs - # without form fields. The user hits this whenever they - # click Apply in Forms mode on a regular PDF; the cryptic - # error message looked like an internal crash. Detect - # up-front and short-circuit with a friendly status - # instead, leaving the file untouched. - if "/AcroForm" not in writer._root_object: - self._status(t("editor.forms.no_fields")) - self._form_status.setText(t("editor.forms.no_fields")) - return - for page in writer.pages: - # auto_regenerate=True so the rendered widget appearance - # actually picks up the new value when viewed in a third- - # party viewer (Adobe etc.) that doesn't render NeedAppearances. - writer.update_page_form_field_values(page, fields, auto_regenerate=True) - if encrypt_choice == "protect" and self._pdf_password: - # Documented limitation: owner == user; original owner - # password is not recoverable from the input file. - writer.encrypt( - user_password=self._pdf_password, - owner_password=self._pdf_password, - algorithm="AES-256", - ) - _log.info( - "Re-encrypted forms output with user password as owner") - fd, tmp = tempfile.mkstemp(prefix=".pdfapps_save_", suffix=".pdf", - dir=os.path.dirname(out) or ".") - os.close(fd) - try: - with open(tmp, "wb") as f: writer.write(f) - os.replace(tmp, out) - except Exception: - try: os.unlink(tmp) - except OSError: pass - raise + for page in writer.pages: + # auto_regenerate=True so the rendered widget appearance + # actually picks up the new value when viewed in a third- + # party viewer (Adobe etc.) that doesn't render NeedAppearances. + writer.update_page_form_field_values(page, fields, auto_regenerate=True) + if encrypt_choice == "protect" and self._pdf_password: + # Documented limitation: owner == user; original owner + # password is not recoverable from the input file. + writer.encrypt( + user_password=self._pdf_password, + owner_password=self._pdf_password, + algorithm="AES-256", + ) + _log.info( + "Re-encrypted forms output with user password as owner") + fd, tmp = tempfile.mkstemp(prefix=".pdfapps_save_", suffix=".pdf", + dir=os.path.dirname(out) or ".") + os.close(fd) + try: + with open(tmp, "wb") as f: writer.write(f) + os.replace(tmp, out) + except Exception: + try: os.unlink(tmp) + except OSError: pass + raise self._status(t("edit.status.form_saved", path=out)) QMessageBox.information(self, t("msg.done"), t("msg.form_saved", path=out)) except Exception as e: diff --git a/app/i18n.py b/app/i18n.py index bd2907a..89faecd 100644 --- a/app/i18n.py +++ b/app/i18n.py @@ -211,7 +211,23 @@ def get_recent_files() -> list[str]: recents = cfg.get("recent_files", []) except Exception: return [] - return [p for p in recents if isinstance(p, str) and os.path.isfile(p)] + # R11 M3: use lexists instead of isfile so OneDrive Files-On-Demand + # placeholders aren't force-downloaded on every startup. isfile + # triggers a sync that can freeze the UI for seconds the first time + # the app starts on a OneDrive folder. Note: UNC paths to offline + # SMB shares may still cause a network round-trip but won't trigger + # placeholder hydration like isfile would. + valid = [p for p in recents + if isinstance(p, str) and os.path.lexists(p)] + # R11 M3: writeback when entries were dropped so the next call has + # less to re-check on disk. Routed through _update_config to keep + # serialization with concurrent writers (add_recent_file etc.). + if len(valid) != len([p for p in recents if isinstance(p, str)]): + try: + _update_config(lambda c: c.__setitem__("recent_files", valid)) + except Exception: + pass + return valid def add_recent_file(path: str): diff --git a/app/tools/ocr.py b/app/tools/ocr.py index aa0e427..aec03f5 100644 --- a/app/tools/ocr.py +++ b/app/tools/ocr.py @@ -316,7 +316,15 @@ def do_work(_self): except OSError: pass raise else: - pdf_pages = [] + # R11 I1: stream pages to the writer as they're + # processed instead of accumulating all per-page + # PDF bytes in a list before the final write. + # This reduces peak memory usage for large PDFs; + # the exact savings depend on pypdf's internal + # page copy behavior. Each per-page BytesIO is + # dropped as soon as writer.append() finishes + # copying its objects. + writer = PdfWriter() for i, page in enumerate(doc): if _self.is_cancelled(): return None @@ -334,12 +342,16 @@ def do_work(_self): pix = fitz.Pixmap(fitz.csRGB, pix) img = Image.frombytes( "RGB", (pix.width, pix.height), pix.samples) - pdf_pages.append( - pytesseract.image_to_pdf_or_hocr( - img, lang=lang, extension="pdf")) - writer = PdfWriter() - for page_bytes in pdf_pages: - writer.append(PdfReader(_io.BytesIO(page_bytes))) + page_bytes = pytesseract.image_to_pdf_or_hocr( + img, lang=lang, extension="pdf") + writer.append( + PdfReader(_io.BytesIO(page_bytes))) + # Drop large per-page buffers now; without + # this they linger until the next loop + # iteration rebinds the locals. + pix = None + img.close() + del page_bytes self._atomic_pdf_write( writer, out_path, sources=[pdf_path]) finally: diff --git a/app/tools/split.py b/app/tools/split.py index f03f013..890d237 100644 --- a/app/tools/split.py +++ b/app/tools/split.py @@ -131,7 +131,13 @@ def _run(self): r=r + 1, start=start, end=end)); continue w = PdfWriter() for p in range(start - 1, end): w.add_page(reader.pages[p]) - with open(os.path.join(out_dir, name), "wb") as f: w.write(f) + out_path = os.path.join(out_dir, name) + try: + self._atomic_pdf_write(w, out_path, sources=[pdf_path]) + except Exception as e: + errors.append(t("tool.split.row_invalid", + r=r + 1, start=start, end=end) + f" — {e}") + continue generated.append(name) if errors: QMessageBox.warning(self, t("msg.warning"), t("tool.split.skipped", errors="\n".join(errors))) if generated: diff --git a/app/translations.json b/app/translations.json index 5ac7138..6af3822 100644 --- a/app/translations.json +++ b/app/translations.json @@ -164,6 +164,7 @@ "msg.warning": "Warning", "msg.confirm": "Confirm", "tool.err.same_source_output": "Output path cannot be the same as the source PDF. Choose a different filename.", + "tool.err.bad_page_input": "Invalid page input: '{text}' — use formats like '1', '3-5', or '1,3,5-7'.", "msg.error": "Error", "msg.unexpected": "Something went wrong. Click 'Show Details' below for the technical error, and the full traceback has been written to the log file.", "msg.done": "Done", @@ -762,6 +763,7 @@ "msg.warning": "Aviso", "msg.confirm": "Confirmar", "tool.err.same_source_output": "O caminho de saída não pode ser o mesmo que o PDF de origem. Escolhe outro nome de ficheiro.", + "tool.err.bad_page_input": "Entrada de páginas inválida: '{text}' — usa formatos como '1', '3-5' ou '1,3,5-7'.", "msg.error": "Erro", "msg.unexpected": "Algo correu mal. Clique em 'Mostrar Detalhes' em baixo para ver o erro técnico; o traceback completo foi escrito no ficheiro de log.", "msg.done": "Concluído", @@ -1360,6 +1362,7 @@ "msg.warning": "Aviso", "msg.confirm": "Confirmar", "tool.err.same_source_output": "La ruta de salida no puede ser la misma que el PDF de origen. Elige otro nombre de archivo.", + "tool.err.bad_page_input": "Entrada de páginas no válida: '{text}' — usa formatos como '1', '3-5' o '1,3,5-7'.", "msg.error": "Error", "msg.unexpected": "Algo salió mal. Haga clic en 'Mostrar detalles' abajo para ver el error técnico; el rastreo completo se ha escrito en el archivo de registro.", "msg.done": "Hecho", @@ -1958,6 +1961,7 @@ "msg.warning": "Avertissement", "msg.confirm": "Confirmer", "tool.err.same_source_output": "Le chemin de sortie ne peut pas être identique au PDF source. Choisissez un autre nom de fichier.", + "tool.err.bad_page_input": "Entrée de pages invalide : '{text}' — utilisez des formats comme '1', '3-5' ou '1,3,5-7'.", "msg.error": "Erreur", "msg.unexpected": "Une erreur s'est produite. Cliquez sur « Afficher les détails » ci-dessous pour voir l'erreur technique ; la trace complète a été écrite dans le fichier journal.", "msg.done": "Terminé", @@ -2556,6 +2560,7 @@ "msg.warning": "Warnung", "msg.confirm": "Bestätigen", "tool.err.same_source_output": "Der Ausgabepfad darf nicht mit der Quell-PDF identisch sein. Wählen Sie einen anderen Dateinamen.", + "tool.err.bad_page_input": "Ungültige Seiteneingabe: '{text}' — verwenden Sie Formate wie '1', '3-5' oder '1,3,5-7'.", "msg.error": "Fehler", "msg.unexpected": "Etwas ist schiefgegangen. Klicken Sie unten auf 'Details anzeigen', um den technischen Fehler zu sehen; das vollständige Traceback wurde in die Protokolldatei geschrieben.", "msg.done": "Fertig", @@ -3154,6 +3159,7 @@ "msg.warning": "警告", "msg.confirm": "确认", "tool.err.same_source_output": "输出路径不能与源 PDF 相同。请选择其他文件名。", + "tool.err.bad_page_input": "无效的页面输入:'{text}' — 请使用 '1'、'3-5' 或 '1,3,5-7' 等格式。", "msg.error": "错误", "msg.unexpected": "出了点问题。点击下方的“显示详情”查看技术错误;完整的堆栈跟踪已写入日志文件。", "msg.done": "完成", @@ -3752,6 +3758,7 @@ "msg.warning": "Avviso", "msg.confirm": "Conferma", "tool.err.same_source_output": "Il percorso di output non può coincidere con il PDF di origine. Scegli un nome file diverso.", + "tool.err.bad_page_input": "Input pagine non valido: '{text}' — usa formati come '1', '3-5' o '1,3,5-7'.", "msg.error": "Errore", "msg.unexpected": "Qualcosa è andato storto. Fare clic su 'Mostra dettagli' qui sotto per vedere l'errore tecnico; il traceback completo è stato scritto nel file di registro.", "msg.done": "Fatto", @@ -4350,6 +4357,7 @@ "msg.warning": "Waarschuwing", "msg.confirm": "Bevestigen", "tool.err.same_source_output": "Het uitvoerpad mag niet hetzelfde zijn als de bron-PDF. Kies een andere bestandsnaam.", + "tool.err.bad_page_input": "Ongeldige pagina-invoer: '{text}' — gebruik formaten zoals '1', '3-5' of '1,3,5-7'.", "msg.error": "Fout", "msg.unexpected": "Er is iets misgegaan. Klik hieronder op 'Details weergeven' om de technische fout te zien; de volledige traceback is naar het logbestand geschreven.", "msg.done": "Voltooid", diff --git a/app/utils.py b/app/utils.py index 74c0063..d7348c0 100644 --- a/app/utils.py +++ b/app/utils.py @@ -72,14 +72,26 @@ def parse_pages(text: str, total: int) -> list: part = part.strip() if not part: continue + try: + if "-" in part: + a, b = part.split("-", 1) + a_int, b_int = int(a), int(b) + else: + a_int = b_int = int(part) + except ValueError as exc: + # int() raised "invalid literal for int() with base 10" — + # re-raise with a translated, user-actionable message so + # show_error() surfaces something useful instead of the + # raw Python error. + raise ValueError( + t("tool.err.bad_page_input", text=part)) from exc if "-" in part: - a, b = part.split("-", 1) - a_int, b_int = int(a), int(b) if b_int - a_int + 1 > _MAX_PAGES: - raise ValueError(f"Range too large: {a}-{b} (max {_MAX_PAGES})") + raise ValueError( + f"Range too large: {a_int}-{b_int} (max {_MAX_PAGES})") pages.extend(range(a_int - 1, b_int)) else: - pages.append(int(part) - 1) + pages.append(a_int - 1) if len(pages) > _MAX_PAGES: raise ValueError(f"Too many pages selected (max {_MAX_PAGES})") invalid = [p for p in pages if p < 0 or p >= total] diff --git a/app/window.py b/app/window.py index 90fefab..b277454 100644 --- a/app/window.py +++ b/app/window.py @@ -858,6 +858,15 @@ def _clear_recent(self): _update_config(lambda cfg: cfg.__setitem__("recent_files", [])) except Exception: pass + # R11 M5: mirror the _load_and_track refresh pattern so every + # open viewer's recents list updates immediately. Without this + # the user clears recents but the placeholder pane still shows + # the old entries until the next reload. + for v in self._viewers: + refresh = getattr(v, "_refresh_recents", None) + if callable(refresh): + with contextlib.suppress(Exception): + refresh() def _set_status(self, msg: str): self._sb.showMessage(msg) @@ -1011,10 +1020,20 @@ def dropEvent(self, e): return continue if os.path.isdir(path): - for pdf in sorted(glob.glob(os.path.join(path, "*.pdf"))): - self._load_and_track(pdf) - # Also pick up uppercase .PDF on case-sensitive FSes - for pdf in sorted(glob.glob(os.path.join(path, "*.PDF"))): + # Case-insensitive walk avoids double-loading on Windows / + # macOS HFS+ where glob("*.pdf") and glob("*.PDF") return + # the same files. Single os.listdir pass + extension filter + # works correctly on both case-sensitive and -insensitive FSes. + try: + entries = os.listdir(path) + except OSError: + entries = [] + pdfs = sorted( + os.path.join(path, f) for f in entries + if f.lower().endswith(".pdf") + and os.path.isfile(os.path.join(path, f)) + ) + for pdf in pdfs: self._load_and_track(pdf) continue if path.lower().endswith(".pdf"): @@ -1186,7 +1205,11 @@ def closeEvent(self, event): # Check for unsaved edits in the editor tool (drawings, redactions, # text edits, signatures, notes added but not yet applied/saved) edit_w = self.stack.widget(self._edit_tool_idx()) - if edit_w and getattr(edit_w, "_pending", None): + # Use _user_pending so we don't prompt about pre-existing + # annotations that the editor mirrored into _pending only for + # canvas rendering (loading a PDF with notes should not look + # like "unsaved edits"). + if edit_w and getattr(edit_w, "_user_pending", None): ans = QMessageBox.question( self, t("msg.warning"), t("pipeline.unsaved_prompt"), QMessageBox.StandardButton.Discard diff --git a/tests/test_audit_r11.py b/tests/test_audit_r11.py new file mode 100644 index 0000000..99f781e --- /dev/null +++ b/tests/test_audit_r11.py @@ -0,0 +1,444 @@ +"""Source-level + behavioral regression tests for PR-G / Round 11 fixes. + +Bug map (PR-G worklist): + #1 split.py no atomic write (same-source truncation) (HIGH A6) + #2 Drop folder duplicates on case-insensitive FS (HIGH N5) + #3 Editor _pending filter for existing notes (HIGH N2) + #4 parse_pages friendly error (MED N4) + #5 OCR streaming writer (memory regression) (MED I1) + #6 _clear_recent UI refresh (MED M5) + #7 get_recent_files writeback + lexists for OneDrive (MED M3) + #8 _apply_forms PdfReader stream leak (MED Extra) +""" + +from __future__ import annotations + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest + +ROOT = Path(__file__).resolve().parent.parent + + +def _read(rel: str) -> str: + return (ROOT / rel).read_text(encoding="utf-8") + + +# ── #1 — split.py uses _atomic_pdf_write ──────────────────────────────── + + +def test_split_uses_atomic_pdf_write(): + """split.py per-row writes must route through _atomic_pdf_write so + a same-source overwrite is rejected up front and lazy reads from + PdfReader are not truncated mid-flight.""" + src = _read("app/tools/split.py") + assert "_atomic_pdf_write" in src, ( + "split.py must call self._atomic_pdf_write to inherit the " + "same-source guard + tempfile/os.replace path." + ) + # The raw direct-write pattern must be gone. + assert "with open(os.path.join(out_dir, name), \"wb\") as f: w.write(f)" \ + not in src, "Old direct-write pattern still present in split.py" + + +def test_split_passes_input_as_source_to_atomic_write(): + """The atomic write call must pass the input pdf as a source so the + _check_not_same_path layer can reject same-source-output.""" + src = _read("app/tools/split.py") + # Look for the call site with the sources kwarg. + assert "sources=[pdf_path]" in src + + +# ── #2 — Drop folder dedup on case-insensitive FS ──────────────────────── + + +def test_window_drop_folder_does_not_double_glob(): + """The pre-fix code globbed *.pdf AND *.PDF which double-loaded every + file on Windows / HFS+. The fix uses os.listdir + extension filter.""" + src = _read("app/window.py") + # The two-glob pattern must be gone. + assert "glob.glob(os.path.join(path, \"*.PDF\"))" not in src + # The replacement must be there. + assert "os.listdir(path)" in src + # And case-insensitive extension match. + assert ".lower().endswith(\".pdf\")" in src + + +def test_window_drop_folder_dedup_behavioral(tmp_path): + """Behavioral: simulate the new walk pattern against a directory + containing case-mixed PDF filenames. The result must contain each + file exactly once even on filesystems where uppercase and lowercase + globs both match the same on-disk file.""" + d = tmp_path + # Create files in lower + uppercase. On Windows / HFS+ these are + # the same file; we still test that the walk pattern doesn't yield + # duplicates when the FS is case-sensitive. + (d / "a.pdf").write_bytes(b"%PDF-1.4") + # Use a distinct uppercase name to avoid same-file collision on + # case-insensitive FSes. + (d / "B.PDF").write_bytes(b"%PDF-1.4") + entries = os.listdir(str(d)) + pdfs = sorted( + os.path.join(str(d), f) for f in entries + if f.lower().endswith(".pdf") + and os.path.isfile(os.path.join(str(d), f)) + ) + # No duplicates allowed. + assert len(pdfs) == len(set(pdfs)) + # Both files present. + assert any(p.endswith("a.pdf") for p in pdfs) + assert any(p.endswith("B.PDF") for p in pdfs) + + +# ── #3 — Editor _user_pending filters existing notes ──────────────────── + + +def test_editor_has_user_pending_property(): + """The editor must expose _user_pending that filters out _existing + annotations mirrored from the loaded PDF.""" + src = _read("app/editor/tab.py") + assert "_user_pending" in src + assert "if not e.get(\"_existing\")" in src + + +def test_window_closeevent_uses_user_pending(): + """The window's closeEvent guard must use _user_pending so just + opening a PDF with existing notes does not prompt 'unsaved'.""" + src = _read("app/window.py") + # The guard must reference _user_pending, not raw _pending. + assert "getattr(edit_w, \"_user_pending\", None)" in src + assert "getattr(edit_w, \"_pending\", None)" not in src + + +def test_editor_forms_warning_uses_user_pending(): + """Forms-mode 'you have pending edits' warning must filter existing + annotations but keep delete_annot (which may carry _existing=True + when the user removed a pre-existing note via the canvas menu).""" + src = _read("app/editor/tab.py") + forms_block_start = src.find("_MODE_FORMS:\n # If there are also pending edits") + assert forms_block_start > 0 + block = src[forms_block_start:forms_block_start + 800] + assert "self._user_pending" in block + + +def test_user_pending_excludes_existing_keeps_user_edits(): + """Behavioral: a pending list with one existing note + one user + deletion + one user draw must yield only the user edits.""" + sample = [ + {"type": "note", "page": 0, "text": "loaded from disk", + "_existing": True}, + {"type": "delete_annot", "page": 0, "annot_type": 0, + "bbox": [0, 0, 10, 10]}, + {"type": "draw", "page": 1, "points": [(0, 0), (1, 1)]}, + ] + user_only = [e for e in sample if not e.get("_existing")] + assert len(user_only) == 2 + assert {e["type"] for e in user_only} == {"delete_annot", "draw"} + + +def test_editor_run_uses_user_pending_for_empty_check(): + """The _run() short-circuit ('no pending edits') must use + _user_pending so opening a PDF with sticky notes and clicking + Apply doesn't silently re-save unchanged.""" + src = _read("app/editor/tab.py") + run_start = src.find(" def _run(self):") + next_def = src.find(" def ", run_start + 1) + body = src[run_start:next_def] + assert "if not self._user_pending:" in body + assert "msg.no_pending" in body + + +def test_editor_pending_list_skips_existing_notes(): + """_load_existing_annotations must NOT push pre-existing notes + into the _pending_list UI — that widget is for THIS session's + unsaved edits only. The note overlay (canvas rendering) is + untouched; only the visible 'pending edits' list is filtered.""" + src = _read("app/editor/tab.py") + load_start = src.find("def _load_existing_annotations(self)") + next_def = src.find(" def ", load_start + 1) + body = src[load_start:next_def] + # The old addItem line must be gone from the existing-notes branch. + # We check that 'self._pending_list.addItem' is not called in the + # branch that sets `_existing`: True. + assert "\"_existing\": True" in body + # Look around the _existing=True dict for an addItem — there must be + # none in the same indentation block. + existing_idx = body.find("\"_existing\": True") + # Scan the next ~600 chars after the dict closes for an addItem call + # before the next outer-level statement (e.g. count += 1 or next + # `for`); the test is permissive because formatting can shift. + tail = body[existing_idx:existing_idx + 800] + # The cleanup comment marker we placed identifies the intentional + # removal so a future refactor that re-adds the line is caught. + assert "do NOT add existing notes to the" in tail + + +# ── R11 C6 — _user_pending must keep delete_annot even for existing ───── + + +def _make_user_pending_stub(): + """Tiny stub so the _user_pending property can be exercised without + spinning up the full Qt tab. Only the attributes it touches matter.""" + from app.editor.tab import TabEditar + + class _Stub: + _user_pending = TabEditar._user_pending + + def __init__(self): + self._pending: list = [] + + return _Stub() + + +def test_user_pending_keeps_delete_annot_even_if_existing(): + """Regression for reviewer R11 C6: ``delete_annot`` edits for + existing notes are also tagged with ``_existing=True`` (so undo can + restore the original). The previous ``_user_pending`` filter dropped + them, which made ``_run`` warn 'no pending edits' and ``closeEvent`` + skip the unsaved-changes prompt — silently losing the deletion.""" + s = _make_user_pending_stub() + s._pending = [ + # Mirrored on load — must be filtered. + {"type": "note", "text": "original", "_existing": True}, + # User right-clicked the loaded note and chose Delete. The + # _existing flag carries through so _undo can put the original + # back on the canvas. MUST stay in _user_pending. + {"type": "delete_annot", "page": 0, "bbox": (0, 0, 10, 10), + "_existing": True, + "_original_note": {"type": "note", "text": "original"}}, + # Brand-new note typed by the user. MUST stay in _user_pending. + {"type": "note", "text": "new user note"}, + ] + user_pending = s._user_pending + assert len(user_pending) == 2 + types = [e["type"] for e in user_pending] + assert "delete_annot" in types + assert "note" in types + # The original existing note is filtered out. + assert not any(e.get("text") == "original" for e in user_pending) + + +def test_delete_existing_note_then_apply_persists(): + """End-to-end flow: user opens a PDF with one sticky note (mirrored + into ``_pending`` by ``_load_existing_annotations`` with + ``_existing=True``), right-clicks → Delete (which appends a + ``delete_annot`` edit also carrying ``_existing=True``), then clicks + Apply. ``_user_pending`` MUST surface the deletion so the save loop + processes it instead of hitting the no_changes warning.""" + s = _make_user_pending_stub() + # _load_existing_annotations behavior. + s._pending = [{ + "type": "note", "text": "loaded", "page": 0, + "rect": (0, 0, 10, 10), "_existing": True, + }] + # Context-menu delete (cf. _on_note_deleted at app/editor/tab.py + # ~line 1078): the delete_annot edit carries _existing=True so that + # undo can roll back to the original note. + s._pending.append({ + "type": "delete_annot", + "page": 0, + "bbox": (0, 0, 10, 10), + "_existing": True, + "_original_note": s._pending[0], + }) + # Apply click — the _run guard reads _user_pending. + user_pending = s._user_pending + # The delete MUST be visible to the save loop. + assert any(e["type"] == "delete_annot" for e in user_pending), ( + "delete_annot with _existing=True was filtered out — the " + "user's deletion would be silently lost on save") + + +# ── #4 — parse_pages friendly error ───────────────────────────────────── + + +def test_parse_pages_raises_friendly_message_for_invalid_input(): + """Garbage input like 'abc' or '1-' must surface the translated + bad_page_input message instead of the raw int() ValueError.""" + from app.utils import parse_pages + + with pytest.raises(ValueError) as exc: + parse_pages("abc", 10) + msg = str(exc.value) + # The friendly message embeds the offending substring. + assert "abc" in msg + # And does NOT leak the python internal phrasing. + assert "invalid literal for int" not in msg + + +def test_parse_pages_raises_friendly_message_for_open_range(): + from app.utils import parse_pages + with pytest.raises(ValueError) as exc: + parse_pages("1-", 10) + msg = str(exc.value) + assert "1-" in msg + assert "invalid literal for int" not in msg + + +def test_parse_pages_still_accepts_valid_input(): + """Regression guard — the new try/except must not break the happy + path or the dedupe/sort contract.""" + from app.utils import parse_pages + assert parse_pages("3,1,2,3", 10) == [0, 1, 2] + assert parse_pages("1-3", 10) == [0, 1, 2] + assert parse_pages("1,4-5", 10) == [0, 3, 4] + + +def test_parse_pages_out_of_range_still_works(): + """Out-of-range error message format must be preserved.""" + from app.utils import parse_pages + with pytest.raises(ValueError) as exc: + parse_pages("99", 10) + assert "out of range" in str(exc.value).lower() + + +# ── #5 — OCR streaming writer ─────────────────────────────────────────── + + +def test_ocr_no_longer_accumulates_pdf_pages_list(): + """The old pattern collected all per-page PDF bytes in a list before + writing — a 100-pg @ 300dpi job spent ~500MB RAM. The new code + appends each page directly into the writer.""" + src = _read("app/tools/ocr.py") + # Old accumulation list must be gone. + assert "pdf_pages = []" not in src + assert "pdf_pages.append" not in src + # New streaming pattern: writer.append inside the per-page loop. + # The marker is that writer is instantiated before the loop and + # writer.append happens inside the same scope as the for-page loop. + assert "writer = PdfWriter()" in src + # And we drop the per-page buffers explicitly. + assert "img.close()" in src + + +# ── #6 — _clear_recent refreshes UI ───────────────────────────────────── + + +def test_clear_recent_refreshes_open_viewers(): + """_clear_recent must iterate self._viewers and call _refresh_recents + on each, mirroring the pattern in _load_and_track.""" + src = _read("app/window.py") + clear_start = src.find("def _clear_recent(self)") + next_def = src.find("def ", clear_start + 1) + body = src[clear_start:next_def] + assert "_refresh_recents" in body, ( + "_clear_recent must refresh recents in all open viewers (M5)." + ) + assert "self._viewers" in body + + +# ── #7 — get_recent_files writeback + lexists ─────────────────────────── + + +def test_get_recent_files_uses_lexists(): + """OneDrive Files-On-Demand: isfile triggers a placeholder download. + lexists does not. The fix swaps isfile for lexists.""" + src = _read("app/i18n.py") + func_start = src.find("def get_recent_files()") + func_end = src.find("def ", func_start + 1) + body = src[func_start:func_end] + assert "os.path.lexists" in body + # The old isfile call must be gone from non-comment lines. Strip + # comments before searching so mentions of the old API in the + # explanatory comment don't trip this assertion. + code_lines = [ + ln for ln in body.splitlines() + if not ln.lstrip().startswith("#") + ] + code_only = "\n".join(code_lines) + assert "os.path.isfile" not in code_only + + +def test_get_recent_files_writeback_when_entries_dropped(tmp_path, monkeypatch): + """Behavioral: when a stale entry is filtered out, get_recent_files + must writeback the trimmed list so we don't re-check disk forever.""" + # Stub config path to a temp file. + cfg = tmp_path / "cfg.json" + real = str(tmp_path / "real.pdf") + Path(real).write_bytes(b"%PDF-1.4") + cfg.write_text(json.dumps({ + "recent_files": [real, str(tmp_path / "missing.pdf")], + }), encoding="utf-8") + + import app.i18n as i18n + monkeypatch.setattr(i18n, "_CONFIG_PATH", str(cfg)) + + result = i18n.get_recent_files() + assert result == [real] + # Writeback should have trimmed the list on disk. + written = json.loads(cfg.read_text(encoding="utf-8")) + assert written["recent_files"] == [real] + + +def test_get_recent_files_no_writeback_when_all_valid(tmp_path, monkeypatch): + """Don't churn the config file when nothing changed.""" + cfg = tmp_path / "cfg.json" + real1 = str(tmp_path / "a.pdf") + real2 = str(tmp_path / "b.pdf") + Path(real1).write_bytes(b"%PDF-1.4") + Path(real2).write_bytes(b"%PDF-1.4") + cfg.write_text(json.dumps({"recent_files": [real1, real2]}), + encoding="utf-8") + mtime_before = cfg.stat().st_mtime_ns + + import app.i18n as i18n + monkeypatch.setattr(i18n, "_CONFIG_PATH", str(cfg)) + + result = i18n.get_recent_files() + assert result == [real1, real2] + # No writeback when valid == recents. + assert cfg.stat().st_mtime_ns == mtime_before + + +# ── #8 — _apply_forms closes PdfReader stream ─────────────────────────── + + +def test_apply_forms_uses_with_open_for_source(): + """_apply_forms must wrap the source PdfReader in `with open(...)` so + the file handle is released deterministically — leaving it pinned + by GC blocks Windows from renaming the file from another tool.""" + src = _read("app/editor/tab.py") + apply_start = src.find("def _apply_forms(self, out)") + next_def = src.find(" def ", apply_start + 1) + body = src[apply_start:next_def] + assert "with open(self._doc_path, \"rb\")" in body + # PdfReader must take the stream (not the raw path). + assert "PdfReader(_src)" in body + + +def test_apply_forms_writer_work_happens_inside_with_block(): + """The writer.append / .write_*** calls must be inside the with-block + so the source stream is still alive when pypdf reads lazily.""" + src = _read("app/editor/tab.py") + apply_start = src.find("def _apply_forms(self, out)") + next_def = src.find(" def ", apply_start + 1) + body = src[apply_start:next_def] + # Find the with-block start and verify writer.write inside. + with_pos = body.find("with open(self._doc_path") + assert with_pos > 0 + # All the writer interactions should appear after the with-line. + assert body.find("writer = PdfWriter()", with_pos) > with_pos + assert body.find("writer.write(f)", with_pos) > with_pos + + +# ── i18n parity ───────────────────────────────────────────────────────── + + +def test_translations_parity_for_new_key(): + """The new tool.err.bad_page_input key must exist in all 8 languages.""" + data = json.loads(_read("app/translations.json")) + expected_langs = {"en", "pt", "es", "fr", "de", "zh", "it", "nl"} + assert set(data.keys()) >= expected_langs + for lang in expected_langs: + assert "tool.err.bad_page_input" in data[lang], ( + f"Missing translation key in '{lang}'" + ) + # All translations must reference the {text} placeholder so + # parse_pages's `.format()` call resolves cleanly. + assert "{text}" in data[lang]["tool.err.bad_page_input"], ( + f"Translation in '{lang}' is missing the {{text}} placeholder" + )