Skip to content

Commit 47e8097

Browse files
committed
fix: handle lone Unicode surrogates in artifact persistence
External tool outputs (e.g. Exa Search) can contain lone Unicode surrogate characters (U+D800..U+DFFF) that are invalid in both UTF-8 and JSON, causing UnicodeEncodeError in _serialize_payload_bytes and surfacing as "Tool storage unavailable" in the chat UI. - Add _sanitize_surrogates() to recursively strip lone surrogates and null bytes from payload dicts before serialization, using the same encode-surrogatepass/decode-ignore technique as OWUI's sanitize_text_for_db - Move _prepare_rows_for_storage inside the try-except in _db_persist so serialization errors return [] instead of crashing streaming - Add 5 regression tests covering surrogates, null bytes, valid Unicode preservation, recursive sanitization, and round-trip safety
1 parent a903e47 commit 47e8097

2 files changed

Lines changed: 92 additions & 3 deletions

File tree

open_webui_openrouter_pipe/storage/persistence.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -687,9 +687,37 @@ def _should_encrypt(self, item_type: str) -> bool:
687687
return True
688688
return (item_type or "").lower() == "reasoning"
689689

690+
@staticmethod
691+
def _sanitize_surrogates(obj: Any) -> Any:
692+
"""Recursively strip lone surrogates and null bytes from all strings.
693+
694+
Lone surrogates (U+D800..U+DFFF) are invalid in both UTF-8 and JSON,
695+
and cause ``UnicodeEncodeError`` / ``json.dumps`` failures. External
696+
tool outputs (e.g. web-search results) may contain them when source
697+
data has encoding issues.
698+
699+
Uses the same encode-surrogatepass/decode-ignore technique as
700+
Open WebUI's ``sanitize_text_for_db`` (``open_webui.utils.misc``).
701+
"""
702+
if isinstance(obj, str):
703+
# Remove null bytes, then strip lone surrogates
704+
cleaned = obj.replace("\x00", "")
705+
try:
706+
return cleaned.encode("utf-8", errors="surrogatepass").decode(
707+
"utf-8", errors="ignore"
708+
)
709+
except (UnicodeEncodeError, UnicodeDecodeError):
710+
return cleaned
711+
if isinstance(obj, dict):
712+
return {k: ArtifactStore._sanitize_surrogates(v) for k, v in obj.items()}
713+
if isinstance(obj, list):
714+
return [ArtifactStore._sanitize_surrogates(v) for v in obj]
715+
return obj
716+
690717
def _serialize_payload_bytes(self, payload: dict[str, Any]) -> bytes:
691718
"""Return compact JSON bytes for ``payload``."""
692-
return json.dumps(payload, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
719+
sanitized = self._sanitize_surrogates(payload)
720+
return json.dumps(sanitized, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
693721

694722
def _maybe_compress_payload(self, serialized: bytes) -> tuple[bytes, bool]:
695723
"""Compress serialized bytes when LZ4 is available and thresholds are met."""
@@ -1018,9 +1046,8 @@ async def _db_persist(self, rows: list[dict[str, Any]]) -> list[str]:
10181046
for row in rows:
10191047
row.setdefault("id", generate_item_id())
10201048

1021-
self._prepare_rows_for_storage(rows)
1022-
10231049
try:
1050+
self._prepare_rows_for_storage(rows)
10241051
if self._redis_enabled:
10251052
return await self._redis_enqueue_rows(rows)
10261053
return await self._db_persist_direct(rows, user_id=user_id)

tests/test_persistence.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,68 @@ def test_prepare_rows_for_storage_non_dict_payload(pipe_instance):
585585
assert rows[1]["payload"] == 123
586586

587587

588+
def test_sanitize_surrogates_strips_lone_surrogates():
589+
"""_sanitize_surrogates removes lone surrogates that crash json.dumps/UTF-8 encode."""
590+
from open_webui_openrouter_pipe.storage.persistence import ArtifactStore
591+
592+
# Lone high surrogate (the exact character from the Exa Search crash)
593+
text_with_surrogate = f"Hello \ud835 world"
594+
result = ArtifactStore._sanitize_surrogates(text_with_surrogate)
595+
assert "\ud835" not in result
596+
assert "Hello" in result and "world" in result
597+
# Must be safe for JSON + UTF-8
598+
import json
599+
encoded = json.dumps(result, ensure_ascii=False).encode("utf-8")
600+
assert isinstance(encoded, bytes)
601+
602+
603+
def test_sanitize_surrogates_preserves_valid_text():
604+
"""_sanitize_surrogates preserves normal Unicode text including emoji and CJK."""
605+
from open_webui_openrouter_pipe.storage.persistence import ArtifactStore
606+
607+
valid = "Hello 🌍 world 你好 café"
608+
assert ArtifactStore._sanitize_surrogates(valid) == valid
609+
610+
611+
def test_sanitize_surrogates_recursive():
612+
"""_sanitize_surrogates recursively cleans nested dicts and lists."""
613+
from open_webui_openrouter_pipe.storage.persistence import ArtifactStore
614+
615+
payload = {
616+
"output": f"result \ud835 text",
617+
"nested": {"deep": [f"item \ud800"]},
618+
}
619+
result = ArtifactStore._sanitize_surrogates(payload)
620+
assert "\ud835" not in result["output"]
621+
assert "\ud800" not in result["nested"]["deep"][0]
622+
# Non-string values pass through
623+
assert ArtifactStore._sanitize_surrogates(42) == 42
624+
assert ArtifactStore._sanitize_surrogates(None) is None
625+
626+
627+
def test_serialize_payload_bytes_with_surrogates(pipe_instance):
628+
"""_serialize_payload_bytes handles payloads containing lone surrogates."""
629+
store = pipe_instance._artifact_store
630+
payload = {"output": f"math \ud835 symbols", "type": "function_call_output"}
631+
result = store._serialize_payload_bytes(payload)
632+
assert isinstance(result, bytes)
633+
# Must round-trip through JSON
634+
import json
635+
parsed = json.loads(result.decode("utf-8"))
636+
assert "math" in parsed["output"]
637+
assert "\ud835" not in parsed["output"]
638+
639+
640+
def test_sanitize_surrogates_strips_null_bytes():
641+
"""_sanitize_surrogates also removes null bytes (PostgreSQL incompatible)."""
642+
from open_webui_openrouter_pipe.storage.persistence import ArtifactStore
643+
644+
text = "before\x00after"
645+
result = ArtifactStore._sanitize_surrogates(text)
646+
assert "\x00" not in result
647+
assert result == "beforeafter"
648+
649+
588650
# -----------------------------------------------------------------------------
589651
# Tests for DB Persist Edge Cases (lines 812, 916, 975, 980)
590652
# -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)