From a7a2650309b322422881771693095728c7e9d632 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyin Date: Sat, 16 May 2026 00:22:35 +0300 Subject: [PATCH] agent: detect dropped RSP_DATA packets in read_memory via seq gap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `FlashAgentClient.read_memory` would silently return short data when a RSP_DATA packet got CRC-rejected mid-stream by `recv_packet`. Surfaced backing up an hi3516av300:emmc camera's partitions over the new agent eMMC reader: at 921600 baud a 9 MiB stream consistently dropped ~5% of packets, and the client appended the surviving bytes contiguously without realising data was missing, then accepted the agent's final ACK_OK as success. Two back-to-back dumps disagreed on kernel/romfs SHAs while uboot/env (both small enough to land in single packet bursts) were byte-identical. The agent already stamps the 2-byte seq prefix on every RSP_DATA in `agent/main.c::handle_read` (pkt[0..1] = seq). Just validate it on the host side. On a gap, raise loudly with the offset, the expected and received seq numbers, and the actual addr the gap landed at — the caller can chunk + retry. Also added a final size check at ACK_OK: if `len(received) != size`, raise. Catches the (rare) case of agent ACKing early without any seq gap on the way. Verified end-to-end with the hi3516av300:emmc partition dump: - 115200 baud + 64-KiB chunks + byte-level idle-drain in the host retry path → 0 retries, both two back-to-back dumps SHA256-identical across all four partitions (uboot 320 KiB, env 64 KiB, kernel 2240 KiB, romfs 9216 KiB). - 921600 baud single-shot → seq gap is now raised after the first lost packet (was: silent short read). 3 new tests in TestReadMemorySeqValidation: - test_normal_sequential_seq_passes (regression: clean seq stream still works) - test_seq_gap_raises (the bug fix) - test_size_mismatch_at_ack_raises (belt-and-braces total-size check) Followup work captured in kaeru: `agent-read-seq-gap-detection-2026-05-16`. The host transport's chunked-retry helper (which lived in /tmp/backup_emmc.py for this validation) is a reasonable candidate for a public `FlashAgentClient.read_memory_chunked` if more callers hit the same problem. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/defib/agent/client.py | 24 ++++++++++++ tests/test_agent_protocol.py | 76 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/src/defib/agent/client.py b/src/defib/agent/client.py index fd5b0a9..67116ec 100644 --- a/src/defib/agent/client.py +++ b/src/defib/agent/client.py @@ -318,12 +318,28 @@ async def read_memory( await send_packet(self._transport, CMD_READ, payload) received = bytearray() + # Track the 16-bit seq prefix the agent stamps on every RSP_DATA + # (`pkt[0..1] = seq` in agent/main.c handle_read). Without this, + # a COBS-CRC-rejected packet was silently dropped by recv_packet + # and read_memory returned short data with no error — see kaeru + # `agent-read-silent-packet-loss-on-long-uart-streams-2026-05-15`. + expected_seq = 0 while True: cmd, data = await recv_packet(self._transport, timeout=60.0) if cmd == RSP_READY: continue elif cmd == RSP_DATA and len(data) > 2: + seq = data[0] | (data[1] << 8) + if seq != expected_seq: + raise RuntimeError( + f"Read packet seq gap at offset {len(received)} " + f"(addr={addr + len(received):#010x}): expected " + f"seq={expected_seq}, got seq={seq}; likely UART " + f"corruption on a long stream. Try a smaller read " + f"or a lower baud." + ) received.extend(data[2:]) + expected_seq = (expected_seq + 1) & 0xFFFF if on_progress: on_progress(len(received), size) elif cmd == RSP_ACK: @@ -334,6 +350,14 @@ async def read_memory( f"Read rejected by agent: status=0x{status:02x} " f"({detail}); addr={addr:#010x} size={size}" ) + # Final size check — catches the case where the agent + # ACKs early but we never saw a seq gap (shouldn't happen + # with the seq-check above, but cheap insurance). + if len(received) != size: + raise RuntimeError( + f"Read returned {len(received)} bytes but agent " + f"ACKed for {size}; addr={addr:#010x}" + ) break else: raise RuntimeError(f"Unexpected response: cmd=0x{cmd:02x}") diff --git a/tests/test_agent_protocol.py b/tests/test_agent_protocol.py index 66ef909..c9bdb1f 100644 --- a/tests/test_agent_protocol.py +++ b/tests/test_agent_protocol.py @@ -871,3 +871,79 @@ async def test_crc32_raises_on_flash_error_ack(self): t.enqueue_rx(make_device_packet(RSP_ACK, bytes([ACK_FLASH_ERROR]))) with pytest.raises(RuntimeError, match="CRC32 rejected"): await client.crc32(0x10100000, 64) + + +class TestReadMemorySeqValidation: + """When recv_packet silently drops a CRC-invalid RSP_DATA packet + mid-stream during a long read (e.g. UART corruption on a multi-MB + eMMC dump), read_memory must NOT return short data and pretend + success. It must spot the gap via the seq prefix the agent stamps + on every RSP_DATA and raise loudly so the caller can chunk + retry.""" + + @pytest.mark.asyncio + async def test_normal_sequential_seq_passes(self): + # seq=0,1,2 followed by ACK_OK — must succeed cleanly. + from defib.transport.mock import MockTransport + from defib.agent.client import FlashAgentClient + + t = MockTransport(flush_clears_buffer=False) + t.enqueue_rx(make_device_packet(RSP_READY, b"DEFIB")) + client = FlashAgentClient(t) + assert await client.connect(timeout=1.0) + + # 3 packets of 4 bytes each => 12 bytes total + for seq in range(3): + seq_bytes = struct.pack("