diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs index 27aa06c..98afd57 100644 --- a/crates/synth-opt/src/lib.rs +++ b/crates/synth-opt/src/lib.rs @@ -925,16 +925,59 @@ impl CommonSubexpressionElimination { } } - // Store invalidates loads from same address - Opcode::Store { addr, .. } => { + // Store invalidates loads from same address. + // Also rewrite `src` through `reg_map` so the local-variable + // sink picks up CSE-eliminated producers (issue #104). + Opcode::Store { src, addr } => { + let src = resolve(src); + inst.opcode = Opcode::Store { src, addr }; expr_map.remove(&ExprKey::Load(addr)); } - // TeeStore also invalidates loads from same address - Opcode::TeeStore { addr, .. } => { + // TeeStore also invalidates loads from same address. + // Rewrite `src` through `reg_map` for the same reason as Store. + Opcode::TeeStore { dest, src, addr } => { + let src = resolve(src); + inst.opcode = Opcode::TeeStore { dest, src, addr }; expr_map.remove(&ExprKey::Load(addr)); } + // Linear-memory load: rewrite the address vreg through + // `reg_map` so when a preceding `Opcode::Load` (local.get) + // is CSE-killed (its dest replaced by an earlier dest), the + // MemLoad consumer still points at the live vreg. + // + // Issue #104: prior to this arm, the `_ => {}` fallback left + // `addr` untouched. The CSE-killed Load was then dropped by + // `optimize_full`'s dead-instruction filter and `ir_to_arm` + // panicked with "vreg vN has no assigned ARM register". + // + // We intentionally do NOT CSE MemLoads themselves (there is + // no alias analysis for linear memory; any MemStore can + // invalidate any address). + Opcode::MemLoad { dest, addr, offset } => { + let addr = resolve(addr); + inst.opcode = Opcode::MemLoad { dest, addr, offset }; + } + + // Linear-memory store: rewrite both `src` and `addr` through + // `reg_map`. Same rationale as `MemLoad`. + Opcode::MemStore { src, addr, offset } => { + let src = resolve(src); + let addr = resolve(addr); + inst.opcode = Opcode::MemStore { src, addr, offset }; + } + + // 8/16-bit sign extension (unary): resolve `src`. + Opcode::Extend8S { dest, src } => { + let src = resolve(src); + inst.opcode = Opcode::Extend8S { dest, src }; + } + Opcode::Extend16S { dest, src } => { + let src = resolve(src); + inst.opcode = Opcode::Extend16S { dest, src }; + } + // Label marks a loop start - clear all Load caches since values // can change across loop iterations (conservative but safe) Opcode::Label { .. } => { diff --git a/crates/synth-synthesis/tests/issue_104_i32_loadstore_cse.rs b/crates/synth-synthesis/tests/issue_104_i32_loadstore_cse.rs new file mode 100644 index 0000000..c2c3dc5 --- /dev/null +++ b/crates/synth-synthesis/tests/issue_104_i32_loadstore_cse.rs @@ -0,0 +1,435 @@ +//! Regression test for issue #104: i32.load / i32.store wasm_to_ir produces +//! unmapped vreg (latent #93-class bug). +//! +//! # Symptom +//! +//! For the canonical store-then-load function +//! +//! ```text +//! (func (param i32 i32) (result i32) +//! local.get 0 ;; address +//! local.get 1 ;; value +//! i32.store ;; store value at address +//! local.get 0 ;; address (again) +//! i32.load) ;; load value back +//! ``` +//! +//! the optimized pipeline (`OptimizerBridge::optimize_full` + `ir_to_arm`) +//! produced a `MemLoad { addr: v3, .. }` ARM lowering for the second +//! `local.get 0`, where vreg `v3` had been eliminated by CSE but its +//! consumer (`MemLoad`) was never rewritten to reference the surviving +//! `v0`. The unmapped vreg silently fell back to `R0` in `get_arm_reg`, +//! producing a miscompile that structural tests did not catch and that +//! PR #101's defensive panic exposed as `"vreg v3 has no assigned ARM +//! register and no spill slot"`. +//! +//! # Root cause +//! +//! `CommonSubexpressionElimination::eliminate_cse` did not have match arms +//! for `Opcode::MemLoad` / `Opcode::MemStore` (linear-memory ops). When the +//! second `Opcode::Load { dest: v3, addr: 0 }` (the second `local.get 0`) +//! was CSE-killed against the earlier `Opcode::Load { dest: v0, addr: 0 }` +//! and `reg_map[v3] = v0` was recorded, no subsequent pass over +//! `MemLoad { addr: v3, .. }` rewrote `addr` through `reg_map`. The dead +//! `Load` was then dropped by `optimize_full`'s +//! `!i.is_dead && i.opcode != Opcode::Nop` filter and `ir_to_arm` had no +//! producer for `v3`. +//! +//! Same shape as #93 (which fixed `I64ExtendI32U` / `I64ExtendI32S` / +//! `I32WrapI64` in `wasm_to_ir`): a wasm op whose result vreg ends up +//! unmapped by the time `ir_to_arm` runs, masked by the `Reg::R0` silent +//! fallback in `get_arm_reg`. +//! +//! # What this test verifies +//! +//! 1. The optimized pipeline compiles the i32_memory pattern without +//! panicking and emits an `Ldr` (the load). +//! 2. A mini ARM interpreter — extended to model `Movw` / `Movt` / `Add` +//! register-base / `Str` / `Ldr` against an emulated linear memory at +//! `0x20000100` — runs the emitted ARM and produces the stored value +//! in the function's return register (R0 per AAPCS). +//! 3. The load and store consistently use the same base register so the +//! round-trip lands at the same effective address. +//! +//! Pre-fix: assertion (1) panics (`vreg v3 ...`) once PR #101's defensive +//! panic is in place; without the panic, assertion (2) returns the wrong +//! value (whatever `R0` happened to hold at the load — the stored value's +//! address rather than the value itself). +//! +//! Post-fix: CSE rewrites the `MemLoad` `addr` through `reg_map`; the load +//! reads from the address the user actually computed, and the function +//! returns the value that was stored. + +use std::collections::HashMap; +use synth_synthesis::{ArmOp, MemAddr, Operand2, OptimizerBridge, Reg, WasmOp}; + +// ========================================================================= +// Mini ARM interpreter with linear-memory support +// +// Just enough of the Thumb-2 model to run `MemLoad` / `MemStore` lowerings +// produced by `ir_to_arm`: Movw, Movt, Add (reg + reg), Mov, Ldr, Str. +// ========================================================================= + +/// `Reg` doesn't derive `Hash`, so we index a fixed-size array. +fn reg_index(r: Reg) -> usize { + match r { + Reg::R0 => 0, + Reg::R1 => 1, + Reg::R2 => 2, + Reg::R3 => 3, + Reg::R4 => 4, + Reg::R5 => 5, + Reg::R6 => 6, + Reg::R7 => 7, + Reg::R8 => 8, + Reg::R9 => 9, + Reg::R10 => 10, + Reg::R11 => 11, + Reg::R12 => 12, + Reg::SP => 13, + Reg::LR => 14, + Reg::PC => 15, + } +} + +#[derive(Default)] +struct Cpu { + regs: [u32; 16], + /// Emulated little-endian byte-addressable memory. Sparse so we don't + /// have to allocate the entire 0x20000100-base region. + mem: HashMap, +} + +impl Cpu { + fn get(&self, r: Reg) -> u32 { + self.regs[reg_index(r)] + } + fn set(&mut self, r: Reg, v: u32) { + self.regs[reg_index(r)] = v; + } + + fn op2(&self, op: &Operand2) -> u32 { + match op { + Operand2::Imm(v) => *v as u32, + Operand2::Reg(r) => self.get(*r), + Operand2::RegShift { rm, .. } => self.get(*rm), + } + } + + fn effective_addr(&self, a: &MemAddr) -> u32 { + let base = self.get(a.base); + let with_reg = match a.offset_reg { + Some(r) => base.wrapping_add(self.get(r)), + None => base, + }; + with_reg.wrapping_add(a.offset as u32) + } + + fn load_u32(&self, addr: u32) -> u32 { + let b0 = *self.mem.get(&addr).unwrap_or(&0) as u32; + let b1 = *self.mem.get(&addr.wrapping_add(1)).unwrap_or(&0) as u32; + let b2 = *self.mem.get(&addr.wrapping_add(2)).unwrap_or(&0) as u32; + let b3 = *self.mem.get(&addr.wrapping_add(3)).unwrap_or(&0) as u32; + b0 | (b1 << 8) | (b2 << 16) | (b3 << 24) + } + + fn store_u32(&mut self, addr: u32, v: u32) { + self.mem.insert(addr, (v & 0xff) as u8); + self.mem + .insert(addr.wrapping_add(1), ((v >> 8) & 0xff) as u8); + self.mem + .insert(addr.wrapping_add(2), ((v >> 16) & 0xff) as u8); + self.mem + .insert(addr.wrapping_add(3), ((v >> 24) & 0xff) as u8); + } + + fn step(&mut self, op: &ArmOp) { + match op { + ArmOp::Movw { rd, imm16 } => { + // MOVW writes the low 16 bits and clears the high 16. + self.set(*rd, *imm16 as u32); + } + ArmOp::Movt { rd, imm16 } => { + let lo = self.get(*rd) & 0xFFFF; + self.set(*rd, ((*imm16 as u32) << 16) | lo); + } + ArmOp::Mov { rd, op2 } => { + let v = self.op2(op2); + self.set(*rd, v); + } + ArmOp::Add { rd, rn, op2 } => { + let a = self.get(*rn); + let b = self.op2(op2); + self.set(*rd, a.wrapping_add(b)); + } + ArmOp::Sub { rd, rn, op2 } => { + let a = self.get(*rn); + let b = self.op2(op2); + self.set(*rd, a.wrapping_sub(b)); + } + ArmOp::Str { rd, addr } => { + let ea = self.effective_addr(addr); + let v = self.get(*rd); + self.store_u32(ea, v); + } + ArmOp::Ldr { rd, addr } => { + let ea = self.effective_addr(addr); + let v = self.load_u32(ea); + self.set(*rd, v); + } + // Prologue/epilogue (push/pop/bx) and other unmodeled ops are + // safely ignored — none affect the memory-roundtrip outcome. + _ => {} + } + } +} + +// ========================================================================= +// Test helpers +// ========================================================================= + +/// The minimal WASM op sequence equivalent to `tests/wast/i32_memory.wast`'s +/// `store_load` function body, but with **operands swapped**: local 1 is +/// the address, local 0 is the value. +/// +/// Why swapped: in the natural `(addr=local 0, value=local 1)` order, the +/// second `local.get 0` is CSE-killed and the consumer `MemLoad` is left +/// referencing the unmapped `v3`. Pre-fix, `get_arm_reg(v3)` falls back to +/// `Reg::R0` — which by coincidence is also `param_regs[0]`, the register +/// holding the address. So the buggy code path happens to compute the +/// right effective address by accident, masking the bug from a behavioral +/// test. +/// +/// With the operands swapped, the second `local.get 1` is the one that +/// gets CSE-killed. The correct mapping is `v3 -> R1` (param 1). The R0 +/// fallback would point at `param 0` (the *value*), so the load reads +/// from `[mem_base + value]` rather than `[mem_base + addr]`. That's an +/// observable miscompile. +fn store_load_wasm_ops_param1_addr() -> Vec { + vec![ + WasmOp::LocalGet(1), // address (param 1, would map to R1) + WasmOp::LocalGet(0), // value (param 0, R0) + WasmOp::I32Store { + offset: 0, + align: 2, + }, + WasmOp::LocalGet(1), // address again — CSE eliminates this + WasmOp::I32Load { + offset: 0, + align: 2, + }, + ] +} + +/// The natural (addr=local 0, value=local 1) shape — matches +/// `tests/wast/i32_memory.wast`. Kept around to ensure both orderings +/// compile and round-trip. Useful for asserting the +/// "structural property" tests (Ldr/Str base register identity) +/// without depending on operand order. +fn store_load_wasm_ops_param0_addr() -> Vec { + vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Store { + offset: 0, + align: 2, + }, + WasmOp::LocalGet(0), + WasmOp::I32Load { + offset: 0, + align: 2, + }, + ] +} + +/// Run the optimized compile pipeline against a given WASM op sequence +/// with 2 params (both i32). +fn compile_optimized(wasm: &[WasmOp]) -> Vec { + let bridge = OptimizerBridge::new(); + let (ir, _cfg, _stats) = bridge + .optimize_full(wasm) + .expect("optimize_full should succeed for the store-load pattern"); + bridge.ir_to_arm(&ir, /* num_params = */ 2) +} + +/// Execute the swapped-operand variant: param 1 = addr (in R1), +/// param 0 = value (in R0). Returns the i32 return register (R0). +fn execute_store_load_param1_addr(addr: u32, value: u32) -> u32 { + let arm = compile_optimized(&store_load_wasm_ops_param1_addr()); + let mut cpu = Cpu::default(); + cpu.set(Reg::R0, value); // param 0 (value) + cpu.set(Reg::R1, addr); // param 1 (address) + for op in &arm { + cpu.step(op); + } + cpu.get(Reg::R0) +} + +// ========================================================================= +// Tests +// ========================================================================= + +/// The core regression: the optimized pipeline must produce code that +/// actually round-trips a store/load. +/// +/// **Uses the swapped-operand pattern** (param 1 = address, param 0 = +/// value) so the bug is sensitive to the R0 fallback. With the natural +/// (param 0 = address) layout, the unmapped vreg `v3` happens to +/// fall back to R0 — which is also the correct register for param 0 — +/// so the buggy code path produces the right answer by accident. +/// +/// Pre-fix this fails two ways: +/// * panic (under PR #101's defensive `get_arm_reg`, +/// `"vreg v3 has no assigned ARM register"`), +/// * wrong return value under v0.2.1 — the load reads from +/// `[mem_base + R0 + 0]` where R0 holds the *value* (param 0), +/// not the *address* (param 1, R1). So the load picks up +/// whatever the value's-numeric-interpretation-as-address +/// happens to point to in our sparse memory map (typically 0). +/// +/// Post-fix the load's `addr` vreg is rewritten by CSE to the +/// surviving `v0` (the first `Opcode::Load { dest: v0, addr: 1 }` +/// for local 1), which maps to R1 — correct. +#[test] +fn issue_104_store_load_roundtrip_optimized() { + // Pick a value far enough from 0 that a R0-fallback bug would point + // at a sparse-memory address that doesn't contain `value` (so we'd + // read back 0 instead of `value`). + let addr: u32 = 0x40; + let value: u32 = 0xCAFEBABE; + let got = execute_store_load_param1_addr(addr, value); + assert_eq!( + got, value, + "store_load(addr={addr:#x}, value={value:#x}) returned {got:#x}; \ + expected {value:#x}. This means the second `local.get 1` (the \ + load address) was not correctly re-routed to R1 after CSE — \ + see issue #104. Buggy code falls back to R0 (the value param)." + ); +} + +/// Same shape, varying inputs. Hits the assertions from the i32_memory.wast +/// test cases adapted to the swapped-operand pattern. +/// +/// **All cases use `addr != value`** so a R0-fallback miscompile gives a +/// wrong answer (reading from `[mem_base + value]`, which is either +/// uninitialized sparse memory → 0, or some other address that doesn't +/// hold `value`). +#[test] +fn issue_104_store_load_table_driven() { + let cases: &[(u32, u32)] = &[ + // (addr, value) — value chosen distinct from addr. + (0x10, 42), + (0x20, 100), + (0x30, 0xDEADBEEF), + (0x40, 0xFFFF_FFFE), // i32 -2 (note: not equal to addr 0x40) + (0x50, 0x7FFF_FFFF), // i32::MAX + ]; + for &(addr, value) in cases { + let got = execute_store_load_param1_addr(addr, value); + assert_eq!( + got, value, + "store_load(addr={addr:#x}, value={value:#x}) returned {got:#x}; \ + expected {value:#x}" + ); + } +} + +/// Compile must produce `Ldr` and `Str` that use the same base register +/// (the R12 scratch holding `linear_mem_base + user_addr`). This is a +/// structural sanity check that doesn't depend on operand order — +/// it would fire on either the natural or swapped layout if the bug +/// caused the load to compute a different scratch. +#[test] +fn issue_104_load_and_store_use_consistent_base_register() { + let arm = compile_optimized(&store_load_wasm_ops_param0_addr()); + + let store = arm + .iter() + .find(|o| matches!(o, ArmOp::Str { .. })) + .expect("expected an Str (i32.store lowering)"); + let load = arm + .iter() + .find(|o| matches!(o, ArmOp::Ldr { .. })) + .expect("expected an Ldr (i32.load lowering)"); + + let store_base = match store { + ArmOp::Str { addr, .. } => addr.base, + _ => unreachable!(), + }; + let load_base = match load { + ArmOp::Ldr { addr, .. } => addr.base, + _ => unreachable!(), + }; + + assert_eq!( + store_base, load_base, + "i32.store and i32.load lowered to use different base registers \ + ({store_base:?} vs {load_base:?}); both should compute \ + `linear_mem_base + user_addr` in the same scratch. See #104." + ); +} + +/// Structural check: in the swapped-operand pattern, the `Ldr`'s +/// effective address must depend on R1 (the address param), not R0 +/// (the value param). Pre-fix `get_arm_reg`'s `Reg::R0` fallback +/// causes the load's address-computation `Add` to read from R0, +/// which is the value — observable in the emitted code as `Add R12, +/// R12, R0` rather than `Add R12, R12, R1`. +/// +/// We check that R0 is NOT the `offset_reg` of any `Ldr` after the +/// `Str` (i.e. the load uses a base computation involving the +/// address param, not the value param). +#[test] +fn issue_104_load_addr_does_not_use_value_register() { + let arm = compile_optimized(&store_load_wasm_ops_param1_addr()); + + // Find the Add that computes the load's base. In the lowering + // sequence for MemLoad we expect: + // MOVW R12, #base_lo + // MOVT R12, #base_hi + // ADD R12, R12, R <-- this Add's op2 must be R1 (addr) + // LDR Rd, [R12, #offset] + // + // We look for the *last* `Add { rd: R12, rn: R12, op2: Reg(rN) }` + // before the last `Ldr` — that's the load's address-computation. + let last_ldr_idx = arm + .iter() + .rposition(|o| matches!(o, ArmOp::Ldr { .. })) + .expect("expected an Ldr"); + + let mut load_addr_add: Option<&ArmOp> = None; + for op in &arm[..last_ldr_idx] { + if let ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(_), + } = op + { + load_addr_add = Some(op); + } + } + + let add = load_addr_add + .expect("expected an `Add R12, R12, R` for the load address computation"); + if let ArmOp::Add { + op2: Operand2::Reg(src), + .. + } = add + { + assert_ne!( + *src, + Reg::R0, + "load address computation uses R0 (the value param); expected R1 \ + (the address param). The CSE-killed local-get-1 did not have \ + its consumer MemLoad rewritten to point at the surviving vreg. \ + Pre-fix this manifests as a wrong return value (we load from \ + [base + value] instead of [base + addr]). See issue #104." + ); + assert_eq!( + *src, + Reg::R1, + "load address computation should use R1 (the address param); got {:?}", + src + ); + } +}