From fafd98e3f39177555cfd5a38aec6662f24992276 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Fri, 15 May 2026 15:59:39 +0200
Subject: [PATCH] =?UTF-8?q?feat(riscv):=20i64=20Phase=201=20=E2=80=94=20ty?=
 =?UTF-8?q?ped=20vstack=20+=20i64=20arithmetic=20/=20logic=20/=20compares?=
 =?UTF-8?q?=20/=20loads=20/=20stores?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactors the RV32IMAC selector's virtual stack from `Vec<Reg>` to a typed
`Vec<VstackVal>` enum so i64 values can live on the stack as a `(lo, hi)`
register pair. RV32 doesn't require consecutive pairs (unlike ARM's
LDRD/STRD), so any two distinct temps work.

New typed helpers replace the untyped `push_val`/`pop_val`/`pop_pair`:
  push_i32 / push_i64 / pop_i32 / pop_i64 / pop_pair_i32 / pop_pair_i64
A new `SelectorError::StackTypeMismatch` variant surfaces selector-internal
type bugs (rather than silently mixing halves).

Phase 1 i64 ops implemented (selector-only — encoder already supports
every base instruction needed):

  I64Const          two emit_load_imm sequences (lo + hi)
  I64Add            add lo, sltu carry, add hi, add hi+carry
  I64Sub            sltu borrow, sub lo, sub hi, sub hi-borrow
  I64And/Or/Xor     pairwise on lo and hi
  I64Eq / I64Ne     xor diffs, or them, then sltiu/sltu → i32 0/1
  I64Eqz            or halves, sltiu → i32 0/1
  I64ExtendI32U     hi = 0
  I64ExtendI32S     hi = srai src, 31
  I32WrapI64        zero-instruction; lo continues as i32
  I64Load           lw lo @offset, lw hi @offset+4
  I64Store          sw lo @offset, sw hi @offset+4

The return epilogue now handles both i32 (a0) and i64 (a0=lo, a1=hi)
return values, matching the RV32 psABI for 64-bit returns.

Out of scope (Phase 2): i64 mul/div/rem (runtime helpers), i64 shifts /
rotates / clz / ctz / popcnt (shamt branching at 32-bit boundary), i64
ordered compares (lt/le/gt/ge S+U — hi-then-lo ladder), i64 sign-
extending sub-word loads, I64Extend{8,16,32}S, sub-word i64 stores.
These remain at the existing `_ => Unsupported` arm and error cleanly.

Validation:
- cargo test --package synth-backend-riscv: 98 → 110 passing (+12 new)
- cargo clippy --package synth-backend-riscv -- -D warnings: clean
- cargo fmt --check: clean
- cargo build --workspace: clean

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 crates/synth-backend-riscv/src/selector.rs | 787 +++++++++++++++++++--
 1 file changed, 740 insertions(+), 47 deletions(-)
diff --git a/crates/synth-backend-riscv/src/selector.rs b/crates/synth-backend-riscv/src/selector.rs
index 7541629..9cf2043 100644
--- a/crates/synth-backend-riscv/src/selector.rs
+++ b/crates/synth-backend-riscv/src/selector.rs
@@ -5,12 +5,20 @@
 //! flow (block / loop / if / br / br_if), and local variable access.
 //!
 //! Out of scope (see `select_simple` doc comments for the full list):
-//! - i64 (handled by `select_i64.rs` in a follow-up PR)
+//! - i64 multiply / divide / remainder / shifts / rotates / count-leading-or-
+//!   trailing-zeros / popcount / signed-and-unsigned compare ladders /
+//!   sign-extending sub-word loads (Phase 2 — needs runtime helpers and
+//!   shamt branching at the 32-bit boundary)
 //! - F32/F64 (RV32F/D — not yet wired)
 //! - br_table (lowered in B3 alongside jump tables)
 //! - Cross-function calls (need linker-resolvable Call ops + relocations)
 //! - Component Model lifting/lowering
 //!
+//! i64 representation: on RV32, an i64 value is held in a *register pair*
+//! `(lo, hi)` where `lo` is bits [31:0] and `hi` is bits [63:32]. The two
+//! halves don't need to be consecutive registers (unlike ARM's LDRD/STRD
+//! pair requirement) — any two distinct temporaries work.
+//!
 //! Memory model: the wasm linear-memory base lives in `s11` (x27) — chosen
 //! because it's callee-saved across the AAPCS-style RV calling convention,
 //! so leaf-style functions can rely on it without a re-load. The startup
@@ -37,6 +45,34 @@ pub enum SelectorError {
 
     #[error("br depth {depth} out of range (control stack height {height})")]
     BrOutOfRange { depth: u32, height: usize },
+
+    #[error("stack type mismatch at op {op:?}: expected {expected}, found {found} on top of stack")]
+    StackTypeMismatch {
+        op: WasmOp,
+        expected: &'static str,
+        found: &'static str,
+    },
+}
+
+/// A value sitting on the selector's virtual stack. RV32 is a 32-bit ISA so
+/// i32s map to a single register, while i64s require a `(lo, hi)` pair (see
+/// the module-level docs for the bit-ordering convention).
+#[derive(Debug, Clone, Copy)]
+enum VstackVal {
+    I32(Reg),
+    I64 { lo: Reg, hi: Reg },
+}
+
+/// An i64 register pair: `.0` is `lo` (bits [31:0]), `.1` is `hi` (bits [63:32]).
+type I64Pair = (Reg, Reg);
+
+impl VstackVal {
+    fn type_name(self) -> &'static str {
+        match self {
+            VstackVal::I32(_) => "i32",
+            VstackVal::I64 { .. } => "i64",
+        }
+    }
 }
 
 /// Output of the selector.
@@ -112,8 +148,9 @@ enum FrameKind {
 
 struct Selector {
     out: Vec<RiscVOp>,
-    /// Virtual stack of registers holding wasm values.
-    vstack: Vec<Reg>,
+    /// Virtual stack of values produced by lowering. i32s occupy a single
+    /// register; i64s occupy a `(lo, hi)` register pair.
+    vstack: Vec<VstackVal>,
     /// Control-flow frames; index 0 is the outermost.
     ctrl: Vec<ControlFrame>,
     /// Argument registers for the current function's params (a0..a7).
@@ -171,19 +208,69 @@ impl Selector {
         r
     }
 
-    fn push_val(&mut self, r: Reg) {
-        self.vstack.push(r);
+    fn push_i32(&mut self, r: Reg) {
+        self.vstack.push(VstackVal::I32(r));
     }
 
-    fn pop_val(&mut self, op: &WasmOp) -> Result<Reg, SelectorError> {
+    fn push_i64(&mut self, lo: Reg, hi: Reg) {
+        self.vstack.push(VstackVal::I64 { lo, hi });
+    }
+
+    fn pop_any(&mut self, op: &WasmOp) -> Result<VstackVal, SelectorError> {
         self.vstack
             .pop()
             .ok_or_else(|| SelectorError::StackUnderflow(op.clone()))
     }
 
-    fn pop_pair(&mut self, op: &WasmOp) -> Result<(Reg, Reg), SelectorError> {
-        let rhs = self.pop_val(op)?;
-        let lhs = self.pop_val(op)?;
+    /// Pop the top of stack and assert it's an i32. Errors with
+    /// `StackTypeMismatch` if the top is an i64 (caller's bug, never the
+    /// wasm input's bug — by this point the wasm has been validated).
+    fn pop_i32(&mut self, op: &WasmOp) -> Result<Reg, SelectorError> {
+        let v = self.pop_any(op)?;
+        match v {
+            VstackVal::I32(r) => Ok(r),
+            other => {
+                // Restore for diagnostic determinism — the function will
+                // bail out anyway, but we don't want subsequent code to
+                // see a different stack layout if someone catches the err.
+                self.vstack.push(other);
+                Err(SelectorError::StackTypeMismatch {
+                    op: op.clone(),
+                    expected: "i32",
+                    found: other.type_name(),
+                })
+            }
+        }
+    }
+
+    /// Pop the top of stack and assert it's an i64, returning `(lo, hi)`.
+    fn pop_i64(&mut self, op: &WasmOp) -> Result<I64Pair, SelectorError> {
+        let v = self.pop_any(op)?;
+        match v {
+            VstackVal::I64 { lo, hi } => Ok((lo, hi)),
+            other => {
+                self.vstack.push(other);
+                Err(SelectorError::StackTypeMismatch {
+                    op: op.clone(),
+                    expected: "i64",
+                    found: other.type_name(),
+                })
+            }
+        }
+    }
+
+    /// Pop two i32s; returns `(lhs, rhs)` in wasm push-order (lhs was pushed
+    /// first, rhs second — i.e. rhs is on top of stack).
+    fn pop_pair_i32(&mut self, op: &WasmOp) -> Result<(Reg, Reg), SelectorError> {
+        let rhs = self.pop_i32(op)?;
+        let lhs = self.pop_i32(op)?;
+        Ok((lhs, rhs))
+    }
+
+    /// Pop two i64s; returns `((lhs_lo, lhs_hi), (rhs_lo, rhs_hi))`.
+    fn pop_pair_i64(&mut self, op: &WasmOp) -> Result<(I64Pair, I64Pair), SelectorError> {
+        let rhs = self.pop_i64(op)?;
+        let lhs = self.pop_i64(op)?;
         Ok((lhs, rhs))
     }
 
@@ -209,7 +296,14 @@ impl Selector {
             I32Const(v) => {
                 let dst = self.alloc_temp();
                 emit_load_imm(&mut self.out, dst, *v);
-                self.push_val(dst);
+                self.push_i32(dst);
+            }
+            I64Const(v) => {
+                let lo = self.alloc_temp();
+                let hi = self.alloc_temp();
+                emit_load_imm(&mut self.out, lo, *v as i32);
+                emit_load_imm(&mut self.out, hi, (*v >> 32) as i32);
+                self.push_i64(lo, hi);
             }
 
             // ─── Arithmetic ─────────────────────────────────────────────
@@ -248,6 +342,27 @@ impl Selector {
             I32LeU => self.lower_cmp_unsigned_ge(op, true)?,
             I32GeU => self.lower_cmp_unsigned_ge(op, false)?,
 
+            // ─── i64 arithmetic / logic ─────────────────────────────────
+            I64Add => self.lower_i64_add(op)?,
+            I64Sub => self.lower_i64_sub(op)?,
+            I64And => self.lower_i64_bitwise(op, |rd, rs1, rs2| RiscVOp::And { rd, rs1, rs2 })?,
+            I64Or => self.lower_i64_bitwise(op, |rd, rs1, rs2| RiscVOp::Or { rd, rs1, rs2 })?,
+            I64Xor => self.lower_i64_bitwise(op, |rd, rs1, rs2| RiscVOp::Xor { rd, rs1, rs2 })?,
+
+            // ─── i64 comparisons (result is an i32 0/1) ─────────────────
+            I64Eq => self.lower_i64_eq(op, false)?,
+            I64Ne => self.lower_i64_eq(op, true)?,
+            I64Eqz => self.lower_i64_eqz(op)?,
+
+            // ─── i64 / i32 conversions ──────────────────────────────────
+            I64ExtendI32U => self.lower_i64_extend_i32_u(op)?,
+            I64ExtendI32S => self.lower_i64_extend_i32_s(op)?,
+            I32WrapI64 => self.lower_i32_wrap_i64(op)?,
+
+            // ─── i64 memory ─────────────────────────────────────────────
+            I64Load { offset, align: _ } => self.lower_i64_load(op, *offset)?,
+            I64Store { offset, align: _ } => self.lower_i64_store(op, *offset)?,
+
             // ─── Memory ─────────────────────────────────────────────────
             I32Load { offset, align: _ } => self.lower_load_word(op, *offset)?,
             I32Load8S { offset, align: _ } => {
@@ -268,7 +383,9 @@ impl Selector {
 
             // ─── Stack manipulation ─────────────────────────────────────
             Drop => {
-                self.pop_val(op)?;
+                // Drop is type-agnostic: pop whatever's on top (i32 or i64).
+                // Popping an i64 discards the whole pair in one shot.
+                self.pop_any(op)?;
             }
             Nop => {}
             Unreachable => {
@@ -333,12 +450,14 @@ impl Selector {
             // we don't materialize the frame — emit a clear error instead.
             return Err(SelectorError::Unsupported(op.clone()));
         }
-        self.push_val(dst);
+        // Phase 1: locals are always treated as i32. i64 locals would need
+        // two arg-register slots and live outside the scope of this PR.
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_local_set(&mut self, idx: u32, op: &WasmOp) -> Result<(), SelectorError> {
-        let src = self.pop_val(op)?;
+        let src = self.pop_i32(op)?;
         if (idx as usize) < self.arg_regs.len() {
             // mv arg, src
             self.out.push(RiscVOp::Addi {
@@ -353,11 +472,23 @@ impl Selector {
     }
 
     fn lower_local_tee(&mut self, idx: u32, op: &WasmOp) -> Result<(), SelectorError> {
-        // tee = set + get; the value remains on the stack.
-        let src = *self
+        // tee = set + get; the value remains on the stack. Phase 1 only
+        // handles i32 locals — see lower_local_set for the same restriction.
+        let top = self
             .vstack
             .last()
+            .copied()
             .ok_or_else(|| SelectorError::StackUnderflow(op.clone()))?;
+        let src = match top {
+            VstackVal::I32(r) => r,
+            other => {
+                return Err(SelectorError::StackTypeMismatch {
+                    op: op.clone(),
+                    expected: "i32",
+                    found: other.type_name(),
+                });
+            }
+        };
         if (idx as usize) < self.arg_regs.len() {
             self.out.push(RiscVOp::Addi {
                 rd: self.arg_regs[idx as usize],
@@ -376,10 +507,10 @@ impl Selector {
     where
         F: FnOnce(Reg, Reg, Reg) -> RiscVOp,
     {
-        let (rs1, rs2) = self.pop_pair(op)?;
+        let (rs1, rs2) = self.pop_pair_i32(op)?;
         let rd = self.alloc_temp();
         self.out.push(build(rd, rs1, rs2));
-        self.push_val(rd);
+        self.push_i32(rd);
         Ok(())
     }
 
@@ -389,7 +520,7 @@ impl Selector {
     where
         F: FnOnce(Reg, Reg, Reg) -> RiscVOp,
     {
-        let (rs1, rs2) = self.pop_pair(op)?;
+        let (rs1, rs2) = self.pop_pair_i32(op)?;
         let rd = self.alloc_temp();
         let ok_label = self.fresh_label("Ldiv_ok");
         // bne rs2, zero, Ldiv_ok  → skip trap when divisor != 0
@@ -404,14 +535,14 @@ impl Selector {
             name: ok_label.clone(),
         });
         self.out.push(build(rd, rs1, rs2));
-        self.push_val(rd);
+        self.push_i32(rd);
         Ok(())
     }
 
     // ────────── Comparisons ──────────
 
     fn lower_eqz(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
-        let src = self.pop_val(op)?;
+        let src = self.pop_i32(op)?;
         let dst = self.alloc_temp();
         // sltiu dst, src, 1  → 1 iff src == 0
         self.out.push(RiscVOp::Sltiu {
@@ -419,12 +550,12 @@ impl Selector {
             rs1: src,
             imm: 1,
         });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_cmp_eq(&mut self, op: &WasmOp, invert: bool) -> Result<(), SelectorError> {
-        let (lhs, rhs) = self.pop_pair(op)?;
+        let (lhs, rhs) = self.pop_pair_i32(op)?;
         let diff = self.alloc_temp();
         // xor diff, lhs, rhs   → 0 iff equal
         self.out.push(RiscVOp::Xor {
@@ -448,22 +579,22 @@ impl Selector {
                 imm: 1,
             });
         }
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_cmp_signed_lt(&mut self, op: &WasmOp, swap: bool) -> Result<(), SelectorError> {
-        let (a, b) = self.pop_pair(op)?;
+        let (a, b) = self.pop_pair_i32(op)?;
         let dst = self.alloc_temp();
         let (rs1, rs2) = if swap { (b, a) } else { (a, b) };
         self.out.push(RiscVOp::Slt { rd: dst, rs1, rs2 });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_cmp_signed_ge(&mut self, op: &WasmOp, swap: bool) -> Result<(), SelectorError> {
         // a >= b  =  !(a < b) ; le maps via swap.
-        let (a, b) = self.pop_pair(op)?;
+        let (a, b) = self.pop_pair_i32(op)?;
         let lt = self.alloc_temp();
         let dst = self.alloc_temp();
         let (rs1, rs2) = if swap { (b, a) } else { (a, b) };
@@ -474,21 +605,21 @@ impl Selector {
             rs1: lt,
             imm: 1,
         });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_cmp_unsigned_lt(&mut self, op: &WasmOp, swap: bool) -> Result<(), SelectorError> {
-        let (a, b) = self.pop_pair(op)?;
+        let (a, b) = self.pop_pair_i32(op)?;
         let dst = self.alloc_temp();
         let (rs1, rs2) = if swap { (b, a) } else { (a, b) };
         self.out.push(RiscVOp::Sltu { rd: dst, rs1, rs2 });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     fn lower_cmp_unsigned_ge(&mut self, op: &WasmOp, swap: bool) -> Result<(), SelectorError> {
-        let (a, b) = self.pop_pair(op)?;
+        let (a, b) = self.pop_pair_i32(op)?;
         let lt = self.alloc_temp();
         let dst = self.alloc_temp();
         let (rs1, rs2) = if swap { (b, a) } else { (a, b) };
@@ -498,14 +629,14 @@ impl Selector {
             rs1: lt,
             imm: 1,
         });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
     // ────────── Memory ──────────
 
     fn lower_load_word(&mut self, op: &WasmOp, offset: u32) -> Result<(), SelectorError> {
-        let addr = self.pop_val(op)?;
+        let addr = self.pop_i32(op)?;
         let dst = self.alloc_temp();
         // tmp = base + addr
         let tmp = self.alloc_temp();
@@ -520,7 +651,7 @@ impl Selector {
             rs1: tmp,
             imm: offset_to_imm(offset)?,
         });
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
@@ -530,7 +661,7 @@ impl Selector {
         offset: u32,
         kind: LoadKind,
     ) -> Result<(), SelectorError> {
-        let addr = self.pop_val(op)?;
+        let addr = self.pop_i32(op)?;
         let dst = self.alloc_temp();
         let tmp = self.alloc_temp();
         self.out.push(RiscVOp::Add {
@@ -562,7 +693,7 @@ impl Selector {
             },
         };
         self.out.push(op_built);
-        self.push_val(dst);
+        self.push_i32(dst);
         Ok(())
     }
 
@@ -572,8 +703,8 @@ impl Selector {
         offset: u32,
         kind: StoreKind,
     ) -> Result<(), SelectorError> {
-        let value = self.pop_val(op)?;
-        let addr = self.pop_val(op)?;
+        let value = self.pop_i32(op)?;
+        let addr = self.pop_i32(op)?;
         let tmp = self.alloc_temp();
         self.out.push(RiscVOp::Add {
             rd: tmp,
@@ -605,7 +736,7 @@ impl Selector {
     // ────────── Control flow ──────────
 
     fn lower_if(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
-        let cond = self.pop_val(op)?;
+        let cond = self.pop_i32(op)?;
         let else_label = self.fresh_label("Lelse");
         let end_label = self.fresh_label("Lif_end");
         // beq cond, zero, Lelse  → skip the then-branch when cond is false
@@ -687,7 +818,7 @@ impl Selector {
     }
 
     fn lower_br_if(&mut self, depth: u32, op: &WasmOp) -> Result<(), SelectorError> {
-        let cond = self.pop_val(op)?;
+        let cond = self.pop_i32(op)?;
         let target_label = self.target_label_for_depth(depth)?;
         // bne cond, zero, target — branch when condition is true (non-zero)
         self.out.push(RiscVOp::Branch {
@@ -721,15 +852,40 @@ impl Selector {
     }
 
     /// Emit `mv a0, top; ret` — the function epilogue.
+    ///
+    /// For i64 returns the wasm ABI puts the lo half in `a0` and the hi half
+    /// in `a1` (matches the RISC-V psABI for 64-bit return values on RV32).
+    /// We only emit the moves when the source isn't already in the target
+    /// return register, to avoid a redundant `addi`.
     fn emit_return_epilogue(&mut self) {
-        if let Some(&top) = self.vstack.last()
-            && top != Reg::A0
-        {
-            self.out.push(RiscVOp::Addi {
-                rd: Reg::A0,
-                rs1: top,
-                imm: 0,
-            });
+        if let Some(top) = self.vstack.last().copied() {
+            match top {
+                VstackVal::I32(r) => {
+                    if r != Reg::A0 {
+                        self.out.push(RiscVOp::Addi {
+                            rd: Reg::A0,
+                            rs1: r,
+                            imm: 0,
+                        });
+                    }
+                }
+                VstackVal::I64 { lo, hi } => {
+                    if lo != Reg::A0 {
+                        self.out.push(RiscVOp::Addi {
+                            rd: Reg::A0,
+                            rs1: lo,
+                            imm: 0,
+                        });
+                    }
+                    if hi != Reg::A1 {
+                        self.out.push(RiscVOp::Addi {
+                            rd: Reg::A1,
+                            rs1: hi,
+                            imm: 0,
+                        });
+                    }
+                }
+            }
         }
         self.out.push(RiscVOp::Jalr {
             rd: Reg::ZERO,
@@ -737,6 +893,265 @@ impl Selector {
             imm: 0,
         });
     }
+
+    // ────────── i64 lowerings (Phase 1) ──────────
+    //
+    // On RV32 an i64 lives in a register pair `(lo, hi)` per the convention
+    // in the module docs. Each operation pops one or two pairs from vstack,
+    // emits the equivalent sequence on the 32-bit halves, and pushes the
+    // result (either as another i64 pair or as an i32 0/1 for comparisons).
+
+    /// 64-bit add with carry: `lo = al+bl ; carry = (lo <u al) ; hi = ah+bh+carry`.
+    fn lower_i64_add(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        let lo = self.alloc_temp();
+        let carry = self.alloc_temp();
+        let hi_sum = self.alloc_temp();
+        let hi = self.alloc_temp();
+        // lo = al + bl
+        self.out.push(RiscVOp::Add {
+            rd: lo,
+            rs1: al,
+            rs2: bl,
+        });
+        // carry = (lo <u al) — unsigned overflow detection. Adding two
+        // unsigned 32-bit values overflows iff the result is < either operand.
+        self.out.push(RiscVOp::Sltu {
+            rd: carry,
+            rs1: lo,
+            rs2: al,
+        });
+        // hi_sum = ah + bh
+        self.out.push(RiscVOp::Add {
+            rd: hi_sum,
+            rs1: ah,
+            rs2: bh,
+        });
+        // hi = hi_sum + carry
+        self.out.push(RiscVOp::Add {
+            rd: hi,
+            rs1: hi_sum,
+            rs2: carry,
+        });
+        self.push_i64(lo, hi);
+        Ok(())
+    }
+
+    /// 64-bit sub with borrow: `borrow = (al <u bl) ; lo = al-bl ; hi = ah-bh-borrow`.
+    fn lower_i64_sub(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        let borrow = self.alloc_temp();
+        let lo = self.alloc_temp();
+        let hi_diff = self.alloc_temp();
+        let hi = self.alloc_temp();
+        // borrow = (al <u bl) — captured *before* the subtraction so we
+        // observe the pre-subtraction relation.
+        self.out.push(RiscVOp::Sltu {
+            rd: borrow,
+            rs1: al,
+            rs2: bl,
+        });
+        // lo = al - bl
+        self.out.push(RiscVOp::Sub {
+            rd: lo,
+            rs1: al,
+            rs2: bl,
+        });
+        // hi_diff = ah - bh
+        self.out.push(RiscVOp::Sub {
+            rd: hi_diff,
+            rs1: ah,
+            rs2: bh,
+        });
+        // hi = hi_diff - borrow
+        self.out.push(RiscVOp::Sub {
+            rd: hi,
+            rs1: hi_diff,
+            rs2: borrow,
+        });
+        self.push_i64(lo, hi);
+        Ok(())
+    }
+
+    /// Pairwise bitwise op on lo and hi (and/or/xor share this shape).
+    fn lower_i64_bitwise<F>(&mut self, op: &WasmOp, build: F) -> Result<(), SelectorError>
+    where
+        F: Fn(Reg, Reg, Reg) -> RiscVOp,
+    {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        let lo = self.alloc_temp();
+        let hi = self.alloc_temp();
+        self.out.push(build(lo, al, bl));
+        self.out.push(build(hi, ah, bh));
+        self.push_i64(lo, hi);
+        Ok(())
+    }
+
+    /// i64 eq / ne — diff both halves, or them together, then sltiu / sltu.
+    /// Result is an i32 0/1 value.
+    fn lower_i64_eq(&mut self, op: &WasmOp, invert: bool) -> Result<(), SelectorError> {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        let d_lo = self.alloc_temp();
+        let d_hi = self.alloc_temp();
+        let d = self.alloc_temp();
+        let dst = self.alloc_temp();
+        // d_lo = al ^ bl ; d_hi = ah ^ bh ; d = d_lo | d_hi  → 0 iff equal
+        self.out.push(RiscVOp::Xor {
+            rd: d_lo,
+            rs1: al,
+            rs2: bl,
+        });
+        self.out.push(RiscVOp::Xor {
+            rd: d_hi,
+            rs1: ah,
+            rs2: bh,
+        });
+        self.out.push(RiscVOp::Or {
+            rd: d,
+            rs1: d_lo,
+            rs2: d_hi,
+        });
+        if invert {
+            // ne: dst = (0 <u d) → 1 iff d != 0
+            self.out.push(RiscVOp::Sltu {
+                rd: dst,
+                rs1: Reg::ZERO,
+                rs2: d,
+            });
+        } else {
+            // eq: dst = (d <u 1) → 1 iff d == 0
+            self.out.push(RiscVOp::Sltiu {
+                rd: dst,
+                rs1: d,
+                imm: 1,
+            });
+        }
+        self.push_i32(dst);
+        Ok(())
+    }
+
+    /// i64 eqz: pops one i64, returns i32 0/1.
+    fn lower_i64_eqz(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let (lo, hi) = self.pop_i64(op)?;
+        let d = self.alloc_temp();
+        let dst = self.alloc_temp();
+        // d = lo | hi → 0 iff both halves are zero
+        self.out.push(RiscVOp::Or {
+            rd: d,
+            rs1: lo,
+            rs2: hi,
+        });
+        // dst = (d <u 1) → 1 iff d == 0
+        self.out.push(RiscVOp::Sltiu {
+            rd: dst,
+            rs1: d,
+            imm: 1,
+        });
+        self.push_i32(dst);
+        Ok(())
+    }
+
+    /// Zero-extend i32 → i64: lo = src, hi = 0.
+    fn lower_i64_extend_i32_u(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let src = self.pop_i32(op)?;
+        let hi = self.alloc_temp();
+        // hi = 0 (via `addi hi, zero, 0` — no dedicated li-zero op).
+        self.out.push(RiscVOp::Addi {
+            rd: hi,
+            rs1: Reg::ZERO,
+            imm: 0,
+        });
+        self.push_i64(src, hi);
+        Ok(())
+    }
+
+    /// Sign-extend i32 → i64: lo = src, hi = sra(src, 31).
+    /// SRA by 31 produces all-ones when src is negative, all-zeros otherwise.
+    fn lower_i64_extend_i32_s(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let src = self.pop_i32(op)?;
+        let hi = self.alloc_temp();
+        self.out.push(RiscVOp::Srai {
+            rd: hi,
+            rs1: src,
+            shamt: 31,
+        });
+        self.push_i64(src, hi);
+        Ok(())
+    }
+
+    /// Wrap i64 → i32: keep lo, drop hi. No instructions emitted — the lo
+    /// register simply continues to live on the value stack as an i32.
+    fn lower_i32_wrap_i64(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let (lo, _hi) = self.pop_i64(op)?;
+        self.push_i32(lo);
+        Ok(())
+    }
+
+    /// i64 load: two word-loads at `offset` and `offset+4`. Little-endian
+    /// memory layout, matching wasm's spec (lo at lower address).
+    fn lower_i64_load(&mut self, op: &WasmOp, offset: u32) -> Result<(), SelectorError> {
+        let addr = self.pop_i32(op)?;
+        let tmp = self.alloc_temp();
+        let lo = self.alloc_temp();
+        let hi = self.alloc_temp();
+        // Single address calculation reused for both word loads.
+        self.out.push(RiscVOp::Add {
+            rd: tmp,
+            rs1: LINEAR_MEM_BASE,
+            rs2: addr,
+        });
+        let imm_lo = offset_to_imm(offset)?;
+        // The high word lives at `offset + 4`; check the same imm12 range.
+        let imm_hi = offset_to_imm(offset.checked_add(4).ok_or(
+            SelectorError::ImmediateTooLarge {
+                value: offset as i64 + 4,
+                context: "i64 load high-word offset",
+            },
+        )?)?;
+        self.out.push(RiscVOp::Lw {
+            rd: lo,
+            rs1: tmp,
+            imm: imm_lo,
+        });
+        self.out.push(RiscVOp::Lw {
+            rd: hi,
+            rs1: tmp,
+            imm: imm_hi,
+        });
+        self.push_i64(lo, hi);
+        Ok(())
+    }
+
+    /// i64 store: pops value (i64) and addr (i32), then two `sw` at offset
+    /// and offset+4. Little-endian, matching wasm's spec.
+    fn lower_i64_store(&mut self, op: &WasmOp, offset: u32) -> Result<(), SelectorError> {
+        let (lo, hi) = self.pop_i64(op)?;
+        let addr = self.pop_i32(op)?;
+        let tmp = self.alloc_temp();
+        self.out.push(RiscVOp::Add {
+            rd: tmp,
+            rs1: LINEAR_MEM_BASE,
+            rs2: addr,
+        });
+        let imm_lo = offset_to_imm(offset)?;
+        let imm_hi = offset_to_imm(offset.checked_add(4).ok_or(
+            SelectorError::ImmediateTooLarge {
+                value: offset as i64 + 4,
+                context: "i64 store high-word offset",
+            },
+        )?)?;
+        self.out.push(RiscVOp::Sw {
+            rs1: tmp,
+            rs2: lo,
+            imm: imm_lo,
+        });
+        self.out.push(RiscVOp::Sw {
+            rs1: tmp,
+            rs2: hi,
+            imm: imm_hi,
+        });
+        Ok(())
+    }
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -1259,4 +1674,282 @@ mod tests {
         );
         assert!(matches!(r, Err(SelectorError::ImmediateTooLarge { .. })));
     }
+
+    // ──────────── i64 Phase-1 tests ────────────
+    //
+    // These tests assert the *shape* of the emitted sequence (op counts, kinds,
+    // and select fields), which is the right granularity for a selector — we
+    // don't want to over-pin register allocation choices.
+
+    /// Helper: build the op sequence for an i64 test scenario. Appends an
+    /// `End` so the function epilogue is emitted; tests work with the full
+    /// output. `num_params` controls how many arg registers are available
+    /// (use 1+ for sequences that contain `LocalGet`).
+    fn run_i64_with_params(seq: &[WasmOp], num_params: u32) -> Vec<RiscVOp> {
+        let mut full = seq.to_vec();
+        full.push(WasmOp::End);
+        s(&full, num_params)
+    }
+
+    /// Most i64 tests use only consts → no LocalGet → no arg regs needed.
+    fn run_i64(seq: &[WasmOp]) -> Vec<RiscVOp> {
+        run_i64_with_params(seq, 0)
+    }
+
+    #[test]
+    fn i64_const_emits_two_load_imm_sequences() {
+        // I64Const(0x1_0000_0001) → lo = 1, hi = 1. Each half goes through
+        // emit_load_imm (here both fit in the addi short path), giving 2
+        // Addi-from-ZERO ops.
+        let out = run_i64(&[WasmOp::I64Const(0x1_0000_0001), WasmOp::Drop]);
+        let imm_loads = count(&out, |op| {
+            matches!(op, RiscVOp::Addi { rs1: Reg::ZERO, .. })
+        });
+        // 2 for the i64 const (lo + hi), nothing else (Drop emits no code).
+        assert_eq!(imm_loads, 2);
+    }
+
+    #[test]
+    fn i64_add_emits_add_sltu_add_add_pattern() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Add,
+            WasmOp::Drop,
+        ]);
+        // Skip past the const-materialization Addi/Lui ops and the trailing
+        // Jalr; isolate the I64Add's emitted sequence.
+        let from_add: Vec<&RiscVOp> = out
+            .iter()
+            .skip_while(|o| !matches!(o, RiscVOp::Add { .. }))
+            .collect();
+        // Expected: Add, Sltu, Add, Add, then function epilogue's Jalr.
+        assert!(matches!(from_add[0], RiscVOp::Add { .. }));
+        assert!(matches!(from_add[1], RiscVOp::Sltu { .. }));
+        assert!(matches!(from_add[2], RiscVOp::Add { .. }));
+        assert!(matches!(from_add[3], RiscVOp::Add { .. }));
+    }
+
+    #[test]
+    fn i64_sub_emits_borrow_pattern() {
+        let out = run_i64(&[
+            WasmOp::I64Const(10),
+            WasmOp::I64Const(3),
+            WasmOp::I64Sub,
+            WasmOp::Drop,
+        ]);
+        // Expected: Sltu (borrow), Sub (lo), Sub (hi diff), Sub (hi - borrow)
+        let from_sub: Vec<&RiscVOp> = out
+            .iter()
+            .skip_while(|o| !matches!(o, RiscVOp::Sltu { .. }))
+            .collect();
+        assert!(matches!(from_sub[0], RiscVOp::Sltu { .. }));
+        assert!(matches!(from_sub[1], RiscVOp::Sub { .. }));
+        assert!(matches!(from_sub[2], RiscVOp::Sub { .. }));
+        assert!(matches!(from_sub[3], RiscVOp::Sub { .. }));
+    }
+
+    #[test]
+    fn i64_and_or_xor_each_emit_two_ops() {
+        // I64And: two And ops on lo/hi
+        let out_and = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64And,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_and, |op| matches!(op, RiscVOp::And { .. })),
+            2,
+            "I64And should emit 2 And ops"
+        );
+
+        let out_or = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Or,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_or, |op| matches!(op, RiscVOp::Or { .. })),
+            2,
+            "I64Or should emit 2 Or ops"
+        );
+
+        let out_xor = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Xor,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_xor, |op| matches!(op, RiscVOp::Xor { .. })),
+            2,
+            "I64Xor should emit 2 Xor ops"
+        );
+    }
+
+    #[test]
+    fn i64_eq_emits_xor_xor_or_sltiu() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Eq,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Xor { .. })),
+            2,
+            "I64Eq emits two Xors (one per half)"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
+            1,
+            "I64Eq ors the half-diffs together"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
+            1,
+            "I64Eq compares the combined diff with 1 (sltiu)"
+        );
+    }
+
+    #[test]
+    fn i64_ne_emits_xor_xor_or_sltu() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Ne,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Xor { .. })), 2);
+        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Or { .. })), 1);
+        assert_eq!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Sltu { rs1: Reg::ZERO, .. }
+            )),
+            1,
+            "I64Ne uses sltu rd, zero, diff"
+        );
+    }
+
+    #[test]
+    fn i64_eqz_emits_or_sltiu() {
+        // I64Eqz pops i64 and pushes i32 — verify by checking we don't get a
+        // Type-mismatch on a subsequent i32 consumer.
+        let out = run_i64(&[
+            WasmOp::I64Const(0),
+            WasmOp::I64Eqz,
+            // After Eqz the stack should hold an i32; an I32Eqz consumer
+            // confirms the type-state on the vstack.
+            WasmOp::I32Eqz,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
+            1,
+            "I64Eqz emits a single Or"
+        );
+        // Two Sltiu(imm=1): one from I64Eqz, one from the I32Eqz that follows.
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
+            2,
+        );
+    }
+
+    #[test]
+    fn i64_extend_i32_u_pushes_zero_hi() {
+        // I64ExtendI32U: hi = 0 (via `addi rd, ZERO, 0`). The presence of an
+        // extra Addi-from-ZERO with imm=0 is the giveaway.
+        let out = run_i64(&[
+            WasmOp::I32Const(5), // small enough to take the addi short path
+            WasmOp::I64ExtendI32U,
+            WasmOp::Drop,
+        ]);
+        // We get: addi (i32const 5), addi (hi=0). Both have rs1=ZERO; the
+        // hi-zero load uses imm=0.
+        assert!(
+            out.iter().any(|op| matches!(
+                op,
+                RiscVOp::Addi {
+                    rs1: Reg::ZERO,
+                    imm: 0,
+                    ..
+                }
+            )),
+            "expected addi rd, zero, 0 to zero the hi half"
+        );
+    }
+
+    #[test]
+    fn i64_extend_i32_s_emits_sra_31() {
+        let out = run_i64(&[WasmOp::I32Const(5), WasmOp::I64ExtendI32S, WasmOp::Drop]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })),
+            1,
+            "I64ExtendI32S uses srai by 31 to sign-extend"
+        );
+    }
+
+    #[test]
+    fn i32_wrap_i64_drops_hi() {
+        // I32WrapI64 emits zero new instructions; the lo half just continues
+        // to live on the value stack as an i32. We verify the op count is
+        // exactly what the surrounding const+drop emits, with no leftover.
+        let baseline = run_i64(&[WasmOp::I64Const(42), WasmOp::Drop]);
+        let with_wrap = run_i64(&[WasmOp::I64Const(42), WasmOp::I32WrapI64, WasmOp::Drop]);
+        assert_eq!(
+            baseline.len(),
+            with_wrap.len(),
+            "I32WrapI64 must not emit any instructions"
+        );
+    }
+
+    #[test]
+    fn i64_load_emits_two_lw_at_offset_and_offset_plus_4() {
+        let out = run_i64_with_params(
+            &[
+                WasmOp::LocalGet(0), // address (treated as i32)
+                WasmOp::I64Load {
+                    offset: 16,
+                    align: 3,
+                },
+                WasmOp::Drop,
+            ],
+            1,
+        );
+        // We expect two Lw ops with imms 16 and 20.
+        let lws: Vec<i32> = out
+            .iter()
+            .filter_map(|op| match op {
+                RiscVOp::Lw { imm, .. } => Some(*imm),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(lws, vec![16, 20], "I64Load emits lw @offset and @offset+4");
+    }
+
+    #[test]
+    fn i64_store_emits_two_sw_at_offset_and_offset_plus_4() {
+        let out = run_i64_with_params(
+            &[
+                WasmOp::LocalGet(0), // address
+                WasmOp::I64Const(0xDEADBEEF_CAFEBABE_u64 as i64),
+                WasmOp::I64Store {
+                    offset: 8,
+                    align: 3,
+                },
+            ],
+            1,
+        );
+        let sws: Vec<i32> = out
+            .iter()
+            .filter_map(|op| match op {
+                RiscVOp::Sw { imm, .. } => Some(*imm),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(sws, vec![8, 12], "I64Store emits sw @offset and @offset+4");
+    }
 }