diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml
index a52e91693..1bae6b290 100644
--- a/.github/actions/multi-functest/action.yml
+++ b/.github/actions/multi-functest/action.yml
@@ -310,7 +310,10 @@ runs:
           nix-verbose: ${{ inputs.nix-verbose }}
           gh_token: ${{ inputs.gh_token }}
           custom_shell: ${{ inputs.custom_shell }}
-          cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32"
+          # The RV32-IM arithmetic backend is experimental and not picked
+          # up by native/meta.h's defaults; select it explicitly here.
+          # No-op for OPT=0 builds (MLD_CONFIG_ARITH_BACKEND_FILE is unused).
+          cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32 -DMLD_CONFIG_ARITH_BACKEND_FILE=\\\\\\\"native/rv32im/meta.h\\\\\\\""
           ldflags: ${{ inputs.ldflags }}
           cross_prefix: riscv32-unknown-linux-gnu-
           exec_wrapper: qemu-riscv32
@@ -327,4 +330,3 @@ runs:
           rng_fail: ${{ inputs.rng_fail }}
           extra_args: ${{ inputs.extra_args }}
           extra_env: ${{ inputs.extra_env }}
-
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e89855ed..9c5f0bb1b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -155,8 +155,8 @@ jobs:
           check_namespace: 'false'
       - name: build + test (cross, opt)
         uses: ./.github/actions/multi-functest
-        # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests
-        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }}
+        # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests
+        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }}
         with:
           nix-shell: ${{ matrix.target.nix_shell }}
           nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }}
@@ -165,8 +165,8 @@ jobs:
           opt: 'opt'
       - name: build + test (cross, opt, +debug)
         uses: ./.github/actions/multi-functest
-        # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests
-        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }}
+        # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests
+        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }}
         with:
           nix-shell: ${{ matrix.target.nix_shell }}
           nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }}
@@ -846,7 +846,7 @@ jobs:
           - system: macos-latest
             nix_cache: 'true'
             nix_shell: 'hol_light-cross-x86_64'
-            extra_args: '--force-cross'
+            extra_args: '--force-cross aarch64 x86_64'
           # TODO: autogen does not yet work on macos15-intel (#1304)
           # - system: macos-15-intel
           #   nix_cache: 'false'
@@ -854,11 +854,11 @@ jobs:
           - system: ubuntu-latest
             nix_shell: 'hol_light-cross-aarch64'
             nix_cache: 'true'
-            extra_args: '--force-cross'
+            extra_args: '--force-cross aarch64 x86_64'
           - system: ubuntu-24.04-arm
             nix_shell: 'hol_light-cross-x86_64'
             nix_cache: 'true'
-            extra_args: '--force-cross'
+            extra_args: '--force-cross aarch64 x86_64'
     runs-on: ${{ matrix.target.system }}
     name: Check object code in HOL-Light proofs
     steps:
diff --git a/README.md b/README.md
index 56a16630d..1a0ad4618 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ mldsa-native allows developers to support ML-DSA with minimal performance and ma
 
 **Maintainability and Safety:** Memory safety, type safety and absence of various classes of timing leakage are automatically checked on every change, using a combination of static model checking (using CBMC) and dynamic instrumentation (using valgrind). This reduces review and maintenance burden and accelerates safe code delivery. See [Formal Verification](#formal-verification) and [Security](#security).
 
-**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64. See [Design](#design).
+**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64, and experimental backends for Armv8.1-M and RV32-IM. See [Design](#design).
 
 ## Quickstart for Ubuntu
 
@@ -92,6 +92,7 @@ mldsa-native currently offers the following backends:
 * 64-bit Arm backend (using Neon)
 * 64-bit Intel/AMD backend (using AVX2)
 * 32-bit Armv8.1-M backend (using Helium/MVE). This is still experimental and disabled by default.
+* 32-bit RISC-V backend (RV32-IM, base integer + M-extension only). This is still experimental and disabled by default.
 
 If you'd like contribute new backends, please reach out!
 
diff --git a/dev/riscv32/meta.h b/dev/riscv32/meta.h
new file mode 100644
index 000000000..a83cd62c1
--- /dev/null
+++ b/dev/riscv32/meta.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_META_H
+#define MLD_NATIVE_RV32IM_META_H
+
+/* Set of primitives that this backend replaces */
+#define MLD_USE_NATIVE_NTT
+#define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLD_ARITH_BACKEND_RV32IM
+
+
+#if !defined(__ASSEMBLER__)
+#include "../api.h"
+#include "src/arith_native_rv32im.h"
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
+{
+  mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
+{
+  mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
+    int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_rv32im_asm(a, b);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* !MLD_NATIVE_RV32IM_META_H */
diff --git a/dev/riscv32/src/arith_native_rv32im.h b/dev/riscv32/src/arith_native_rv32im.h
new file mode 100644
index 000000000..03da705ad
--- /dev/null
+++ b/dev/riscv32/src/arith_native_rv32im.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+
+#include "../../../cbmc.h"
+#include "../../../common.h"
+
+#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas)
+
+/*
+ * Forward NTT zeta table for the RV32-IM backend.
+ *
+ * 255 logical entries, each a (zeta, zeta * QINV mod 2^32) pair, with
+ * zeta in Montgomery form (i.e. R * w^{bitrev_8(k)} mod q where R = 2^32).
+ * The order matches the consumption order of the 2+2+2+2 forward NTT.
+ */
+MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510];
+
+#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm)
+void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, 9 * MLDSA_Q))
+);
+
+#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm)
+void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+);
+
+#define mld_poly_pointwise_montgomery_rv32im_asm \
+  MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N))
+  /* check-magic: off */
+  requires(array_abs_bound(a, 0, MLDSA_N, 75423753))   /* MLD_NTT_BOUND */
+  requires(array_abs_bound(b, 0, MLDSA_N, 75423753))
+  /* check-magic: on */
+  assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+);
+
+#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */
diff --git a/dev/riscv32/src/intt_rv32im_asm.S b/dev/riscv32/src/intt_rv32im_asm.S
new file mode 100644
index 000000000..04a7f6c68
--- /dev/null
+++ b/dev/riscv32/src/intt_rv32im_asm.S
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT.
+ *
+ * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes
+ * applied in reverse layer order). Each pass merges two C-layers into a
+ * radix-4 inner kernel that holds 4 coefficients in registers.
+ *
+ *   inv-pass-1: C-layers 8, 7   (inner stride =  4 B,  64 outer iters)
+ *   inv-pass-2: C-layers 6, 5   (inner stride = 16 B,  16 outer iters)
+ *   inv-pass-3: C-layers 4, 3   (inner stride = 64 B,   4 outer iters)
+ *   inv-pass-4: C-layers 2, 1   (inner stride = 256 B,  1 outer iter )
+ *
+ * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT
+ * table). The forward pass-(5-k) consumes its 3*N_outer pairs in
+ * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but
+ * in reverse outer order, with the two "hi" zetas swapped. We implement
+ * this by initializing zeta_ptr at the end of each pass region and
+ * subtracting 24 bytes per outer iter; within the iter the lo zeta is
+ * read from offset 0 and the hi zetas from offsets 8/16 swapped via the
+ * GS kernel argument order. The negation that the C reference applies
+ * (`-mld_zetas[k]`) is absorbed by the GS butterfly form
+ *      a' = a + b
+ *      b' = montmul(b - a, +zeta)
+ * which produces the same result as the canonical
+ *      t  = a; a' = t + b; b' = montmul(t - b, -zeta).
+ *
+ * Modular arithmetic: standard signed Montgomery (3-mul kernel
+ *   m = low(a*z'), r = hi(a*z) - hi(m*q)
+ * ), matching the forward NTT.
+ *
+ * Final scaling: after the four passes, every coefficient is multiplied
+ * by  f = 41978 = 2^{64-8} mod q  (Montgomery-form, accounts for both
+ * 2^{-8} of the inverse NTT and the 2^32 left over from intermediate
+ * reductions). Implemented as a simple post-loop.
+ *
+ * Bounds (after each inv-pass):
+ *
+ *   start                       :  |coef| < q          (= 1*q)
+ *   after inv-pass-1 (C-L 8,7)  :  |coef| < 4*q
+ *   after inv-pass-2 (C-L 6,5)  :  |coef| < 16*q
+ *   after inv-pass-3 (C-L 4,3)  :  |coef| < 64*q
+ *   after inv-pass-4 (C-L 2,1)  :  |coef| < 256*q   (~ 2^31, fits int32)
+ *   after final fqscale         :  |coef| < q
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+/*****************************************************************
+ * Register aliases
+ *****************************************************************/
+
+/* Arguments */
+#define in_ptr      a0
+#define zeta_ptr    a1
+
+/* Working pointers / counters */
+#define data        t2
+#define outer_end   t3
+#define inner_end   t4
+#define scale_end   t5      /* end pointer for final-scaling loop  */
+
+/* Coefficient registers */
+#define ca          a2
+#define cb          a3
+#define cc          a4
+#define cd          a5
+
+/* Butterfly temporaries */
+#define tmp0        a6
+#define tmp1        a7
+
+/* Loaded zeta pair registers */
+#define zeta_lo     s0
+#define zeta_lo_tw  s1
+#define zeta_h0     s2
+#define zeta_h0_tw  s3
+#define zeta_h1     s4
+#define zeta_h1_tw  s5
+
+/* Constants */
+#define q           s6      /* MLDSA_Q = 8380417            */
+#define f           s7      /* fqscale: 41978               */
+#define f_tw        s8      /* fqscale * QINV mod 2^32      */
+
+/*****************************************************************
+ * Macros
+ *****************************************************************/
+
+/* montmul rd, ra, rb, rb_tw, rt
+ *
+ *   rd = (ra * rb) * R^-1 mod q  (signed Montgomery, R = 2^32).
+ *   |rd| < q.
+ *   Clobbers: rt.
+ */
+.macro montmul rd, ra, rb, rb_tw, rt
+        mul   \rt, \ra, \rb_tw
+        mulh  \rd, \ra, \rb
+        mulh  \rt, \rt, q
+        sub   \rd, \rd, \rt
+.endm
+
+/* gs_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1 :
+ *
+ *   t  = rb - ra
+ *   ra = ra + rb
+ *   rb = montmul(t, +rzeta)
+ *
+ * Gentleman-Sande butterfly. Each application grows |coef| by
+ * a factor of 2 (or by q, whichever is greater): the additive part
+ * doubles, the multiplicative part is bounded by q.
+ *
+ * The algebraic equivalence with the C reference's
+ *      t = ra; ra = t + rb; rb = montmul(t - rb, -zeta)
+ * follows from
+ *      montmul(t - rb, -zeta) = -montmul(t - rb, +zeta)
+ *                             = montmul(rb - t, +zeta)
+ *                             = montmul(rb - ra, +zeta)        (t == ra)
+ * which is what this macro computes. This lets us reuse the
+ * (un-negated) forward-NTT zeta table.
+ *
+ * Clobbers: rt0, rt1.
+ */
+.macro gs_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1
+        sub  \rt0, \rb, \ra
+        add  \ra,  \ra, \rb
+        montmul \rb, \rt0, \rzeta, \rzeta_tw, \rt1
+.endm
+
+/* gs_radix4 stride :
+ *
+ * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`,
+ * applies the inverse-NTT radix-4 kernel using the loaded zetas,
+ * writes them back.
+ *
+ * Within a single inv-pass:
+ *   - "Inner" layer (the smaller-stride C-layer, run first) pairs
+ *     (a,b) and (c,d). The C reference uses two distinct zetas here
+ *     (k = (1<<L_in)-1-2o and (1<<L_in)-2-2o), which appear in our
+ *     table in fwd order as (h0, h1). With the cursor walked
+ *     backward, position offsets remain (h0=8, h1=16); the inv
+ *     consumption order swaps them: (a,b) gets h1, (c,d) gets h0.
+ *   - "Outer" layer (the larger-stride C-layer, run second) pairs
+ *     (a,c) and (b,d) with a single shared zeta = lo.
+ */
+.macro gs_radix4 stride
+        lw   ca, 0(data)
+        lw   cb, (1*\stride)(data)
+        lw   cc, (2*\stride)(data)
+        lw   cd, (3*\stride)(data)
+
+        /* Inner C-layer (smaller stride): (a,b) gets h1, (c,d) gets h0. */
+        gs_bfly ca, cb, zeta_h1, zeta_h1_tw, tmp0, tmp1
+        gs_bfly cc, cd, zeta_h0, zeta_h0_tw, tmp0, tmp1
+
+        /* Outer C-layer (larger stride): (a,c) and (b,d), shared lo. */
+        gs_bfly ca, cc, zeta_lo, zeta_lo_tw, tmp0, tmp1
+        gs_bfly cb, cd, zeta_lo, zeta_lo_tw, tmp0, tmp1
+
+        sw   ca, 0(data)
+        sw   cb, (1*\stride)(data)
+        sw   cc, (2*\stride)(data)
+        sw   cd, (3*\stride)(data)
+.endm
+
+/* load_outer_zetas_rev:
+ *
+ *   zeta_ptr -= 24
+ *   load (lo, lo_tw, h0, h0_tw, h1, h1_tw) from [zeta_ptr+0..+23]
+ *
+ * Walks the forward-NTT zeta table backward, one outer-iter pair set
+ * (24 bytes) at a time.
+ */
+.macro load_outer_zetas_rev
+        addi zeta_ptr, zeta_ptr, -24
+        lw   zeta_lo,    0(zeta_ptr)
+        lw   zeta_lo_tw, 4(zeta_ptr)
+        lw   zeta_h0,    8(zeta_ptr)
+        lw   zeta_h0_tw, 12(zeta_ptr)
+        lw   zeta_h1,    16(zeta_ptr)
+        lw   zeta_h1_tw, 20(zeta_ptr)
+.endm
+
+/* save / restore the callee-saved regs s0..s8 we use. */
+.macro save_regs
+        addi sp, sp, -48
+        sw   s0,  0(sp)
+        sw   s1,  4(sp)
+        sw   s2,  8(sp)
+        sw   s3, 12(sp)
+        sw   s4, 16(sp)
+        sw   s5, 20(sp)
+        sw   s6, 24(sp)
+        sw   s7, 28(sp)
+        sw   s8, 32(sp)
+.endm
+
+.macro restore_regs
+        lw   s0,  0(sp)
+        lw   s1,  4(sp)
+        lw   s2,  8(sp)
+        lw   s3, 12(sp)
+        lw   s4, 16(sp)
+        lw   s5, 20(sp)
+        lw   s6, 24(sp)
+        lw   s7, 28(sp)
+        lw   s8, 32(sp)
+        addi sp, sp, 48
+.endm
+
+/*****************************************************************
+ * Function
+ *****************************************************************/
+
+        .text
+        .global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+        save_regs
+
+        /* q = 8380417 = 0x007FE001 */
+        lui  q, 0x7FE
+        addi q, q, 1
+
+        /* Position zeta_ptr at the END of the table (one past last entry).
+         * The table has 255 pairs = 510 int32 = 2040 bytes. */
+        addi zeta_ptr, zeta_ptr, 2040
+
+        /***************************************************
+         * inv-pass-1: C-layers 8, 7.
+         *   64 outer iters, 1 inner iter each, stride = 4 B.
+         *   Each outer iter handles 4 consecutive coefficients.
+         *
+         * Reads fwd-pass-4's 64 outer iters in reverse order.
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p1_outer:
+        load_outer_zetas_rev
+        gs_radix4 4
+        addi data, data, 16
+        bne  data, outer_end, intt_rv32im_p1_outer
+
+        /***************************************************
+         * inv-pass-2: C-layers 6, 5.
+         *   16 outer iters, 4 inner iters each, stride = 16 B.
+         *   Each outer block is 64 B (= 16 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p2_outer:
+        load_outer_zetas_rev
+        addi inner_end, data, 16          /* 4 * 4 B */
+intt_rv32im_p2_inner:
+        gs_radix4 16
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p2_inner
+        addi data, data, (64 - 16)        /* skip to next 64 B block */
+        bne  data, outer_end, intt_rv32im_p2_outer
+
+        /***************************************************
+         * inv-pass-3: C-layers 4, 3.
+         *   4 outer iters, 16 inner iters each, stride = 64 B.
+         *   Each outer block is 256 B (= 64 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p3_outer:
+        load_outer_zetas_rev
+        addi inner_end, data, 64          /* 16 * 4 B */
+intt_rv32im_p3_inner:
+        gs_radix4 64
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p3_inner
+        addi data, data, (256 - 64)
+        bne  data, outer_end, intt_rv32im_p3_outer
+
+        /***************************************************
+         * inv-pass-4: C-layers 2, 1.
+         *   1 outer iter, 64 inner iters, stride = 256 B.
+         ***************************************************/
+        load_outer_zetas_rev
+        mv   data, in_ptr
+        addi inner_end, in_ptr, 256       /* 64 * 4 B */
+intt_rv32im_p4_inner:
+        gs_radix4 256
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p4_inner
+
+        /***************************************************
+         * Final scaling: each coefficient *= 41978  (Montgomery).
+         *
+         * f    = 41978                  (Montgomery form of 2^{-8})
+         * f_tw = (f * QINV) mod 2^32
+         *      = -8395782 = 0xFF7FE3FA
+         *
+         * Both fit in lui+addi pairs that GAS expands automatically
+         * via the `li` pseudoinstruction.
+         ***************************************************/
+        li   f,    41978
+        li   f_tw, -8395782              /* (41978 * QINV) mod 2^32 */
+
+        mv   data, in_ptr
+        addi scale_end, in_ptr, 1024
+intt_rv32im_scale:
+        lw   ca, 0(data)
+        montmul cb, ca, f, f_tw, tmp0
+        sw   cb, 0(data)
+        addi data, data, 4
+        bne  data, scale_end, intt_rv32im_scale
+
+        restore_regs
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef in_ptr
+#undef zeta_ptr
+#undef data
+#undef outer_end
+#undef inner_end
+#undef scale_end
+#undef ca
+#undef cb
+#undef cc
+#undef cd
+#undef tmp0
+#undef tmp1
+#undef zeta_lo
+#undef zeta_lo_tw
+#undef zeta_h0
+#undef zeta_h0_tw
+#undef zeta_h1
+#undef zeta_h1_tw
+#undef q
+#undef f
+#undef f_tw
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/riscv32/src/ntt_rv32im_asm.S b/dev/riscv32/src/ntt_rv32im_asm.S
new file mode 100644
index 000000000..28de44e06
--- /dev/null
+++ b/dev/riscv32/src/ntt_rv32im_asm.S
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT.
+ *
+ * Layered structure: 2+2+2+2 (four passes, each merging two layers, with
+ * a radix-4 inner kernel holding 4 coefficients in registers).
+ *
+ * Modular arithmetic: standard signed Montgomery multiplication.
+ * Each zeta is provided in Montgomery form (R * w^{bitrev(k)} mod q,
+ * R = 2^32) along with its precomputed twist  z' = (z * QINV) mod 2^32,
+ * so a Montgomery multiply is 3 multiplies + 1 sub:
+ *
+ *   m  = low(a * z')
+ *   r  = hi(a * z) - hi(m * q)
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+/*****************************************************************
+ * Register aliases (RV32 GAS lacks `.req`; use cpp #defines).
+ *****************************************************************/
+
+/* Arguments */
+#define in_ptr      a0      /* base of int32_t r[256]       */
+#define zeta_ptr    a1      /* zeta cursor                  */
+
+/* Working pointers / counters */
+#define data        t2      /* inner data cursor            */
+#define outer_end   t3      /* end address for outer loop   */
+#define inner_end   t4      /* end address for inner loop   */
+
+/* Coefficient registers (caller-saved) */
+#define ca          a2
+#define cb          a3
+#define cc          a4
+#define cd          a5
+
+/* Butterfly temporaries (caller-saved) */
+#define tmp0        a6
+#define tmp1        a7
+
+/* Loaded zeta pair registers (callee-saved; loaded once per outer iter,
+ * used across the inner loop). */
+#define zeta_lo     s0
+#define zeta_lo_tw  s1
+#define zeta_h0     s2
+#define zeta_h0_tw  s3
+#define zeta_h1     s4
+#define zeta_h1_tw  s5
+
+/* Constants */
+#define q           s6      /* MLDSA_Q = 8380417            */
+
+/*****************************************************************
+ * Macros
+ *****************************************************************/
+
+/* montmul rd, ra, rb, rb_tw, rt :
+ *
+ *   rd = (ra * rb) * R^-1 mod q  (signed Montgomery, R = 2^32).
+ *
+ * Uses the precomputed twist  rb_tw = (rb * QINV) mod 2^32  so that
+ *   (ra * rb * QINV) mod 2^32  ==  low(ra * rb_tw),
+ * giving the standard 3-mul kernel
+ *   m  = low(ra * rb_tw)
+ *   rd = hi(ra * rb) - hi(m * q).
+ *
+ * Bound: |rd| < q (signed Montgomery bound).
+ * Clobbers: rt.
+ */
+.macro montmul rd, ra, rb, rb_tw, rt
+        mul   \rt, \ra, \rb_tw    /* low(ra * rb_tw) = m       */
+        mulh  \rd, \ra, \rb       /* hi(ra * rb)               */
+        mulh  \rt, \rt, q         /* hi(m * q)                 */
+        sub   \rd, \rd, \rt
+.endm
+
+/* ct_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1 :
+ *
+ *   t  = montmul(rb, rzeta)
+ *   rb = ra - t
+ *   ra = ra + t
+ *
+ * Cooley-Tukey butterfly. Each application grows |coeff| by at most q.
+ * Clobbers: rt0, rt1.
+ */
+.macro ct_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1
+        montmul \rt0, \rb, \rzeta, \rzeta_tw, \rt1
+        sub  \rb, \ra, \rt0
+        add  \ra, \ra, \rt0
+.endm
+
+/* radix4_kernel stride (in bytes):
+ *
+ * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`, runs
+ * two layers of CT butterflies using the loaded zeta pairs, writes back.
+ */
+.macro radix4_kernel stride
+        lw   ca, 0(data)
+        lw   cb, (1*\stride)(data)
+        lw   cc, (2*\stride)(data)
+        lw   cd, (3*\stride)(data)
+
+        /* "Lo" layer: pair (ca,cc) and (cb,cd), both with zeta_lo. */
+        ct_bfly ca, cc, zeta_lo, zeta_lo_tw, tmp0, tmp1
+        ct_bfly cb, cd, zeta_lo, zeta_lo_tw, tmp0, tmp1
+
+        /* "Hi" layer: (ca,cb) with zeta_h0, (cc,cd) with zeta_h1. */
+        ct_bfly ca, cb, zeta_h0, zeta_h0_tw, tmp0, tmp1
+        ct_bfly cc, cd, zeta_h1, zeta_h1_tw, tmp0, tmp1
+
+        sw   ca, 0(data)
+        sw   cb, (1*\stride)(data)
+        sw   cc, (2*\stride)(data)
+        sw   cd, (3*\stride)(data)
+.endm
+
+/* load_outer_zetas: load 3 zeta pairs (24 bytes) for one outer iter
+ * from `zeta_ptr`, advancing it. */
+.macro load_outer_zetas
+        lw   zeta_lo,    0(zeta_ptr)
+        lw   zeta_lo_tw, 4(zeta_ptr)
+        lw   zeta_h0,    8(zeta_ptr)
+        lw   zeta_h0_tw, 12(zeta_ptr)
+        lw   zeta_h1,    16(zeta_ptr)
+        lw   zeta_h1_tw, 20(zeta_ptr)
+        addi zeta_ptr, zeta_ptr, 24
+.endm
+
+/* save / restore the callee-saved regs s0..s6 we use. */
+.macro save_regs
+        addi sp, sp, -32
+        sw   s0,  0(sp)
+        sw   s1,  4(sp)
+        sw   s2,  8(sp)
+        sw   s3, 12(sp)
+        sw   s4, 16(sp)
+        sw   s5, 20(sp)
+        sw   s6, 24(sp)
+.endm
+
+.macro restore_regs
+        lw   s0,  0(sp)
+        lw   s1,  4(sp)
+        lw   s2,  8(sp)
+        lw   s3, 12(sp)
+        lw   s4, 16(sp)
+        lw   s5, 20(sp)
+        lw   s6, 24(sp)
+        addi sp, sp, 32
+.endm
+
+/*****************************************************************
+ * Function
+ *****************************************************************/
+
+        .text
+        .global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+        save_regs
+
+        /* q = 8380417 = 0x7FE001.
+         * lui loads q << 12; we want the upper 20 bits of 0x007FE001 = 0x007FE,
+         * so lui q, 0x7FE; then addi q, q, 1 -> 0x007FE001.
+         * (Keeps everything within signed 12-bit immediate range.) */
+        lui  q, 0x7FE
+        addi q, q, 1
+
+        /***************************************************
+         * Pass 1: C-layers 1, 2.
+         *   1 outer iter, 64 inner iters, butterfly stride = 256 B.
+         ***************************************************/
+        load_outer_zetas
+        mv   data, in_ptr
+        addi inner_end, in_ptr, 256       /* 64 * 4 B */
+ntt_rv32im_p1_loop:
+        radix4_kernel 256
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p1_loop
+
+        /***************************************************
+         * Pass 2: C-layers 3, 4.
+         *   4 outer iters, 16 inner iters each, stride = 64 B.
+         *   Each outer block is 256 B (= 64 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p2_outer:
+        load_outer_zetas
+        addi inner_end, data, 64          /* 16 * 4 B */
+ntt_rv32im_p2_inner:
+        radix4_kernel 64
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p2_inner
+        addi data, data, (256 - 64)       /* skip to next 256 B block */
+        bne  data, outer_end, ntt_rv32im_p2_outer
+
+        /***************************************************
+         * Pass 3: C-layers 5, 6.
+         *   16 outer iters, 4 inner iters each, stride = 16 B.
+         *   Each outer block is 64 B (= 16 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p3_outer:
+        load_outer_zetas
+        addi inner_end, data, 16          /* 4 * 4 B */
+ntt_rv32im_p3_inner:
+        radix4_kernel 16
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p3_inner
+        addi data, data, (64 - 16)        /* skip to next 64 B block */
+        bne  data, outer_end, ntt_rv32im_p3_outer
+
+        /***************************************************
+         * Pass 4: C-layers 7, 8.
+         *   64 outer iters, 1 inner iter each, stride = 4 B.
+         *   Each outer iter handles 4 consecutive coefficients.
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p4_outer:
+        load_outer_zetas
+        radix4_kernel 4
+        addi data, data, 16
+        bne  data, outer_end, ntt_rv32im_p4_outer
+
+        restore_regs
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef in_ptr
+#undef zeta_ptr
+#undef data
+#undef outer_end
+#undef inner_end
+#undef ca
+#undef cb
+#undef cc
+#undef cd
+#undef tmp0
+#undef tmp1
+#undef zeta_lo
+#undef zeta_lo_tw
+#undef zeta_h0
+#undef zeta_h0_tw
+#undef zeta_h1
+#undef zeta_h1_tw
+#undef q
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S
new file mode 100644
index 000000000..4d51c9afc
--- /dev/null
+++ b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery
+ * reduction. Computes
+ *
+ *     a[i] = (a[i] * b[i]) * R^-1  mod q,    R = 2^32, |result| < q,
+ *
+ * for i in 0..256, in-place in a.
+ *
+ * Modular arithmetic: standard signed Montgomery reduction. Unlike the
+ * NTT, neither operand is constant, so we can't precompute a twisted
+ * form -- the kernel uses 4 multiplies per coefficient:
+ *
+ *     plo = low (a * b)            ; mul
+ *     m   = low (plo * QINV)       ; mul (low 32 of (plo * QINV))
+ *     phi = high(a * b)            ; mulh
+ *     mh  = high(m * q)            ; mulh
+ *     r   = phi - mh               ; sub
+ *
+ * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product
+ * is bounded by (9q)^2 < 2^31 * q, well within the safe input range
+ * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q).
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+/*****************************************************************
+ * Register aliases
+ *****************************************************************/
+
+/* Arguments */
+#define a_ptr       a0
+#define b_ptr       a1
+
+/* Loop control */
+#define a_end       t0          /* end-of-array sentinel for a_ptr */
+
+/* Per-coef working set (caller-saved) */
+#define a_val       a2
+#define b_val       a3
+#define plo         a4
+#define phi         a5
+#define mlo         a6
+#define mhi         a7
+
+/* Constants (callee-saved) */
+#define q           s0          /* MLDSA_Q = 8380417           */
+#define qinv        s1          /* QINV    = 58728449          */
+
+/*****************************************************************
+ * Function
+ *****************************************************************/
+
+        .text
+        .global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm)
+
+        addi sp, sp, -8
+        sw   s0, 0(sp)
+        sw   s1, 4(sp)
+
+        /* q    = 0x007FE001 */
+        lui  q, 0x7FE
+        addi q, q, 1
+        /* qinv = 0x03802001 = 58728449
+         *   lui qinv, 0x3802; addi qinv, qinv, 1  -> 0x03802001 */
+        lui  qinv, 0x3802
+        addi qinv, qinv, 1
+
+        addi a_end, a_ptr, 1024     /* 256 * 4 bytes */
+
+poly_pointwise_montgomery_rv32im_loop:
+        lw   a_val, 0(a_ptr)
+        lw   b_val, 0(b_ptr)
+
+        /* Standard signed Montgomery reduction of a*b:
+         *   plo = (a*b)   low 32
+         *   mlo = plo*QINV low 32
+         *   phi = (a*b)   high 32  (signed)
+         *   mhi = mlo*q   high 32  (signed)
+         *   res = phi - mhi
+         */
+        mul   plo, a_val, b_val
+        mul   mlo, plo,   qinv
+        mulh  phi, a_val, b_val
+        mulh  mhi, mlo,   q
+        sub   a_val, phi, mhi
+
+        sw   a_val, 0(a_ptr)
+
+        addi a_ptr, a_ptr, 4
+        addi b_ptr, b_ptr, 4
+        bne  a_ptr, a_end, poly_pointwise_montgomery_rv32im_loop
+
+        lw   s0, 0(sp)
+        lw   s1, 4(sp)
+        addi sp, sp, 8
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef a_ptr
+#undef b_ptr
+#undef a_end
+#undef a_val
+#undef b_val
+#undef plo
+#undef phi
+#undef mlo
+#undef mhi
+#undef q
+#undef qinv
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/riscv32/src/rv32im_zetas.c b/dev/riscv32/src/rv32im_zetas.c
new file mode 100644
index 000000000..6b7d67e2a
--- /dev/null
+++ b/dev/riscv32/src/rv32im_zetas.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "arith_native_rv32im.h"
+
+/*
+ * Table of zeta values used in the RV32-IM forward NTT.
+ * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with
+ * zeta in Montgomery form. See autogen for details.
+ */
+MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t
+    mld_rv32im_ntt_zetas[510] = {
+        25847,    1830765815,  -2608894, -1929875198, -518909,  -1927777021,
+        237124,   1640767044,  1826347,  308362795,   2353451,  -1815525077,
+        -777960,  1477910808,  -359251,  -1374673747, -2091905, -1091570561,
+        -876248,  1612161320,  3119733,  -1929495947, -2884855, 515185417,
+        466468,   1640734244,  3111497,  -285697463,  2680103,  625853735,
+        2725464,  1727305304,  2706023,  -1846138265, 95776,    -1631226336,
+        1024112,  2082316400,  3077325,  -1404529459, 3530437,  1838055109,
+        -1079900, -1364982364, -1661693, 1594295555,  -3592148, -1076973524,
+        3585928,  858240904,   -2537516, -1898723372, 3915439,  -594436433,
+        -549488,  1806278032,  -3861115, -202001019,  -3043716, -475984260,
+        -1119584, 222489248,   3574422,  -561427818,  -2867647, 1797021249,
+        2619752,  -346752664,  3539968,  -1061813248, -300467,  2059733581,
+        -2108549, 684667771,   2348700,  -1661512036, -539299,  -1104976547,
+        -2118186, 1654287830,  -1699267, -1750224323, -1643818, -901666090,
+        -3859737, -878576921,  3505694,  418987550,   -3821735, 1831915353,
+        -1399561, -1257667337, 3507263,  -1925356481, -2140649, 992097815,
+        -3277672, -748618600,  -1600420, 879957084,   3699596,  2024403852,
+        1757237,  329347125,   811944,   1484874664,  531354,   -1636082790,
+        -19422,   1837364258,  954230,   -285388938,  3881043,  -1983539117,
+        4010497,  -1443016191, 3900724,  -1495136972, -2556880, -950076368,
+        280005,   -1170414139, 2071892,  -1714807468, -2797779, -952438995,
+        -3930395, -1574918427, 2091667,  -898413,     3407706,  991903578,
+        -1528703, -654783359,  2316500,  1363007700,  3817976,  746144248,
+        -3677745, 1350681039,  -3342478, -1363460238, 2244091,  912367099,
+        -3041255, -1974159335, -2446433, 30313375,    -3562462, -1420958686,
+        -1452451, -2143979939, 266997,   -605900043,  2434439,  -44694137,
+        3475950,  1651689966,  -1235728, -326425360,  3513181,  2032221021,
+        2176455,  1599739335,  -3520352, 2027833504,  -3759364, 1176904444,
+        -1585221, 140455867,   -1197226, 1683520342,  -3193378, 1904936414,
+        -1257611, -1285853323, 900702,   14253662,    1859098,  -421552614,
+        1939314,  -1039411342, 909542,   -517299994,  819034,   1257750362,
+        -4083598, -993005454,  495491,   1014493059,  -1613174, -818371958,
+        -1000202, 1955560694,  -43260,   2027935492,  -522500,  1926727420,
+        -3190144, -1440787840, -655327,  863641633,   -3122442, 1747917558,
+        -3157330, 1529189038,  2031748,  -1372618620, 3207046,  1931587462,
+        -3632928, 568627424,   -3556995, 1819892093,  -525098,  -325927722,
+        126922,   -2131021878, -768622,  128353682,   -3595838, 1258381762,
+        3412210,  -783134478,  342297,   2124962073,  286988,   908452108,
+        -983419,  -247357819,  -2437823, -1123881663, 4108315,  885133339,
+        2147896,  -588790216,  3437287,  -1223601433, -3342277, 1851023419,
+        2715295,  1518161567,  1735879,  137583815,   203044,   1629985060,
+        -2967645, 289871779,   2842341,  -1920467227, 2691481,  -1176751719,
+        -3693493, -86965173,   -2590150, -635454918,  1265009,  1967222129,
+        -411027,  -1262003603, 4055324,  -1637785316, 1247620,  -1354528380,
+        -2477047, 1708872713,  2486353,  -642772911,  1595974,  6363718,
+        -671102,  2135294594,  -3767016, -1536588520, 1250494,  -72690498,
+        -1228525, 1787797779,  2635921,  45766801,    -3548272, -1287922800,
+        -22981,   -1018755525, -2994039, 694382729,   1869119,  -314284737,
+        -1308169, 1638590967,  1903435,  671509323,   -1050970, 1136965286,
+        -381987,  -889861155,  -1333058, 235104446,   1237275,  985022747,
+        1349076,  -120646188,  -3318210, -2070602178, -1430225, 1779436847,
+        1852771,  1665705315,  -451100,  -1045062172, 1312455,  963438279,
+        -1430430, -1669960606, 3306115,  419615363,   -1962642, 1116720494,
+        -3343383, 1321868265,  -1279661, 831969619,   1917081,  -1078959975,
+        264944,   -916321552,  -2546312, 1216882040,  -1374803, 1042326957,
+        508951,   1225434135,  1500165,  -300448763,  777191,   604552167,
+        3097992,  1155548552,  2235880,  -270590488,  3406031,  1405999311,
+        44288,    -1784632064, -542412,  756955444,   -2831860, -1021949428,
+        -1100098, 2143745726,  -1671176, -1276805128, -1846953, 713994583,
+        904516,   666258756,   -2584293, -260312805,  -3724270, 608791570,
+        3958618,  1210558298,  594136,   371462360,   -3776993, 940195359,
+        -3724342, 675310538,   -2013608, 1554794072,  2432395,  173440395,
+        -8578,    -1261461890, 2454455,  -1357098057, -164721,  -1542497137,
+        1653064,  -1555941048, 1957272,  1339088280,  3369112,  -2126092136,
+        -3249728, -318346816,  185531,   -384158533,  -1207385, 2061661095,
+        2389356,  -1999506068, -3183426, -2040058690, 162844,   -1316619236,
+        -210977,  628664287,   1616392,  827959816,   3014001,  -883155599,
+        759969,   -1499481951, 810149,   -853476187,  1652634,  -1039370342,
+        -1316856, -1729304568, -3694233, -596344473,  -1799107, 1726753853,
+        189548,   -695180180,  -3038916, -2047270596, 3523897,  6087993,
+        -3553272, 1422575624,  3866901,  702390549,   269760,   -1547952704,
+        3159746,  -1375177022, 2213111,  -1723816713, -975884,  -110126092,
+        -1851402, 1424130038,  1717735,  -279505433,  472078,   394851342,
+        -2409325, 1777179795,  -426683,  -1591599803, 1723600,  565464272,
+        -177440,  -1185330464, -1803090, -260424530,  1910376,  283780712,
+        1315589,  334803717,   -1667432, -440824168,  -1104333, -1758099917,
+        1341330,  235321234,   -260646,  -71875110,   -3833893, 776003547,
+        1285669,  -178766299,  -2939036, 1119856484,  -2235985, -1600929361,
+        -1584928, 168022240,   -420899,  -1208667171, -2286327, 1123958025,
+        -812732,  -518252220,  183443,   1544891539,  -976891,  879867909,
+        -1439742, 1206536194,  1612842,  -1499603926, -3545687, 201262505,
+        -3019102, 1957047970,  -554416,  155290192,   3919660,  -1809756372,
+        -3881060, 985155484,   -48306,   2036925262,  -1362209, 1934038751,
+        -3628969, 1146323031,  3937738,  -973777462,  1400424,  400711272,
+        3839961,  -894060583,  -846154,  -540420426,  1976782,  374860238,
+};
+
+#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(rv32im_zetas)
+
+#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/flake.nix b/flake.nix
index 7714eec95..8c4f7683f 100644
--- a/flake.nix
+++ b/flake.nix
@@ -158,7 +158,7 @@
 
           # autogen shell with cross compiler for the "other" architecture
           devShells.cross-autogen = util.mkShell {
-            packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; }
+            packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv32; inherit (pkgs) gcc-arm-embedded; }
               ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ]
               ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ];
           };
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index ea9901768..7265c63eb 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -95,6 +95,9 @@
 #include "src/native/x86_64/src/rej_uniform_eta4_avx2.c"
 #include "src/native/x86_64/src/rej_uniform_table.c"
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+#include "src/native/rv32im/src/rv32im_zetas.c"
+#endif
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
@@ -772,5 +775,22 @@
 #undef MLD_NATIVE_X86_64_SRC_CONSTS_H
 #undef mld_qdata
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+/*
+ * Undefine macros from native code (Arith, RV32IM)
+ */
+/* mldsa/src/native/rv32im/meta.h */
+#undef MLD_ARITH_BACKEND_RV32IM
+#undef MLD_NATIVE_RV32IM_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */
+#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#undef mld_intt_rv32im_asm
+#undef mld_ntt_rv32im_asm
+#undef mld_poly_pointwise_montgomery_rv32im_asm
+#undef mld_rv32im_ntt_zetas
+#endif /* MLD_SYS_RISCV32 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index 5e3c2d0de..3b0fa704e 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -88,6 +88,11 @@
 #include "src/native/x86_64/src/pointwise_avx2_asm.S"
 #include "src/native/x86_64/src/poly_caddq_avx2_asm.S"
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+#include "src/native/rv32im/src/intt_rv32im_asm.S"
+#include "src/native/rv32im/src/ntt_rv32im_asm.S"
+#include "src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S"
+#endif
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
@@ -779,5 +784,22 @@
 #undef MLD_NATIVE_X86_64_SRC_CONSTS_H
 #undef mld_qdata
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+/*
+ * Undefine macros from native code (Arith, RV32IM)
+ */
+/* mldsa/src/native/rv32im/meta.h */
+#undef MLD_ARITH_BACKEND_RV32IM
+#undef MLD_NATIVE_RV32IM_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */
+#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#undef mld_intt_rv32im_asm
+#undef mld_ntt_rv32im_asm
+#undef mld_poly_pointwise_montgomery_rv32im_asm
+#undef mld_rv32im_ntt_zetas
+#endif /* MLD_SYS_RISCV32 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/mldsa/src/native/meta.h b/mldsa/src/native/meta.h
index 0b46dd579..248974a90 100644
--- a/mldsa/src/native/meta.h
+++ b/mldsa/src/native/meta.h
@@ -20,4 +20,10 @@
 #include "x86_64/meta.h"
 #endif
 
+/* We do not yet include the arithmetic backend for RV32-IM by default
+ * as it is still experimental and undergoing review. */
+/* #if defined(MLD_SYS_RISCV32) */
+/* #include "rv32im/meta.h" */
+/* #endif */
+
 #endif /* !MLD_NATIVE_META_H */
diff --git a/mldsa/src/native/rv32im/meta.h b/mldsa/src/native/rv32im/meta.h
new file mode 100644
index 000000000..a83cd62c1
--- /dev/null
+++ b/mldsa/src/native/rv32im/meta.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_META_H
+#define MLD_NATIVE_RV32IM_META_H
+
+/* Set of primitives that this backend replaces */
+#define MLD_USE_NATIVE_NTT
+#define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLD_ARITH_BACKEND_RV32IM
+
+
+#if !defined(__ASSEMBLER__)
+#include "../api.h"
+#include "src/arith_native_rv32im.h"
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
+{
+  mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
+{
+  mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
+    int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_rv32im_asm(a, b);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* !MLD_NATIVE_RV32IM_META_H */
diff --git a/mldsa/src/native/rv32im/src/arith_native_rv32im.h b/mldsa/src/native/rv32im/src/arith_native_rv32im.h
new file mode 100644
index 000000000..03da705ad
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/arith_native_rv32im.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+
+#include "../../../cbmc.h"
+#include "../../../common.h"
+
+#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas)
+
+/*
+ * Forward NTT zeta table for the RV32-IM backend.
+ *
+ * 255 logical entries, each a (zeta, zeta * QINV mod 2^32) pair, with
+ * zeta in Montgomery form (i.e. R * w^{bitrev_8(k)} mod q where R = 2^32).
+ * The order matches the consumption order of the 2+2+2+2 forward NTT.
+ */
+MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510];
+
+#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm)
+void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, 9 * MLDSA_Q))
+);
+
+#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm)
+void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+);
+
+#define mld_poly_pointwise_montgomery_rv32im_asm \
+  MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N))
+  /* check-magic: off */
+  requires(array_abs_bound(a, 0, MLDSA_N, 75423753))   /* MLD_NTT_BOUND */
+  requires(array_abs_bound(b, 0, MLDSA_N, 75423753))
+  /* check-magic: on */
+  assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+);
+
+#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */
diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S
new file mode 100644
index 000000000..81ed5716c
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT.
+ *
+ * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes
+ * applied in reverse layer order). Each pass merges two C-layers into a
+ * radix-4 inner kernel that holds 4 coefficients in registers.
+ *
+ *   inv-pass-1: C-layers 8, 7   (inner stride =  4 B,  64 outer iters)
+ *   inv-pass-2: C-layers 6, 5   (inner stride = 16 B,  16 outer iters)
+ *   inv-pass-3: C-layers 4, 3   (inner stride = 64 B,   4 outer iters)
+ *   inv-pass-4: C-layers 2, 1   (inner stride = 256 B,  1 outer iter )
+ *
+ * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT
+ * table). The forward pass-(5-k) consumes its 3*N_outer pairs in
+ * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but
+ * in reverse outer order, with the two "hi" zetas swapped. We implement
+ * this by initializing zeta_ptr at the end of each pass region and
+ * subtracting 24 bytes per outer iter; within the iter the lo zeta is
+ * read from offset 0 and the hi zetas from offsets 8/16 swapped via the
+ * GS kernel argument order. The negation that the C reference applies
+ * (`-mld_zetas[k]`) is absorbed by the GS butterfly form
+ *      a' = a + b
+ *      b' = montmul(b - a, +zeta)
+ * which produces the same result as the canonical
+ *      t  = a; a' = t + b; b' = montmul(t - b, -zeta).
+ *
+ * Modular arithmetic: standard signed Montgomery (3-mul kernel
+ *   m = low(a*z'), r = hi(a*z) - hi(m*q)
+ * ), matching the forward NTT.
+ *
+ * Final scaling: after the four passes, every coefficient is multiplied
+ * by  f = 41978 = 2^{64-8} mod q  (Montgomery-form, accounts for both
+ * 2^{-8} of the inverse NTT and the 2^32 left over from intermediate
+ * reductions). Implemented as a simple post-loop.
+ *
+ * Bounds (after each inv-pass):
+ *
+ *   start                       :  |coef| < q          (= 1*q)
+ *   after inv-pass-1 (C-L 8,7)  :  |coef| < 4*q
+ *   after inv-pass-2 (C-L 6,5)  :  |coef| < 16*q
+ *   after inv-pass-3 (C-L 4,3)  :  |coef| < 64*q
+ *   after inv-pass-4 (C-L 2,1)  :  |coef| < 256*q   (~ 2^31, fits int32)
+ *   after final fqscale         :  |coef| < q
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/intt_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x30
+        .cfi_adjust_cfa_offset 0x30
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        sw s6, 0x18(sp)
+        sw s7, 0x1c(sp)
+        sw s8, 0x20(sp)
+        lui s6, 0x7fe
+        addi s6, s6, 0x1
+        addi a1, a1, 0x7f8
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p1_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mul a7, a6, s5
+        mulh a3, a6, s4
+        mulh a7, a7, s6
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mul a7, a6, s3
+        mulh a5, a6, s2
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mul a7, a6, s1
+        mulh a4, a6, s0
+        mulh a7, a7, s6
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mul a7, a6, s1
+        mulh a5, a6, s0
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lintt_rv32im_p1_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p2_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x10
+
+Lintt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mul a7, a6, s5
+        mulh a3, a6, s4
+        mulh a7, a7, s6
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mul a7, a6, s3
+        mulh a5, a6, s2
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mul a7, a6, s1
+        mulh a4, a6, s0
+        mulh a7, a7, s6
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mul a7, a6, s1
+        mulh a5, a6, s0
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p2_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lintt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p3_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x40
+
+Lintt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mul a7, a6, s5
+        mulh a3, a6, s4
+        mulh a7, a7, s6
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mul a7, a6, s3
+        mulh a5, a6, s2
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mul a7, a6, s1
+        mulh a4, a6, s0
+        mulh a7, a7, s6
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mul a7, a6, s1
+        mulh a5, a6, s0
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p3_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lintt_rv32im_p3_outer
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lintt_rv32im_p4_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mul a7, a6, s5
+        mulh a3, a6, s4
+        mulh a7, a7, s6
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mul a7, a6, s3
+        mulh a5, a6, s2
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mul a7, a6, s1
+        mulh a4, a6, s0
+        mulh a7, a7, s6
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mul a7, a6, s1
+        mulh a5, a6, s0
+        mulh a7, a7, s6
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p4_inner
+        lui s7, 0xa
+        addi s7, s7, 0x3fa
+        lui s8, 0xff7fe
+        addi s8, s8, 0x3fa
+        mv t2, a0
+        addi t5, a0, 0x400
+
+Lintt_rv32im_scale:
+        lw a2, 0x0(t2)
+        mul a6, a2, s8
+        mulh a3, a2, s7
+        mulh a6, a6, s6
+        sub a3, a3, a6
+        sw a3, 0x0(t2)
+        addi t2, t2, 0x4
+        bne t2, t5, Lintt_rv32im_scale
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        lw s6, 0x18(sp)
+        lw s7, 0x1c(sp)
+        lw s8, 0x20(sp)
+        addi sp, sp, 0x30
+        .cfi_adjust_cfa_offset -0x30
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(intt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S
new file mode 100644
index 000000000..9a61090a5
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT.
+ *
+ * Layered structure: 2+2+2+2 (four passes, each merging two layers, with
+ * a radix-4 inner kernel holding 4 coefficients in registers).
+ *
+ * Modular arithmetic: standard signed Montgomery multiplication.
+ * Each zeta is provided in Montgomery form (R * w^{bitrev(k)} mod q,
+ * R = 2^32) along with its precomputed twist  z' = (z * QINV) mod 2^32,
+ * so a Montgomery multiply is 3 multiplies + 1 sub:
+ *
+ *   m  = low(a * z')
+ *   r  = hi(a * z) - hi(m * q)
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/ntt_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x20
+        .cfi_adjust_cfa_offset 0x20
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        sw s6, 0x18(sp)
+        lui s6, 0x7fe
+        addi s6, s6, 0x1
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lntt_rv32im_p1_loop:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        mul a7, a4, s1
+        mulh a6, a4, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s1
+        mulh a6, a5, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mul a7, a3, s3
+        mulh a6, a3, s2
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s5
+        mulh a6, a5, s4
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p1_loop
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p2_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x40
+
+Lntt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        mul a7, a4, s1
+        mulh a6, a4, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s1
+        mulh a6, a5, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mul a7, a3, s3
+        mulh a6, a3, s2
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s5
+        mulh a6, a5, s4
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p2_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lntt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p3_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x10
+
+Lntt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        mul a7, a4, s1
+        mulh a6, a4, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s1
+        mulh a6, a5, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mul a7, a3, s3
+        mulh a6, a3, s2
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s5
+        mulh a6, a5, s4
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p3_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lntt_rv32im_p3_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p4_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        mul a7, a4, s1
+        mulh a6, a4, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s1
+        mulh a6, a5, s0
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mul a7, a3, s3
+        mulh a6, a3, s2
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mul a7, a5, s5
+        mulh a6, a5, s4
+        mulh a7, a7, s6
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lntt_rv32im_p4_outer
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        lw s6, 0x18(sp)
+        addi sp, sp, 0x20
+        .cfi_adjust_cfa_offset -0x20
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(ntt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S
new file mode 100644
index 000000000..606a13379
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery
+ * reduction. Computes
+ *
+ *     a[i] = (a[i] * b[i]) * R^-1  mod q,    R = 2^32, |result| < q,
+ *
+ * for i in 0..256, in-place in a.
+ *
+ * Modular arithmetic: standard signed Montgomery reduction. Unlike the
+ * NTT, neither operand is constant, so we can't precompute a twisted
+ * form -- the kernel uses 4 multiplies per coefficient:
+ *
+ *     plo = low (a * b)            ; mul
+ *     m   = low (plo * QINV)       ; mul (low 32 of (plo * QINV))
+ *     phi = high(a * b)            ; mulh
+ *     mh  = high(m * q)            ; mulh
+ *     r   = phi - mh               ; sub
+ *
+ * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product
+ * is bounded by (9q)^2 < 2^31 * q, well within the safe input range
+ * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q).
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x8
+        .cfi_adjust_cfa_offset 0x8
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        lui s0, 0x7fe
+        addi s0, s0, 0x1
+        lui s1, 0x3802
+        addi s1, s1, 0x1
+        addi t0, a0, 0x400
+
+Lpoly_pointwise_montgomery_rv32im_loop:
+        lw a2, 0x0(a0)
+        lw a3, 0x0(a1)
+        mul a4, a2, a3
+        mul a6, a4, s1
+        mulh a5, a2, a3
+        mulh a7, a6, s0
+        sub a2, a5, a7
+        sw a2, 0x0(a0)
+        addi a0, a0, 0x4
+        addi a1, a1, 0x4
+        bne a0, t0, Lpoly_pointwise_montgomery_rv32im_loop
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        addi sp, sp, 0x8
+        .cfi_adjust_cfa_offset -0x8
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(poly_pointwise_montgomery_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/rv32im_zetas.c b/mldsa/src/native/rv32im/src/rv32im_zetas.c
new file mode 100644
index 000000000..6b7d67e2a
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/rv32im_zetas.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "arith_native_rv32im.h"
+
+/*
+ * Table of zeta values used in the RV32-IM forward NTT.
+ * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with
+ * zeta in Montgomery form. See autogen for details.
+ */
+MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t
+    mld_rv32im_ntt_zetas[510] = {
+        25847,    1830765815,  -2608894, -1929875198, -518909,  -1927777021,
+        237124,   1640767044,  1826347,  308362795,   2353451,  -1815525077,
+        -777960,  1477910808,  -359251,  -1374673747, -2091905, -1091570561,
+        -876248,  1612161320,  3119733,  -1929495947, -2884855, 515185417,
+        466468,   1640734244,  3111497,  -285697463,  2680103,  625853735,
+        2725464,  1727305304,  2706023,  -1846138265, 95776,    -1631226336,
+        1024112,  2082316400,  3077325,  -1404529459, 3530437,  1838055109,
+        -1079900, -1364982364, -1661693, 1594295555,  -3592148, -1076973524,
+        3585928,  858240904,   -2537516, -1898723372, 3915439,  -594436433,
+        -549488,  1806278032,  -3861115, -202001019,  -3043716, -475984260,
+        -1119584, 222489248,   3574422,  -561427818,  -2867647, 1797021249,
+        2619752,  -346752664,  3539968,  -1061813248, -300467,  2059733581,
+        -2108549, 684667771,   2348700,  -1661512036, -539299,  -1104976547,
+        -2118186, 1654287830,  -1699267, -1750224323, -1643818, -901666090,
+        -3859737, -878576921,  3505694,  418987550,   -3821735, 1831915353,
+        -1399561, -1257667337, 3507263,  -1925356481, -2140649, 992097815,
+        -3277672, -748618600,  -1600420, 879957084,   3699596,  2024403852,
+        1757237,  329347125,   811944,   1484874664,  531354,   -1636082790,
+        -19422,   1837364258,  954230,   -285388938,  3881043,  -1983539117,
+        4010497,  -1443016191, 3900724,  -1495136972, -2556880, -950076368,
+        280005,   -1170414139, 2071892,  -1714807468, -2797779, -952438995,
+        -3930395, -1574918427, 2091667,  -898413,     3407706,  991903578,
+        -1528703, -654783359,  2316500,  1363007700,  3817976,  746144248,
+        -3677745, 1350681039,  -3342478, -1363460238, 2244091,  912367099,
+        -3041255, -1974159335, -2446433, 30313375,    -3562462, -1420958686,
+        -1452451, -2143979939, 266997,   -605900043,  2434439,  -44694137,
+        3475950,  1651689966,  -1235728, -326425360,  3513181,  2032221021,
+        2176455,  1599739335,  -3520352, 2027833504,  -3759364, 1176904444,
+        -1585221, 140455867,   -1197226, 1683520342,  -3193378, 1904936414,
+        -1257611, -1285853323, 900702,   14253662,    1859098,  -421552614,
+        1939314,  -1039411342, 909542,   -517299994,  819034,   1257750362,
+        -4083598, -993005454,  495491,   1014493059,  -1613174, -818371958,
+        -1000202, 1955560694,  -43260,   2027935492,  -522500,  1926727420,
+        -3190144, -1440787840, -655327,  863641633,   -3122442, 1747917558,
+        -3157330, 1529189038,  2031748,  -1372618620, 3207046,  1931587462,
+        -3632928, 568627424,   -3556995, 1819892093,  -525098,  -325927722,
+        126922,   -2131021878, -768622,  128353682,   -3595838, 1258381762,
+        3412210,  -783134478,  342297,   2124962073,  286988,   908452108,
+        -983419,  -247357819,  -2437823, -1123881663, 4108315,  885133339,
+        2147896,  -588790216,  3437287,  -1223601433, -3342277, 1851023419,
+        2715295,  1518161567,  1735879,  137583815,   203044,   1629985060,
+        -2967645, 289871779,   2842341,  -1920467227, 2691481,  -1176751719,
+        -3693493, -86965173,   -2590150, -635454918,  1265009,  1967222129,
+        -411027,  -1262003603, 4055324,  -1637785316, 1247620,  -1354528380,
+        -2477047, 1708872713,  2486353,  -642772911,  1595974,  6363718,
+        -671102,  2135294594,  -3767016, -1536588520, 1250494,  -72690498,
+        -1228525, 1787797779,  2635921,  45766801,    -3548272, -1287922800,
+        -22981,   -1018755525, -2994039, 694382729,   1869119,  -314284737,
+        -1308169, 1638590967,  1903435,  671509323,   -1050970, 1136965286,
+        -381987,  -889861155,  -1333058, 235104446,   1237275,  985022747,
+        1349076,  -120646188,  -3318210, -2070602178, -1430225, 1779436847,
+        1852771,  1665705315,  -451100,  -1045062172, 1312455,  963438279,
+        -1430430, -1669960606, 3306115,  419615363,   -1962642, 1116720494,
+        -3343383, 1321868265,  -1279661, 831969619,   1917081,  -1078959975,
+        264944,   -916321552,  -2546312, 1216882040,  -1374803, 1042326957,
+        508951,   1225434135,  1500165,  -300448763,  777191,   604552167,
+        3097992,  1155548552,  2235880,  -270590488,  3406031,  1405999311,
+        44288,    -1784632064, -542412,  756955444,   -2831860, -1021949428,
+        -1100098, 2143745726,  -1671176, -1276805128, -1846953, 713994583,
+        904516,   666258756,   -2584293, -260312805,  -3724270, 608791570,
+        3958618,  1210558298,  594136,   371462360,   -3776993, 940195359,
+        -3724342, 675310538,   -2013608, 1554794072,  2432395,  173440395,
+        -8578,    -1261461890, 2454455,  -1357098057, -164721,  -1542497137,
+        1653064,  -1555941048, 1957272,  1339088280,  3369112,  -2126092136,
+        -3249728, -318346816,  185531,   -384158533,  -1207385, 2061661095,
+        2389356,  -1999506068, -3183426, -2040058690, 162844,   -1316619236,
+        -210977,  628664287,   1616392,  827959816,   3014001,  -883155599,
+        759969,   -1499481951, 810149,   -853476187,  1652634,  -1039370342,
+        -1316856, -1729304568, -3694233, -596344473,  -1799107, 1726753853,
+        189548,   -695180180,  -3038916, -2047270596, 3523897,  6087993,
+        -3553272, 1422575624,  3866901,  702390549,   269760,   -1547952704,
+        3159746,  -1375177022, 2213111,  -1723816713, -975884,  -110126092,
+        -1851402, 1424130038,  1717735,  -279505433,  472078,   394851342,
+        -2409325, 1777179795,  -426683,  -1591599803, 1723600,  565464272,
+        -177440,  -1185330464, -1803090, -260424530,  1910376,  283780712,
+        1315589,  334803717,   -1667432, -440824168,  -1104333, -1758099917,
+        1341330,  235321234,   -260646,  -71875110,   -3833893, 776003547,
+        1285669,  -178766299,  -2939036, 1119856484,  -2235985, -1600929361,
+        -1584928, 168022240,   -420899,  -1208667171, -2286327, 1123958025,
+        -812732,  -518252220,  183443,   1544891539,  -976891,  879867909,
+        -1439742, 1206536194,  1612842,  -1499603926, -3545687, 201262505,
+        -3019102, 1957047970,  -554416,  155290192,   3919660,  -1809756372,
+        -3881060, 985155484,   -48306,   2036925262,  -1362209, 1934038751,
+        -3628969, 1146323031,  3937738,  -973777462,  1400424,  400711272,
+        3839961,  -894060583,  -846154,  -540420426,  1976782,  374860238,
+};
+
+#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(rv32im_zetas)
+
+#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/scripts/autogen b/scripts/autogen
index c2ddfd822..15aead747 100755
--- a/scripts/autogen
+++ b/scripts/autogen
@@ -958,6 +958,92 @@ def _fmt_indexed_rows(data):
         yield ",".join(map(str, row)) + f" /* {i} */,"
 
 
+def prepare_root_for_montgomery(root):
+    """Takes a constant that the code needs to Montgomery-multiply with,
+    and returns the pair (z, z'), where z is the signed canonical form
+    of the input, and z' = (z * QINV) mod 2^32 is the twisted constant
+    used in the low-mul part of the standard signed Montgomery
+    multiplication.
+
+    QINV = pow(MLDSA_Q, -1, 2^32) (matches mldsa/src/reduce.h)."""
+
+    QINV = 58728449  # pow(MLDSA_Q, -1, 2^32)
+
+    z = signed_reduce(root)
+
+    # Compute (z * QINV) mod 2^32, then reinterpret as int32.
+    z_twisted = (z * QINV) & 0xFFFFFFFF
+    if z_twisted >= (1 << 31):
+        z_twisted -= 1 << 32
+
+    return z, z_twisted
+
+
+def gen_rv32im_root_for_block(layer, block):
+    """Forward NTT zeta for the given (layer, block), in Montgomery form,
+    returned as the (z, z') pair consumed by the RV32-IM assembly."""
+    log = bitreverse(pow(2, layer) + block, 8)
+    # Montgomery factor R = 2^32 mod q is folded in, since the RV32-IM
+    # butterfly uses standard signed Montgomery multiplication.
+    root = pow(root_of_unity, log, modulus) * montgomery_factor
+    return prepare_root_for_montgomery(root)
+
+
+def gen_rv32im_fwd_ntt_zetas():
+    """Yield (z, z') pairs in the order consumed by the 2+2+2+2 forward NTT.
+
+    Each of the 4 passes (L1+L2, L3+L4, L5+L6, L7+L8) emits one set of
+    3 pairs per outer iteration. Layers are 0-indexed here:
+
+        pass p uses layers (lo, hi) = (2p, 2p+1)
+
+    For outer index o in pass p:
+        zeta_lo  = layer lo, block o
+        zeta_hi0 = layer hi, block 2*o
+        zeta_hi1 = layer hi, block 2*o + 1
+
+    Total: 1 + 4 + 16 + 64 = 85 outer iters * 3 pairs = 255 pairs."""
+    for p in range(4):
+        lo = 2 * p
+        hi = 2 * p + 1
+        n_outer = 1 << lo  # 1, 4, 16, 64
+        for o in range(n_outer):
+            yield from gen_rv32im_root_for_block(lo, o)
+            yield from gen_rv32im_root_for_block(hi, 2 * o + 0)
+            yield from gen_rv32im_root_for_block(hi, 2 * o + 1)
+
+
+def gen_rv32im_zeta_file():
+    def gen():
+        yield from gen_header()
+        yield '#include "../../../common.h"'
+        yield ""
+        yield "#if defined(MLD_ARITH_BACKEND_RV32IM) && \\"
+        yield "    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)"
+        yield ""
+        yield '#include "arith_native_rv32im.h"'
+        yield ""
+        yield "/*"
+        yield " * Table of zeta values used in the RV32-IM forward NTT."
+        yield " * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with"
+        yield " * zeta in Montgomery form. See autogen for details."
+        yield " */"
+        yield from emit_c_array(
+            "const int32_t",
+            "mld_rv32im_ntt_zetas",
+            gen_rv32im_fwd_ntt_zetas(),
+        )
+        yield ""
+        yield "#else"
+        yield ""
+        yield "MLD_EMPTY_CU(rv32im_zetas)"
+        yield ""
+        yield "#endif"
+        yield ""
+
+    update_file("dev/riscv32/src/rv32im_zetas.c", "\n".join(gen()))
+
+
 def gen_aarch64_zeta_file():
     def gen():
         yield from gen_header()
@@ -1844,6 +1930,10 @@ def riscv64(c):
     return "/riscv64/" in c
 
 
+def rv32im(c):
+    return "/rv32im/" in c
+
+
 def armv81m(c):
     return "/armv81m/" in c
 
@@ -1889,12 +1979,17 @@ def native_arith_riscv64(c):
     return native_arith(c) and riscv64(c)
 
 
+def native_arith_rv32im(c):
+    return native_arith(c) and rv32im(c)
+
+
 def native_arith_core(c):
     return (
         native_arith(c)
         and not native_arith_x86_64(c)
         and not native_arith_aarch64(c)
         and not native_arith_riscv64(c)
+        and not native_arith_rv32im(c)
     )
 
 
@@ -2001,6 +2096,11 @@ def gen_macro_undefs(extra_notes=None):
         filt=native_arith_x86_64, desc="native code (Arith, X86_64)"
     )
     yield "#endif"
+    yield "#if defined(MLD_SYS_RISCV32)"
+    yield from gen_monolithic_undef_all_core(
+        filt=native_arith_rv32im, desc="native code (Arith, RV32IM)"
+    )
+    yield "#endif"
     yield "#endif"
     yield "#endif"
     yield ""
@@ -2078,6 +2178,10 @@ def gen_monolithic_source_file():
         for c in filter(native_arith_x86_64, c_sources):
             yield f'#include "{c}"'
         yield "#endif"
+        yield "#if defined(MLD_SYS_RISCV32)"
+        for c in filter(native_arith_rv32im, c_sources):
+            yield f'#include "{c}"'
+        yield "#endif"
         yield "#endif"
         yield ""
         yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)"
@@ -2161,6 +2265,10 @@ def gen_monolithic_asm_file():
         for c in filter(native_arith_x86_64, asm_sources):
             yield f'#include "{c}"'
         yield "#endif"
+        yield "#if defined(MLD_SYS_RISCV32)"
+        for c in filter(native_arith_rv32im, asm_sources):
+            yield f'#include "{c}"'
+        yield "#endif"
         yield "#endif"
         yield ""
         yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)"
@@ -2306,7 +2414,10 @@ def check_macro_typos():
                 return True
 
         # 5. AWS-LC importer patch
-        if is_autogen or filename == "integration/awslc/awslc.patch":
+        if is_autogen or filename in [
+            "integration/awslc/pre_import.patch",
+            "integration/awslc/post_import.patch",
+        ]:
             return True
 
         if is_autogen or filename == "mldsa/src/common.h":
@@ -2483,9 +2594,14 @@ def update_via_simpasm(
     outfile=None,
     cflags=None,
     preserve_header=True,
-    force_cross=False,
+    force_cross=None,
     x86_64_syntax="att",
 ):
+    # force_cross: set of source architectures for which a missing cross
+    # toolchain is a hard error rather than a silent skip. Pass None or an
+    # empty set to skip silently for every arch.
+    if force_cross is None:
+        force_cross = set()
     _, infile = os.path.split(infile_full)
     if outfile is None:
         outfile = infile
@@ -2502,6 +2618,8 @@ def update_via_simpasm(
         source_arch = "x86_64"
     elif "armv81m" in infile_full:
         source_arch = "armv81m"
+    elif "riscv32" in infile_full or "rv32im" in infile_full:
+        source_arch = "riscv32"
     else:
         raise Exception(f"Could not detect architecture of source file {infile_full}.")
     # Check native architecture
@@ -2515,7 +2633,15 @@ def update_via_simpasm(
         cross_prefix = "arm-none-eabi-"
         cross_gcc = cross_prefix + "gcc"
         if shutil.which(cross_gcc) is None:
-            if force_cross is False:
+            if source_arch not in force_cross:
+                return
+            raise Exception(f"Could not find cross toolchain {cross_prefix}")
+    # RISC-V 32-bit is always cross-compiled.
+    elif source_arch == "riscv32":
+        cross_prefix = "riscv32-unknown-linux-gnu-"
+        cross_gcc = cross_prefix + "gcc"
+        if shutil.which(cross_gcc) is None:
+            if source_arch not in force_cross:
                 return
             raise Exception(f"Could not find cross toolchain {cross_prefix}")
     elif native_arch != source_arch:
@@ -2523,7 +2649,7 @@ def update_via_simpasm(
         cross_gcc = cross_prefix + "gcc"
         # Check if cross-compiler is present
         if shutil.which(cross_gcc) is None:
-            if force_cross is False:
+            if source_arch not in force_cross:
                 return
             raise Exception(f"Could not find cross toolchain {cross_prefix}")
     else:
@@ -2536,6 +2662,8 @@ def update_via_simpasm(
                 arch = "aarch64"
             elif "armv81m" in infile_full:
                 arch = "armv81m"
+            elif "riscv32" in infile_full or "rv32im" in infile_full:
+                arch = "riscv32"
             else:
                 arch = "x86_64"
 
@@ -2836,7 +2964,7 @@ def synchronize_backend(in_dir, out_dir, delete=False, no_simplify=False, **kwar
 
 def synchronize_backends(
     *,
-    force_cross=False,
+    force_cross=None,
     clean=False,
     delete=False,
     no_simplify=False,
@@ -2865,6 +2993,14 @@ def synchronize_backends(
             ),
         )
 
+        update_via_copy(
+            "dev/riscv32/meta.h",
+            "mldsa/src/native/rv32im/meta.h",
+            transform=lambda c: adjust_header_guard_for_filename(
+                c, "mldsa/src/native/rv32im/meta.h"
+            ),
+        )
+
     synchronize_backend(
         f"dev/aarch64_{ty}/src",
         "mldsa/src/native/aarch64/src",
@@ -2946,6 +3082,14 @@ def synchronize_backends(
         no_simplify=no_simplify,
         cflags="-Idev/fips202/armv81m -Imldsa/src/fips202/native/armv81m -march=armv8.1-m.main+mve -mthumb",
     )
+    synchronize_backend(
+        "dev/riscv32/src",
+        "mldsa/src/native/rv32im/src",
+        delete=delete,
+        force_cross=force_cross,
+        no_simplify=no_simplify,
+        cflags="-Idev/riscv32/src -Imldsa/src/native/rv32im/src -march=rv32im -mabi=ilp32",
+    )
 
 
 def adjust_header_guard_for_filename(content, header_file):
@@ -3547,7 +3691,9 @@ def update_bytecode_in_proof_script(filepath, bytecode):
     update_file(filepath, updated_content)
 
 
-def update_hol_light_bytecode_for_arch(arch, force_cross=False):
+def update_hol_light_bytecode_for_arch(arch, force_cross=None):
+    if force_cross is None:
+        force_cross = set()
     source_arch = arch
     if platform.machine().lower() in ["arm64", "aarch64"]:
         native_arch = "aarch64"
@@ -3559,7 +3705,7 @@ def update_hol_light_bytecode_for_arch(arch, force_cross=False):
         cross_gcc = cross_prefix + "gcc"
         # Check if cross-compiler is present
         if shutil.which(cross_gcc) is None:
-            if force_cross is False:
+            if source_arch not in force_cross:
                 return
             raise Exception(f"Could not find cross toolchain {cross_prefix}")
 
@@ -3581,7 +3727,7 @@ def update_hol_light_bytecode_for_arch(arch, force_cross=False):
         update_bytecode_in_proof_script(ml_file, bytecode)
 
 
-def update_hol_light_bytecode(force_cross=False):
+def update_hol_light_bytecode(force_cross=None):
     """Update HOL Light proof files with bytecode from make dump_bytecode."""
     update_hol_light_bytecode_for_arch("aarch64", force_cross=force_cross)
     update_hol_light_bytecode_for_arch("x86_64", force_cross=force_cross)
@@ -3983,7 +4129,19 @@ def _main():
     parser.add_argument("--slothy", nargs="*", default=None, choices=slothy_choices)
     parser.add_argument("--aarch64-clean", default=False, action="store_true")
     parser.add_argument("--no-simplify", default=False, action="store_true")
-    parser.add_argument("--force-cross", default=False, action="store_true")
+    KNOWN_CROSS_ARCHS = ["aarch64", "x86_64", "armv81m", "riscv32"]
+    parser.add_argument(
+        "--force-cross",
+        nargs="*",
+        default=None,
+        choices=KNOWN_CROSS_ARCHS,
+        metavar="ARCH",
+        help=(
+            "Architectures whose missing cross toolchain should fail the "
+            "run instead of being silently skipped. With no argument, "
+            "applies to all of: " + ", ".join(KNOWN_CROSS_ARCHS) + "."
+        ),
+    )
     parser.add_argument(
         "--x86-64-syntax",
         type=str,
@@ -4017,11 +4175,22 @@ def _main():
     if args.slothy == []:
         args.slothy = slothy_choices
 
+    # Normalize --force-cross into a set:
+    #   absent          -> empty set (silently skip every missing toolchain)
+    #   no value        -> all known archs (legacy --force-cross behavior)
+    #   explicit list   -> just those archs
+    if args.force_cross is None:
+        force_cross = set()
+    elif args.force_cross == []:
+        force_cross = set(KNOWN_CROSS_ARCHS)
+    else:
+        force_cross = set(args.force_cross)
+
     def sync_backends():
         synchronize_backends(
             clean=args.aarch64_clean,
             no_simplify=args.no_simplify,
-            force_cross=args.force_cross,
+            force_cross=force_cross,
             x86_64_syntax=args.x86_64_syntax,
         )
 
@@ -4029,7 +4198,7 @@ def _main():
         synchronize_backends(
             clean=args.aarch64_clean,
             delete=True,
-            force_cross=args.force_cross,
+            force_cross=force_cross,
             no_simplify=args.no_simplify,
             x86_64_syntax=args.x86_64_syntax,
         )
@@ -4037,6 +4206,7 @@ def _main():
     def gen_zeta_tables():
         gen_c_zeta_file()
         gen_aarch64_zeta_file()
+        gen_rv32im_zeta_file()
         gen_aarch64_hol_light_zeta_file()
         gen_aarch64_rej_uniform_table()
         gen_aarch64_rej_uniform_eta_table()
@@ -4080,7 +4250,7 @@ def _main():
         ("Complete final backend synchronization", sync_backends_final),
         (
             "Update HOL Light bytecode",
-            partial(update_hol_light_bytecode, force_cross=args.force_cross),
+            partial(update_hol_light_bytecode, force_cross=force_cross),
             args.update_hol_light_bytecode,
         ),
         ("Generate monolithic source files", gen_monolithic),
diff --git a/scripts/cfify b/scripts/cfify
index 126a900fe..0e4178c9f 100755
--- a/scripts/cfify
+++ b/scripts/cfify
@@ -127,6 +127,19 @@ ARMV81M_ADD_SP_PATTERN = re.compile(
 ARMV81M_BX_LR_PATTERN = re.compile(r"(\s*)bx\s+lr\s*$", re.IGNORECASE)
 
 
+# -----------------------------------------------------------------------------
+# riscv32 module-scope constants
+# -----------------------------------------------------------------------------
+# `addi sp, sp, -OFF` (allocate) and `addi sp, sp, +OFF` (free).
+RISCV32_SUB_SP_PATTERN = re.compile(
+    r"(\s*)addi\s+sp,\s*sp,\s*-(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE
+)
+RISCV32_ADD_SP_PATTERN = re.compile(
+    r"(\s*)addi\s+sp,\s*sp,\s*(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE
+)
+RISCV32_RET_PATTERN = re.compile(r"(\s*)ret\s*$", re.IGNORECASE)
+
+
 def armv81m_parse_reg(s):
     """Parse a single register token, returning its canonical name
     (e.g. 'r14' -> 'lr'). Raises ValueError on unrecognised input."""
@@ -443,6 +456,44 @@ def add_cfi_directives(text, arch):
                 i += 1
                 continue
 
+        elif arch == "riscv32":
+            # addi sp, sp, -OFF — stack allocation
+            match = RISCV32_SUB_SP_PATTERN.match(line)
+            if match:
+                indent, offset_str = match.groups()
+                offset = (
+                    int(offset_str, 16)
+                    if offset_str.lower().startswith("0x")
+                    else int(offset_str)
+                )
+                result.append(line)
+                result.append(f"{indent}.cfi_adjust_cfa_offset {offset:#x}")
+                i += 1
+                continue
+
+            # addi sp, sp, +OFF — stack deallocation
+            match = RISCV32_ADD_SP_PATTERN.match(line)
+            if match:
+                indent, offset_str = match.groups()
+                offset = (
+                    int(offset_str, 16)
+                    if offset_str.lower().startswith("0x")
+                    else int(offset_str)
+                )
+                result.append(line)
+                result.append(f"{indent}.cfi_adjust_cfa_offset -{offset:#x}")
+                i += 1
+                continue
+
+            # ret — function return
+            match = RISCV32_RET_PATTERN.match(line)
+            if match:
+                indent = match.group(1)
+                result.append(line)
+                result.append(f"{indent}.cfi_endproc")
+                i += 1
+                continue
+
         result.append(line)
         i += 1
 
@@ -462,7 +513,7 @@ def main():
     )
     parser.add_argument(
         "--arch",
-        choices=["aarch64", "x86_64", "armv81m"],
+        choices=["aarch64", "x86_64", "armv81m", "riscv32"],
         default="aarch64",
         help="Target architecture (default: aarch64)",
     )
diff --git a/scripts/simpasm b/scripts/simpasm
index dc34079a1..62cca9adf 100755
--- a/scripts/simpasm
+++ b/scripts/simpasm
@@ -256,6 +256,11 @@ def simplify(logger, args, asm_input, asm_output=None):
         # Armv8.1-M requires explicit triple for Thumb disassembly
         if args.arch == "armv81m":
             cmd += ["--triple=thumbv8.1m.main-none-eabi"]
+        # RISC-V 32-bit ILP32 needs an explicit triple so llvm-objdump
+        # decodes the M extension (mul/mulh) instead of marking them
+        # as illegal.
+        if args.arch == "riscv32":
+            cmd += ["--triple=riscv32", "--mattr=+m"]
 
         # Add syntax option if specified
         if args.syntax and args.syntax.lower() != "att":
diff --git a/test/mk/components.mk b/test/mk/components.mk
index 67698aabe..00df4d67c 100644
--- a/test/mk/components.mk
+++ b/test/mk/components.mk
@@ -10,7 +10,7 @@ endif
 
 SOURCES += $(wildcard mldsa/src/*.c)
 ifeq ($(OPT),1)
-	SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS])
+	SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) $(wildcard mldsa/src/native/rv32im/src/*.[csS])
 	CFLAGS += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
 endif