diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index a52e91693..1bae6b290 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -310,7 +310,10 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32" + # The RV32-IM arithmetic backend is experimental and not picked + # up by native/meta.h's defaults; select it explicitly here. + # No-op for OPT=0 builds (MLD_CONFIG_ARITH_BACKEND_FILE is unused). + cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32 -DMLD_CONFIG_ARITH_BACKEND_FILE=\\\\\\\"native/rv32im/meta.h\\\\\\\"" ldflags: ${{ inputs.ldflags }} cross_prefix: riscv32-unknown-linux-gnu- exec_wrapper: qemu-riscv32 @@ -327,4 +330,3 @@ runs: rng_fail: ${{ inputs.rng_fail }} extra_args: ${{ inputs.extra_args }} extra_env: ${{ inputs.extra_env }} - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e89855ed..9c5f0bb1b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -155,8 +155,8 @@ jobs: check_namespace: 'false' - name: build + test (cross, opt) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -165,8 +165,8 @@ jobs: opt: 'opt' - name: build + test (cross, opt, +debug) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -846,7 +846,7 @@ jobs: - system: macos-latest nix_cache: 'true' nix_shell: 'hol_light-cross-x86_64' - extra_args: '--force-cross' + extra_args: '--force-cross aarch64 x86_64' # TODO: autogen does not yet work on macos15-intel (#1304) # - system: macos-15-intel # nix_cache: 'false' @@ -854,11 +854,11 @@ jobs: - system: ubuntu-latest nix_shell: 'hol_light-cross-aarch64' nix_cache: 'true' - extra_args: '--force-cross' + extra_args: '--force-cross aarch64 x86_64' - system: ubuntu-24.04-arm nix_shell: 'hol_light-cross-x86_64' nix_cache: 'true' - extra_args: '--force-cross' + extra_args: '--force-cross aarch64 x86_64' runs-on: ${{ matrix.target.system }} name: Check object code in HOL-Light proofs steps: diff --git a/README.md b/README.md index 56a16630d..1a0ad4618 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ mldsa-native allows developers to support ML-DSA with minimal performance and ma **Maintainability and Safety:** Memory safety, type safety and absence of various classes of timing leakage are automatically checked on every change, using a combination of static model checking (using CBMC) and dynamic instrumentation (using valgrind). This reduces review and maintenance burden and accelerates safe code delivery. See [Formal Verification](#formal-verification) and [Security](#security). -**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64. See [Design](#design). +**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64, and experimental backends for Armv8.1-M and RV32-IM. See [Design](#design). ## Quickstart for Ubuntu @@ -92,6 +92,7 @@ mldsa-native currently offers the following backends: * 64-bit Arm backend (using Neon) * 64-bit Intel/AMD backend (using AVX2) * 32-bit Armv8.1-M backend (using Helium/MVE). This is still experimental and disabled by default. +* 32-bit RISC-V backend (RV32-IM, base integer + M-extension only). This is still experimental and disabled by default. If you'd like contribute new backends, please reach out! diff --git a/dev/riscv32/meta.h b/dev/riscv32/meta.h new file mode 100644 index 000000000..a83cd62c1 --- /dev/null +++ b/dev/riscv32/meta.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_META_H +#define MLD_NATIVE_RV32IM_META_H + +/* Set of primitives that this backend replaces */ +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_RV32IM + + +#if !defined(__ASSEMBLER__) +#include "../api.h" +#include "src/arith_native_rv32im.h" + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + mld_poly_pointwise_montgomery_rv32im_asm(a, b); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#endif /* !__ASSEMBLER__ */ +#endif /* !MLD_NATIVE_RV32IM_META_H */ diff --git a/dev/riscv32/src/arith_native_rv32im.h b/dev/riscv32/src/arith_native_rv32im.h new file mode 100644 index 000000000..03da705ad --- /dev/null +++ b/dev/riscv32/src/arith_native_rv32im.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H + +#include "../../../cbmc.h" +#include "../../../common.h" + +#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas) + +/* + * Forward NTT zeta table for the RV32-IM backend. + * + * 255 logical entries, each a (zeta, zeta * QINV mod 2^32) pair, with + * zeta in Montgomery form (i.e. R * w^{bitrev_8(k)} mod q where R = 2^32). + * The order matches the consumption order of the 2+2+2+2 forward NTT. + */ +MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510]; + +#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm) +void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, 9 * MLDSA_Q)) +); + +#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm) +void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) +); + +#define mld_poly_pointwise_montgomery_rv32im_asm \ + MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + /* check-magic: off */ + requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) /* MLD_NTT_BOUND */ + requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) + /* check-magic: on */ + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) +); + +#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */ diff --git a/dev/riscv32/src/intt_rv32im_asm.S b/dev/riscv32/src/intt_rv32im_asm.S new file mode 100644 index 000000000..04a7f6c68 --- /dev/null +++ b/dev/riscv32/src/intt_rv32im_asm.S @@ -0,0 +1,348 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT. + * + * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes + * applied in reverse layer order). Each pass merges two C-layers into a + * radix-4 inner kernel that holds 4 coefficients in registers. + * + * inv-pass-1: C-layers 8, 7 (inner stride = 4 B, 64 outer iters) + * inv-pass-2: C-layers 6, 5 (inner stride = 16 B, 16 outer iters) + * inv-pass-3: C-layers 4, 3 (inner stride = 64 B, 4 outer iters) + * inv-pass-4: C-layers 2, 1 (inner stride = 256 B, 1 outer iter ) + * + * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT + * table). The forward pass-(5-k) consumes its 3*N_outer pairs in + * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but + * in reverse outer order, with the two "hi" zetas swapped. We implement + * this by initializing zeta_ptr at the end of each pass region and + * subtracting 24 bytes per outer iter; within the iter the lo zeta is + * read from offset 0 and the hi zetas from offsets 8/16 swapped via the + * GS kernel argument order. The negation that the C reference applies + * (`-mld_zetas[k]`) is absorbed by the GS butterfly form + * a' = a + b + * b' = montmul(b - a, +zeta) + * which produces the same result as the canonical + * t = a; a' = t + b; b' = montmul(t - b, -zeta). + * + * Modular arithmetic: standard signed Montgomery (3-mul kernel + * m = low(a*z'), r = hi(a*z) - hi(m*q) + * ), matching the forward NTT. + * + * Final scaling: after the four passes, every coefficient is multiplied + * by f = 41978 = 2^{64-8} mod q (Montgomery-form, accounts for both + * 2^{-8} of the inverse NTT and the 2^32 left over from intermediate + * reductions). Implemented as a simple post-loop. + * + * Bounds (after each inv-pass): + * + * start : |coef| < q (= 1*q) + * after inv-pass-1 (C-L 8,7) : |coef| < 4*q + * after inv-pass-2 (C-L 6,5) : |coef| < 16*q + * after inv-pass-3 (C-L 4,3) : |coef| < 64*q + * after inv-pass-4 (C-L 2,1) : |coef| < 256*q (~ 2^31, fits int32) + * after final fqscale : |coef| < q + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +/***************************************************************** + * Register aliases + *****************************************************************/ + +/* Arguments */ +#define in_ptr a0 +#define zeta_ptr a1 + +/* Working pointers / counters */ +#define data t2 +#define outer_end t3 +#define inner_end t4 +#define scale_end t5 /* end pointer for final-scaling loop */ + +/* Coefficient registers */ +#define ca a2 +#define cb a3 +#define cc a4 +#define cd a5 + +/* Butterfly temporaries */ +#define tmp0 a6 +#define tmp1 a7 + +/* Loaded zeta pair registers */ +#define zeta_lo s0 +#define zeta_lo_tw s1 +#define zeta_h0 s2 +#define zeta_h0_tw s3 +#define zeta_h1 s4 +#define zeta_h1_tw s5 + +/* Constants */ +#define q s6 /* MLDSA_Q = 8380417 */ +#define f s7 /* fqscale: 41978 */ +#define f_tw s8 /* fqscale * QINV mod 2^32 */ + +/***************************************************************** + * Macros + *****************************************************************/ + +/* montmul rd, ra, rb, rb_tw, rt + * + * rd = (ra * rb) * R^-1 mod q (signed Montgomery, R = 2^32). + * |rd| < q. + * Clobbers: rt. + */ +.macro montmul rd, ra, rb, rb_tw, rt + mul \rt, \ra, \rb_tw + mulh \rd, \ra, \rb + mulh \rt, \rt, q + sub \rd, \rd, \rt +.endm + +/* gs_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1 : + * + * t = rb - ra + * ra = ra + rb + * rb = montmul(t, +rzeta) + * + * Gentleman-Sande butterfly. Each application grows |coef| by + * a factor of 2 (or by q, whichever is greater): the additive part + * doubles, the multiplicative part is bounded by q. + * + * The algebraic equivalence with the C reference's + * t = ra; ra = t + rb; rb = montmul(t - rb, -zeta) + * follows from + * montmul(t - rb, -zeta) = -montmul(t - rb, +zeta) + * = montmul(rb - t, +zeta) + * = montmul(rb - ra, +zeta) (t == ra) + * which is what this macro computes. This lets us reuse the + * (un-negated) forward-NTT zeta table. + * + * Clobbers: rt0, rt1. + */ +.macro gs_bfly ra, rb, rzeta, rzeta_tw, rt0, rt1 + sub \rt0, \rb, \ra + add \ra, \ra, \rb + montmul \rb, \rt0, \rzeta, \rzeta_tw, \rt1 +.endm + +/* gs_radix4 stride : + * + * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`, + * applies the inverse-NTT radix-4 kernel using the loaded zetas, + * writes them back. + * + * Within a single inv-pass: + * - "Inner" layer (the smaller-stride C-layer, run first) pairs + * (a,b) and (c,d). The C reference uses two distinct zetas here + * (k = (1< 0x007FE001. + * (Keeps everything within signed 12-bit immediate range.) */ + lui q, 0x7FE + addi q, q, 1 + + /*************************************************** + * Pass 1: C-layers 1, 2. + * 1 outer iter, 64 inner iters, butterfly stride = 256 B. + ***************************************************/ + load_outer_zetas + mv data, in_ptr + addi inner_end, in_ptr, 256 /* 64 * 4 B */ +ntt_rv32im_p1_loop: + radix4_kernel 256 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p1_loop + + /*************************************************** + * Pass 2: C-layers 3, 4. + * 4 outer iters, 16 inner iters each, stride = 64 B. + * Each outer block is 256 B (= 64 coefs). + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p2_outer: + load_outer_zetas + addi inner_end, data, 64 /* 16 * 4 B */ +ntt_rv32im_p2_inner: + radix4_kernel 64 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p2_inner + addi data, data, (256 - 64) /* skip to next 256 B block */ + bne data, outer_end, ntt_rv32im_p2_outer + + /*************************************************** + * Pass 3: C-layers 5, 6. + * 16 outer iters, 4 inner iters each, stride = 16 B. + * Each outer block is 64 B (= 16 coefs). + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p3_outer: + load_outer_zetas + addi inner_end, data, 16 /* 4 * 4 B */ +ntt_rv32im_p3_inner: + radix4_kernel 16 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p3_inner + addi data, data, (64 - 16) /* skip to next 64 B block */ + bne data, outer_end, ntt_rv32im_p3_outer + + /*************************************************** + * Pass 4: C-layers 7, 8. + * 64 outer iters, 1 inner iter each, stride = 4 B. + * Each outer iter handles 4 consecutive coefficients. + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p4_outer: + load_outer_zetas + radix4_kernel 4 + addi data, data, 16 + bne data, outer_end, ntt_rv32im_p4_outer + + restore_regs + ret + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef in_ptr +#undef zeta_ptr +#undef data +#undef outer_end +#undef inner_end +#undef ca +#undef cb +#undef cc +#undef cd +#undef tmp0 +#undef tmp1 +#undef zeta_lo +#undef zeta_lo_tw +#undef zeta_h0 +#undef zeta_h0_tw +#undef zeta_h1 +#undef zeta_h1_tw +#undef q + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S new file mode 100644 index 000000000..4d51c9afc --- /dev/null +++ b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S @@ -0,0 +1,123 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery + * reduction. Computes + * + * a[i] = (a[i] * b[i]) * R^-1 mod q, R = 2^32, |result| < q, + * + * for i in 0..256, in-place in a. + * + * Modular arithmetic: standard signed Montgomery reduction. Unlike the + * NTT, neither operand is constant, so we can't precompute a twisted + * form -- the kernel uses 4 multiplies per coefficient: + * + * plo = low (a * b) ; mul + * m = low (plo * QINV) ; mul (low 32 of (plo * QINV)) + * phi = high(a * b) ; mulh + * mh = high(m * q) ; mulh + * r = phi - mh ; sub + * + * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product + * is bounded by (9q)^2 < 2^31 * q, well within the safe input range + * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q). + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +/***************************************************************** + * Register aliases + *****************************************************************/ + +/* Arguments */ +#define a_ptr a0 +#define b_ptr a1 + +/* Loop control */ +#define a_end t0 /* end-of-array sentinel for a_ptr */ + +/* Per-coef working set (caller-saved) */ +#define a_val a2 +#define b_val a3 +#define plo a4 +#define phi a5 +#define mlo a6 +#define mhi a7 + +/* Constants (callee-saved) */ +#define q s0 /* MLDSA_Q = 8380417 */ +#define qinv s1 /* QINV = 58728449 */ + +/***************************************************************** + * Function + *****************************************************************/ + + .text + .global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm) + + addi sp, sp, -8 + sw s0, 0(sp) + sw s1, 4(sp) + + /* q = 0x007FE001 */ + lui q, 0x7FE + addi q, q, 1 + /* qinv = 0x03802001 = 58728449 + * lui qinv, 0x3802; addi qinv, qinv, 1 -> 0x03802001 */ + lui qinv, 0x3802 + addi qinv, qinv, 1 + + addi a_end, a_ptr, 1024 /* 256 * 4 bytes */ + +poly_pointwise_montgomery_rv32im_loop: + lw a_val, 0(a_ptr) + lw b_val, 0(b_ptr) + + /* Standard signed Montgomery reduction of a*b: + * plo = (a*b) low 32 + * mlo = plo*QINV low 32 + * phi = (a*b) high 32 (signed) + * mhi = mlo*q high 32 (signed) + * res = phi - mhi + */ + mul plo, a_val, b_val + mul mlo, plo, qinv + mulh phi, a_val, b_val + mulh mhi, mlo, q + sub a_val, phi, mhi + + sw a_val, 0(a_ptr) + + addi a_ptr, a_ptr, 4 + addi b_ptr, b_ptr, 4 + bne a_ptr, a_end, poly_pointwise_montgomery_rv32im_loop + + lw s0, 0(sp) + lw s1, 4(sp) + addi sp, sp, 8 + ret + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef a_ptr +#undef b_ptr +#undef a_end +#undef a_val +#undef b_val +#undef plo +#undef phi +#undef mlo +#undef mhi +#undef q +#undef qinv + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/riscv32/src/rv32im_zetas.c b/dev/riscv32/src/rv32im_zetas.c new file mode 100644 index 000000000..6b7d67e2a --- /dev/null +++ b/dev/riscv32/src/rv32im_zetas.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "arith_native_rv32im.h" + +/* + * Table of zeta values used in the RV32-IM forward NTT. + * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with + * zeta in Montgomery form. See autogen for details. + */ +MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t + mld_rv32im_ntt_zetas[510] = { + 25847, 1830765815, -2608894, -1929875198, -518909, -1927777021, + 237124, 1640767044, 1826347, 308362795, 2353451, -1815525077, + -777960, 1477910808, -359251, -1374673747, -2091905, -1091570561, + -876248, 1612161320, 3119733, -1929495947, -2884855, 515185417, + 466468, 1640734244, 3111497, -285697463, 2680103, 625853735, + 2725464, 1727305304, 2706023, -1846138265, 95776, -1631226336, + 1024112, 2082316400, 3077325, -1404529459, 3530437, 1838055109, + -1079900, -1364982364, -1661693, 1594295555, -3592148, -1076973524, + 3585928, 858240904, -2537516, -1898723372, 3915439, -594436433, + -549488, 1806278032, -3861115, -202001019, -3043716, -475984260, + -1119584, 222489248, 3574422, -561427818, -2867647, 1797021249, + 2619752, -346752664, 3539968, -1061813248, -300467, 2059733581, + -2108549, 684667771, 2348700, -1661512036, -539299, -1104976547, + -2118186, 1654287830, -1699267, -1750224323, -1643818, -901666090, + -3859737, -878576921, 3505694, 418987550, -3821735, 1831915353, + -1399561, -1257667337, 3507263, -1925356481, -2140649, 992097815, + -3277672, -748618600, -1600420, 879957084, 3699596, 2024403852, + 1757237, 329347125, 811944, 1484874664, 531354, -1636082790, + -19422, 1837364258, 954230, -285388938, 3881043, -1983539117, + 4010497, -1443016191, 3900724, -1495136972, -2556880, -950076368, + 280005, -1170414139, 2071892, -1714807468, -2797779, -952438995, + -3930395, -1574918427, 2091667, -898413, 3407706, 991903578, + -1528703, -654783359, 2316500, 1363007700, 3817976, 746144248, + -3677745, 1350681039, -3342478, -1363460238, 2244091, 912367099, + -3041255, -1974159335, -2446433, 30313375, -3562462, -1420958686, + -1452451, -2143979939, 266997, -605900043, 2434439, -44694137, + 3475950, 1651689966, -1235728, -326425360, 3513181, 2032221021, + 2176455, 1599739335, -3520352, 2027833504, -3759364, 1176904444, + -1585221, 140455867, -1197226, 1683520342, -3193378, 1904936414, + -1257611, -1285853323, 900702, 14253662, 1859098, -421552614, + 1939314, -1039411342, 909542, -517299994, 819034, 1257750362, + -4083598, -993005454, 495491, 1014493059, -1613174, -818371958, + -1000202, 1955560694, -43260, 2027935492, -522500, 1926727420, + -3190144, -1440787840, -655327, 863641633, -3122442, 1747917558, + -3157330, 1529189038, 2031748, -1372618620, 3207046, 1931587462, + -3632928, 568627424, -3556995, 1819892093, -525098, -325927722, + 126922, -2131021878, -768622, 128353682, -3595838, 1258381762, + 3412210, -783134478, 342297, 2124962073, 286988, 908452108, + -983419, -247357819, -2437823, -1123881663, 4108315, 885133339, + 2147896, -588790216, 3437287, -1223601433, -3342277, 1851023419, + 2715295, 1518161567, 1735879, 137583815, 203044, 1629985060, + -2967645, 289871779, 2842341, -1920467227, 2691481, -1176751719, + -3693493, -86965173, -2590150, -635454918, 1265009, 1967222129, + -411027, -1262003603, 4055324, -1637785316, 1247620, -1354528380, + -2477047, 1708872713, 2486353, -642772911, 1595974, 6363718, + -671102, 2135294594, -3767016, -1536588520, 1250494, -72690498, + -1228525, 1787797779, 2635921, 45766801, -3548272, -1287922800, + -22981, -1018755525, -2994039, 694382729, 1869119, -314284737, + -1308169, 1638590967, 1903435, 671509323, -1050970, 1136965286, + -381987, -889861155, -1333058, 235104446, 1237275, 985022747, + 1349076, -120646188, -3318210, -2070602178, -1430225, 1779436847, + 1852771, 1665705315, -451100, -1045062172, 1312455, 963438279, + -1430430, -1669960606, 3306115, 419615363, -1962642, 1116720494, + -3343383, 1321868265, -1279661, 831969619, 1917081, -1078959975, + 264944, -916321552, -2546312, 1216882040, -1374803, 1042326957, + 508951, 1225434135, 1500165, -300448763, 777191, 604552167, + 3097992, 1155548552, 2235880, -270590488, 3406031, 1405999311, + 44288, -1784632064, -542412, 756955444, -2831860, -1021949428, + -1100098, 2143745726, -1671176, -1276805128, -1846953, 713994583, + 904516, 666258756, -2584293, -260312805, -3724270, 608791570, + 3958618, 1210558298, 594136, 371462360, -3776993, 940195359, + -3724342, 675310538, -2013608, 1554794072, 2432395, 173440395, + -8578, -1261461890, 2454455, -1357098057, -164721, -1542497137, + 1653064, -1555941048, 1957272, 1339088280, 3369112, -2126092136, + -3249728, -318346816, 185531, -384158533, -1207385, 2061661095, + 2389356, -1999506068, -3183426, -2040058690, 162844, -1316619236, + -210977, 628664287, 1616392, 827959816, 3014001, -883155599, + 759969, -1499481951, 810149, -853476187, 1652634, -1039370342, + -1316856, -1729304568, -3694233, -596344473, -1799107, 1726753853, + 189548, -695180180, -3038916, -2047270596, 3523897, 6087993, + -3553272, 1422575624, 3866901, 702390549, 269760, -1547952704, + 3159746, -1375177022, 2213111, -1723816713, -975884, -110126092, + -1851402, 1424130038, 1717735, -279505433, 472078, 394851342, + -2409325, 1777179795, -426683, -1591599803, 1723600, 565464272, + -177440, -1185330464, -1803090, -260424530, 1910376, 283780712, + 1315589, 334803717, -1667432, -440824168, -1104333, -1758099917, + 1341330, 235321234, -260646, -71875110, -3833893, 776003547, + 1285669, -178766299, -2939036, 1119856484, -2235985, -1600929361, + -1584928, 168022240, -420899, -1208667171, -2286327, 1123958025, + -812732, -518252220, 183443, 1544891539, -976891, 879867909, + -1439742, 1206536194, 1612842, -1499603926, -3545687, 201262505, + -3019102, 1957047970, -554416, 155290192, 3919660, -1809756372, + -3881060, 985155484, -48306, 2036925262, -1362209, 1934038751, + -3628969, 1146323031, 3937738, -973777462, 1400424, 400711272, + 3839961, -894060583, -846154, -540420426, 1976782, 374860238, +}; + +#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(rv32im_zetas) + +#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/flake.nix b/flake.nix index 7714eec95..8c4f7683f 100644 --- a/flake.nix +++ b/flake.nix @@ -158,7 +158,7 @@ # autogen shell with cross compiler for the "other" architecture devShells.cross-autogen = util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; } + packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv32; inherit (pkgs) gcc-arm-embedded; } ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ] ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ]; }; diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index ea9901768..7265c63eb 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -95,6 +95,9 @@ #include "src/native/x86_64/src/rej_uniform_eta4_avx2.c" #include "src/native/x86_64/src/rej_uniform_table.c" #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +#include "src/native/rv32im/src/rv32im_zetas.c" +#endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -772,5 +775,22 @@ #undef MLD_NATIVE_X86_64_SRC_CONSTS_H #undef mld_qdata #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +/* + * Undefine macros from native code (Arith, RV32IM) + */ +/* mldsa/src/native/rv32im/meta.h */ +#undef MLD_ARITH_BACKEND_RV32IM +#undef MLD_NATIVE_RV32IM_META_H +#undef MLD_USE_NATIVE_INTT +#undef MLD_USE_NATIVE_NTT +#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY +/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */ +#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#undef mld_intt_rv32im_asm +#undef mld_ntt_rv32im_asm +#undef mld_poly_pointwise_montgomery_rv32im_asm +#undef mld_rv32im_ntt_zetas +#endif /* MLD_SYS_RISCV32 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index 5e3c2d0de..3b0fa704e 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -88,6 +88,11 @@ #include "src/native/x86_64/src/pointwise_avx2_asm.S" #include "src/native/x86_64/src/poly_caddq_avx2_asm.S" #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +#include "src/native/rv32im/src/intt_rv32im_asm.S" +#include "src/native/rv32im/src/ntt_rv32im_asm.S" +#include "src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S" +#endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -779,5 +784,22 @@ #undef MLD_NATIVE_X86_64_SRC_CONSTS_H #undef mld_qdata #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +/* + * Undefine macros from native code (Arith, RV32IM) + */ +/* mldsa/src/native/rv32im/meta.h */ +#undef MLD_ARITH_BACKEND_RV32IM +#undef MLD_NATIVE_RV32IM_META_H +#undef MLD_USE_NATIVE_INTT +#undef MLD_USE_NATIVE_NTT +#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY +/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */ +#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#undef mld_intt_rv32im_asm +#undef mld_ntt_rv32im_asm +#undef mld_poly_pointwise_montgomery_rv32im_asm +#undef mld_rv32im_ntt_zetas +#endif /* MLD_SYS_RISCV32 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mldsa/src/native/meta.h b/mldsa/src/native/meta.h index 0b46dd579..248974a90 100644 --- a/mldsa/src/native/meta.h +++ b/mldsa/src/native/meta.h @@ -20,4 +20,10 @@ #include "x86_64/meta.h" #endif +/* We do not yet include the arithmetic backend for RV32-IM by default + * as it is still experimental and undergoing review. */ +/* #if defined(MLD_SYS_RISCV32) */ +/* #include "rv32im/meta.h" */ +/* #endif */ + #endif /* !MLD_NATIVE_META_H */ diff --git a/mldsa/src/native/rv32im/meta.h b/mldsa/src/native/rv32im/meta.h new file mode 100644 index 000000000..a83cd62c1 --- /dev/null +++ b/mldsa/src/native/rv32im/meta.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_META_H +#define MLD_NATIVE_RV32IM_META_H + +/* Set of primitives that this backend replaces */ +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_RV32IM + + +#if !defined(__ASSEMBLER__) +#include "../api.h" +#include "src/arith_native_rv32im.h" + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + mld_poly_pointwise_montgomery_rv32im_asm(a, b); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#endif /* !__ASSEMBLER__ */ +#endif /* !MLD_NATIVE_RV32IM_META_H */ diff --git a/mldsa/src/native/rv32im/src/arith_native_rv32im.h b/mldsa/src/native/rv32im/src/arith_native_rv32im.h new file mode 100644 index 000000000..03da705ad --- /dev/null +++ b/mldsa/src/native/rv32im/src/arith_native_rv32im.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H + +#include "../../../cbmc.h" +#include "../../../common.h" + +#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas) + +/* + * Forward NTT zeta table for the RV32-IM backend. + * + * 255 logical entries, each a (zeta, zeta * QINV mod 2^32) pair, with + * zeta in Montgomery form (i.e. R * w^{bitrev_8(k)} mod q where R = 2^32). + * The order matches the consumption order of the 2+2+2+2 forward NTT. + */ +MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510]; + +#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm) +void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, 9 * MLDSA_Q)) +); + +#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm) +void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) +); + +#define mld_poly_pointwise_montgomery_rv32im_asm \ + MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + /* check-magic: off */ + requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) /* MLD_NTT_BOUND */ + requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) + /* check-magic: on */ + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) +); + +#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */ diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S new file mode 100644 index 000000000..81ed5716c --- /dev/null +++ b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S @@ -0,0 +1,306 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT. + * + * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes + * applied in reverse layer order). Each pass merges two C-layers into a + * radix-4 inner kernel that holds 4 coefficients in registers. + * + * inv-pass-1: C-layers 8, 7 (inner stride = 4 B, 64 outer iters) + * inv-pass-2: C-layers 6, 5 (inner stride = 16 B, 16 outer iters) + * inv-pass-3: C-layers 4, 3 (inner stride = 64 B, 4 outer iters) + * inv-pass-4: C-layers 2, 1 (inner stride = 256 B, 1 outer iter ) + * + * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT + * table). The forward pass-(5-k) consumes its 3*N_outer pairs in + * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but + * in reverse outer order, with the two "hi" zetas swapped. We implement + * this by initializing zeta_ptr at the end of each pass region and + * subtracting 24 bytes per outer iter; within the iter the lo zeta is + * read from offset 0 and the hi zetas from offsets 8/16 swapped via the + * GS kernel argument order. The negation that the C reference applies + * (`-mld_zetas[k]`) is absorbed by the GS butterfly form + * a' = a + b + * b' = montmul(b - a, +zeta) + * which produces the same result as the canonical + * t = a; a' = t + b; b' = montmul(t - b, -zeta). + * + * Modular arithmetic: standard signed Montgomery (3-mul kernel + * m = low(a*z'), r = hi(a*z) - hi(m*q) + * ), matching the forward NTT. + * + * Final scaling: after the four passes, every coefficient is multiplied + * by f = 41978 = 2^{64-8} mod q (Montgomery-form, accounts for both + * 2^{-8} of the inverse NTT and the 2^32 left over from intermediate + * reductions). Implemented as a simple post-loop. + * + * Bounds (after each inv-pass): + * + * start : |coef| < q (= 1*q) + * after inv-pass-1 (C-L 8,7) : |coef| < 4*q + * after inv-pass-2 (C-L 6,5) : |coef| < 16*q + * after inv-pass-3 (C-L 4,3) : |coef| < 64*q + * after inv-pass-4 (C-L 2,1) : |coef| < 256*q (~ 2^31, fits int32) + * after final fqscale : |coef| < q + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/intt_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(intt_rv32im_asm) +MLD_ASM_FN_SYMBOL(intt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x30 + .cfi_adjust_cfa_offset 0x30 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + sw s6, 0x18(sp) + sw s7, 0x1c(sp) + sw s8, 0x20(sp) + lui s6, 0x7fe + addi s6, s6, 0x1 + addi a1, a1, 0x7f8 + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p1_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + sub a6, a3, a2 + add a2, a2, a3 + mul a7, a6, s5 + mulh a3, a6, s4 + mulh a7, a7, s6 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mul a7, a6, s3 + mulh a5, a6, s2 + mulh a7, a7, s6 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mul a7, a6, s1 + mulh a4, a6, s0 + mulh a7, a7, s6 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mul a7, a6, s1 + mulh a5, a6, s0 + mulh a7, a7, s6 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lintt_rv32im_p1_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p2_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x10 + +Lintt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + sub a6, a3, a2 + add a2, a2, a3 + mul a7, a6, s5 + mulh a3, a6, s4 + mulh a7, a7, s6 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mul a7, a6, s3 + mulh a5, a6, s2 + mulh a7, a7, s6 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mul a7, a6, s1 + mulh a4, a6, s0 + mulh a7, a7, s6 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mul a7, a6, s1 + mulh a5, a6, s0 + mulh a7, a7, s6 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p2_inner + addi t2, t2, 0x30 + bne t2, t3, Lintt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p3_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x40 + +Lintt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + sub a6, a3, a2 + add a2, a2, a3 + mul a7, a6, s5 + mulh a3, a6, s4 + mulh a7, a7, s6 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mul a7, a6, s3 + mulh a5, a6, s2 + mulh a7, a7, s6 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mul a7, a6, s1 + mulh a4, a6, s0 + mulh a7, a7, s6 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mul a7, a6, s1 + mulh a5, a6, s0 + mulh a7, a7, s6 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p3_inner + addi t2, t2, 0xc0 + bne t2, t3, Lintt_rv32im_p3_outer + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + mv t2, a0 + addi t4, a0, 0x100 + +Lintt_rv32im_p4_inner: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + sub a6, a3, a2 + add a2, a2, a3 + mul a7, a6, s5 + mulh a3, a6, s4 + mulh a7, a7, s6 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mul a7, a6, s3 + mulh a5, a6, s2 + mulh a7, a7, s6 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mul a7, a6, s1 + mulh a4, a6, s0 + mulh a7, a7, s6 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mul a7, a6, s1 + mulh a5, a6, s0 + mulh a7, a7, s6 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p4_inner + lui s7, 0xa + addi s7, s7, 0x3fa + lui s8, 0xff7fe + addi s8, s8, 0x3fa + mv t2, a0 + addi t5, a0, 0x400 + +Lintt_rv32im_scale: + lw a2, 0x0(t2) + mul a6, a2, s8 + mulh a3, a2, s7 + mulh a6, a6, s6 + sub a3, a3, a6 + sw a3, 0x0(t2) + addi t2, t2, 0x4 + bne t2, t5, Lintt_rv32im_scale + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + lw s6, 0x18(sp) + lw s7, 0x1c(sp) + lw s8, 0x20(sp) + addi sp, sp, 0x30 + .cfi_adjust_cfa_offset -0x30 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(intt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S new file mode 100644 index 000000000..9a61090a5 --- /dev/null +++ b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S @@ -0,0 +1,255 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA forward NTT. + * + * Layered structure: 2+2+2+2 (four passes, each merging two layers, with + * a radix-4 inner kernel holding 4 coefficients in registers). + * + * Modular arithmetic: standard signed Montgomery multiplication. + * Each zeta is provided in Montgomery form (R * w^{bitrev(k)} mod q, + * R = 2^32) along with its precomputed twist z' = (z * QINV) mod 2^32, + * so a Montgomery multiply is 3 multiplies + 1 sub: + * + * m = low(a * z') + * r = hi(a * z) - hi(m * q) + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/ntt_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(ntt_rv32im_asm) +MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x20 + .cfi_adjust_cfa_offset 0x20 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + sw s6, 0x18(sp) + lui s6, 0x7fe + addi s6, s6, 0x1 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + mv t2, a0 + addi t4, a0, 0x100 + +Lntt_rv32im_p1_loop: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + mul a7, a4, s1 + mulh a6, a4, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mul a7, a5, s1 + mulh a6, a5, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mul a7, a3, s3 + mulh a6, a3, s2 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mul a7, a5, s5 + mulh a6, a5, s4 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p1_loop + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p2_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x40 + +Lntt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + mul a7, a4, s1 + mulh a6, a4, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mul a7, a5, s1 + mulh a6, a5, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mul a7, a3, s3 + mulh a6, a3, s2 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mul a7, a5, s5 + mulh a6, a5, s4 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p2_inner + addi t2, t2, 0xc0 + bne t2, t3, Lntt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p3_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x10 + +Lntt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + mul a7, a4, s1 + mulh a6, a4, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mul a7, a5, s1 + mulh a6, a5, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mul a7, a3, s3 + mulh a6, a3, s2 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mul a7, a5, s5 + mulh a6, a5, s4 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p3_inner + addi t2, t2, 0x30 + bne t2, t3, Lntt_rv32im_p3_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p4_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + mul a7, a4, s1 + mulh a6, a4, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mul a7, a5, s1 + mulh a6, a5, s0 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mul a7, a3, s3 + mulh a6, a3, s2 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mul a7, a5, s5 + mulh a6, a5, s4 + mulh a7, a7, s6 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lntt_rv32im_p4_outer + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + lw s6, 0x18(sp) + addi sp, sp, 0x20 + .cfi_adjust_cfa_offset -0x20 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(ntt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S new file mode 100644 index 000000000..606a13379 --- /dev/null +++ b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S @@ -0,0 +1,79 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery + * reduction. Computes + * + * a[i] = (a[i] * b[i]) * R^-1 mod q, R = 2^32, |result| < q, + * + * for i in 0..256, in-place in a. + * + * Modular arithmetic: standard signed Montgomery reduction. Unlike the + * NTT, neither operand is constant, so we can't precompute a twisted + * form -- the kernel uses 4 multiplies per coefficient: + * + * plo = low (a * b) ; mul + * m = low (plo * QINV) ; mul (low 32 of (plo * QINV)) + * phi = high(a * b) ; mulh + * mh = high(m * q) ; mulh + * r = phi - mh ; sub + * + * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product + * is bounded by (9q)^2 < 2^31 * q, well within the safe input range + * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q). + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x8 + .cfi_adjust_cfa_offset 0x8 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + lui s0, 0x7fe + addi s0, s0, 0x1 + lui s1, 0x3802 + addi s1, s1, 0x1 + addi t0, a0, 0x400 + +Lpoly_pointwise_montgomery_rv32im_loop: + lw a2, 0x0(a0) + lw a3, 0x0(a1) + mul a4, a2, a3 + mul a6, a4, s1 + mulh a5, a2, a3 + mulh a7, a6, s0 + sub a2, a5, a7 + sw a2, 0x0(a0) + addi a0, a0, 0x4 + addi a1, a1, 0x4 + bne a0, t0, Lpoly_pointwise_montgomery_rv32im_loop + lw s0, 0x0(sp) + lw s1, 0x4(sp) + addi sp, sp, 0x8 + .cfi_adjust_cfa_offset -0x8 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(poly_pointwise_montgomery_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/rv32im_zetas.c b/mldsa/src/native/rv32im/src/rv32im_zetas.c new file mode 100644 index 000000000..6b7d67e2a --- /dev/null +++ b/mldsa/src/native/rv32im/src/rv32im_zetas.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "arith_native_rv32im.h" + +/* + * Table of zeta values used in the RV32-IM forward NTT. + * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with + * zeta in Montgomery form. See autogen for details. + */ +MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t + mld_rv32im_ntt_zetas[510] = { + 25847, 1830765815, -2608894, -1929875198, -518909, -1927777021, + 237124, 1640767044, 1826347, 308362795, 2353451, -1815525077, + -777960, 1477910808, -359251, -1374673747, -2091905, -1091570561, + -876248, 1612161320, 3119733, -1929495947, -2884855, 515185417, + 466468, 1640734244, 3111497, -285697463, 2680103, 625853735, + 2725464, 1727305304, 2706023, -1846138265, 95776, -1631226336, + 1024112, 2082316400, 3077325, -1404529459, 3530437, 1838055109, + -1079900, -1364982364, -1661693, 1594295555, -3592148, -1076973524, + 3585928, 858240904, -2537516, -1898723372, 3915439, -594436433, + -549488, 1806278032, -3861115, -202001019, -3043716, -475984260, + -1119584, 222489248, 3574422, -561427818, -2867647, 1797021249, + 2619752, -346752664, 3539968, -1061813248, -300467, 2059733581, + -2108549, 684667771, 2348700, -1661512036, -539299, -1104976547, + -2118186, 1654287830, -1699267, -1750224323, -1643818, -901666090, + -3859737, -878576921, 3505694, 418987550, -3821735, 1831915353, + -1399561, -1257667337, 3507263, -1925356481, -2140649, 992097815, + -3277672, -748618600, -1600420, 879957084, 3699596, 2024403852, + 1757237, 329347125, 811944, 1484874664, 531354, -1636082790, + -19422, 1837364258, 954230, -285388938, 3881043, -1983539117, + 4010497, -1443016191, 3900724, -1495136972, -2556880, -950076368, + 280005, -1170414139, 2071892, -1714807468, -2797779, -952438995, + -3930395, -1574918427, 2091667, -898413, 3407706, 991903578, + -1528703, -654783359, 2316500, 1363007700, 3817976, 746144248, + -3677745, 1350681039, -3342478, -1363460238, 2244091, 912367099, + -3041255, -1974159335, -2446433, 30313375, -3562462, -1420958686, + -1452451, -2143979939, 266997, -605900043, 2434439, -44694137, + 3475950, 1651689966, -1235728, -326425360, 3513181, 2032221021, + 2176455, 1599739335, -3520352, 2027833504, -3759364, 1176904444, + -1585221, 140455867, -1197226, 1683520342, -3193378, 1904936414, + -1257611, -1285853323, 900702, 14253662, 1859098, -421552614, + 1939314, -1039411342, 909542, -517299994, 819034, 1257750362, + -4083598, -993005454, 495491, 1014493059, -1613174, -818371958, + -1000202, 1955560694, -43260, 2027935492, -522500, 1926727420, + -3190144, -1440787840, -655327, 863641633, -3122442, 1747917558, + -3157330, 1529189038, 2031748, -1372618620, 3207046, 1931587462, + -3632928, 568627424, -3556995, 1819892093, -525098, -325927722, + 126922, -2131021878, -768622, 128353682, -3595838, 1258381762, + 3412210, -783134478, 342297, 2124962073, 286988, 908452108, + -983419, -247357819, -2437823, -1123881663, 4108315, 885133339, + 2147896, -588790216, 3437287, -1223601433, -3342277, 1851023419, + 2715295, 1518161567, 1735879, 137583815, 203044, 1629985060, + -2967645, 289871779, 2842341, -1920467227, 2691481, -1176751719, + -3693493, -86965173, -2590150, -635454918, 1265009, 1967222129, + -411027, -1262003603, 4055324, -1637785316, 1247620, -1354528380, + -2477047, 1708872713, 2486353, -642772911, 1595974, 6363718, + -671102, 2135294594, -3767016, -1536588520, 1250494, -72690498, + -1228525, 1787797779, 2635921, 45766801, -3548272, -1287922800, + -22981, -1018755525, -2994039, 694382729, 1869119, -314284737, + -1308169, 1638590967, 1903435, 671509323, -1050970, 1136965286, + -381987, -889861155, -1333058, 235104446, 1237275, 985022747, + 1349076, -120646188, -3318210, -2070602178, -1430225, 1779436847, + 1852771, 1665705315, -451100, -1045062172, 1312455, 963438279, + -1430430, -1669960606, 3306115, 419615363, -1962642, 1116720494, + -3343383, 1321868265, -1279661, 831969619, 1917081, -1078959975, + 264944, -916321552, -2546312, 1216882040, -1374803, 1042326957, + 508951, 1225434135, 1500165, -300448763, 777191, 604552167, + 3097992, 1155548552, 2235880, -270590488, 3406031, 1405999311, + 44288, -1784632064, -542412, 756955444, -2831860, -1021949428, + -1100098, 2143745726, -1671176, -1276805128, -1846953, 713994583, + 904516, 666258756, -2584293, -260312805, -3724270, 608791570, + 3958618, 1210558298, 594136, 371462360, -3776993, 940195359, + -3724342, 675310538, -2013608, 1554794072, 2432395, 173440395, + -8578, -1261461890, 2454455, -1357098057, -164721, -1542497137, + 1653064, -1555941048, 1957272, 1339088280, 3369112, -2126092136, + -3249728, -318346816, 185531, -384158533, -1207385, 2061661095, + 2389356, -1999506068, -3183426, -2040058690, 162844, -1316619236, + -210977, 628664287, 1616392, 827959816, 3014001, -883155599, + 759969, -1499481951, 810149, -853476187, 1652634, -1039370342, + -1316856, -1729304568, -3694233, -596344473, -1799107, 1726753853, + 189548, -695180180, -3038916, -2047270596, 3523897, 6087993, + -3553272, 1422575624, 3866901, 702390549, 269760, -1547952704, + 3159746, -1375177022, 2213111, -1723816713, -975884, -110126092, + -1851402, 1424130038, 1717735, -279505433, 472078, 394851342, + -2409325, 1777179795, -426683, -1591599803, 1723600, 565464272, + -177440, -1185330464, -1803090, -260424530, 1910376, 283780712, + 1315589, 334803717, -1667432, -440824168, -1104333, -1758099917, + 1341330, 235321234, -260646, -71875110, -3833893, 776003547, + 1285669, -178766299, -2939036, 1119856484, -2235985, -1600929361, + -1584928, 168022240, -420899, -1208667171, -2286327, 1123958025, + -812732, -518252220, 183443, 1544891539, -976891, 879867909, + -1439742, 1206536194, 1612842, -1499603926, -3545687, 201262505, + -3019102, 1957047970, -554416, 155290192, 3919660, -1809756372, + -3881060, 985155484, -48306, 2036925262, -1362209, 1934038751, + -3628969, 1146323031, 3937738, -973777462, 1400424, 400711272, + 3839961, -894060583, -846154, -540420426, 1976782, 374860238, +}; + +#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(rv32im_zetas) + +#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/scripts/autogen b/scripts/autogen index c2ddfd822..15aead747 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -958,6 +958,92 @@ def _fmt_indexed_rows(data): yield ",".join(map(str, row)) + f" /* {i} */," +def prepare_root_for_montgomery(root): + """Takes a constant that the code needs to Montgomery-multiply with, + and returns the pair (z, z'), where z is the signed canonical form + of the input, and z' = (z * QINV) mod 2^32 is the twisted constant + used in the low-mul part of the standard signed Montgomery + multiplication. + + QINV = pow(MLDSA_Q, -1, 2^32) (matches mldsa/src/reduce.h).""" + + QINV = 58728449 # pow(MLDSA_Q, -1, 2^32) + + z = signed_reduce(root) + + # Compute (z * QINV) mod 2^32, then reinterpret as int32. + z_twisted = (z * QINV) & 0xFFFFFFFF + if z_twisted >= (1 << 31): + z_twisted -= 1 << 32 + + return z, z_twisted + + +def gen_rv32im_root_for_block(layer, block): + """Forward NTT zeta for the given (layer, block), in Montgomery form, + returned as the (z, z') pair consumed by the RV32-IM assembly.""" + log = bitreverse(pow(2, layer) + block, 8) + # Montgomery factor R = 2^32 mod q is folded in, since the RV32-IM + # butterfly uses standard signed Montgomery multiplication. + root = pow(root_of_unity, log, modulus) * montgomery_factor + return prepare_root_for_montgomery(root) + + +def gen_rv32im_fwd_ntt_zetas(): + """Yield (z, z') pairs in the order consumed by the 2+2+2+2 forward NTT. + + Each of the 4 passes (L1+L2, L3+L4, L5+L6, L7+L8) emits one set of + 3 pairs per outer iteration. Layers are 0-indexed here: + + pass p uses layers (lo, hi) = (2p, 2p+1) + + For outer index o in pass p: + zeta_lo = layer lo, block o + zeta_hi0 = layer hi, block 2*o + zeta_hi1 = layer hi, block 2*o + 1 + + Total: 1 + 4 + 16 + 64 = 85 outer iters * 3 pairs = 255 pairs.""" + for p in range(4): + lo = 2 * p + hi = 2 * p + 1 + n_outer = 1 << lo # 1, 4, 16, 64 + for o in range(n_outer): + yield from gen_rv32im_root_for_block(lo, o) + yield from gen_rv32im_root_for_block(hi, 2 * o + 0) + yield from gen_rv32im_root_for_block(hi, 2 * o + 1) + + +def gen_rv32im_zeta_file(): + def gen(): + yield from gen_header() + yield '#include "../../../common.h"' + yield "" + yield "#if defined(MLD_ARITH_BACKEND_RV32IM) && \\" + yield " !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)" + yield "" + yield '#include "arith_native_rv32im.h"' + yield "" + yield "/*" + yield " * Table of zeta values used in the RV32-IM forward NTT." + yield " * Each entry is a (zeta, zeta * QINV mod 2^32) pair, with" + yield " * zeta in Montgomery form. See autogen for details." + yield " */" + yield from emit_c_array( + "const int32_t", + "mld_rv32im_ntt_zetas", + gen_rv32im_fwd_ntt_zetas(), + ) + yield "" + yield "#else" + yield "" + yield "MLD_EMPTY_CU(rv32im_zetas)" + yield "" + yield "#endif" + yield "" + + update_file("dev/riscv32/src/rv32im_zetas.c", "\n".join(gen())) + + def gen_aarch64_zeta_file(): def gen(): yield from gen_header() @@ -1844,6 +1930,10 @@ def riscv64(c): return "/riscv64/" in c +def rv32im(c): + return "/rv32im/" in c + + def armv81m(c): return "/armv81m/" in c @@ -1889,12 +1979,17 @@ def native_arith_riscv64(c): return native_arith(c) and riscv64(c) +def native_arith_rv32im(c): + return native_arith(c) and rv32im(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) and not native_arith_riscv64(c) + and not native_arith_rv32im(c) ) @@ -2001,6 +2096,11 @@ def gen_macro_undefs(extra_notes=None): filt=native_arith_x86_64, desc="native code (Arith, X86_64)" ) yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + yield from gen_monolithic_undef_all_core( + filt=native_arith_rv32im, desc="native code (Arith, RV32IM)" + ) + yield "#endif" yield "#endif" yield "#endif" yield "" @@ -2078,6 +2178,10 @@ def gen_monolithic_source_file(): for c in filter(native_arith_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + for c in filter(native_arith_rv32im, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2161,6 +2265,10 @@ def gen_monolithic_asm_file(): for c in filter(native_arith_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + for c in filter(native_arith_rv32im, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2306,7 +2414,10 @@ def check_macro_typos(): return True # 5. AWS-LC importer patch - if is_autogen or filename == "integration/awslc/awslc.patch": + if is_autogen or filename in [ + "integration/awslc/pre_import.patch", + "integration/awslc/post_import.patch", + ]: return True if is_autogen or filename == "mldsa/src/common.h": @@ -2483,9 +2594,14 @@ def update_via_simpasm( outfile=None, cflags=None, preserve_header=True, - force_cross=False, + force_cross=None, x86_64_syntax="att", ): + # force_cross: set of source architectures for which a missing cross + # toolchain is a hard error rather than a silent skip. Pass None or an + # empty set to skip silently for every arch. + if force_cross is None: + force_cross = set() _, infile = os.path.split(infile_full) if outfile is None: outfile = infile @@ -2502,6 +2618,8 @@ def update_via_simpasm( source_arch = "x86_64" elif "armv81m" in infile_full: source_arch = "armv81m" + elif "riscv32" in infile_full or "rv32im" in infile_full: + source_arch = "riscv32" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -2515,7 +2633,15 @@ def update_via_simpasm( cross_prefix = "arm-none-eabi-" cross_gcc = cross_prefix + "gcc" if shutil.which(cross_gcc) is None: - if force_cross is False: + if source_arch not in force_cross: + return + raise Exception(f"Could not find cross toolchain {cross_prefix}") + # RISC-V 32-bit is always cross-compiled. + elif source_arch == "riscv32": + cross_prefix = "riscv32-unknown-linux-gnu-" + cross_gcc = cross_prefix + "gcc" + if shutil.which(cross_gcc) is None: + if source_arch not in force_cross: return raise Exception(f"Could not find cross toolchain {cross_prefix}") elif native_arch != source_arch: @@ -2523,7 +2649,7 @@ def update_via_simpasm( cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: - if force_cross is False: + if source_arch not in force_cross: return raise Exception(f"Could not find cross toolchain {cross_prefix}") else: @@ -2536,6 +2662,8 @@ def update_via_simpasm( arch = "aarch64" elif "armv81m" in infile_full: arch = "armv81m" + elif "riscv32" in infile_full or "rv32im" in infile_full: + arch = "riscv32" else: arch = "x86_64" @@ -2836,7 +2964,7 @@ def synchronize_backend(in_dir, out_dir, delete=False, no_simplify=False, **kwar def synchronize_backends( *, - force_cross=False, + force_cross=None, clean=False, delete=False, no_simplify=False, @@ -2865,6 +2993,14 @@ def synchronize_backends( ), ) + update_via_copy( + "dev/riscv32/meta.h", + "mldsa/src/native/rv32im/meta.h", + transform=lambda c: adjust_header_guard_for_filename( + c, "mldsa/src/native/rv32im/meta.h" + ), + ) + synchronize_backend( f"dev/aarch64_{ty}/src", "mldsa/src/native/aarch64/src", @@ -2946,6 +3082,14 @@ def synchronize_backends( no_simplify=no_simplify, cflags="-Idev/fips202/armv81m -Imldsa/src/fips202/native/armv81m -march=armv8.1-m.main+mve -mthumb", ) + synchronize_backend( + "dev/riscv32/src", + "mldsa/src/native/rv32im/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Idev/riscv32/src -Imldsa/src/native/rv32im/src -march=rv32im -mabi=ilp32", + ) def adjust_header_guard_for_filename(content, header_file): @@ -3547,7 +3691,9 @@ def update_bytecode_in_proof_script(filepath, bytecode): update_file(filepath, updated_content) -def update_hol_light_bytecode_for_arch(arch, force_cross=False): +def update_hol_light_bytecode_for_arch(arch, force_cross=None): + if force_cross is None: + force_cross = set() source_arch = arch if platform.machine().lower() in ["arm64", "aarch64"]: native_arch = "aarch64" @@ -3559,7 +3705,7 @@ def update_hol_light_bytecode_for_arch(arch, force_cross=False): cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: - if force_cross is False: + if source_arch not in force_cross: return raise Exception(f"Could not find cross toolchain {cross_prefix}") @@ -3581,7 +3727,7 @@ def update_hol_light_bytecode_for_arch(arch, force_cross=False): update_bytecode_in_proof_script(ml_file, bytecode) -def update_hol_light_bytecode(force_cross=False): +def update_hol_light_bytecode(force_cross=None): """Update HOL Light proof files with bytecode from make dump_bytecode.""" update_hol_light_bytecode_for_arch("aarch64", force_cross=force_cross) update_hol_light_bytecode_for_arch("x86_64", force_cross=force_cross) @@ -3983,7 +4129,19 @@ def _main(): parser.add_argument("--slothy", nargs="*", default=None, choices=slothy_choices) parser.add_argument("--aarch64-clean", default=False, action="store_true") parser.add_argument("--no-simplify", default=False, action="store_true") - parser.add_argument("--force-cross", default=False, action="store_true") + KNOWN_CROSS_ARCHS = ["aarch64", "x86_64", "armv81m", "riscv32"] + parser.add_argument( + "--force-cross", + nargs="*", + default=None, + choices=KNOWN_CROSS_ARCHS, + metavar="ARCH", + help=( + "Architectures whose missing cross toolchain should fail the " + "run instead of being silently skipped. With no argument, " + "applies to all of: " + ", ".join(KNOWN_CROSS_ARCHS) + "." + ), + ) parser.add_argument( "--x86-64-syntax", type=str, @@ -4017,11 +4175,22 @@ def _main(): if args.slothy == []: args.slothy = slothy_choices + # Normalize --force-cross into a set: + # absent -> empty set (silently skip every missing toolchain) + # no value -> all known archs (legacy --force-cross behavior) + # explicit list -> just those archs + if args.force_cross is None: + force_cross = set() + elif args.force_cross == []: + force_cross = set(KNOWN_CROSS_ARCHS) + else: + force_cross = set(args.force_cross) + def sync_backends(): synchronize_backends( clean=args.aarch64_clean, no_simplify=args.no_simplify, - force_cross=args.force_cross, + force_cross=force_cross, x86_64_syntax=args.x86_64_syntax, ) @@ -4029,7 +4198,7 @@ def _main(): synchronize_backends( clean=args.aarch64_clean, delete=True, - force_cross=args.force_cross, + force_cross=force_cross, no_simplify=args.no_simplify, x86_64_syntax=args.x86_64_syntax, ) @@ -4037,6 +4206,7 @@ def _main(): def gen_zeta_tables(): gen_c_zeta_file() gen_aarch64_zeta_file() + gen_rv32im_zeta_file() gen_aarch64_hol_light_zeta_file() gen_aarch64_rej_uniform_table() gen_aarch64_rej_uniform_eta_table() @@ -4080,7 +4250,7 @@ def _main(): ("Complete final backend synchronization", sync_backends_final), ( "Update HOL Light bytecode", - partial(update_hol_light_bytecode, force_cross=args.force_cross), + partial(update_hol_light_bytecode, force_cross=force_cross), args.update_hol_light_bytecode, ), ("Generate monolithic source files", gen_monolithic), diff --git a/scripts/cfify b/scripts/cfify index 126a900fe..0e4178c9f 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -127,6 +127,19 @@ ARMV81M_ADD_SP_PATTERN = re.compile( ARMV81M_BX_LR_PATTERN = re.compile(r"(\s*)bx\s+lr\s*$", re.IGNORECASE) +# ----------------------------------------------------------------------------- +# riscv32 module-scope constants +# ----------------------------------------------------------------------------- +# `addi sp, sp, -OFF` (allocate) and `addi sp, sp, +OFF` (free). +RISCV32_SUB_SP_PATTERN = re.compile( + r"(\s*)addi\s+sp,\s*sp,\s*-(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE +) +RISCV32_ADD_SP_PATTERN = re.compile( + r"(\s*)addi\s+sp,\s*sp,\s*(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE +) +RISCV32_RET_PATTERN = re.compile(r"(\s*)ret\s*$", re.IGNORECASE) + + def armv81m_parse_reg(s): """Parse a single register token, returning its canonical name (e.g. 'r14' -> 'lr'). Raises ValueError on unrecognised input.""" @@ -443,6 +456,44 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "riscv32": + # addi sp, sp, -OFF — stack allocation + match = RISCV32_SUB_SP_PATTERN.match(line) + if match: + indent, offset_str = match.groups() + offset = ( + int(offset_str, 16) + if offset_str.lower().startswith("0x") + else int(offset_str) + ) + result.append(line) + result.append(f"{indent}.cfi_adjust_cfa_offset {offset:#x}") + i += 1 + continue + + # addi sp, sp, +OFF — stack deallocation + match = RISCV32_ADD_SP_PATTERN.match(line) + if match: + indent, offset_str = match.groups() + offset = ( + int(offset_str, 16) + if offset_str.lower().startswith("0x") + else int(offset_str) + ) + result.append(line) + result.append(f"{indent}.cfi_adjust_cfa_offset -{offset:#x}") + i += 1 + continue + + # ret — function return + match = RISCV32_RET_PATTERN.match(line) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -462,7 +513,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64", "armv81m"], + choices=["aarch64", "x86_64", "armv81m", "riscv32"], default="aarch64", help="Target architecture (default: aarch64)", ) diff --git a/scripts/simpasm b/scripts/simpasm index dc34079a1..62cca9adf 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -256,6 +256,11 @@ def simplify(logger, args, asm_input, asm_output=None): # Armv8.1-M requires explicit triple for Thumb disassembly if args.arch == "armv81m": cmd += ["--triple=thumbv8.1m.main-none-eabi"] + # RISC-V 32-bit ILP32 needs an explicit triple so llvm-objdump + # decodes the M extension (mul/mulh) instead of marking them + # as illegal. + if args.arch == "riscv32": + cmd += ["--triple=riscv32", "--mattr=+m"] # Add syntax option if specified if args.syntax and args.syntax.lower() != "att": diff --git a/test/mk/components.mk b/test/mk/components.mk index 67698aabe..00df4d67c 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -10,7 +10,7 @@ endif SOURCES += $(wildcard mldsa/src/*.c) ifeq ($(OPT),1) - SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) $(wildcard mldsa/src/native/rv32im/src/*.[csS]) CFLAGS += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif