diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h index 78e98390c..dec3f5d57 100644 --- a/dev/aarch64_clean/meta.h +++ b/dev/aarch64_clean/meta.h @@ -210,10 +210,9 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], - const int32_t in1[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) { - mld_poly_pointwise_montgomery_asm(out, in0, in1); + mld_poly_pointwise_montgomery_asm(a, b); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/dev/aarch64_clean/src/arith_native_aarch64.h b/dev/aarch64_clean/src/arith_native_aarch64.h index 08230c39f..f366b1935 100644 --- a/dev/aarch64_clean/src/arith_native_aarch64.h +++ b/dev/aarch64_clean/src/arith_native_aarch64.h @@ -150,19 +150,17 @@ void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) -void mld_poly_pointwise_montgomery_asm(int32_t *r, const int32_t *a, - const int32_t *b) +void mld_poly_pointwise_montgomery_asm(int32_t *a, const int32_t *b) /* This must be kept in sync with the HOL-Light specification * in proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml */ __contract__( - requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) /* check-magic: off */ requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) - assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) - ensures(array_abs_bound(r, 0, MLDSA_N, 8380417)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) /* check-magic: on */ ); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/dev/aarch64_clean/src/pointwise_montgomery.S b/dev/aarch64_clean/src/pointwise_montgomery.S index 9bb5273c7..4e63a3fcd 100644 --- a/dev/aarch64_clean/src/pointwise_montgomery.S +++ b/dev/aarch64_clean/src/pointwise_montgomery.S @@ -22,9 +22,8 @@ smull2 \dh\().2d, \a\().4s, \b\().4s .endm - out_ptr .req x0 - a_ptr .req x1 - b_ptr .req x2 + a_ptr .req x0 + b_ptr .req x1 count .req x3 wtmp .req w3 @@ -89,10 +88,10 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) mov count, #(MLDSA_N / 4) poly_pointwise_montgomery_loop_start: + ldr q_a_0, [a_ptr, #0*16] ldr q_a_1, [a_ptr, #1*16] ldr q_a_2, [a_ptr, #2*16] ldr q_a_3, [a_ptr, #3*16] - ldr q_a_0, [a_ptr], #4*16 ldr q_b_1, [b_ptr, #1*16] ldr q_b_2, [b_ptr, #2*16] @@ -116,17 +115,16 @@ poly_pointwise_montgomery_loop_start: // // See description of mld_montgomery_reduce() in mldsa/src/reduce.h. - str q_c_1, [out_ptr, #1*16] - str q_c_2, [out_ptr, #2*16] - str q_c_3, [out_ptr, #3*16] - str q_c_0, [out_ptr], #4*16 + str q_c_1, [a_ptr, #1*16] + str q_c_2, [a_ptr, #2*16] + str q_c_3, [a_ptr, #3*16] + str q_c_0, [a_ptr], #4*16 subs count, count, #4 cbnz count, poly_pointwise_montgomery_loop_start ret - .unreq out_ptr .unreq a_ptr .unreq b_ptr .unreq count diff --git a/dev/aarch64_opt/meta.h b/dev/aarch64_opt/meta.h index 78e98390c..dec3f5d57 100644 --- a/dev/aarch64_opt/meta.h +++ b/dev/aarch64_opt/meta.h @@ -210,10 +210,9 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], - const int32_t in1[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) { - mld_poly_pointwise_montgomery_asm(out, in0, in1); + mld_poly_pointwise_montgomery_asm(a, b); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/dev/aarch64_opt/src/arith_native_aarch64.h b/dev/aarch64_opt/src/arith_native_aarch64.h index 08230c39f..f366b1935 100644 --- a/dev/aarch64_opt/src/arith_native_aarch64.h +++ b/dev/aarch64_opt/src/arith_native_aarch64.h @@ -150,19 +150,17 @@ void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) -void mld_poly_pointwise_montgomery_asm(int32_t *r, const int32_t *a, - const int32_t *b) +void mld_poly_pointwise_montgomery_asm(int32_t *a, const int32_t *b) /* This must be kept in sync with the HOL-Light specification * in proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml */ __contract__( - requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) /* check-magic: off */ requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) - assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) - ensures(array_abs_bound(r, 0, MLDSA_N, 8380417)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) /* check-magic: on */ ); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/dev/aarch64_opt/src/pointwise_montgomery.S b/dev/aarch64_opt/src/pointwise_montgomery.S index 9bb5273c7..4e63a3fcd 100644 --- a/dev/aarch64_opt/src/pointwise_montgomery.S +++ b/dev/aarch64_opt/src/pointwise_montgomery.S @@ -22,9 +22,8 @@ smull2 \dh\().2d, \a\().4s, \b\().4s .endm - out_ptr .req x0 - a_ptr .req x1 - b_ptr .req x2 + a_ptr .req x0 + b_ptr .req x1 count .req x3 wtmp .req w3 @@ -89,10 +88,10 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) mov count, #(MLDSA_N / 4) poly_pointwise_montgomery_loop_start: + ldr q_a_0, [a_ptr, #0*16] ldr q_a_1, [a_ptr, #1*16] ldr q_a_2, [a_ptr, #2*16] ldr q_a_3, [a_ptr, #3*16] - ldr q_a_0, [a_ptr], #4*16 ldr q_b_1, [b_ptr, #1*16] ldr q_b_2, [b_ptr, #2*16] @@ -116,17 +115,16 @@ poly_pointwise_montgomery_loop_start: // // See description of mld_montgomery_reduce() in mldsa/src/reduce.h. - str q_c_1, [out_ptr, #1*16] - str q_c_2, [out_ptr, #2*16] - str q_c_3, [out_ptr, #3*16] - str q_c_0, [out_ptr], #4*16 + str q_c_1, [a_ptr, #1*16] + str q_c_2, [a_ptr, #2*16] + str q_c_3, [a_ptr, #3*16] + str q_c_0, [a_ptr], #4*16 subs count, count, #4 cbnz count, poly_pointwise_montgomery_loop_start ret - .unreq out_ptr .unreq a_ptr .unreq b_ptr .unreq count diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h index 71f3b3460..23600f9d5 100644 --- a/dev/x86_64/meta.h +++ b/dev/x86_64/meta.h @@ -263,13 +263,13 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_pointwise_avx2(c, a, b, mld_qdata); + mld_pointwise_avx2(a, b, mld_qdata); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h index af71a7140..b5fd4e9b8 100644 --- a/dev/x86_64/src/arith_native_x86_64.h +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -122,20 +122,18 @@ void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2) -void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b, - const int32_t *qdata) +void mld_pointwise_avx2(int32_t *a, const int32_t *b, const int32_t *qdata) /* This must be kept in sync with the HOL-Light specification * in proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml */ __contract__( - requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) /* check-magic: off */ requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) requires(qdata == mld_qdata) - assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) - ensures(array_abs_bound(c, 0, MLDSA_N, 8380417)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) /* check-magic: on */ ); diff --git a/dev/x86_64/src/pointwise.S b/dev/x86_64/src/pointwise.S index df89d4d31..cf3e3930c 100644 --- a/dev/x86_64/src/pointwise.S +++ b/dev/x86_64/src/pointwise.S @@ -28,35 +28,35 @@ .text /* - * void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata) + * void mld_pointwise_avx2(__m256i *a, const __m256i *b, const __m256i *qdata) * - * Pointwise multiplication of polynomials in NTT domain with Montgomery reduction + * Pointwise multiplication of polynomials in NTT domain with Montgomery + * reduction. Destructive in the first argument: a := a * b * R^{-1} mod q. * * Arguments: - * rdi: pointer to output polynomial c - * rsi: pointer to input polynomial a - * rdx: pointer to input polynomial b - * rcx: pointer to qdata constants + * rdi: pointer to first input/output polynomial a + * rsi: pointer to second input polynomial b + * rdx: pointer to qdata constants */ .balign 4 .global MLD_ASM_NAMESPACE(pointwise_avx2) MLD_ASM_FN_SYMBOL(pointwise_avx2) // Load constants - vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] - vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] + vmovdqa ymm0, [rdx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4] + vmovdqa ymm1, [rdx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4] xor eax, eax pointwise_avx2_looptop1: // Handle 24 = 3*8 coefficients per iteration // Load - vmovdqa ymm2, [rsi] - vmovdqa ymm4, [rsi + 32] - vmovdqa ymm6, [rsi + 64] - vmovdqa ymm10, [rdx] - vmovdqa ymm12, [rdx + 32] - vmovdqa ymm14, [rdx + 64] + vmovdqa ymm2, [rdi] + vmovdqa ymm4, [rdi + 32] + vmovdqa ymm6, [rdi + 64] + vmovdqa ymm10, [rsi] + vmovdqa ymm12, [rsi + 32] + vmovdqa ymm14, [rsi + 64] vpsrlq ymm3, ymm2, 32 vpsrlq ymm5, ymm4, 32 vmovshdup ymm7, ymm6 @@ -122,7 +122,6 @@ pointwise_avx2_looptop1: add rdi, 96 add rsi, 96 - add rdx, 96 add eax, 1 cmp eax, 10 jb pointwise_avx2_looptop1 @@ -131,10 +130,10 @@ pointwise_avx2_looptop1: // Handle the last 256 % 24 = 16 = 2*8 coefficients, left over by the loop // Load - vmovdqa ymm2, [rsi] - vmovdqa ymm4, [rsi + 32] - vmovdqa ymm10, [rdx] - vmovdqa ymm12, [rdx + 32] + vmovdqa ymm2, [rdi] + vmovdqa ymm4, [rdi + 32] + vmovdqa ymm10, [rsi] + vmovdqa ymm12, [rsi + 32] vpsrlq ymm3, ymm2, 32 vpsrlq ymm5, ymm4, 32 vmovshdup ymm11, ymm10 diff --git a/integration/opentitan/reduce_alloc.patch b/integration/opentitan/reduce_alloc.patch index 418968a19..67c7ab992 100644 --- a/integration/opentitan/reduce_alloc.patch +++ b/integration/opentitan/reduce_alloc.patch @@ -11,22 +11,22 @@ index be11f20..0000000 100644 - kOtcryptoMldsa44WorkBufferKeypairWords = 32992 / sizeof(uint32_t), - kOtcryptoMldsa44WorkBufferSignWords = 32448 / sizeof(uint32_t), - kOtcryptoMldsa44WorkBufferVerifyWords = 22464 / sizeof(uint32_t), -+ kOtcryptoMldsa44WorkBufferKeypairWords = 14624 / sizeof(uint32_t), -+ kOtcryptoMldsa44WorkBufferSignWords = 14144 / sizeof(uint32_t), -+ kOtcryptoMldsa44WorkBufferVerifyWords = 20416 / sizeof(uint32_t), ++ kOtcryptoMldsa44WorkBufferKeypairWords = 13600 / sizeof(uint32_t), ++ kOtcryptoMldsa44WorkBufferSignWords = 13120 / sizeof(uint32_t), ++ kOtcryptoMldsa44WorkBufferVerifyWords = 19392 / sizeof(uint32_t), - kOtcryptoMldsa65WorkBufferKeypairWords = 46304 / sizeof(uint32_t), - kOtcryptoMldsa65WorkBufferSignWords = 44768 / sizeof(uint32_t), - kOtcryptoMldsa65WorkBufferVerifyWords = 30720 / sizeof(uint32_t), -+ kOtcryptoMldsa65WorkBufferKeypairWords = 20768 / sizeof(uint32_t), -+ kOtcryptoMldsa65WorkBufferSignWords = 18272 / sizeof(uint32_t), -+ kOtcryptoMldsa65WorkBufferVerifyWords = 27648 / sizeof(uint32_t), ++ kOtcryptoMldsa65WorkBufferKeypairWords = 19744 / sizeof(uint32_t), ++ kOtcryptoMldsa65WorkBufferSignWords = 17248 / sizeof(uint32_t), ++ kOtcryptoMldsa65WorkBufferVerifyWords = 26624 / sizeof(uint32_t), - kOtcryptoMldsa87WorkBufferKeypairWords = 62688 / sizeof(uint32_t), - kOtcryptoMldsa87WorkBufferSignWords = 59104 / sizeof(uint32_t), - kOtcryptoMldsa87WorkBufferVerifyWords = 41216 / sizeof(uint32_t), -+ kOtcryptoMldsa87WorkBufferKeypairWords = 26912 / sizeof(uint32_t), -+ kOtcryptoMldsa87WorkBufferSignWords = 22368 / sizeof(uint32_t), -+ kOtcryptoMldsa87WorkBufferVerifyWords = 36096 / sizeof(uint32_t), ++ kOtcryptoMldsa87WorkBufferKeypairWords = 25888 / sizeof(uint32_t), ++ kOtcryptoMldsa87WorkBufferSignWords = 21344 / sizeof(uint32_t), ++ kOtcryptoMldsa87WorkBufferVerifyWords = 35072 / sizeof(uint32_t), }; diff --git a/mldsa/mldsa_native.h b/mldsa/mldsa_native.h index 6303cc539..aeb843f20 100644 --- a/mldsa/mldsa_native.h +++ b/mldsa/mldsa_native.h @@ -954,21 +954,21 @@ int MLD_API_NAMESPACE(pk_from_sk)( #define MLD_TOTAL_ALLOC_87_SIGN 108224 #define MLD_TOTAL_ALLOC_87_VERIFY 91360 #else /* MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM */ -#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 14624 -#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 24160 -#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 22752 -#define MLD_TOTAL_ALLOC_44_SIGN 14144 -#define MLD_TOTAL_ALLOC_44_VERIFY 20416 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 20768 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 32928 -#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 31968 -#define MLD_TOTAL_ALLOC_65_SIGN 18272 -#define MLD_TOTAL_ALLOC_65_VERIFY 27648 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 26912 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 43328 -#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 42208 -#define MLD_TOTAL_ALLOC_87_SIGN 22368 -#define MLD_TOTAL_ALLOC_87_VERIFY 36096 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 13600 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 23136 +#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 21728 +#define MLD_TOTAL_ALLOC_44_SIGN 13120 +#define MLD_TOTAL_ALLOC_44_VERIFY 19392 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 19744 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 31904 +#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 30944 +#define MLD_TOTAL_ALLOC_65_SIGN 17248 +#define MLD_TOTAL_ALLOC_65_VERIFY 26624 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 25888 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 42304 +#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 41184 +#define MLD_TOTAL_ALLOC_87_SIGN 21344 +#define MLD_TOTAL_ALLOC_87_VERIFY 35072 #endif /* !(MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM) */ /* check-magic: on */ diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h index 78e98390c..dec3f5d57 100644 --- a/mldsa/src/native/aarch64/meta.h +++ b/mldsa/src/native/aarch64/meta.h @@ -210,10 +210,9 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], - const int32_t in1[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) { - mld_poly_pointwise_montgomery_asm(out, in0, in1); + mld_poly_pointwise_montgomery_asm(a, b); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/mldsa/src/native/aarch64/src/arith_native_aarch64.h b/mldsa/src/native/aarch64/src/arith_native_aarch64.h index 08230c39f..f366b1935 100644 --- a/mldsa/src/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/src/native/aarch64/src/arith_native_aarch64.h @@ -150,19 +150,17 @@ void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) -void mld_poly_pointwise_montgomery_asm(int32_t *r, const int32_t *a, - const int32_t *b) +void mld_poly_pointwise_montgomery_asm(int32_t *a, const int32_t *b) /* This must be kept in sync with the HOL-Light specification * in proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml */ __contract__( - requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) /* check-magic: off */ requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) - assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) - ensures(array_abs_bound(r, 0, MLDSA_N, 8380417)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) /* check-magic: on */ ); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/mldsa/src/native/aarch64/src/pointwise_montgomery.S b/mldsa/src/native/aarch64/src/pointwise_montgomery.S index 32ae865ec..d2f57cb31 100644 --- a/mldsa/src/native/aarch64/src/pointwise_montgomery.S +++ b/mldsa/src/native/aarch64/src/pointwise_montgomery.S @@ -28,14 +28,14 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) mov x3, #0x40 // =64 Lpoly_pointwise_montgomery_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 + ldr q16, [x0] + ldr q17, [x0, #0x10] + ldr q18, [x0, #0x20] + ldr q19, [x0, #0x30] + ldr q21, [x1, #0x10] + ldr q22, [x1, #0x20] + ldr q23, [x1, #0x30] + ldr q20, [x1], #0x40 smull v24.2d, v16.2s, v20.2s smull2 v25.2d, v16.4s, v20.4s smull v26.2d, v17.2s, v21.2s diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h index 3cc61350f..efc8e057d 100644 --- a/mldsa/src/native/api.h +++ b/mldsa/src/native/api.h @@ -521,30 +521,27 @@ __contract__( * Name: mld_poly_pointwise_montgomery_native * * Description: Pointwise multiplication of polynomials in NTT domain - * with Montgomery reduction. + * with Montgomery reduction. Destructive in the first argument. * - * Computes c[i] = a[i] * b[i] * R^(-1) mod q for all i, + * Computes a[i] = a[i] * b[i] * R^(-1) mod q for all i, * where R = 2^32. * - * Arguments: - int32_t c[MLDSA_N]: output polynomial - * - const int32_t a[MLDSA_N]: first input polynomial + * Arguments: - int32_t a[MLDSA_N]: first input/output polynomial * - const int32_t b[MLDSA_N]: second input polynomial **************************************************/ MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) __contract__( requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) - requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) requires(array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND)) requires(array_abs_bound(b, 0, MLDSA_N, MLD_NTT_BOUND)) - assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) - ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(c, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND)) ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(b, 0, MLDSA_N, MLD_NTT_BOUND)) - ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(c, MLDSA_N)) ); #endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */ #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h index 71f3b3460..23600f9d5 100644 --- a/mldsa/src/native/x86_64/meta.h +++ b/mldsa/src/native/x86_64/meta.h @@ -263,13 +263,13 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( - int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_pointwise_avx2(c, a, b, mld_qdata); + mld_pointwise_avx2(a, b, mld_qdata); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h index af71a7140..b5fd4e9b8 100644 --- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h @@ -122,20 +122,18 @@ void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2) -void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b, - const int32_t *qdata) +void mld_pointwise_avx2(int32_t *a, const int32_t *b, const int32_t *qdata) /* This must be kept in sync with the HOL-Light specification * in proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml */ __contract__( - requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) /* check-magic: off */ requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) requires(qdata == mld_qdata) - assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) - ensures(array_abs_bound(c, 0, MLDSA_N, 8380417)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) /* check-magic: on */ ); diff --git a/mldsa/src/native/x86_64/src/pointwise.S b/mldsa/src/native/x86_64/src/pointwise.S index 718e7178b..b6a4263d8 100644 --- a/mldsa/src/native/x86_64/src/pointwise.S +++ b/mldsa/src/native/x86_64/src/pointwise.S @@ -32,17 +32,17 @@ MLD_ASM_FN_SYMBOL(pointwise_avx2) .cfi_startproc - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 + vmovdqa 0x20(%rdx), %ymm0 + vmovdqa (%rdx), %ymm1 xorl %eax, %eax Lpointwise_avx2_looptop1: - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa 0x40(%rsi), %ymm6 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vmovdqa 0x40(%rdx), %ymm14 + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 + vmovdqa 0x40(%rsi), %ymm14 vpsrlq $0x20, %ymm2, %ymm3 vpsrlq $0x20, %ymm4, %ymm5 vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] @@ -84,14 +84,13 @@ Lpointwise_avx2_looptop1: vmovdqa %ymm6, 0x40(%rdi) addq $0x60, %rdi addq $0x60, %rsi - addq $0x60, %rdx addl $0x1, %eax cmpl $0xa, %eax jb Lpointwise_avx2_looptop1 - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 vpsrlq $0x20, %ymm2, %ymm3 vpsrlq $0x20, %ymm4, %ymm5 vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c index 4aead9774..7fadfb683 100644 --- a/mldsa/src/poly.c +++ b/mldsa/src/poly.c @@ -460,17 +460,15 @@ void mld_poly_invntt_tomont(mld_poly *a) #if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) || \ defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) -MLD_STATIC_TESTABLE void mld_poly_pointwise_montgomery_c(mld_poly *c, - const mld_poly *a, +MLD_STATIC_TESTABLE void mld_poly_pointwise_montgomery_c(mld_poly *a, const mld_poly *b) __contract__( requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(b, sizeof(mld_poly))) - requires(memory_no_alias(c, sizeof(mld_poly))) requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - assigns(memory_slice(c, sizeof(mld_poly))) - ensures(array_abs_bound(c->coeffs, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q)) ) { unsigned int i; @@ -480,31 +478,31 @@ __contract__( for (i = 0; i < MLDSA_N; ++i) __loop__( invariant(i <= MLDSA_N) - invariant(array_abs_bound(c->coeffs, 0, i, MLDSA_Q)) + invariant(array_abs_bound(a->coeffs, 0, i, MLDSA_Q)) + invariant(array_abs_bound(a->coeffs, i, MLDSA_N, MLD_NTT_BOUND)) decreases(MLDSA_N - i) ) { - c->coeffs[i] = mld_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + a->coeffs[i] = mld_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); } - mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q); + mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q); } MLD_INTERNAL_API -void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a, - const mld_poly *b) +void mld_poly_pointwise_montgomery(mld_poly *a, const mld_poly *b) { #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) int ret; mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND); mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND); - ret = mld_poly_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs); + ret = mld_poly_pointwise_montgomery_native(a->coeffs, b->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { - mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q); + mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q); return; } #endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */ - mld_poly_pointwise_montgomery_c(c, a, b); + mld_poly_pointwise_montgomery_c(a, b); } #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ diff --git a/mldsa/src/poly.h b/mldsa/src/poly.h index 6b7385a9d..aa7ebd12f 100644 --- a/mldsa/src/poly.h +++ b/mldsa/src/poly.h @@ -184,23 +184,22 @@ __contract__( * * Description: Pointwise multiplication of polynomials in NTT domain * representation and multiplication of resulting polynomial - * by 2^{-32}. + * by 2^{-32}. Destructive in the first argument. * - * Arguments: - mld_poly *c: pointer to output polynomial - * - const mld_poly *a: pointer to first input polynomial + * Arguments: - mld_poly *a: pointer to first input/output polynomial. + * On entry, holds the first multiplicand; on exit, holds + * the product a * b * 2^{-32}. * - const mld_poly *b: pointer to second input polynomial **************************************************/ MLD_INTERNAL_API -void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a, - const mld_poly *b) +void mld_poly_pointwise_montgomery(mld_poly *a, const mld_poly *b) __contract__( requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(b, sizeof(mld_poly))) - requires(memory_no_alias(c, sizeof(mld_poly))) requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - assigns(memory_slice(c, sizeof(mld_poly))) - ensures(array_abs_bound(c->coeffs, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q)) ); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ diff --git a/mldsa/src/polyvec.c b/mldsa/src/polyvec.c index a628cb1e4..53fc8303c 100644 --- a/mldsa/src/polyvec.c +++ b/mldsa/src/polyvec.c @@ -408,23 +408,23 @@ void mld_polyveck_invntt_tomont(mld_polyveck *v) #if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API -void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a, - const mld_polyveck *v) +void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *v, const mld_poly *a) { unsigned int i; mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_NTT_BOUND); for (i = 0; i < MLDSA_K; ++i) __loop__( - assigns(i, memory_slice(r, sizeof(mld_polyveck))) + assigns(i, memory_slice(v, sizeof(mld_polyveck))) invariant(i <= MLDSA_K) - invariant(forall(k2, 0, i, array_abs_bound(r->vec[k2].coeffs, 0, MLDSA_N, MLDSA_Q))) + invariant(forall(k2, 0, i, array_abs_bound(v->vec[k2].coeffs, 0, MLDSA_N, MLDSA_Q))) + invariant(forall(k3, i, MLDSA_K, array_abs_bound(v->vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) decreases(MLDSA_K - i) ) { - mld_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + mld_poly_pointwise_montgomery(&v->vec[i], a); } - mld_assert_abs_bound_2d(r->vec, MLDSA_K, MLDSA_N, MLDSA_Q); + mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q); } #endif /* !MLD_CONFIG_NO_VERIFY_API */ diff --git a/mldsa/src/polyvec.h b/mldsa/src/polyvec.h index 41e349450..025ff0edb 100644 --- a/mldsa/src/polyvec.h +++ b/mldsa/src/polyvec.h @@ -324,23 +324,21 @@ __contract__( * * Description: Pointwise multiplication of a polynomial vector of length * MLDSA_K by a single polynomial in NTT domain and multiplication - * of the resulting polynomial vector by 2^{-32}. + * of the resulting polynomial vector by 2^{-32}. Destructive in + * the first argument: v[i] := v[i] * a * 2^{-32}. * - * Arguments: - mld_polyveck *r: pointer to output vector - * - mld_poly *a: pointer to input polynomial - * - mld_polyveck *v: pointer to input vector + * Arguments: - mld_polyveck *v: pointer to input/output vector + * - const mld_poly *a: pointer to input polynomial **************************************************/ MLD_INTERNAL_API -void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a, - const mld_polyveck *v) +void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *v, const mld_poly *a) __contract__( - requires(memory_no_alias(r, sizeof(mld_polyveck))) - requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(v, sizeof(mld_polyveck))) + requires(memory_no_alias(a, sizeof(mld_poly))) requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) requires(forall(k0, 0, MLDSA_K, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - assigns(memory_slice(r, sizeof(mld_polyveck))) - ensures(forall(k1, 0, MLDSA_K, array_abs_bound(r->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q))) + assigns(memory_slice(v, sizeof(mld_polyveck))) + ensures(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q))) ); #endif /* !MLD_CONFIG_NO_VERIFY_API */ diff --git a/mldsa/src/polyvec_lazy.c b/mldsa/src/polyvec_lazy.c index 5508a6e16..04ffdb6b8 100644 --- a/mldsa/src/polyvec_lazy.c +++ b/mldsa/src/polyvec_lazy.c @@ -206,8 +206,8 @@ void mld_polyvec_matrix_pointwise_montgomery_row_lazy(mld_poly *t_row, MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES); - mld_polymat_expand_entry(&mat->cur, seed_ext, 0, (uint8_t)i); - mld_poly_pointwise_montgomery(t_row, &mat->cur, &v->vec[0]); + mld_polymat_expand_entry(t_row, seed_ext, 0, (uint8_t)i); + mld_poly_pointwise_montgomery(t_row, &v->vec[0]); for (l = 1; l < MLDSA_L; ++l) __loop__( @@ -220,11 +220,8 @@ void mld_polyvec_matrix_pointwise_montgomery_row_lazy(mld_poly *t_row, ) { mld_polymat_expand_entry(&mat->cur, seed_ext, (uint8_t)l, (uint8_t)i); - /* TODO: if mld_poly_pointwise_montgomery's CBMC and HOL Light specs - * are strengthened to permit aliasing, the product can be written - * in place into mat->cur and the separate mat->tmp field dropped. */ - mld_poly_pointwise_montgomery(&mat->tmp, &mat->cur, &v->vec[l]); - mld_poly_add(t_row, &mat->tmp); + mld_poly_pointwise_montgomery(&mat->cur, &v->vec[l]); + mld_poly_add(t_row, &mat->cur); } mld_poly_reduce(t_row); @@ -287,15 +284,16 @@ void mld_polyvec_matrix_pointwise_montgomery_yvec_lazy(mld_polyveck *w, decreases(MLDSA_K - k) ) { - mld_polymat_expand_entry(&mat->cur, seed_ext, (uint8_t)l, (uint8_t)k); if (l == 0) { - mld_poly_pointwise_montgomery(&w->vec[k], &mat->cur, y_ntt); + mld_polymat_expand_entry(&w->vec[k], seed_ext, 0, (uint8_t)k); + mld_poly_pointwise_montgomery(&w->vec[k], y_ntt); } else { - mld_poly_pointwise_montgomery(&mat->tmp, &mat->cur, y_ntt); - mld_poly_add(&w->vec[k], &mat->tmp); + mld_polymat_expand_entry(&mat->cur, seed_ext, (uint8_t)l, (uint8_t)k); + mld_poly_pointwise_montgomery(&mat->cur, y_ntt); + mld_poly_add(&w->vec[k], &mat->cur); } } } diff --git a/mldsa/src/polyvec_lazy.h b/mldsa/src/polyvec_lazy.h index 1ff946062..4db4185ab 100644 --- a/mldsa/src/polyvec_lazy.h +++ b/mldsa/src/polyvec_lazy.h @@ -396,12 +396,10 @@ typedef struct #endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ /* Lazy: store seed, sample elements on demand. - * cur holds the on-demand sampled matrix element A[k][l]. - * tmp is scratch space for the per-element pointwise product. */ + * cur holds the on-demand sampled matrix element A[k][l]. */ typedef struct { mld_poly cur; - mld_poly tmp; uint8_t rho[MLDSA_SEEDBYTES]; } mld_polymat_lazy; @@ -530,7 +528,7 @@ __contract__( * with pointwise multiplication and multiplication by 2^{-32}. * Input vector must be in NTT domain representation; the matrix * entries are sampled on demand from the seed stored in mat->rho, - * using mat->cur and mat->tmp as scratch. + * using mat->cur as scratch. * Output coefficients are bounded by MLDSA_Q in absolute value. * * Arguments: - mld_poly *t_row: pointer to output row polynomial diff --git a/mldsa/src/sign.c b/mldsa/src/sign.c index 2899a5a58..7eb4b2f2a 100644 --- a/mldsa/src/sign.c +++ b/mldsa/src/sign.c @@ -516,8 +516,8 @@ __contract__( decreases(MLDSA_L - i) ) { - mld_sk_s1hat_get_poly(tmp, s1hat, i); - mld_poly_pointwise_montgomery(z, cp, tmp); + mld_sk_s1hat_get_poly(z, s1hat, i); + mld_poly_pointwise_montgomery(z, cp); mld_poly_invntt_tomont(z); mld_yvec_get_poly(tmp, y, i); mld_poly_add(z, tmp); @@ -680,8 +680,8 @@ __contract__( ) { /* Compute cs2[k] and subtract from w0[k] */ - mld_sk_s2hat_get_poly(t, s2hat, k); - mld_poly_pointwise_montgomery(z, cp, t); + mld_sk_s2hat_get_poly(z, s2hat, k); + mld_poly_pointwise_montgomery(z, cp); mld_poly_invntt_tomont(z); /* TODO: Remove this workaround for CBMC performance issues */ @@ -702,8 +702,8 @@ __contract__( } /* Compute ct0[k], check norm, and add to w0[k] */ - mld_sk_t0hat_get_poly(t, t0hat, k); - mld_poly_pointwise_montgomery(z, cp, t); + mld_sk_t0hat_get_poly(z, t0hat, k); + mld_poly_pointwise_montgomery(z, cp); mld_poly_invntt_tomont(z); mld_poly_reduce(z); @@ -1125,19 +1125,19 @@ int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, mld_poly_ntt(cp); mld_polyveck_shiftl(t1); mld_polyveck_ntt(t1); - mld_polyveck_pointwise_poly_montgomery(tmp, cp, t1); + mld_polyveck_pointwise_poly_montgomery(t1, cp); mld_polyvec_matrix_expand(mat, rho); mld_polyvecl_ntt(z); - mld_polyvec_matrix_pointwise_montgomery(w1, mat, z); - mld_polyveck_sub(w1, tmp); - mld_polyveck_reduce(w1); - mld_polyveck_invntt_tomont(w1); + mld_polyvec_matrix_pointwise_montgomery(tmp, mat, z); + mld_polyveck_sub(tmp, t1); + mld_polyveck_reduce(tmp); + mld_polyveck_invntt_tomont(tmp); /* Reconstruct w1 */ - mld_polyveck_caddq(w1); - mld_polyveck_use_hint(tmp, w1, h); - mld_polyveck_pack_w1(buf, tmp); + mld_polyveck_caddq(tmp); + mld_polyveck_use_hint(w1, tmp, h); + mld_polyveck_pack_w1(buf, w1); /* Call random oracle and verify challenge */ mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); diff --git a/proofs/cbmc/pointwise_native_aarch64/pointwise_native_aarch64_harness.c b/proofs/cbmc/pointwise_native_aarch64/pointwise_native_aarch64_harness.c index 0ca00db2c..2202f6358 100644 --- a/proofs/cbmc/pointwise_native_aarch64/pointwise_native_aarch64_harness.c +++ b/proofs/cbmc/pointwise_native_aarch64/pointwise_native_aarch64_harness.c @@ -6,13 +6,12 @@ #include "params.h" -int mld_poly_pointwise_montgomery_native(int32_t c[MLDSA_N], - const int32_t a[MLDSA_N], +int mld_poly_pointwise_montgomery_native(int32_t a[MLDSA_N], const int32_t b[MLDSA_N]); void harness(void) { - int32_t *c, *a, *b; + int32_t *a, *b; int t; - t = mld_poly_pointwise_montgomery_native(c, a, b); + t = mld_poly_pointwise_montgomery_native(a, b); } diff --git a/proofs/cbmc/pointwise_native_x86_64/pointwise_native_x86_64_harness.c b/proofs/cbmc/pointwise_native_x86_64/pointwise_native_x86_64_harness.c index 0ca00db2c..2202f6358 100644 --- a/proofs/cbmc/pointwise_native_x86_64/pointwise_native_x86_64_harness.c +++ b/proofs/cbmc/pointwise_native_x86_64/pointwise_native_x86_64_harness.c @@ -6,13 +6,12 @@ #include "params.h" -int mld_poly_pointwise_montgomery_native(int32_t c[MLDSA_N], - const int32_t a[MLDSA_N], +int mld_poly_pointwise_montgomery_native(int32_t a[MLDSA_N], const int32_t b[MLDSA_N]); void harness(void) { - int32_t *c, *a, *b; + int32_t *a, *b; int t; - t = mld_poly_pointwise_montgomery_native(c, a, b); + t = mld_poly_pointwise_montgomery_native(a, b); } diff --git a/proofs/cbmc/poly_pointwise_montgomery/poly_pointwise_montgomery_harness.c b/proofs/cbmc/poly_pointwise_montgomery/poly_pointwise_montgomery_harness.c index d885c2d5d..1a5234de5 100644 --- a/proofs/cbmc/poly_pointwise_montgomery/poly_pointwise_montgomery_harness.c +++ b/proofs/cbmc/poly_pointwise_montgomery/poly_pointwise_montgomery_harness.c @@ -5,6 +5,6 @@ void harness(void) { - mld_poly *a, *b, *c; - mld_poly_pointwise_montgomery(c, a, b); + mld_poly *a, *b; + mld_poly_pointwise_montgomery(a, b); } diff --git a/proofs/cbmc/poly_pointwise_montgomery_c/poly_pointwise_montgomery_c_harness.c b/proofs/cbmc/poly_pointwise_montgomery_c/poly_pointwise_montgomery_c_harness.c index 4ff441fb6..34b998808 100644 --- a/proofs/cbmc/poly_pointwise_montgomery_c/poly_pointwise_montgomery_c_harness.c +++ b/proofs/cbmc/poly_pointwise_montgomery_c/poly_pointwise_montgomery_c_harness.c @@ -4,11 +4,11 @@ #include "poly.h" // Prototype for the function under test -void mld_poly_pointwise_montgomery_c(mld_poly *c, mld_poly *a, mld_poly *b); +void mld_poly_pointwise_montgomery_c(mld_poly *a, const mld_poly *b); void harness(void) { - mld_poly *a, *b, *c; - mld_poly_pointwise_montgomery_c(c, a, b); + mld_poly *a, *b; + mld_poly_pointwise_montgomery_c(a, b); } diff --git a/proofs/cbmc/poly_pointwise_montgomery_native/poly_pointwise_montgomery_native_harness.c b/proofs/cbmc/poly_pointwise_montgomery_native/poly_pointwise_montgomery_native_harness.c index d885c2d5d..1a5234de5 100644 --- a/proofs/cbmc/poly_pointwise_montgomery_native/poly_pointwise_montgomery_native_harness.c +++ b/proofs/cbmc/poly_pointwise_montgomery_native/poly_pointwise_montgomery_native_harness.c @@ -5,6 +5,6 @@ void harness(void) { - mld_poly *a, *b, *c; - mld_poly_pointwise_montgomery(c, a, b); + mld_poly *a, *b; + mld_poly_pointwise_montgomery(a, b); } diff --git a/proofs/cbmc/polyveck_pointwise_poly_montgomery/polyveck_pointwise_poly_montgomery_harness.c b/proofs/cbmc/polyveck_pointwise_poly_montgomery/polyveck_pointwise_poly_montgomery_harness.c index 485acf8ce..b41e40b63 100644 --- a/proofs/cbmc/polyveck_pointwise_poly_montgomery/polyveck_pointwise_poly_montgomery_harness.c +++ b/proofs/cbmc/polyveck_pointwise_poly_montgomery/polyveck_pointwise_poly_montgomery_harness.c @@ -5,7 +5,7 @@ void harness(void) { - mld_polyveck *a, *b; - mld_poly *c; - mld_polyveck_pointwise_poly_montgomery(a, c, b); + mld_polyveck *v; + mld_poly *a; + mld_polyveck_pointwise_poly_montgomery(v, a); } diff --git a/proofs/hol_light/aarch64/mldsa/mldsa_pointwise.S b/proofs/hol_light/aarch64/mldsa/mldsa_pointwise.S index a7b735b9e..5a8c8f527 100644 --- a/proofs/hol_light/aarch64/mldsa/mldsa_pointwise.S +++ b/proofs/hol_light/aarch64/mldsa/mldsa_pointwise.S @@ -28,14 +28,14 @@ PQCP_MLDSA_NATIVE_MLDSA44_poly_pointwise_montgomery_asm: mov x3, #0x40 // =64 Lpoly_pointwise_montgomery_loop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 + ldr q16, [x0] + ldr q17, [x0, #0x10] + ldr q18, [x0, #0x20] + ldr q19, [x0, #0x30] + ldr q21, [x1, #0x10] + ldr q22, [x1, #0x20] + ldr q23, [x1, #0x30] + ldr q20, [x1], #0x40 smull v24.2d, v16.2s, v20.2s smull2 v25.2d, v16.4s, v20.4s smull v26.2d, v17.2s, v21.2s diff --git a/proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml b/proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml index 30d71efdc..c4a9936b1 100644 --- a/proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml +++ b/proofs/hol_light/aarch64/proofs/mldsa_pointwise.ml @@ -26,14 +26,14 @@ let mldsa_pointwise_mc = define_assert_from_elf 0x72a07003; (* arm_MOVK W3 (word 896) 16 *) 0x4e040c61; (* arm_DUP_GEN Q1 X3 32 128 *) 0xd2800803; (* arm_MOV X3 (rvalue (word 64)) *) - 0x3dc00431; (* arm_LDR Q17 X1 (Immediate_Offset (word 16)) *) - 0x3dc00832; (* arm_LDR Q18 X1 (Immediate_Offset (word 32)) *) - 0x3dc00c33; (* arm_LDR Q19 X1 (Immediate_Offset (word 48)) *) - 0x3cc40430; (* arm_LDR Q16 X1 (Postimmediate_Offset (word 64)) *) - 0x3dc00455; (* arm_LDR Q21 X2 (Immediate_Offset (word 16)) *) - 0x3dc00856; (* arm_LDR Q22 X2 (Immediate_Offset (word 32)) *) - 0x3dc00c57; (* arm_LDR Q23 X2 (Immediate_Offset (word 48)) *) - 0x3cc40454; (* arm_LDR Q20 X2 (Postimmediate_Offset (word 64)) *) + 0x3dc00010; (* arm_LDR Q16 X0 (Immediate_Offset (word 0)) *) + 0x3dc00411; (* arm_LDR Q17 X0 (Immediate_Offset (word 16)) *) + 0x3dc00812; (* arm_LDR Q18 X0 (Immediate_Offset (word 32)) *) + 0x3dc00c13; (* arm_LDR Q19 X0 (Immediate_Offset (word 48)) *) + 0x3dc00435; (* arm_LDR Q21 X1 (Immediate_Offset (word 16)) *) + 0x3dc00836; (* arm_LDR Q22 X1 (Immediate_Offset (word 32)) *) + 0x3dc00c37; (* arm_LDR Q23 X1 (Immediate_Offset (word 48)) *) + 0x3cc40434; (* arm_LDR Q20 X1 (Postimmediate_Offset (word 64)) *) 0x0eb4c218; (* arm_SMULL_VEC Q24 Q16 Q20 32 *) 0x4eb4c219; (* arm_SMULL2_VEC Q25 Q16 Q20 32 *) 0x0eb5c23a; (* arm_SMULL_VEC Q26 Q17 Q21 32 *) @@ -79,14 +79,13 @@ let MLDSA_POINTWISE_EXEC = ARM_MK_EXEC_RULE mldsa_pointwise_mc;; (* ========================================================================= *) let MLDSA_POINTWISE_CORRECT = prove - (`!r a b x y pc. - nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (r, 1024) /\ - nonoverlapping (r, 1024) (a, 1024) /\ - nonoverlapping (r, 1024) (b, 1024) + (`!a b x y pc. + nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (a, 1024) /\ + nonoverlapping (a, 1024) (b, 1024) ==> ensures arm (\s. aligned_bytes_loaded s (word pc) mldsa_pointwise_mc /\ read PC s = word pc /\ - C_ARGUMENTS [r; a; b] s /\ + C_ARGUMENTS [a; b] s /\ (!i. i < 256 ==> abs(ival(x i)) <= &75423752) /\ (!i. i < 256 ==> abs(ival(y i)) <= &75423752) /\ (!i. i < 256 ==> @@ -95,16 +94,16 @@ let MLDSA_POINTWISE_CORRECT = prove read(memory :> bytes32(word_add b (word(4 * i)))) s = y i)) (\s. read PC s = word(pc + 0xC4) /\ (!i. i < 256 ==> - let zi = read(memory :> bytes32(word_add r (word(4 * i)))) s in + let zi = read(memory :> bytes32(word_add a (word(4 * i)))) s in (ival zi == mldsa_pointwise (ival o x) (ival o y) i) (mod &8380417) /\ abs(ival zi) <= &8380416)) (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, - MAYCHANGE [memory :> bytes(r, 1024)])`, + MAYCHANGE [memory :> bytes(a, 1024)])`, (* Setup *) MAP_EVERY X_GEN_TAC - [`r:int64`; `a:int64`; `b:int64`; + [`a:int64`; `b:int64`; `x:num->int32`; `y:num->int32`; `pc:num`] THEN REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; C_ARGUMENTS; NONOVERLAPPING_CLAUSES; ALL; @@ -211,15 +210,14 @@ let MLDSA_POINTWISE_CORRECT = prove (* ========================================================================= *) let MLDSA_POINTWISE_SUBROUTINE_CORRECT = prove - (`!r a b x y pc returnaddress. - nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (r, 1024) /\ - nonoverlapping (r, 1024) (a, 1024) /\ - nonoverlapping (r, 1024) (b, 1024) + (`!a b x y pc returnaddress. + nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (a, 1024) /\ + nonoverlapping (a, 1024) (b, 1024) ==> ensures arm (\s. aligned_bytes_loaded s (word pc) mldsa_pointwise_mc /\ read PC s = word pc /\ read X30 s = returnaddress /\ - C_ARGUMENTS [r; a; b] s /\ + C_ARGUMENTS [a; b] s /\ (!i. i < 256 ==> abs(ival(x i)) <= &75423752) /\ (!i. i < 256 ==> abs(ival(y i)) <= &75423752) /\ (!i. i < 256 ==> @@ -228,12 +226,12 @@ let MLDSA_POINTWISE_SUBROUTINE_CORRECT = prove read(memory :> bytes32(word_add b (word(4 * i)))) s = y i)) (\s. read PC s = returnaddress /\ (!i. i < 256 ==> - let zi = read(memory :> bytes32(word_add r (word(4 * i)))) s in + let zi = read(memory :> bytes32(word_add a (word(4 * i)))) s in (ival zi == mldsa_pointwise (ival o x) (ival o y) i) (mod &8380417) /\ abs(ival zi) <= &8380416)) (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, - MAYCHANGE [memory :> bytes(r, 1024)])`, + MAYCHANGE [memory :> bytes(a, 1024)])`, REWRITE_TAC[fst MLDSA_POINTWISE_EXEC] THEN ARM_ADD_RETURN_NOSTACK_TAC MLDSA_POINTWISE_EXEC (REWRITE_RULE[fst MLDSA_POINTWISE_EXEC] MLDSA_POINTWISE_CORRECT));; @@ -253,25 +251,24 @@ let full_spec,public_vars = mk_safety_spec let MLDSA_POINTWISE_SUBROUTINE_SAFE = time prove (`exists f_events. - forall e r a b pc returnaddress. - nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (r,1024) /\ - nonoverlapping (r,1024) (a,1024) /\ - nonoverlapping (r,1024) (b,1024) + forall e a b pc returnaddress. + nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (a,1024) /\ + nonoverlapping (a,1024) (b,1024) ==> ensures arm (\s. aligned_bytes_loaded s (word pc) mldsa_pointwise_mc /\ read PC s = word pc /\ read X30 s = returnaddress /\ - C_ARGUMENTS [r; a; b] s /\ + C_ARGUMENTS [a; b] s /\ read events s = e) (\s. read PC s = returnaddress /\ (exists e2. read events s = APPEND e2 e /\ - e2 = f_events a b r pc returnaddress /\ - memaccess_inbounds e2 [a,1024; b,1024; r,1024] - [r,1024])) + e2 = f_events b a pc returnaddress /\ + memaccess_inbounds e2 [a,1024; b,1024] + [a,1024])) (\s s'. true)`, ASSERT_CONCL_TAC full_spec THEN PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars MLDSA_POINTWISE_EXEC);; diff --git a/proofs/hol_light/aarch64/proofs/subroutine_signatures.ml b/proofs/hol_light/aarch64/proofs/subroutine_signatures.ml index 125613025..d8f5344b8 100644 --- a/proofs/hol_light/aarch64/proofs/subroutine_signatures.ml +++ b/proofs/hol_light/aarch64/proofs/subroutine_signatures.ml @@ -30,18 +30,16 @@ let subroutine_signatures = [ ); ("mldsa_pointwise", ([(*args*) - ("r", "int32_t[static 256]", (*is const?*)"false"); - ("a", "int32_t[static 256]", (*is const?*)"true"); + ("a", "int32_t[static 256]", (*is const?*)"false"); ("b", "int32_t[static 256]", (*is const?*)"true"); ], "void", [(* input buffers *) ("a", "256"(* num elems *), 4(* elem bytesize *)); ("b", "256"(* num elems *), 4(* elem bytesize *)); - ("r", "256"(* num elems *), 4(* elem bytesize *)); ], [(* output buffers *) - ("r", "256"(* num elems *), 4(* elem bytesize *)); + ("a", "256"(* num elems *), 4(* elem bytesize *)); ], [(* temporary buffers *) ]) diff --git a/proofs/hol_light/x86_64/mldsa/mldsa_pointwise.S b/proofs/hol_light/x86_64/mldsa/mldsa_pointwise.S index 65044083c..0c352f6f6 100644 --- a/proofs/hol_light/x86_64/mldsa/mldsa_pointwise.S +++ b/proofs/hol_light/x86_64/mldsa/mldsa_pointwise.S @@ -35,17 +35,17 @@ PQCP_MLDSA_NATIVE_MLDSA44_pointwise_avx2: .cfi_startproc endbr64 - vmovdqa 0x20(%rcx), %ymm0 - vmovdqa (%rcx), %ymm1 + vmovdqa 0x20(%rdx), %ymm0 + vmovdqa (%rdx), %ymm1 xorl %eax, %eax Lpointwise_avx2_looptop1: - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa 0x40(%rsi), %ymm6 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 - vmovdqa 0x40(%rdx), %ymm14 + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 + vmovdqa 0x40(%rsi), %ymm14 vpsrlq $0x20, %ymm2, %ymm3 vpsrlq $0x20, %ymm4, %ymm5 vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] @@ -87,14 +87,13 @@ Lpointwise_avx2_looptop1: vmovdqa %ymm6, 0x40(%rdi) addq $0x60, %rdi addq $0x60, %rsi - addq $0x60, %rdx addl $0x1, %eax cmpl $0xa, %eax jb Lpointwise_avx2_looptop1 - vmovdqa (%rsi), %ymm2 - vmovdqa 0x20(%rsi), %ymm4 - vmovdqa (%rdx), %ymm10 - vmovdqa 0x20(%rdx), %ymm12 + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 vpsrlq $0x20, %ymm2, %ymm3 vpsrlq $0x20, %ymm4, %ymm5 vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] diff --git a/proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml b/proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml index fd46b7632..1bf1d340c 100644 --- a/proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml +++ b/proofs/hol_light/x86_64/proofs/mldsa_pointwise.ml @@ -20,20 +20,20 @@ let mldsa_pointwise_mc = define_assert_from_elf "mldsa_pointwise_mc" "x86_64/mld (*** BYTECODE START ***) [ 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) - 0xc5; 0xfd; 0x6f; 0x41; 0x20; - (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rcx,32))) *) - 0xc5; 0xfd; 0x6f; 0x09; (* VMOVDQA (%_% ymm1) (Memop Word256 (%% (rcx,0))) *) + 0xc5; 0xfd; 0x6f; 0x42; 0x20; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rdx,32))) *) + 0xc5; 0xfd; 0x6f; 0x0a; (* VMOVDQA (%_% ymm1) (Memop Word256 (%% (rdx,0))) *) 0x31; 0xc0; (* XOR (% eax) (% eax) *) - 0xc5; 0xfd; 0x6f; 0x16; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,0))) *) - 0xc5; 0xfd; 0x6f; 0x66; 0x20; - (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rsi,32))) *) - 0xc5; 0xfd; 0x6f; 0x76; 0x40; - (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rsi,64))) *) - 0xc5; 0x7d; 0x6f; 0x12; (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdx,0))) *) - 0xc5; 0x7d; 0x6f; 0x62; 0x20; - (* VMOVDQA (%_% ymm12) (Memop Word256 (%% (rdx,32))) *) - 0xc5; 0x7d; 0x6f; 0x72; 0x40; - (* VMOVDQA (%_% ymm14) (Memop Word256 (%% (rdx,64))) *) + 0xc5; 0xfd; 0x6f; 0x17; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0xfd; 0x6f; 0x67; 0x20; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xfd; 0x6f; 0x77; 0x40; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x16; (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rsi,0))) *) + 0xc5; 0x7d; 0x6f; 0x66; 0x20; + (* VMOVDQA (%_% ymm12) (Memop Word256 (%% (rsi,32))) *) + 0xc5; 0x7d; 0x6f; 0x76; 0x40; + (* VMOVDQA (%_% ymm14) (Memop Word256 (%% (rsi,64))) *) 0xc5; 0xe5; 0x73; 0xd2; 0x20; (* VPSRLQ (%_% ymm3) (%_% ymm2) (Imm8 (word 32)) *) 0xc5; 0xd5; 0x73; 0xd4; 0x20; @@ -111,17 +111,16 @@ let mldsa_pointwise_mc = define_assert_from_elf "mldsa_pointwise_mc" "x86_64/mld (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm6) *) 0x48; 0x83; 0xc7; 0x60; (* ADD (% rdi) (Imm8 (word 96)) *) 0x48; 0x83; 0xc6; 0x60; (* ADD (% rsi) (Imm8 (word 96)) *) - 0x48; 0x83; 0xc2; 0x60; (* ADD (% rdx) (Imm8 (word 96)) *) 0x83; 0xc0; 0x01; (* ADD (% eax) (Imm8 (word 1)) *) 0x83; 0xf8; 0x0a; (* CMP (% eax) (Imm8 (word 10)) *) - 0x0f; 0x82; 0x07; 0xff; 0xff; 0xff; - (* JB (Imm32 (word 4294967047)) *) - 0xc5; 0xfd; 0x6f; 0x16; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,0))) *) - 0xc5; 0xfd; 0x6f; 0x66; 0x20; - (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rsi,32))) *) - 0xc5; 0x7d; 0x6f; 0x12; (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdx,0))) *) - 0xc5; 0x7d; 0x6f; 0x62; 0x20; - (* VMOVDQA (%_% ymm12) (Memop Word256 (%% (rdx,32))) *) + 0x0f; 0x82; 0x0b; 0xff; 0xff; 0xff; + (* JB (Imm32 (word 4294967051)) *) + 0xc5; 0xfd; 0x6f; 0x17; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0xfd; 0x6f; 0x67; 0x20; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0x7d; 0x6f; 0x16; (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rsi,0))) *) + 0xc5; 0x7d; 0x6f; 0x66; 0x20; + (* VMOVDQA (%_% ymm12) (Memop Word256 (%% (rsi,32))) *) 0xc5; 0xe5; 0x73; 0xd2; 0x20; (* VPSRLQ (%_% ymm3) (%_% ymm2) (Imm8 (word 32)) *) 0xc5; 0xd5; 0x73; 0xd4; 0x20; @@ -185,25 +184,20 @@ let MLDSA_POINTWISE_TMC_EXEC = X86_MK_CORE_EXEC_RULE mldsa_pointwise_tmc;; (* ========================================================================= *) let MLDSA_POINTWISE_CORRECT = prove - (`!r a b consts x y pc. - aligned 32 r /\ + (`!a b consts x y pc. aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ - nonoverlapping (word pc, 0x0199) (r, 1024) /\ - nonoverlapping (word pc, 0x0199) (a, 1024) /\ - nonoverlapping (word pc, 0x0199) (b, 1024) /\ - nonoverlapping (word pc, 0x0199) (consts, 2496) /\ - nonoverlapping (r, 1024) (a, 1024) /\ - nonoverlapping (r, 1024) (b, 1024) /\ - nonoverlapping (r, 1024) (consts, 2496) /\ + nonoverlapping (word pc, 0x0195) (a, 1024) /\ + nonoverlapping (word pc, 0x0195) (b, 1024) /\ + nonoverlapping (word pc, 0x0195) (consts, 2496) /\ nonoverlapping (a, 1024) (b, 1024) /\ nonoverlapping (a, 1024) (consts, 2496) /\ nonoverlapping (b, 1024) (consts, 2496) ==> ensures x86 (\s. bytes_loaded s (word pc) (BUTLAST mldsa_pointwise_tmc) /\ read RIP s = word pc /\ - C_ARGUMENTS [r; a; b; consts] s /\ + C_ARGUMENTS [a; b; consts] s /\ wordlist_from_memory(consts,624) s = MAP (iword: int -> 32 word) mldsa_complete_qdata /\ (!i. i < 256 ==> abs(ival(x i)) <= &75423752) /\ @@ -212,9 +206,9 @@ let MLDSA_POINTWISE_CORRECT = prove read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ (!i. i < 256 ==> read(memory :> bytes32(word_add b (word(4 * i)))) s = y i)) - (\s. read RIP s = word(pc + 0x0198) /\ + (\s. read RIP s = word(pc + 0x0194) /\ (!i. i < 256 ==> - let zi = read(memory :> bytes32(word_add r (word(4 * i)))) s in + let zi = read(memory :> bytes32(word_add a (word(4 * i)))) s in (ival zi == mldsa_pointwise (ival o x) (ival o y) i) (mod &8380417) /\ abs(ival zi) <= &8380416)) @@ -222,11 +216,11 @@ let MLDSA_POINTWISE_CORRECT = prove MAYCHANGE [ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; ZMM8; ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14; ZMM15] ,, MAYCHANGE [RAX] ,, MAYCHANGE SOME_FLAGS ,, - MAYCHANGE [memory :> bytes(r, 1024)])`, + MAYCHANGE [memory :> bytes(a, 1024)])`, (* Setup - strip quantifiers, introduce preconditions *) MAP_EVERY X_GEN_TAC - [`r:int64`; `a:int64`; `b:int64`; `consts:int64`; + [`a:int64`; `b:int64`; `consts:int64`; `x:num->int32`; `y:num->int32`; `pc:num`] THEN REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; C_ARGUMENTS; NONOVERLAPPING_CLAUSES; ALL] THEN @@ -255,13 +249,6 @@ let MLDSA_POINTWISE_CORRECT = prove GHOST_INTRO_TAC `init_ymm14:int256` `read YMM14` THEN GHOST_INTRO_TAC `init_ymm15:int256` `read YMM15` THEN - (* Ghost reads from output region r (32 x 256-bit = 1024 bytes) *) - MAP_EVERY (fun n -> - let vname = "init_r" ^ string_of_int n in - GHOST_INTRO_TAC (mk_var(vname, `:int256`)) - (subst[mk_small_numeral(32*n),`n:num`] - `read (memory :> bytes256(word_add r (word n)))`)) - (0--31) THEN ENSURES_INIT_TAC "s0" THEN (* Merge memory reads from array a *) @@ -314,17 +301,17 @@ let MLDSA_POINTWISE_CORRECT = prove CONV_TAC INT_REDUCE_CONV]; ALL_TAC] THEN - (* Execute all 543 instructions with SIMD simplification *) + (* Execute all 533 instructions with SIMD simplification *) MAP_EVERY (fun n -> X86_STEPS_TAC MLDSA_POINTWISE_TMC_EXEC [n] THEN SIMD_SIMPLIFY_TAC[mldsa_pointwise_montred]) - (1--543) THEN + (1--533) THEN ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN (* Split bytes256 -> bytes32 *) REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o CONV_RULE(READ_MEMORY_SPLIT_CONV 3) o - check (can (term_match [] `read qqq s543:int256 = xxx`) o concl))) THEN + check (can (term_match [] `read qqq s533:int256 = xxx`) o concl))) THEN (* Expand output cases, substitute, collapse subwords, fold *) CONV_TAC(TOP_DEPTH_CONV EXPAND_CASES_CONV) THEN @@ -383,22 +370,16 @@ let MLDSA_POINTWISE_CORRECT = prove (* ========================================================================= *) let MLDSA_POINTWISE_NOIBT_SUBROUTINE_CORRECT = prove - (`!r a b consts x y pc stackpointer returnaddress. - aligned 32 r /\ + (`!a b consts x y pc stackpointer returnaddress. aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ - nonoverlapping (word pc,LENGTH mldsa_pointwise_tmc) (r, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_tmc) (a, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_tmc) (b, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_tmc) (consts, 2496) /\ - nonoverlapping (r, 1024) (a, 1024) /\ - nonoverlapping (r, 1024) (b, 1024) /\ - nonoverlapping (r, 1024) (consts, 2496) /\ nonoverlapping (a, 1024) (b, 1024) /\ nonoverlapping (a, 1024) (consts, 2496) /\ nonoverlapping (b, 1024) (consts, 2496) /\ - nonoverlapping (stackpointer, 8) (r, 1024) /\ nonoverlapping (stackpointer, 8) (a, 1024) /\ nonoverlapping (stackpointer, 8) (b, 1024) /\ nonoverlapping (stackpointer, 8) (consts, 2496) @@ -407,7 +388,7 @@ let MLDSA_POINTWISE_NOIBT_SUBROUTINE_CORRECT = prove read RIP s = word pc /\ read RSP s = stackpointer /\ read (memory :> bytes64 stackpointer) s = returnaddress /\ - C_ARGUMENTS [r; a; b; consts] s /\ + C_ARGUMENTS [a; b; consts] s /\ wordlist_from_memory(consts,624) s = MAP (iword: int -> 32 word) mldsa_complete_qdata /\ (!i. i < 256 ==> abs(ival(x i)) <= &75423752) /\ @@ -419,34 +400,28 @@ let MLDSA_POINTWISE_NOIBT_SUBROUTINE_CORRECT = prove (\s. read RIP s = returnaddress /\ read RSP s = word_add stackpointer (word 8) /\ (!i. i < 256 ==> - let zi = read(memory :> bytes32(word_add r (word(4 * i)))) s in + let zi = read(memory :> bytes32(word_add a (word(4 * i)))) s in (ival zi == mldsa_pointwise (ival o x) (ival o y) i) (mod &8380417) /\ abs(ival zi) <= &8380416)) (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, - MAYCHANGE [memory :> bytes(r, 1024)])`, + MAYCHANGE [memory :> bytes(a, 1024)])`, let TWEAK_CONV = ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV in CONV_TAC TWEAK_CONV THEN X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_pointwise_tmc (CONV_RULE TWEAK_CONV MLDSA_POINTWISE_CORRECT));; let MLDSA_POINTWISE_SUBROUTINE_CORRECT = prove - (`!r a b consts x y pc stackpointer returnaddress. - aligned 32 r /\ + (`!a b consts x y pc stackpointer returnaddress. aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ - nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (r, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (a, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (b, 1024) /\ nonoverlapping (word pc,LENGTH mldsa_pointwise_mc) (consts, 2496) /\ - nonoverlapping (r, 1024) (a, 1024) /\ - nonoverlapping (r, 1024) (b, 1024) /\ - nonoverlapping (r, 1024) (consts, 2496) /\ nonoverlapping (a, 1024) (b, 1024) /\ nonoverlapping (a, 1024) (consts, 2496) /\ nonoverlapping (b, 1024) (consts, 2496) /\ - nonoverlapping (stackpointer, 8) (r, 1024) /\ nonoverlapping (stackpointer, 8) (a, 1024) /\ nonoverlapping (stackpointer, 8) (b, 1024) /\ nonoverlapping (stackpointer, 8) (consts, 2496) @@ -455,7 +430,7 @@ let MLDSA_POINTWISE_SUBROUTINE_CORRECT = prove read RIP s = word pc /\ read RSP s = stackpointer /\ read (memory :> bytes64 stackpointer) s = returnaddress /\ - C_ARGUMENTS [r; a; b; consts] s /\ + C_ARGUMENTS [a; b; consts] s /\ wordlist_from_memory(consts,624) s = MAP (iword: int -> 32 word) mldsa_complete_qdata /\ (!i. i < 256 ==> abs(ival(x i)) <= &75423752) /\ @@ -467,12 +442,12 @@ let MLDSA_POINTWISE_SUBROUTINE_CORRECT = prove (\s. read RIP s = returnaddress /\ read RSP s = word_add stackpointer (word 8) /\ (!i. i < 256 ==> - let zi = read(memory :> bytes32(word_add r (word(4 * i)))) s in + let zi = read(memory :> bytes32(word_add a (word(4 * i)))) s in (ival zi == mldsa_pointwise (ival o x) (ival o y) i) (mod &8380417) /\ abs(ival zi) <= &8380416)) (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, - MAYCHANGE [memory :> bytes(r, 1024)])`, + MAYCHANGE [memory :> bytes(a, 1024)])`, let TWEAK_CONV = ONCE_DEPTH_CONV WORDLIST_FROM_MEMORY_CONV in CONV_TAC TWEAK_CONV THEN MATCH_ACCEPT_TAC(ADD_IBT_RULE @@ -501,16 +476,13 @@ let MLDSA_POINTWISE_SAFE = let MLDSA_POINTWISE_NOIBT_SUBROUTINE_SAFE = time prove (`exists f_events. - forall e r a b consts pc stackpointer returnaddress. - aligned 32 r /\ aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ - nonoverlapping (word pc, LENGTH mldsa_pointwise_tmc) (r, 1024) /\ + forall e a b consts pc stackpointer returnaddress. + aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_tmc) (a, 1024) /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_tmc) (b, 1024) /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_tmc) (consts, 2496) /\ - nonoverlapping (r, 1024) (a, 1024) /\ nonoverlapping (r, 1024) (b, 1024) /\ - nonoverlapping (r, 1024) (consts, 2496) /\ nonoverlapping (a, 1024) (b, 1024) /\ + nonoverlapping (a, 1024) (b, 1024) /\ nonoverlapping (a, 1024) (consts, 2496) /\ nonoverlapping (b, 1024) (consts, 2496) /\ - nonoverlapping (stackpointer, 8) (r, 1024) /\ nonoverlapping (stackpointer, 8) (a, 1024) /\ nonoverlapping (stackpointer, 8) (b, 1024) /\ nonoverlapping (stackpointer, 8) (consts, 2496) @@ -518,30 +490,27 @@ let MLDSA_POINTWISE_NOIBT_SUBROUTINE_SAFE = time prove (\s. bytes_loaded s (word pc) mldsa_pointwise_tmc /\ read RIP s = word pc /\ read RSP s = stackpointer /\ read (memory :> bytes64 stackpointer) s = returnaddress /\ - C_ARGUMENTS [r; a; b; consts] s /\ read events s = e) + C_ARGUMENTS [a; b; consts] s /\ read events s = e) (\s. read RIP s = returnaddress /\ read RSP s = word_add stackpointer (word 8) /\ (exists e2. read events s = APPEND e2 e /\ - e2 = f_events r a b consts pc stackpointer returnaddress /\ + e2 = f_events b consts a pc stackpointer returnaddress /\ memaccess_inbounds e2 - [a,1024; b,1024; consts,2496; r,1024; stackpointer,8] - [r,1024; stackpointer,8])) + [a,1024; b,1024; consts,2496; stackpointer,8] + [a,1024; stackpointer,8])) (\s s'. true)`, X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_pointwise_tmc MLDSA_POINTWISE_SAFE THEN DISCHARGE_SAFETY_PROPERTY_TAC);; let MLDSA_POINTWISE_SUBROUTINE_SAFE = time prove (`exists f_events. - forall e r a b consts pc stackpointer returnaddress. - aligned 32 r /\ aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ - nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (r, 1024) /\ + forall e a b consts pc stackpointer returnaddress. + aligned 32 a /\ aligned 32 b /\ aligned 32 consts /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (a, 1024) /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (b, 1024) /\ nonoverlapping (word pc, LENGTH mldsa_pointwise_mc) (consts, 2496) /\ - nonoverlapping (r, 1024) (a, 1024) /\ nonoverlapping (r, 1024) (b, 1024) /\ - nonoverlapping (r, 1024) (consts, 2496) /\ nonoverlapping (a, 1024) (b, 1024) /\ + nonoverlapping (a, 1024) (b, 1024) /\ nonoverlapping (a, 1024) (consts, 2496) /\ nonoverlapping (b, 1024) (consts, 2496) /\ - nonoverlapping (stackpointer, 8) (r, 1024) /\ nonoverlapping (stackpointer, 8) (a, 1024) /\ nonoverlapping (stackpointer, 8) (b, 1024) /\ nonoverlapping (stackpointer, 8) (consts, 2496) @@ -549,13 +518,13 @@ let MLDSA_POINTWISE_SUBROUTINE_SAFE = time prove (\s. bytes_loaded s (word pc) mldsa_pointwise_mc /\ read RIP s = word pc /\ read RSP s = stackpointer /\ read (memory :> bytes64 stackpointer) s = returnaddress /\ - C_ARGUMENTS [r; a; b; consts] s /\ read events s = e) + C_ARGUMENTS [a; b; consts] s /\ read events s = e) (\s. read RIP s = returnaddress /\ read RSP s = word_add stackpointer (word 8) /\ (exists e2. read events s = APPEND e2 e /\ - e2 = f_events r a b consts pc stackpointer returnaddress /\ + e2 = f_events b consts a pc stackpointer returnaddress /\ memaccess_inbounds e2 - [a,1024; b,1024; consts,2496; r,1024; stackpointer,8] - [r,1024; stackpointer,8])) + [a,1024; b,1024; consts,2496; stackpointer,8] + [a,1024; stackpointer,8])) (\s s'. true)`, MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_POINTWISE_NOIBT_SUBROUTINE_SAFE));; diff --git a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml index c790d6e66..0668457c9 100644 --- a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml +++ b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml @@ -12,8 +12,7 @@ let subroutine_signatures = [ ("mldsa_pointwise_x86", ([(*args*) - ("c", "int32_t[static 256]", (*is const?*)"false"); - ("a", "int32_t[static 256]", (*is const?*)"true"); + ("a", "int32_t[static 256]", (*is const?*)"false"); ("b", "int32_t[static 256]", (*is const?*)"true"); ("qdata", "int32_t[static 624]", (*is const?*)"true"); ], @@ -24,7 +23,7 @@ let subroutine_signatures = [ ("qdata", "624"(* num elems *), 4(* elem bytesize *)); ], [(* output buffers *) - ("c", "256"(* num elems *), 4(* elem bytesize *)); + ("a", "256"(* num elems *), 4(* elem bytesize *)); ], [(* temporary buffers *) ]) diff --git a/test/src/test_unit.c b/test/src/test_unit.c index 31db1ba50..b3f531153 100644 --- a/test/src/test_unit.c +++ b/test/src/test_unit.c @@ -48,8 +48,7 @@ void mld_poly_use_hint_c(mld_poly *b, const mld_poly *a, const mld_poly *h); #endif uint32_t mld_poly_chknorm_c(const mld_poly *a, int32_t B); #if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) -void mld_poly_pointwise_montgomery_c(mld_poly *c, const mld_poly *a, - const mld_poly *b); +void mld_poly_pointwise_montgomery_c(mld_poly *a, const mld_poly *b); #endif void mld_polyvecl_pointwise_acc_montgomery_c(mld_poly *w, const mld_polyvecl *u, const mld_polyvecl *v); @@ -536,8 +535,10 @@ static int test_poly_pointwise_montgomery_core(const mld_poly *poly_a, { mld_poly test_poly_c, ref_poly_c; - mld_poly_pointwise_montgomery(&test_poly_c, poly_a, poly_b); - mld_poly_pointwise_montgomery_c(&ref_poly_c, poly_a, poly_b); + test_poly_c = *poly_a; + ref_poly_c = *poly_a; + mld_poly_pointwise_montgomery(&test_poly_c, poly_b); + mld_poly_pointwise_montgomery_c(&ref_poly_c, poly_b); CHECK(compare_i32_arrays(test_poly_c.coeffs, ref_poly_c.coeffs, MLDSA_N, test_name, poly_a->coeffs));